Make semantic vector-table misconfigurations fail with live evidence
Constraint: Phase-1 semantic jobs were already blocked by missing audio and model runtimes, so vector-table regressions needed their own isolated live proof to avoid being masked by the same environment failures. Rejected: Infer vector-table coverage from code inspection only | It would not prove the worker writes the correct blocker reasons into PostgreSQL metadata. Confidence: high Scope-risk: narrow Directive: When semantic extraction fails, inspect vector_table_report.reason before assuming the host is only missing mounts or model dependencies. Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_embedding_vector_table_negative_matrix_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_embedding_vector_table_negative_matrix_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --output data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json Not-tested: No successful semantic extraction path exists yet on this host; this commit validates negative preflight cases only.
Showing
9 changed files
with
1024 additions
and
0 deletions
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T14:00:28.270203+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_192", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": "vector_table_dim_mismatch", | ||
| 88 | "resolved": false, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": false, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T14:00:28.270203+08:00", | ||
| 115 | "finished_at": "2026-06-04T14:00:28.271729+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_192", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "vector_table_dim_mismatch", | ||
| 152 | "model_runtime_unavailable" | ||
| 153 | ], | ||
| 154 | "scope_window_count": 20, | ||
| 155 | "write_target_table": "audio_embedding", | ||
| 156 | "vector_table_report": { | ||
| 157 | "reason": "vector_table_dim_mismatch", | ||
| 158 | "resolved": false, | ||
| 159 | "expected_dim": 768, | ||
| 160 | "table_exists": false, | ||
| 161 | "allowed_vector_tables": [ | ||
| 162 | "audio_embedding_vector_192", | ||
| 163 | "audio_embedding_vector_768" | ||
| 164 | ], | ||
| 165 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 166 | }, | ||
| 167 | "missing_window_count": 20, | ||
| 168 | "target_scope_summary": { | ||
| 169 | "scope_type": "reference_set", | ||
| 170 | "scope_value": "phase1_hot_reference_v1", | ||
| 171 | "recording_count": 20, | ||
| 172 | "reference_set_id": 2, | ||
| 173 | "ready_asset_count": 20, | ||
| 174 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 175 | "active_window_count": 20 | ||
| 176 | }, | ||
| 177 | "missing_window_samples": [ | ||
| 178 | { | ||
| 179 | "reason": "missing_audio", | ||
| 180 | "asset_id": 1, | ||
| 181 | "window_id": 1, | ||
| 182 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 183 | }, | ||
| 184 | { | ||
| 185 | "reason": "missing_audio", | ||
| 186 | "asset_id": 2, | ||
| 187 | "window_id": 2, | ||
| 188 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 189 | }, | ||
| 190 | { | ||
| 191 | "reason": "missing_audio", | ||
| 192 | "asset_id": 3, | ||
| 193 | "window_id": 3, | ||
| 194 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 195 | }, | ||
| 196 | { | ||
| 197 | "reason": "missing_audio", | ||
| 198 | "asset_id": 4, | ||
| 199 | "window_id": 4, | ||
| 200 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 201 | }, | ||
| 202 | { | ||
| 203 | "reason": "missing_audio", | ||
| 204 | "asset_id": 5, | ||
| 205 | "window_id": 5, | ||
| 206 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 207 | } | ||
| 208 | ] | ||
| 209 | } | ||
| 210 | }, | ||
| 211 | "resolved_vector_table": "audio_embedding_vector_192", | ||
| 212 | "vector_table_report": { | ||
| 213 | "requested_vector_table": "audio_embedding_vector_192", | ||
| 214 | "expected_dim": 768, | ||
| 215 | "allowed_vector_tables": [ | ||
| 216 | "audio_embedding_vector_192", | ||
| 217 | "audio_embedding_vector_768" | ||
| 218 | ], | ||
| 219 | "resolved": false, | ||
| 220 | "table_exists": false, | ||
| 221 | "reason": "vector_table_dim_mismatch" | ||
| 222 | }, | ||
| 223 | "runtime_report": { | ||
| 224 | "model_name": "mert", | ||
| 225 | "requirements": [ | ||
| 226 | "numpy", | ||
| 227 | "torch", | ||
| 228 | "torchaudio", | ||
| 229 | "transformers" | ||
| 230 | ], | ||
| 231 | "availability": { | ||
| 232 | "numpy": true, | ||
| 233 | "torch": false, | ||
| 234 | "torchaudio": false, | ||
| 235 | "transformers": false | ||
| 236 | }, | ||
| 237 | "missing_dependencies": [ | ||
| 238 | "torch", | ||
| 239 | "torchaudio", | ||
| 240 | "transformers" | ||
| 241 | ], | ||
| 242 | "ready": false | ||
| 243 | }, | ||
| 244 | "processed_windows": [], | ||
| 245 | "notes": [ | ||
| 246 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 247 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 248 | ] | ||
| 249 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/pgvector_eval/music20/embedding_vector_table_missing_in_schema_attempt.json
0 → 100644
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_vector_table_missing_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T14:00:28.943358+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": "vector_table_missing_in_schema", | ||
| 88 | "resolved": false, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": false, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T14:00:28.943358+08:00", | ||
| 115 | "finished_at": "2026-06-04T14:00:28.944578+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "vector_table_missing_in_schema", | ||
| 152 | "model_runtime_unavailable" | ||
| 153 | ], | ||
| 154 | "scope_window_count": 20, | ||
| 155 | "write_target_table": "audio_embedding", | ||
| 156 | "vector_table_report": { | ||
| 157 | "reason": "vector_table_missing_in_schema", | ||
| 158 | "resolved": false, | ||
| 159 | "expected_dim": 768, | ||
| 160 | "table_exists": false, | ||
| 161 | "allowed_vector_tables": [ | ||
| 162 | "audio_embedding_vector_192", | ||
| 163 | "audio_embedding_vector_768" | ||
| 164 | ], | ||
| 165 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 166 | }, | ||
| 167 | "missing_window_count": 20, | ||
| 168 | "target_scope_summary": { | ||
| 169 | "scope_type": "reference_set", | ||
| 170 | "scope_value": "phase1_hot_reference_v1", | ||
| 171 | "recording_count": 20, | ||
| 172 | "reference_set_id": 2, | ||
| 173 | "ready_asset_count": 20, | ||
| 174 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 175 | "active_window_count": 20 | ||
| 176 | }, | ||
| 177 | "missing_window_samples": [ | ||
| 178 | { | ||
| 179 | "reason": "missing_audio", | ||
| 180 | "asset_id": 1, | ||
| 181 | "window_id": 1, | ||
| 182 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 183 | }, | ||
| 184 | { | ||
| 185 | "reason": "missing_audio", | ||
| 186 | "asset_id": 2, | ||
| 187 | "window_id": 2, | ||
| 188 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 189 | }, | ||
| 190 | { | ||
| 191 | "reason": "missing_audio", | ||
| 192 | "asset_id": 3, | ||
| 193 | "window_id": 3, | ||
| 194 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 195 | }, | ||
| 196 | { | ||
| 197 | "reason": "missing_audio", | ||
| 198 | "asset_id": 4, | ||
| 199 | "window_id": 4, | ||
| 200 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 201 | }, | ||
| 202 | { | ||
| 203 | "reason": "missing_audio", | ||
| 204 | "asset_id": 5, | ||
| 205 | "window_id": 5, | ||
| 206 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 207 | } | ||
| 208 | ] | ||
| 209 | } | ||
| 210 | }, | ||
| 211 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 212 | "vector_table_report": { | ||
| 213 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 214 | "expected_dim": 768, | ||
| 215 | "allowed_vector_tables": [ | ||
| 216 | "audio_embedding_vector_192", | ||
| 217 | "audio_embedding_vector_768" | ||
| 218 | ], | ||
| 219 | "resolved": false, | ||
| 220 | "table_exists": false, | ||
| 221 | "reason": "vector_table_missing_in_schema" | ||
| 222 | }, | ||
| 223 | "runtime_report": { | ||
| 224 | "model_name": "mert", | ||
| 225 | "requirements": [ | ||
| 226 | "numpy", | ||
| 227 | "torch", | ||
| 228 | "torchaudio", | ||
| 229 | "transformers" | ||
| 230 | ], | ||
| 231 | "availability": { | ||
| 232 | "numpy": true, | ||
| 233 | "torch": false, | ||
| 234 | "torchaudio": false, | ||
| 235 | "transformers": false | ||
| 236 | }, | ||
| 237 | "missing_dependencies": [ | ||
| 238 | "torch", | ||
| 239 | "torchaudio", | ||
| 240 | "transformers" | ||
| 241 | ], | ||
| 242 | "ready": false | ||
| 243 | }, | ||
| 244 | "processed_windows": [], | ||
| 245 | "notes": [ | ||
| 246 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 247 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 248 | ] | ||
| 249 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "source_schema": "acr_test", | ||
| 3 | "missing_table_schema": "acr_vector_table_missing_test", | ||
| 4 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 5 | "cases": [ | ||
| 6 | { | ||
| 7 | "case": "vector_table_dim_mismatch", | ||
| 8 | "schema": "acr_test", | ||
| 9 | "vector_table": "audio_embedding_vector_192", | ||
| 10 | "job_status": "failed", | ||
| 11 | "failure_reason": "preflight_failed", | ||
| 12 | "preflight_blockers": [ | ||
| 13 | "unreadable_audio_assets", | ||
| 14 | "vector_table_dim_mismatch", | ||
| 15 | "model_runtime_unavailable" | ||
| 16 | ], | ||
| 17 | "vector_table_report": { | ||
| 18 | "reason": "vector_table_dim_mismatch", | ||
| 19 | "resolved": false, | ||
| 20 | "expected_dim": 768, | ||
| 21 | "table_exists": false, | ||
| 22 | "allowed_vector_tables": [ | ||
| 23 | "audio_embedding_vector_192", | ||
| 24 | "audio_embedding_vector_768" | ||
| 25 | ], | ||
| 26 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 27 | }, | ||
| 28 | "artifact": "data/pgvector_eval/music20/embedding_vector_table_dim_mismatch_attempt.json" | ||
| 29 | }, | ||
| 30 | { | ||
| 31 | "case": "vector_table_not_allowlisted", | ||
| 32 | "schema": "acr_test", | ||
| 33 | "vector_table": "audio_embedding_vector_1024", | ||
| 34 | "job_status": "failed", | ||
| 35 | "failure_reason": "preflight_failed", | ||
| 36 | "preflight_blockers": [ | ||
| 37 | "unreadable_audio_assets", | ||
| 38 | "vector_table_not_allowlisted", | ||
| 39 | "model_runtime_unavailable" | ||
| 40 | ], | ||
| 41 | "vector_table_report": { | ||
| 42 | "reason": "vector_table_not_allowlisted", | ||
| 43 | "resolved": false, | ||
| 44 | "expected_dim": 768, | ||
| 45 | "table_exists": false, | ||
| 46 | "allowed_vector_tables": [ | ||
| 47 | "audio_embedding_vector_192", | ||
| 48 | "audio_embedding_vector_768" | ||
| 49 | ], | ||
| 50 | "requested_vector_table": "audio_embedding_vector_1024" | ||
| 51 | }, | ||
| 52 | "artifact": "data/pgvector_eval/music20/embedding_vector_table_not_allowlisted_attempt.json" | ||
| 53 | }, | ||
| 54 | { | ||
| 55 | "case": "vector_table_missing_in_schema", | ||
| 56 | "schema": "acr_vector_table_missing_test", | ||
| 57 | "vector_table": "audio_embedding_vector_768", | ||
| 58 | "job_status": "failed", | ||
| 59 | "failure_reason": "preflight_failed", | ||
| 60 | "preflight_blockers": [ | ||
| 61 | "unreadable_audio_assets", | ||
| 62 | "vector_table_missing_in_schema", | ||
| 63 | "model_runtime_unavailable" | ||
| 64 | ], | ||
| 65 | "vector_table_report": { | ||
| 66 | "reason": "vector_table_missing_in_schema", | ||
| 67 | "resolved": false, | ||
| 68 | "expected_dim": 768, | ||
| 69 | "table_exists": false, | ||
| 70 | "allowed_vector_tables": [ | ||
| 71 | "audio_embedding_vector_192", | ||
| 72 | "audio_embedding_vector_768" | ||
| 73 | ], | ||
| 74 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 75 | }, | ||
| 76 | "artifact": "data/pgvector_eval/music20/embedding_vector_table_missing_in_schema_attempt.json" | ||
| 77 | } | ||
| 78 | ], | ||
| 79 | "summary": { | ||
| 80 | "expected_reasons": { | ||
| 81 | "vector_table_dim_mismatch": "vector_table_dim_mismatch", | ||
| 82 | "vector_table_not_allowlisted": "vector_table_not_allowlisted", | ||
| 83 | "vector_table_missing_in_schema": "vector_table_missing_in_schema" | ||
| 84 | }, | ||
| 85 | "all_failed": true | ||
| 86 | } | ||
| 87 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/pgvector_eval/music20/embedding_vector_table_not_allowlisted_attempt.json
0 → 100644
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T14:00:28.602175+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_1024", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": "vector_table_not_allowlisted", | ||
| 88 | "resolved": false, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": false, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_1024" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T14:00:28.602175+08:00", | ||
| 115 | "finished_at": "2026-06-04T14:00:28.603652+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_1024", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "vector_table_not_allowlisted", | ||
| 152 | "model_runtime_unavailable" | ||
| 153 | ], | ||
| 154 | "scope_window_count": 20, | ||
| 155 | "write_target_table": "audio_embedding", | ||
| 156 | "vector_table_report": { | ||
| 157 | "reason": "vector_table_not_allowlisted", | ||
| 158 | "resolved": false, | ||
| 159 | "expected_dim": 768, | ||
| 160 | "table_exists": false, | ||
| 161 | "allowed_vector_tables": [ | ||
| 162 | "audio_embedding_vector_192", | ||
| 163 | "audio_embedding_vector_768" | ||
| 164 | ], | ||
| 165 | "requested_vector_table": "audio_embedding_vector_1024" | ||
| 166 | }, | ||
| 167 | "missing_window_count": 20, | ||
| 168 | "target_scope_summary": { | ||
| 169 | "scope_type": "reference_set", | ||
| 170 | "scope_value": "phase1_hot_reference_v1", | ||
| 171 | "recording_count": 20, | ||
| 172 | "reference_set_id": 2, | ||
| 173 | "ready_asset_count": 20, | ||
| 174 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 175 | "active_window_count": 20 | ||
| 176 | }, | ||
| 177 | "missing_window_samples": [ | ||
| 178 | { | ||
| 179 | "reason": "missing_audio", | ||
| 180 | "asset_id": 1, | ||
| 181 | "window_id": 1, | ||
| 182 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 183 | }, | ||
| 184 | { | ||
| 185 | "reason": "missing_audio", | ||
| 186 | "asset_id": 2, | ||
| 187 | "window_id": 2, | ||
| 188 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 189 | }, | ||
| 190 | { | ||
| 191 | "reason": "missing_audio", | ||
| 192 | "asset_id": 3, | ||
| 193 | "window_id": 3, | ||
| 194 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 195 | }, | ||
| 196 | { | ||
| 197 | "reason": "missing_audio", | ||
| 198 | "asset_id": 4, | ||
| 199 | "window_id": 4, | ||
| 200 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 201 | }, | ||
| 202 | { | ||
| 203 | "reason": "missing_audio", | ||
| 204 | "asset_id": 5, | ||
| 205 | "window_id": 5, | ||
| 206 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 207 | } | ||
| 208 | ] | ||
| 209 | } | ||
| 210 | }, | ||
| 211 | "resolved_vector_table": "audio_embedding_vector_1024", | ||
| 212 | "vector_table_report": { | ||
| 213 | "requested_vector_table": "audio_embedding_vector_1024", | ||
| 214 | "expected_dim": 768, | ||
| 215 | "allowed_vector_tables": [ | ||
| 216 | "audio_embedding_vector_192", | ||
| 217 | "audio_embedding_vector_768" | ||
| 218 | ], | ||
| 219 | "resolved": false, | ||
| 220 | "table_exists": false, | ||
| 221 | "reason": "vector_table_not_allowlisted" | ||
| 222 | }, | ||
| 223 | "runtime_report": { | ||
| 224 | "model_name": "mert", | ||
| 225 | "requirements": [ | ||
| 226 | "numpy", | ||
| 227 | "torch", | ||
| 228 | "torchaudio", | ||
| 229 | "transformers" | ||
| 230 | ], | ||
| 231 | "availability": { | ||
| 232 | "numpy": true, | ||
| 233 | "torch": false, | ||
| 234 | "torchaudio": false, | ||
| 235 | "transformers": false | ||
| 236 | }, | ||
| 237 | "missing_dependencies": [ | ||
| 238 | "torch", | ||
| 239 | "torchaudio", | ||
| 240 | "transformers" | ||
| 241 | ], | ||
| 242 | "ready": false | ||
| 243 | }, | ||
| 244 | "processed_windows": [], | ||
| 245 | "notes": [ | ||
| 246 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 247 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 248 | ] | ||
| 249 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | import subprocess | ||
| 7 | from pathlib import Path | ||
| 8 | import sys | ||
| 9 | from typing import Any | ||
| 10 | |||
| 11 | import psycopg | ||
| 12 | |||
| 13 | ROOT = Path(__file__).resolve().parents[1] | ||
| 14 | if str(ROOT) not in sys.path: | ||
| 15 | sys.path.insert(0, str(ROOT)) | ||
| 16 | |||
| 17 | from workers._job_common import validate_schema | ||
| 18 | |||
| 19 | PYTHON_BIN = '/usr/local/miniconda3/bin/python' | ||
| 20 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'embedding_vector_table_negative_matrix_report.json' | ||
| 21 | SOURCE_SCHEMA = 'acr_test' | ||
| 22 | MINIMAL_TABLES = [ | ||
| 23 | 'canonical_song', | ||
| 24 | 'work', | ||
| 25 | 'recording', | ||
| 26 | 'recording_asset', | ||
| 27 | 'audio_window', | ||
| 28 | 'model_registry', | ||
| 29 | 'feature_set_registry', | ||
| 30 | 'feature_extraction_job', | ||
| 31 | 'reference_set_registry', | ||
| 32 | 'reference_set_member', | ||
| 33 | ] | ||
| 34 | |||
| 35 | |||
| 36 | def run_cmd(cmd: list[str]) -> subprocess.CompletedProcess[str]: | ||
| 37 | return subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True) | ||
| 38 | |||
| 39 | |||
| 40 | def reset_source_jobs(dsn: str) -> None: | ||
| 41 | proc = run_cmd([ | ||
| 42 | PYTHON_BIN, | ||
| 43 | 'scripts/bootstrap_phase1_extraction_jobs_live.py', | ||
| 44 | '--dsn', dsn, | ||
| 45 | '--schema', SOURCE_SCHEMA, | ||
| 46 | ]) | ||
| 47 | if proc.returncode != 0: | ||
| 48 | raise SystemExit(proc.stderr or proc.stdout) | ||
| 49 | |||
| 50 | |||
| 51 | def clone_minimal_schema_without_vectors(dsn: str, target_schema: str) -> None: | ||
| 52 | target_schema = validate_schema(target_schema) | ||
| 53 | with psycopg.connect(dsn, autocommit=True) as conn: | ||
| 54 | conn.execute(f'DROP SCHEMA IF EXISTS {target_schema} CASCADE;') | ||
| 55 | conn.execute(f'CREATE SCHEMA {target_schema};') | ||
| 56 | for table_name in MINIMAL_TABLES: | ||
| 57 | conn.execute(f'CREATE TABLE {target_schema}.{table_name} AS TABLE {SOURCE_SCHEMA}.{table_name} WITH DATA;') | ||
| 58 | |||
| 59 | |||
| 60 | def run_worker_case(*, dsn: str, schema: str, vector_table: str, output_name: str) -> dict[str, Any]: | ||
| 61 | out = ROOT / 'data' / 'pgvector_eval' / 'music20' / output_name | ||
| 62 | proc = run_cmd([ | ||
| 63 | PYTHON_BIN, | ||
| 64 | 'workers/run_embedding_job.py', | ||
| 65 | '--dsn', dsn, | ||
| 66 | '--schema', schema, | ||
| 67 | '--job-id', '2', | ||
| 68 | '--model-name', 'mert', | ||
| 69 | '--model-version', 'v1-95m', | ||
| 70 | '--vector-table', vector_table, | ||
| 71 | '--output', str(out), | ||
| 72 | ]) | ||
| 73 | if proc.returncode != 0: | ||
| 74 | raise SystemExit(proc.stderr or proc.stdout) | ||
| 75 | payload = json.loads(out.read_text(encoding='utf-8')) | ||
| 76 | failed = payload.get('status_after_failed') or {} | ||
| 77 | metadata = failed.get('metadata_json') or {} | ||
| 78 | return { | ||
| 79 | 'schema': schema, | ||
| 80 | 'vector_table': vector_table, | ||
| 81 | 'job_status': failed.get('job_status'), | ||
| 82 | 'failure_reason': metadata.get('failure_reason'), | ||
| 83 | 'preflight_blockers': metadata.get('preflight_blockers'), | ||
| 84 | 'vector_table_report': metadata.get('vector_table_report'), | ||
| 85 | 'artifact': str(out.relative_to(ROOT)), | ||
| 86 | } | ||
| 87 | |||
| 88 | |||
| 89 | def main() -> None: | ||
| 90 | ap = argparse.ArgumentParser() | ||
| 91 | ap.add_argument('--dsn', required=True) | ||
| 92 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 93 | ap.add_argument('--missing-table-schema', default='acr_vector_table_missing_test') | ||
| 94 | args = ap.parse_args() | ||
| 95 | |||
| 96 | reset_source_jobs(args.dsn) | ||
| 97 | dim_mismatch = run_worker_case( | ||
| 98 | dsn=args.dsn, | ||
| 99 | schema=SOURCE_SCHEMA, | ||
| 100 | vector_table='audio_embedding_vector_192', | ||
| 101 | output_name='embedding_vector_table_dim_mismatch_attempt.json', | ||
| 102 | ) | ||
| 103 | |||
| 104 | reset_source_jobs(args.dsn) | ||
| 105 | not_allowlisted = run_worker_case( | ||
| 106 | dsn=args.dsn, | ||
| 107 | schema=SOURCE_SCHEMA, | ||
| 108 | vector_table='audio_embedding_vector_1024', | ||
| 109 | output_name='embedding_vector_table_not_allowlisted_attempt.json', | ||
| 110 | ) | ||
| 111 | |||
| 112 | reset_source_jobs(args.dsn) | ||
| 113 | clone_minimal_schema_without_vectors(args.dsn, args.missing_table_schema) | ||
| 114 | missing_table = run_worker_case( | ||
| 115 | dsn=args.dsn, | ||
| 116 | schema=args.missing_table_schema, | ||
| 117 | vector_table='audio_embedding_vector_768', | ||
| 118 | output_name='embedding_vector_table_missing_in_schema_attempt.json', | ||
| 119 | ) | ||
| 120 | |||
| 121 | payload = { | ||
| 122 | 'source_schema': SOURCE_SCHEMA, | ||
| 123 | 'missing_table_schema': args.missing_table_schema, | ||
| 124 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 125 | 'cases': [ | ||
| 126 | {'case': 'vector_table_dim_mismatch', **dim_mismatch}, | ||
| 127 | {'case': 'vector_table_not_allowlisted', **not_allowlisted}, | ||
| 128 | {'case': 'vector_table_missing_in_schema', **missing_table}, | ||
| 129 | ], | ||
| 130 | 'summary': { | ||
| 131 | 'expected_reasons': { | ||
| 132 | 'vector_table_dim_mismatch': dim_mismatch['vector_table_report'].get('reason'), | ||
| 133 | 'vector_table_not_allowlisted': not_allowlisted['vector_table_report'].get('reason'), | ||
| 134 | 'vector_table_missing_in_schema': missing_table['vector_table_report'].get('reason'), | ||
| 135 | }, | ||
| 136 | 'all_failed': all(item['job_status'] == 'failed' for item in [dim_mismatch, not_allowlisted, missing_table]), | ||
| 137 | }, | ||
| 138 | } | ||
| 139 | out = Path(args.output) | ||
| 140 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 141 | out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 142 | print(json.dumps(payload, ensure_ascii=False, indent=2)) | ||
| 143 | |||
| 144 | |||
| 145 | if __name__ == '__main__': | ||
| 146 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `scripts/run_embedding_vector_table_negative_matrix_live.py` 与 `embedding_vector_table_negative_matrix_report.json`,在 live PostgreSQL 上补齐 semantic preflight 的三类向量表负例:维度不匹配、未 allowlist、schema 缺表;三类 case 都会稳定落到 `preflight_failed`,且 `vector_table_report.reason` 与预期一致。 | ||
| 3 | - 新增 `scripts/run_phase1_worker_contract_smoke_live.py` 与 `phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。 | 4 | - 新增 `scripts/run_phase1_worker_contract_smoke_live.py` 与 `phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。 |
| 4 | - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py` 与 `audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`。 | 5 | - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py` 与 `audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`。 |
| 5 | - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py` 与 `phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 | 6 | - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py` 与 `phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 | ... | ... |
| ... | @@ -312,6 +312,21 @@ worker 会把这些 blocker 聚合到: | ... | @@ -312,6 +312,21 @@ worker 会把这些 blocker 聚合到: |
| 312 | 312 | ||
| 313 | 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。 | 313 | 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。 |
| 314 | 314 | ||
| 315 | ### 当前 vector table 负例证据 | ||
| 316 | |||
| 317 | 除了正常 `audio_embedding_vector_768` 存在性校验外,本轮还对 semantic lane 补了 3 类 live 负例: | ||
| 318 | |||
| 319 | - `audio_embedding_vector_192` -> `vector_table_dim_mismatch` | ||
| 320 | - `audio_embedding_vector_1024` -> `vector_table_not_allowlisted` | ||
| 321 | - 缺失 `audio_embedding_vector_768` 的隔离 schema -> `vector_table_missing_in_schema` | ||
| 322 | |||
| 323 | 对应产物: | ||
| 324 | |||
| 325 | - `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py` | ||
| 326 | - `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json` | ||
| 327 | |||
| 328 | 这说明 semantic worker 当前不只是会在“环境缺依赖”时失败,也能把 **配置错误的向量表** 精确落账。 | ||
| 329 | |||
| 315 | ### 当前 live 证据 | 330 | ### 当前 live 证据 |
| 316 | 331 | ||
| 317 | MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: | 332 | MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: | ... | ... |
| ... | @@ -845,3 +845,30 @@ cd /workspace/acr-engine | ... | @@ -845,3 +845,30 @@ cd /workspace/acr-engine |
| 845 | - 当前阻塞已经非常明确,主要不是 orchestration,而是环境: | 845 | - 当前阻塞已经非常明确,主要不是 orchestration,而是环境: |
| 846 | - `/workspace/downloads` 未挂载 | 846 | - `/workspace/downloads` 未挂载 |
| 847 | - semantic model runtime 未安装 | 847 | - semantic model runtime 未安装 |
| 848 | |||
| 849 | |||
| 850 | ## 新增:semantic vector table 负例矩阵 | ||
| 851 | |||
| 852 | 为了避免后续把 semantic worker 的失败都误归因为“缺模型/缺音频”,本轮新增: | ||
| 853 | |||
| 854 | - `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py` | ||
| 855 | - `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json` | ||
| 856 | |||
| 857 | 它真实验证了 3 类向量表配置错误: | ||
| 858 | |||
| 859 | | case | schema | vector table | reason | | ||
| 860 | |---|---|---|---| | ||
| 861 | | `vector_table_dim_mismatch` | `acr_test` | `audio_embedding_vector_192` | `vector_table_dim_mismatch` | | ||
| 862 | | `vector_table_not_allowlisted` | `acr_test` | `audio_embedding_vector_1024` | `vector_table_not_allowlisted` | | ||
| 863 | | `vector_table_missing_in_schema` | `acr_vector_table_missing_test` | `audio_embedding_vector_768` | `vector_table_missing_in_schema` | | ||
| 864 | |||
| 865 | 共同点: | ||
| 866 | |||
| 867 | - 3 条 case 全部 `job_status = failed` | ||
| 868 | - `failure_reason = preflight_failed` | ||
| 869 | - `preflight_blockers` 中除了环境 blocker,还会额外带上精确的 vector-table blocker | ||
| 870 | |||
| 871 | 这说明: | ||
| 872 | |||
| 873 | - 当前 semantic preflight 已经能够把“运行环境问题”和“配置错误问题”分层暴露 | ||
| 874 | - 后续只要看 `vector_table_report.reason`,就能快速区分是 DDL/配置错误,还是模型 runtime/音频挂载错误 | ... | ... |
| ... | @@ -194,6 +194,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -194,6 +194,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 194 | - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 | 194 | - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 |
| 195 | - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 | 195 | - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 |
| 196 | - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失 | 196 | - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失 |
| 197 | - `scripts/run_embedding_vector_table_negative_matrix_live.py` 已在 live PostgreSQL 上补齐 semantic vector-table 负例矩阵:`vector_table_dim_mismatch`、`vector_table_not_allowlisted`、`vector_table_missing_in_schema` 三类错误都能被稳定写入 `vector_table_report.reason` | ||
| 197 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 198 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 198 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 199 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 199 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 200 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment