Freeze a live blocker matrix for semantic extraction jobs
Constraint: The current container still lacks mounted source audio and the semantic model runtimes, so repeated manual spot-checks are noisy and wasteful. Rejected: Ad-hoc one-job validation only | It would not show whether failures are contract-wide or model-specific. Confidence: high Scope-risk: narrow Directive: Re-run the matrix before claiming any semantic worker progress so blocker drift across MERT/MuQ/ECAPA is visible. Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_phase1_embedding_preflight_matrix_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_phase1_embedding_preflight_matrix_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_embedding_preflight_matrix_report.json Not-tested: This matrix still cannot prove successful semantic inference until assets and runtime dependencies are available.
Showing
9 changed files
with
1293 additions
and
0 deletions
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:52:49.952665+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:52:49.952665+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:52:49.954302+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 768, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 213 | "expected_dim": 768, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "mert", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "transformers" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "transformers": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "transformers" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 3, | ||
| 6 | "feature_set_id": 4, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/10s_5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "long_context_validation", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 10.0, | ||
| 19 | "hop_sec": 5.0, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "long_context_validation" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 3, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:52:50.210469+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "long_context_validation", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 3, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:52:50.210469+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:52:50.211993+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "long_context_validation", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 768, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 213 | "expected_dim": 768, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "mert", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "transformers" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "transformers": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "transformers" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 4, | ||
| 6 | "feature_set_id": 5, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "challenger", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "semantic_challenger" | ||
| 24 | }, | ||
| 25 | "model_id": 4, | ||
| 26 | "model_name": "muq", | ||
| 27 | "model_version": "large-msd-iter", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "challenger", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 4, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:52:50.434101+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "challenger", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "muq", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 4, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:52:50.434101+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:52:50.435667+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "challenger", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "muq", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 768, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 213 | "expected_dim": 768, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "muq", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "transformers" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "transformers": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "transformers" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 5, | ||
| 6 | "feature_set_id": 6, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "historical_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 192, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "historical_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 5, | ||
| 26 | "model_name": "ecapa", | ||
| 27 | "model_version": "acr-baseline-v1", | ||
| 28 | "model_family": "speech_derived", | ||
| 29 | "input_sample_rate": 16000, | ||
| 30 | "output_embedding_dim": 192, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "historical_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 5, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:52:50.702135+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "historical_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_192", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "ecapa", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "speechbrain": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "speechbrain" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "speechbrain" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 192, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 5, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:52:50.702135+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:52:50.703634+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "historical_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_192", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "ecapa", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "speechbrain": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "speechbrain" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "speechbrain" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 192, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_192", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_192", | ||
| 213 | "expected_dim": 192, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "ecapa", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "speechbrain" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "speechbrain": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "speechbrain" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "schema": "acr_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "semantic_job_count": 4, | ||
| 5 | "results": [ | ||
| 6 | { | ||
| 7 | "extraction_job_id": 2, | ||
| 8 | "model_name": "mert", | ||
| 9 | "model_version": "v1-95m", | ||
| 10 | "vector_table": "audio_embedding_vector_768", | ||
| 11 | "returncode": 0, | ||
| 12 | "job_status": "failed", | ||
| 13 | "failure_reason": "preflight_failed", | ||
| 14 | "preflight_blockers": [ | ||
| 15 | "unreadable_audio_assets", | ||
| 16 | "model_runtime_unavailable" | ||
| 17 | ], | ||
| 18 | "missing_window_count": 20, | ||
| 19 | "runtime_missing_dependencies": [ | ||
| 20 | "torch", | ||
| 21 | "torchaudio", | ||
| 22 | "transformers" | ||
| 23 | ], | ||
| 24 | "vector_table_report": { | ||
| 25 | "reason": null, | ||
| 26 | "resolved": true, | ||
| 27 | "expected_dim": 768, | ||
| 28 | "table_exists": true, | ||
| 29 | "allowed_vector_tables": [ | ||
| 30 | "audio_embedding_vector_192", | ||
| 31 | "audio_embedding_vector_768" | ||
| 32 | ], | ||
| 33 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 34 | }, | ||
| 35 | "attempt_artifact": "data/pgvector_eval/music20/job2_mert_preflight_attempt.json" | ||
| 36 | }, | ||
| 37 | { | ||
| 38 | "extraction_job_id": 3, | ||
| 39 | "model_name": "mert", | ||
| 40 | "model_version": "v1-95m", | ||
| 41 | "vector_table": "audio_embedding_vector_768", | ||
| 42 | "returncode": 0, | ||
| 43 | "job_status": "failed", | ||
| 44 | "failure_reason": "preflight_failed", | ||
| 45 | "preflight_blockers": [ | ||
| 46 | "unreadable_audio_assets", | ||
| 47 | "model_runtime_unavailable" | ||
| 48 | ], | ||
| 49 | "missing_window_count": 20, | ||
| 50 | "runtime_missing_dependencies": [ | ||
| 51 | "torch", | ||
| 52 | "torchaudio", | ||
| 53 | "transformers" | ||
| 54 | ], | ||
| 55 | "vector_table_report": { | ||
| 56 | "reason": null, | ||
| 57 | "resolved": true, | ||
| 58 | "expected_dim": 768, | ||
| 59 | "table_exists": true, | ||
| 60 | "allowed_vector_tables": [ | ||
| 61 | "audio_embedding_vector_192", | ||
| 62 | "audio_embedding_vector_768" | ||
| 63 | ], | ||
| 64 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 65 | }, | ||
| 66 | "attempt_artifact": "data/pgvector_eval/music20/job3_mert_preflight_attempt.json" | ||
| 67 | }, | ||
| 68 | { | ||
| 69 | "extraction_job_id": 4, | ||
| 70 | "model_name": "muq", | ||
| 71 | "model_version": "large-msd-iter", | ||
| 72 | "vector_table": "audio_embedding_vector_768", | ||
| 73 | "returncode": 0, | ||
| 74 | "job_status": "failed", | ||
| 75 | "failure_reason": "preflight_failed", | ||
| 76 | "preflight_blockers": [ | ||
| 77 | "unreadable_audio_assets", | ||
| 78 | "model_runtime_unavailable" | ||
| 79 | ], | ||
| 80 | "missing_window_count": 20, | ||
| 81 | "runtime_missing_dependencies": [ | ||
| 82 | "torch", | ||
| 83 | "torchaudio", | ||
| 84 | "transformers" | ||
| 85 | ], | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "attempt_artifact": "data/pgvector_eval/music20/job4_muq_preflight_attempt.json" | ||
| 98 | }, | ||
| 99 | { | ||
| 100 | "extraction_job_id": 5, | ||
| 101 | "model_name": "ecapa", | ||
| 102 | "model_version": "acr-baseline-v1", | ||
| 103 | "vector_table": "audio_embedding_vector_192", | ||
| 104 | "returncode": 0, | ||
| 105 | "job_status": "failed", | ||
| 106 | "failure_reason": "preflight_failed", | ||
| 107 | "preflight_blockers": [ | ||
| 108 | "unreadable_audio_assets", | ||
| 109 | "model_runtime_unavailable" | ||
| 110 | ], | ||
| 111 | "missing_window_count": 20, | ||
| 112 | "runtime_missing_dependencies": [ | ||
| 113 | "torch", | ||
| 114 | "torchaudio", | ||
| 115 | "speechbrain" | ||
| 116 | ], | ||
| 117 | "vector_table_report": { | ||
| 118 | "reason": null, | ||
| 119 | "resolved": true, | ||
| 120 | "expected_dim": 192, | ||
| 121 | "table_exists": true, | ||
| 122 | "allowed_vector_tables": [ | ||
| 123 | "audio_embedding_vector_192", | ||
| 124 | "audio_embedding_vector_768" | ||
| 125 | ], | ||
| 126 | "requested_vector_table": "audio_embedding_vector_192" | ||
| 127 | }, | ||
| 128 | "attempt_artifact": "data/pgvector_eval/music20/job5_ecapa_preflight_attempt.json" | ||
| 129 | } | ||
| 130 | ], | ||
| 131 | "summary": { | ||
| 132 | "failed_jobs": 4, | ||
| 133 | "models": [ | ||
| 134 | "mert", | ||
| 135 | "mert", | ||
| 136 | "muq", | ||
| 137 | "ecapa" | ||
| 138 | ], | ||
| 139 | "unique_blockers": [ | ||
| 140 | "model_runtime_unavailable", | ||
| 141 | "unreadable_audio_assets" | ||
| 142 | ] | ||
| 143 | } | ||
| 144 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | import subprocess | ||
| 7 | from pathlib import Path | ||
| 8 | from typing import Any | ||
| 9 | |||
| 10 | import psycopg | ||
| 11 | |||
| 12 | ROOT = Path(__file__).resolve().parents[1] | ||
| 13 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_embedding_preflight_matrix_report.json' | ||
| 14 | PYTHON_BIN = '/usr/local/miniconda3/bin/python' | ||
| 15 | |||
| 16 | |||
| 17 | def load_semantic_jobs(conn: psycopg.Connection) -> list[dict[str, Any]]: | ||
| 18 | rows = conn.execute( | ||
| 19 | """ | ||
| 20 | SELECT | ||
| 21 | fej.extraction_job_id, | ||
| 22 | mr.model_name, | ||
| 23 | mr.model_version, | ||
| 24 | fs.embedding_dim | ||
| 25 | FROM feature_extraction_job fej | ||
| 26 | JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id | ||
| 27 | JOIN model_registry mr ON mr.model_id = fs.model_id | ||
| 28 | WHERE fs.feature_name = 'semantic_embedding' | ||
| 29 | AND fs.feature_level = 'window' | ||
| 30 | ORDER BY fej.extraction_job_id; | ||
| 31 | """ | ||
| 32 | ).fetchall() | ||
| 33 | return [ | ||
| 34 | { | ||
| 35 | 'extraction_job_id': int(row[0]), | ||
| 36 | 'model_name': row[1], | ||
| 37 | 'model_version': row[2], | ||
| 38 | 'embedding_dim': int(row[3]) if row[3] is not None else None, | ||
| 39 | 'vector_table': f"audio_embedding_vector_{int(row[3])}" if row[3] in (192, 768) else None, | ||
| 40 | } | ||
| 41 | for row in rows | ||
| 42 | ] | ||
| 43 | |||
| 44 | |||
| 45 | def reset_jobs(dsn: str, schema: str) -> None: | ||
| 46 | cmd = [ | ||
| 47 | PYTHON_BIN, | ||
| 48 | 'scripts/bootstrap_phase1_extraction_jobs_live.py', | ||
| 49 | '--dsn', dsn, | ||
| 50 | '--schema', schema, | ||
| 51 | ] | ||
| 52 | subprocess.run(cmd, cwd=ROOT, check=True, capture_output=True, text=True) | ||
| 53 | |||
| 54 | |||
| 55 | def run_job(dsn: str, schema: str, job: dict[str, Any]) -> dict[str, Any]: | ||
| 56 | attempt_path = ROOT / 'data' / 'pgvector_eval' / 'music20' / f"job{job['extraction_job_id']}_{job['model_name']}_preflight_attempt.json" | ||
| 57 | cmd = [ | ||
| 58 | PYTHON_BIN, | ||
| 59 | 'workers/run_embedding_job.py', | ||
| 60 | '--dsn', dsn, | ||
| 61 | '--schema', schema, | ||
| 62 | '--job-id', str(job['extraction_job_id']), | ||
| 63 | '--model-name', job['model_name'], | ||
| 64 | '--model-version', job['model_version'], | ||
| 65 | ] | ||
| 66 | if job['vector_table']: | ||
| 67 | cmd.extend(['--vector-table', job['vector_table']]) | ||
| 68 | cmd.extend(['--output', str(attempt_path)]) | ||
| 69 | proc = subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True) | ||
| 70 | payload = json.loads(attempt_path.read_text(encoding='utf-8')) | ||
| 71 | status_after_failed = payload.get('status_after_failed') or {} | ||
| 72 | metadata = status_after_failed.get('metadata_json') or {} | ||
| 73 | return { | ||
| 74 | 'extraction_job_id': job['extraction_job_id'], | ||
| 75 | 'model_name': job['model_name'], | ||
| 76 | 'model_version': job['model_version'], | ||
| 77 | 'vector_table': job['vector_table'], | ||
| 78 | 'returncode': proc.returncode, | ||
| 79 | 'job_status': status_after_failed.get('job_status') or payload.get('status_after_complete', {}).get('job_status'), | ||
| 80 | 'failure_reason': metadata.get('failure_reason'), | ||
| 81 | 'preflight_blockers': metadata.get('preflight_blockers'), | ||
| 82 | 'missing_window_count': metadata.get('missing_window_count'), | ||
| 83 | 'runtime_missing_dependencies': ((metadata.get('runtime_report') or {}).get('missing_dependencies')), | ||
| 84 | 'vector_table_report': metadata.get('vector_table_report'), | ||
| 85 | 'attempt_artifact': str(attempt_path.relative_to(ROOT)), | ||
| 86 | } | ||
| 87 | |||
| 88 | |||
| 89 | def main() -> None: | ||
| 90 | ap = argparse.ArgumentParser() | ||
| 91 | ap.add_argument('--dsn', required=True) | ||
| 92 | ap.add_argument('--schema', default='acr_test') | ||
| 93 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 94 | args = ap.parse_args() | ||
| 95 | |||
| 96 | reset_jobs(args.dsn, args.schema) | ||
| 97 | with psycopg.connect(args.dsn, autocommit=True) as conn: | ||
| 98 | conn.execute(f'SET search_path TO {args.schema}, public;') | ||
| 99 | jobs = load_semantic_jobs(conn) | ||
| 100 | |||
| 101 | results = [run_job(args.dsn, args.schema, job) for job in jobs] | ||
| 102 | payload = { | ||
| 103 | 'schema': args.schema, | ||
| 104 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 105 | 'semantic_job_count': len(results), | ||
| 106 | 'results': results, | ||
| 107 | 'summary': { | ||
| 108 | 'failed_jobs': sum(1 for item in results if item['job_status'] == 'failed'), | ||
| 109 | 'models': [item['model_name'] for item in results], | ||
| 110 | 'unique_blockers': sorted({blocker for item in results for blocker in (item.get('preflight_blockers') or [])}), | ||
| 111 | }, | ||
| 112 | } | ||
| 113 | out = Path(args.output) | ||
| 114 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 115 | out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 116 | print(json.dumps(payload, ensure_ascii=False, indent=2)) | ||
| 117 | |||
| 118 | |||
| 119 | if __name__ == '__main__': | ||
| 120 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py` 与 `phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 | ||
| 3 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 | 4 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 |
| 4 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 | 5 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 |
| 5 | - 新增 `phase1_worker_embedding_write_attempt.json`、`phase1_worker_embedding_write_guard_report.json` 与 `phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`。 | 6 | - 新增 `phase1_worker_embedding_write_attempt.json`、`phase1_worker_embedding_write_guard_report.json` 与 `phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`。 | ... | ... |
| ... | @@ -739,3 +739,38 @@ cd /workspace/acr-engine | ... | @@ -739,3 +739,38 @@ cd /workspace/acr-engine |
| 739 | 3. 不把幂等职责留给应用层“先查再写” | 739 | 3. 不把幂等职责留给应用层“先查再写” |
| 740 | 740 | ||
| 741 | 这一步对后续的 `MERT / MuQ / ECAPA` 都通用。 | 741 | 这一步对后续的 `MERT / MuQ / ECAPA` 都通用。 |
| 742 | |||
| 743 | |||
| 744 | ## 新增:Semantic preflight blocker matrix | ||
| 745 | |||
| 746 | 为了避免下次 session 继续手工逐个试,本轮又新增: | ||
| 747 | |||
| 748 | - `acr-engine/scripts/run_phase1_embedding_preflight_matrix_live.py` | ||
| 749 | - `acr-engine/data/pgvector_eval/music20/phase1_embedding_preflight_matrix_report.json` | ||
| 750 | |||
| 751 | 它会: | ||
| 752 | |||
| 753 | 1. 先把 `feature_extraction_job` 重置回 `pending` | ||
| 754 | 2. 顺序执行全部 semantic jobs(当前是 `mert 5s`、`mert 10s`、`muq 5s`、`ecapa 5s`) | ||
| 755 | 3. 归并输出每个 job 的: | ||
| 756 | - `failure_reason` | ||
| 757 | - `preflight_blockers` | ||
| 758 | - `runtime_missing_dependencies` | ||
| 759 | - `vector_table_report` | ||
| 760 | |||
| 761 | ### 当前矩阵结果 | ||
| 762 | |||
| 763 | | job | model | vector table | blockers | runtime missing | | ||
| 764 | |---|---|---|---|---| | ||
| 765 | | 2 | `mert v1-95m` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` | | ||
| 766 | | 3 | `mert v1-95m` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` | | ||
| 767 | | 4 | `muq large-msd-iter` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` | | ||
| 768 | | 5 | `ecapa acr-baseline-v1` | `audio_embedding_vector_192` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `speechbrain` | | ||
| 769 | |||
| 770 | 结论: | ||
| 771 | |||
| 772 | - 当前 semantic lane 的失败已经具有**稳定矩阵特征**,不是某一个 job 独有的偶发问题 | ||
| 773 | - `vector_table` 路径已全部通过 | ||
| 774 | - 当前真正阻塞 Phase-1 encoder-only 落地的是: | ||
| 775 | 1. `/workspace/downloads` 音频挂载 | ||
| 776 | 2. 模型 runtime 依赖安装 | ... | ... |
| ... | @@ -191,6 +191,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -191,6 +191,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 | 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 |
| 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed | 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed |
| 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 | 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 |
| 194 | - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 | ||
| 194 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 195 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 195 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 196 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 196 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 197 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment