Make semantic extraction failures auditable before model runtimes land
Constraint: Current container lacks /workspace/downloads and torch/torchaudio/transformers, so Phase-1 semantic work must prove honest failure semantics instead of pretending inference succeeded. Rejected: Stub semantic embeddings | Would blur the contract between real model outputs and repo-local placeholders. Confidence: high Scope-risk: narrow Directive: Keep the preflight blockers explicit until real MERT/MuQ/ECAPA adapters and asset-level embedding tests exist. Tested: /usr/local/miniconda3/bin/python -m py_compile workers/run_embedding_job.py workers/run_chromaprint_job.py workers/_job_common.py scripts/bootstrap_phase1_extraction_jobs_live.py scripts/plan_phase1_extraction_jobs_live.py scripts/bootstrap_phase1_reference_members_live.py scripts/live_pgvector_music20_eval.py; git diff --check; /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test; /usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768 --output data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json Not-tested: Real encoder inference and asset-level embedding upsert path remain unavailable in this container.
Showing
9 changed files
with
491 additions
and
11 deletions
| 1 | { | ||
| 2 | "audio_embedding_count": 20, | ||
| 3 | "audio_embedding_vector_768_count": 0, | ||
| 4 | "job_2": [ | ||
| 5 | 2, | ||
| 6 | "failed", | ||
| 7 | 20, | ||
| 8 | 0, | ||
| 9 | { | ||
| 10 | "lane": "semantic", | ||
| 11 | "role": "primary_baseline", | ||
| 12 | "phase": "phase1", | ||
| 13 | "worker": "run_embedding_job", | ||
| 14 | "dry_run": false, | ||
| 15 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 16 | "vector_table": "audio_embedding_vector_768", | ||
| 17 | "output_target": "audio_embedding", | ||
| 18 | "execution_mode": "preflight_failure", | ||
| 19 | "failure_reason": "preflight_failed", | ||
| 20 | "runtime_report": { | ||
| 21 | "ready": false, | ||
| 22 | "model_name": "mert", | ||
| 23 | "availability": { | ||
| 24 | "numpy": true, | ||
| 25 | "torch": false, | ||
| 26 | "torchaudio": false, | ||
| 27 | "transformers": false | ||
| 28 | }, | ||
| 29 | "requirements": [ | ||
| 30 | "numpy", | ||
| 31 | "torch", | ||
| 32 | "torchaudio", | ||
| 33 | "transformers" | ||
| 34 | ], | ||
| 35 | "missing_dependencies": [ | ||
| 36 | "torch", | ||
| 37 | "torchaudio", | ||
| 38 | "transformers" | ||
| 39 | ] | ||
| 40 | }, | ||
| 41 | "preflight_blockers": [ | ||
| 42 | "unreadable_audio_assets", | ||
| 43 | "model_runtime_unavailable" | ||
| 44 | ], | ||
| 45 | "scope_window_count": 20, | ||
| 46 | "write_target_table": "audio_embedding", | ||
| 47 | "vector_table_report": { | ||
| 48 | "reason": null, | ||
| 49 | "resolved": true, | ||
| 50 | "expected_dim": 768, | ||
| 51 | "table_exists": true, | ||
| 52 | "allowed_vector_tables": [ | ||
| 53 | "audio_embedding_vector_192", | ||
| 54 | "audio_embedding_vector_768" | ||
| 55 | ], | ||
| 56 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 57 | }, | ||
| 58 | "missing_window_count": 20, | ||
| 59 | "target_scope_summary": { | ||
| 60 | "scope_type": "reference_set", | ||
| 61 | "scope_value": "phase1_hot_reference_v1", | ||
| 62 | "recording_count": 20, | ||
| 63 | "reference_set_id": 2, | ||
| 64 | "ready_asset_count": 20, | ||
| 65 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 66 | "active_window_count": 20 | ||
| 67 | }, | ||
| 68 | "missing_window_samples": [ | ||
| 69 | { | ||
| 70 | "reason": "missing_audio", | ||
| 71 | "asset_id": 1, | ||
| 72 | "window_id": 1, | ||
| 73 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 74 | }, | ||
| 75 | { | ||
| 76 | "reason": "missing_audio", | ||
| 77 | "asset_id": 2, | ||
| 78 | "window_id": 2, | ||
| 79 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "reason": "missing_audio", | ||
| 83 | "asset_id": 3, | ||
| 84 | "window_id": 3, | ||
| 85 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 86 | }, | ||
| 87 | { | ||
| 88 | "reason": "missing_audio", | ||
| 89 | "asset_id": 4, | ||
| 90 | "window_id": 4, | ||
| 91 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 92 | }, | ||
| 93 | { | ||
| 94 | "reason": "missing_audio", | ||
| 95 | "asset_id": 5, | ||
| 96 | "window_id": 5, | ||
| 97 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 98 | } | ||
| 99 | ] | ||
| 100 | } | ||
| 101 | ] | ||
| 102 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:44:05.982252+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:44:05.982252+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:44:05.983441+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 768, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 213 | "expected_dim": 768, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "mert", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "transformers" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "transformers": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "transformers" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "command": "/usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768", | ||
| 3 | "returncode": 1, | ||
| 4 | "stdout": "", | ||
| 5 | "stderr": "failed to update feature_extraction_job=2 with expected_status=pending\n", | ||
| 6 | "expected_guard": "failed to update feature_extraction_job=2 with expected_status=pending", | ||
| 7 | "passed": true | ||
| 8 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -423,6 +423,14 @@ ON audio_window(canonical_song_id); | ... | @@ -423,6 +423,14 @@ ON audio_window(canonical_song_id); |
| 423 | CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index | 423 | CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index |
| 424 | ON audio_window(active_for_index); | 424 | ON audio_window(active_for_index); |
| 425 | 425 | ||
| 426 | CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_window | ||
| 427 | ON audio_embedding(feature_set_id, window_id) | ||
| 428 | WHERE window_id IS NOT NULL; | ||
| 429 | |||
| 430 | CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_asset | ||
| 431 | ON audio_embedding(feature_set_id, asset_id) | ||
| 432 | WHERE window_id IS NULL AND asset_id IS NOT NULL; | ||
| 433 | |||
| 426 | CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id | 434 | CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id |
| 427 | ON audio_embedding(feature_set_id); | 435 | ON audio_embedding(feature_set_id); |
| 428 | 436 | ... | ... |
This diff is collapsed.
Click to expand it.
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 | ||
| 4 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 | ||
| 5 | - 新增 `phase1_worker_embedding_write_attempt.json`、`phase1_worker_embedding_write_guard_report.json` 与 `phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`。 | ||
| 3 | - 更新 `run_chromaprint_job.py` 与 `src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。 | 6 | - 更新 `run_chromaprint_job.py` 与 `src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。 |
| 4 | - 给 `audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。 | 7 | - 给 `audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。 |
| 5 | - 新增 `phase1_worker_chromaprint_write_attempt.json` 与 `phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20` 但 `processed_assets=0`,job 被明确标记为 `failed` 且 `failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。 | 8 | - 新增 `phase1_worker_chromaprint_write_attempt.json` 与 `phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20` 但 `processed_assets=0`,job 被明确标记为 `failed` 且 `failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。 | ... | ... |
| ... | @@ -286,15 +286,75 @@ flowchart TD | ... | @@ -286,15 +286,75 @@ flowchart TD |
| 286 | 286 | ||
| 287 | ### 7.2 Embedding worker | 287 | ### 7.2 Embedding worker |
| 288 | 288 | ||
| 289 | 后续把下面逻辑塞进 `run_embedding_job.py`: | 289 | `run_embedding_job.py` 现在已经不再只是简单 dry-run。当前它已经具备: |
| 290 | 290 | ||
| 291 | 1. 读取 `audio_window` | 291 | 1. 真实读取 `reference_set -> audio_window -> recording_asset` scope |
| 292 | 2. 加载 `MERT` / `MuQ` / `ECAPA` | 292 | 2. 真实检查目标向量表是否存在且与维度匹配 |
| 293 | 3. 提取向量 | 293 | 3. 真实检查模型 runtime 依赖是否齐全 |
| 294 | 4. 写 `audio_embedding` | 294 | 4. 真实检查 source audio 是否存在 |
| 295 | 5. 写 `audio_embedding_vector_<dim>` | 295 | 5. 把 blocker 明确写回 `feature_extraction_job.metadata_json` |
| 296 | 6. 更新 `output_count` | 296 | 6. 在 blocker 存在时把 job 诚实标记为 `failed` |
| 297 | 7. 标记 `completed` | 297 | |
| 298 | ### 当前失败语义 | ||
| 299 | |||
| 300 | semantic lane 当前采用的是 **preflight all-or-nothing**: | ||
| 301 | |||
| 302 | - 只要 scope 内音频路径不可达 / 文件不存在,记为: | ||
| 303 | - `unreadable_audio_assets` | ||
| 304 | - 只要模型 runtime 依赖导入不满足,记为: | ||
| 305 | - `model_runtime_unavailable` | ||
| 306 | - 只要目标向量表非法 / 缺失 / 维度不匹配,记为对应 blocker | ||
| 307 | |||
| 308 | worker 会把这些 blocker 聚合到: | ||
| 309 | |||
| 310 | - `failure_reason = preflight_failed` | ||
| 311 | - `preflight_blockers = [...]` | ||
| 312 | |||
| 313 | 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。 | ||
| 314 | |||
| 315 | ### 当前 live 证据 | ||
| 316 | |||
| 317 | MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: | ||
| 318 | |||
| 319 | - `scope_window_count = 20` | ||
| 320 | - `job_status = failed` | ||
| 321 | - `output_count = 0` | ||
| 322 | - `preflight_blockers = ['unreadable_audio_assets', 'model_runtime_unavailable']` | ||
| 323 | - `runtime_report.missing_dependencies = ['torch', 'torchaudio', 'transformers']` | ||
| 324 | - `audio_embedding_vector_768` 已通过存在性与维度校验 | ||
| 325 | |||
| 326 | 对应产物: | ||
| 327 | |||
| 328 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json` | ||
| 329 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json` | ||
| 330 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json` | ||
| 331 | |||
| 332 | ### 当前幂等保护 | ||
| 333 | |||
| 334 | 为了服务后续真正的 window embedding upsert,`audio_embedding` 现在补了两条唯一键: | ||
| 335 | |||
| 336 | - `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` | ||
| 337 | - `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` | ||
| 338 | |||
| 339 | 这让后续真实 encoder 接入后可以直接做: | ||
| 340 | |||
| 341 | - window 级 embedding upsert | ||
| 342 | - asset 级 embedding upsert | ||
| 343 | |||
| 344 | 而不需要先查再写。 | ||
| 345 | |||
| 346 | ### 下一步替换点 | ||
| 347 | |||
| 348 | 当 runtime 与音频挂载到位后,只需要把 guarded failure path 替换成真实 inference: | ||
| 349 | |||
| 350 | 1. 加载 `MERT` / `MuQ` / `ECAPA` | ||
| 351 | 2. 提取向量 | ||
| 352 | 3. 写 `audio_embedding` | ||
| 353 | 4. 写 `audio_embedding_vector_<dim>` | ||
| 354 | 5. 更新 `output_count` | ||
| 355 | 6. 标记 `completed` | ||
| 356 | |||
| 357 | 也就是说,**PostgreSQL worker contract 已经固定,下一步换的是 encoder adapter,不是 orchestration 结构。** | ||
| 298 | 358 | ||
| 299 | --- | 359 | --- |
| 300 | 360 | ||
| ... | @@ -313,11 +373,11 @@ flowchart TD | ... | @@ -313,11 +373,11 @@ flowchart TD |
| 313 | 373 | ||
| 314 | 当前还没有完成的部分: | 374 | 当前还没有完成的部分: |
| 315 | 375 | ||
| 316 | - 真实 chromaprint 特征写入 | 376 | - exact lane 虽已有真实写入路径,但当前 live 环境仍被 `/workspace/downloads` 缺失阻塞 |
| 317 | - 真实 MERT / MuQ / ECAPA embedding 写入 | 377 | - semantic lane 已有真实 preflight failure contract,但还没有接上真正的 `MERT / MuQ / ECAPA` inference adapter |
| 318 | - `failed` 重试策略 | 378 | - `failed` 重试策略 |
| 319 | - job 分片执行器 | 379 | - job 分片执行器 |
| 320 | - 幂等去重写入策略 | 380 | - 更完整的 embedding artifact / checksum 治理策略 |
| 321 | 381 | ||
| 322 | 但现在已经足够支撑下一阶段: | 382 | 但现在已经足够支撑下一阶段: |
| 323 | 383 | ... | ... |
| ... | @@ -552,6 +552,8 @@ flowchart TD | ... | @@ -552,6 +552,8 @@ flowchart TD |
| 552 | - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘 | 552 | - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘 |
| 553 | - live pgvector 检索和现有 stand-in 逻辑一致 | 553 | - live pgvector 检索和现有 stand-in 逻辑一致 |
| 554 | - `retrieval_candidate` / `match_decision` 可以真实承载在线结果 | 554 | - `retrieval_candidate` / `match_decision` 可以真实承载在线结果 |
| 555 | - semantic worker 已真实验证 preflight failure 语义:既能识别 `/workspace/downloads` 缺失,也能识别 `torch/torchaudio/transformers` 缺失 | ||
| 556 | - `audio_embedding` 已补上 window / asset 双路幂等唯一键,为后续 encoder 真实 upsert 预留稳定主键 | ||
| 555 | 557 | ||
| 556 | ### 未验证 | 558 | ### 未验证 |
| 557 | 559 | ||
| ... | @@ -690,3 +692,50 @@ cd /workspace/acr-engine | ... | @@ -690,3 +692,50 @@ cd /workspace/acr-engine |
| 690 | 692 | ||
| 691 | > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。 | 693 | > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。 |
| 692 | > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。 | 694 | > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。 |
| 695 | |||
| 696 | |||
| 697 | ## 新增:Phase-1 semantic worker live 证据 | ||
| 698 | |||
| 699 | 本轮继续对 `run_embedding_job.py` 做 live PostgreSQL 验证,目标不是伪造 embedding,而是把 **失败语义先固定住**。 | ||
| 700 | |||
| 701 | ### 结果摘要 | ||
| 702 | |||
| 703 | 对 `extraction_job_id=2`(`mert v1-95m`, `5s/2.5s`)执行非 dry-run worker 后: | ||
| 704 | |||
| 705 | | 项 | 结果 | | ||
| 706 | |---|---| | ||
| 707 | | `scope_window_count` | `20` | | ||
| 708 | | `job_status` | `failed` | | ||
| 709 | | `output_count` | `0` | | ||
| 710 | | `failure_reason` | `preflight_failed` | | ||
| 711 | | `preflight_blockers` | `['unreadable_audio_assets', 'model_runtime_unavailable']` | | ||
| 712 | | `vector_table_report.resolved` | `true` | | ||
| 713 | | `audio_embedding_vector_768_count` | `0` | | ||
| 714 | |||
| 715 | 说明: | ||
| 716 | |||
| 717 | - 当前语义 lane 不是“没做事”,而是已经真实走到了 PostgreSQL job scope / runtime / vector table / asset 路径检查 | ||
| 718 | - 只是当前容器同时被两个外部条件挡住: | ||
| 719 | 1. `/workspace/downloads/...` 未挂载 | ||
| 720 | 2. `torch / torchaudio / transformers` 未安装 | ||
| 721 | |||
| 722 | ### 证据文件 | ||
| 723 | |||
| 724 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json` | ||
| 725 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json` | ||
| 726 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json` | ||
| 727 | |||
| 728 | ### 为什么要先补唯一键 | ||
| 729 | |||
| 730 | 当前 `audio_embedding` 已新增: | ||
| 731 | |||
| 732 | - `uq_audio_embedding_feature_window` | ||
| 733 | - `uq_audio_embedding_feature_asset` | ||
| 734 | |||
| 735 | 设计意图是: | ||
| 736 | |||
| 737 | 1. 同一 `feature_set_id + window_id` 的 embedding 重跑时可以稳定 upsert | ||
| 738 | 2. 将来如果有 asset-level embedding,也能独立幂等 | ||
| 739 | 3. 不把幂等职责留给应用层“先查再写” | ||
| 740 | |||
| 741 | 这一步对后续的 `MERT / MuQ / ECAPA` 都通用。 | ... | ... |
| ... | @@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 | 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 |
| 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` | 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` |
| 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 | 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 |
| 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed | ||
| 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 | ||
| 192 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 194 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 193 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 195 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 194 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 196 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment