Make semantic extraction failures auditable before model runtimes land
Constraint: Current container lacks /workspace/downloads and torch/torchaudio/transformers, so Phase-1 semantic work must prove honest failure semantics instead of pretending inference succeeded. Rejected: Stub semantic embeddings | Would blur the contract between real model outputs and repo-local placeholders. Confidence: high Scope-risk: narrow Directive: Keep the preflight blockers explicit until real MERT/MuQ/ECAPA adapters and asset-level embedding tests exist. Tested: /usr/local/miniconda3/bin/python -m py_compile workers/run_embedding_job.py workers/run_chromaprint_job.py workers/_job_common.py scripts/bootstrap_phase1_extraction_jobs_live.py scripts/plan_phase1_extraction_jobs_live.py scripts/bootstrap_phase1_reference_members_live.py scripts/live_pgvector_music20_eval.py; git diff --check; /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test; /usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768 --output data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json Not-tested: Real encoder inference and asset-level embedding upsert path remain unavailable in this container.
Showing
9 changed files
with
806 additions
and
17 deletions
| 1 | { | ||
| 2 | "audio_embedding_count": 20, | ||
| 3 | "audio_embedding_vector_768_count": 0, | ||
| 4 | "job_2": [ | ||
| 5 | 2, | ||
| 6 | "failed", | ||
| 7 | 20, | ||
| 8 | 0, | ||
| 9 | { | ||
| 10 | "lane": "semantic", | ||
| 11 | "role": "primary_baseline", | ||
| 12 | "phase": "phase1", | ||
| 13 | "worker": "run_embedding_job", | ||
| 14 | "dry_run": false, | ||
| 15 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 16 | "vector_table": "audio_embedding_vector_768", | ||
| 17 | "output_target": "audio_embedding", | ||
| 18 | "execution_mode": "preflight_failure", | ||
| 19 | "failure_reason": "preflight_failed", | ||
| 20 | "runtime_report": { | ||
| 21 | "ready": false, | ||
| 22 | "model_name": "mert", | ||
| 23 | "availability": { | ||
| 24 | "numpy": true, | ||
| 25 | "torch": false, | ||
| 26 | "torchaudio": false, | ||
| 27 | "transformers": false | ||
| 28 | }, | ||
| 29 | "requirements": [ | ||
| 30 | "numpy", | ||
| 31 | "torch", | ||
| 32 | "torchaudio", | ||
| 33 | "transformers" | ||
| 34 | ], | ||
| 35 | "missing_dependencies": [ | ||
| 36 | "torch", | ||
| 37 | "torchaudio", | ||
| 38 | "transformers" | ||
| 39 | ] | ||
| 40 | }, | ||
| 41 | "preflight_blockers": [ | ||
| 42 | "unreadable_audio_assets", | ||
| 43 | "model_runtime_unavailable" | ||
| 44 | ], | ||
| 45 | "scope_window_count": 20, | ||
| 46 | "write_target_table": "audio_embedding", | ||
| 47 | "vector_table_report": { | ||
| 48 | "reason": null, | ||
| 49 | "resolved": true, | ||
| 50 | "expected_dim": 768, | ||
| 51 | "table_exists": true, | ||
| 52 | "allowed_vector_tables": [ | ||
| 53 | "audio_embedding_vector_192", | ||
| 54 | "audio_embedding_vector_768" | ||
| 55 | ], | ||
| 56 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 57 | }, | ||
| 58 | "missing_window_count": 20, | ||
| 59 | "target_scope_summary": { | ||
| 60 | "scope_type": "reference_set", | ||
| 61 | "scope_value": "phase1_hot_reference_v1", | ||
| 62 | "recording_count": 20, | ||
| 63 | "reference_set_id": 2, | ||
| 64 | "ready_asset_count": 20, | ||
| 65 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 66 | "active_window_count": 20 | ||
| 67 | }, | ||
| 68 | "missing_window_samples": [ | ||
| 69 | { | ||
| 70 | "reason": "missing_audio", | ||
| 71 | "asset_id": 1, | ||
| 72 | "window_id": 1, | ||
| 73 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 74 | }, | ||
| 75 | { | ||
| 76 | "reason": "missing_audio", | ||
| 77 | "asset_id": 2, | ||
| 78 | "window_id": 2, | ||
| 79 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "reason": "missing_audio", | ||
| 83 | "asset_id": 3, | ||
| 84 | "window_id": 3, | ||
| 85 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 86 | }, | ||
| 87 | { | ||
| 88 | "reason": "missing_audio", | ||
| 89 | "asset_id": 4, | ||
| 90 | "window_id": 4, | ||
| 91 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 92 | }, | ||
| 93 | { | ||
| 94 | "reason": "missing_audio", | ||
| 95 | "asset_id": 5, | ||
| 96 | "window_id": 5, | ||
| 97 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 98 | } | ||
| 99 | ] | ||
| 100 | } | ||
| 101 | ] | ||
| 102 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 20, | ||
| 43 | "ready_asset_count": 20, | ||
| 44 | "active_window_count": 20 | ||
| 45 | }, | ||
| 46 | "scope_window_count": 20, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 2, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 20, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:44:05.982252+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "semantic", | ||
| 57 | "role": "primary_baseline", | ||
| 58 | "phase": "phase1", | ||
| 59 | "worker": "run_embedding_job", | ||
| 60 | "dry_run": false, | ||
| 61 | "vector_table": "audio_embedding_vector_768", | ||
| 62 | "output_target": "audio_embedding", | ||
| 63 | "execution_mode": "preflight", | ||
| 64 | "runtime_report": { | ||
| 65 | "ready": false, | ||
| 66 | "model_name": "mert", | ||
| 67 | "availability": { | ||
| 68 | "numpy": true, | ||
| 69 | "torch": false, | ||
| 70 | "torchaudio": false, | ||
| 71 | "transformers": false | ||
| 72 | }, | ||
| 73 | "requirements": [ | ||
| 74 | "numpy", | ||
| 75 | "torch", | ||
| 76 | "torchaudio", | ||
| 77 | "transformers" | ||
| 78 | ], | ||
| 79 | "missing_dependencies": [ | ||
| 80 | "torch", | ||
| 81 | "torchaudio", | ||
| 82 | "transformers" | ||
| 83 | ] | ||
| 84 | }, | ||
| 85 | "scope_window_count": 20, | ||
| 86 | "vector_table_report": { | ||
| 87 | "reason": null, | ||
| 88 | "resolved": true, | ||
| 89 | "expected_dim": 768, | ||
| 90 | "table_exists": true, | ||
| 91 | "allowed_vector_tables": [ | ||
| 92 | "audio_embedding_vector_192", | ||
| 93 | "audio_embedding_vector_768" | ||
| 94 | ], | ||
| 95 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 96 | }, | ||
| 97 | "target_scope_summary": { | ||
| 98 | "scope_type": "reference_set", | ||
| 99 | "scope_value": "phase1_hot_reference_v1", | ||
| 100 | "recording_count": 20, | ||
| 101 | "reference_set_id": 2, | ||
| 102 | "ready_asset_count": 20, | ||
| 103 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 104 | "active_window_count": 20 | ||
| 105 | } | ||
| 106 | } | ||
| 107 | }, | ||
| 108 | "status_after_complete": null, | ||
| 109 | "status_after_failed": { | ||
| 110 | "extraction_job_id": 2, | ||
| 111 | "job_status": "failed", | ||
| 112 | "input_count": 20, | ||
| 113 | "output_count": 0, | ||
| 114 | "started_at": "2026-06-04T13:44:05.982252+08:00", | ||
| 115 | "finished_at": "2026-06-04T13:44:05.983441+08:00", | ||
| 116 | "log_uri": null, | ||
| 117 | "metadata_json": { | ||
| 118 | "lane": "semantic", | ||
| 119 | "role": "primary_baseline", | ||
| 120 | "phase": "phase1", | ||
| 121 | "worker": "run_embedding_job", | ||
| 122 | "dry_run": false, | ||
| 123 | "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings", | ||
| 124 | "vector_table": "audio_embedding_vector_768", | ||
| 125 | "output_target": "audio_embedding", | ||
| 126 | "execution_mode": "preflight_failure", | ||
| 127 | "failure_reason": "preflight_failed", | ||
| 128 | "runtime_report": { | ||
| 129 | "ready": false, | ||
| 130 | "model_name": "mert", | ||
| 131 | "availability": { | ||
| 132 | "numpy": true, | ||
| 133 | "torch": false, | ||
| 134 | "torchaudio": false, | ||
| 135 | "transformers": false | ||
| 136 | }, | ||
| 137 | "requirements": [ | ||
| 138 | "numpy", | ||
| 139 | "torch", | ||
| 140 | "torchaudio", | ||
| 141 | "transformers" | ||
| 142 | ], | ||
| 143 | "missing_dependencies": [ | ||
| 144 | "torch", | ||
| 145 | "torchaudio", | ||
| 146 | "transformers" | ||
| 147 | ] | ||
| 148 | }, | ||
| 149 | "preflight_blockers": [ | ||
| 150 | "unreadable_audio_assets", | ||
| 151 | "model_runtime_unavailable" | ||
| 152 | ], | ||
| 153 | "scope_window_count": 20, | ||
| 154 | "write_target_table": "audio_embedding", | ||
| 155 | "vector_table_report": { | ||
| 156 | "reason": null, | ||
| 157 | "resolved": true, | ||
| 158 | "expected_dim": 768, | ||
| 159 | "table_exists": true, | ||
| 160 | "allowed_vector_tables": [ | ||
| 161 | "audio_embedding_vector_192", | ||
| 162 | "audio_embedding_vector_768" | ||
| 163 | ], | ||
| 164 | "requested_vector_table": "audio_embedding_vector_768" | ||
| 165 | }, | ||
| 166 | "missing_window_count": 20, | ||
| 167 | "target_scope_summary": { | ||
| 168 | "scope_type": "reference_set", | ||
| 169 | "scope_value": "phase1_hot_reference_v1", | ||
| 170 | "recording_count": 20, | ||
| 171 | "reference_set_id": 2, | ||
| 172 | "ready_asset_count": 20, | ||
| 173 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 174 | "active_window_count": 20 | ||
| 175 | }, | ||
| 176 | "missing_window_samples": [ | ||
| 177 | { | ||
| 178 | "reason": "missing_audio", | ||
| 179 | "asset_id": 1, | ||
| 180 | "window_id": 1, | ||
| 181 | "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "reason": "missing_audio", | ||
| 185 | "asset_id": 2, | ||
| 186 | "window_id": 2, | ||
| 187 | "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav" | ||
| 188 | }, | ||
| 189 | { | ||
| 190 | "reason": "missing_audio", | ||
| 191 | "asset_id": 3, | ||
| 192 | "window_id": 3, | ||
| 193 | "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav" | ||
| 194 | }, | ||
| 195 | { | ||
| 196 | "reason": "missing_audio", | ||
| 197 | "asset_id": 4, | ||
| 198 | "window_id": 4, | ||
| 199 | "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav" | ||
| 200 | }, | ||
| 201 | { | ||
| 202 | "reason": "missing_audio", | ||
| 203 | "asset_id": 5, | ||
| 204 | "window_id": 5, | ||
| 205 | "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav" | ||
| 206 | } | ||
| 207 | ] | ||
| 208 | } | ||
| 209 | }, | ||
| 210 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 211 | "vector_table_report": { | ||
| 212 | "requested_vector_table": "audio_embedding_vector_768", | ||
| 213 | "expected_dim": 768, | ||
| 214 | "allowed_vector_tables": [ | ||
| 215 | "audio_embedding_vector_192", | ||
| 216 | "audio_embedding_vector_768" | ||
| 217 | ], | ||
| 218 | "resolved": true, | ||
| 219 | "table_exists": true, | ||
| 220 | "reason": null | ||
| 221 | }, | ||
| 222 | "runtime_report": { | ||
| 223 | "model_name": "mert", | ||
| 224 | "requirements": [ | ||
| 225 | "numpy", | ||
| 226 | "torch", | ||
| 227 | "torchaudio", | ||
| 228 | "transformers" | ||
| 229 | ], | ||
| 230 | "availability": { | ||
| 231 | "numpy": true, | ||
| 232 | "torch": false, | ||
| 233 | "torchaudio": false, | ||
| 234 | "transformers": false | ||
| 235 | }, | ||
| 236 | "missing_dependencies": [ | ||
| 237 | "torch", | ||
| 238 | "torchaudio", | ||
| 239 | "transformers" | ||
| 240 | ], | ||
| 241 | "ready": false | ||
| 242 | }, | ||
| 243 | "processed_windows": [], | ||
| 244 | "notes": [ | ||
| 245 | "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics", | ||
| 246 | "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys" | ||
| 247 | ] | ||
| 248 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "command": "/usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768", | ||
| 3 | "returncode": 1, | ||
| 4 | "stdout": "", | ||
| 5 | "stderr": "failed to update feature_extraction_job=2 with expected_status=pending\n", | ||
| 6 | "expected_guard": "failed to update feature_extraction_job=2 with expected_status=pending", | ||
| 7 | "passed": true | ||
| 8 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -423,6 +423,14 @@ ON audio_window(canonical_song_id); | ... | @@ -423,6 +423,14 @@ ON audio_window(canonical_song_id); |
| 423 | CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index | 423 | CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index |
| 424 | ON audio_window(active_for_index); | 424 | ON audio_window(active_for_index); |
| 425 | 425 | ||
| 426 | CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_window | ||
| 427 | ON audio_embedding(feature_set_id, window_id) | ||
| 428 | WHERE window_id IS NOT NULL; | ||
| 429 | |||
| 430 | CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_asset | ||
| 431 | ON audio_embedding(feature_set_id, asset_id) | ||
| 432 | WHERE window_id IS NULL AND asset_id IS NOT NULL; | ||
| 433 | |||
| 426 | CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id | 434 | CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id |
| 427 | ON audio_embedding(feature_set_id); | 435 | ON audio_embedding(feature_set_id); |
| 428 | 436 | ... | ... |
| ... | @@ -2,7 +2,13 @@ | ... | @@ -2,7 +2,13 @@ |
| 2 | from __future__ import annotations | 2 | from __future__ import annotations |
| 3 | 3 | ||
| 4 | import argparse | 4 | import argparse |
| 5 | import json | ||
| 6 | import math | ||
| 5 | import os | 7 | import os |
| 8 | from pathlib import Path | ||
| 9 | from typing import Any | ||
| 10 | |||
| 11 | from psycopg import sql | ||
| 6 | 12 | ||
| 7 | from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status | 13 | from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status |
| 8 | 14 | ||
| ... | @@ -10,6 +16,229 @@ VECTOR_TABLE_BY_DIM = { | ... | @@ -10,6 +16,229 @@ VECTOR_TABLE_BY_DIM = { |
| 10 | 192: 'audio_embedding_vector_192', | 16 | 192: 'audio_embedding_vector_192', |
| 11 | 768: 'audio_embedding_vector_768', | 17 | 768: 'audio_embedding_vector_768', |
| 12 | } | 18 | } |
| 19 | MODEL_RUNTIME_REQUIREMENTS = { | ||
| 20 | 'mert': ('numpy', 'torch', 'torchaudio', 'transformers'), | ||
| 21 | 'muq': ('numpy', 'torch', 'torchaudio', 'transformers'), | ||
| 22 | 'ecapa': ('numpy', 'torch', 'torchaudio', 'speechbrain'), | ||
| 23 | } | ||
| 24 | ALLOWED_VECTOR_TABLES = set(VECTOR_TABLE_BY_DIM.values()) | ||
| 25 | |||
| 26 | |||
| 27 | def fetch_scope_windows(conn, target_scope: str) -> list[dict[str, object]]: | ||
| 28 | if not target_scope.startswith('reference_set:'): | ||
| 29 | raise SystemExit(f'unsupported target_scope for embedding worker: {target_scope}') | ||
| 30 | set_name = target_scope.split(':', 1)[1] | ||
| 31 | rows = conn.execute( | ||
| 32 | """ | ||
| 33 | SELECT | ||
| 34 | aw.window_id, | ||
| 35 | aw.asset_id, | ||
| 36 | aw.window_index, | ||
| 37 | aw.start_sec, | ||
| 38 | aw.end_sec, | ||
| 39 | aw.duration_sec, | ||
| 40 | aw.recording_id, | ||
| 41 | aw.work_id, | ||
| 42 | aw.canonical_song_id, | ||
| 43 | ra.storage_uri, | ||
| 44 | ra.ingest_status, | ||
| 45 | aw.active_for_index | ||
| 46 | FROM reference_set_registry rs | ||
| 47 | JOIN reference_set_member rsm ON rsm.reference_set_id = rs.reference_set_id | ||
| 48 | JOIN audio_window aw ON aw.recording_id = rsm.recording_id | ||
| 49 | JOIN recording_asset ra ON ra.asset_id = aw.asset_id | ||
| 50 | WHERE rs.set_name = %s | ||
| 51 | AND aw.active_for_index = TRUE | ||
| 52 | AND ra.ingest_status = 'ready' | ||
| 53 | ORDER BY aw.window_id; | ||
| 54 | """, | ||
| 55 | (set_name,), | ||
| 56 | ).fetchall() | ||
| 57 | return [ | ||
| 58 | { | ||
| 59 | 'window_id': int(row[0]), | ||
| 60 | 'asset_id': int(row[1]), | ||
| 61 | 'window_index': int(row[2]), | ||
| 62 | 'start_sec': float(row[3]), | ||
| 63 | 'end_sec': float(row[4]), | ||
| 64 | 'duration_sec': float(row[5]), | ||
| 65 | 'recording_id': int(row[6]), | ||
| 66 | 'work_id': int(row[7]), | ||
| 67 | 'canonical_song_id': int(row[8]), | ||
| 68 | 'storage_uri': row[9], | ||
| 69 | 'ingest_status': row[10], | ||
| 70 | 'active_for_index': bool(row[11]), | ||
| 71 | } | ||
| 72 | for row in rows | ||
| 73 | ] | ||
| 74 | |||
| 75 | |||
| 76 | def detect_runtime(model_name: str) -> dict[str, Any]: | ||
| 77 | checks: dict[str, Any] = {'model_name': model_name, 'requirements': list(MODEL_RUNTIME_REQUIREMENTS.get(model_name, ('numpy',)))} | ||
| 78 | availability: dict[str, bool] = {} | ||
| 79 | missing: list[str] = [] | ||
| 80 | for package_name in checks['requirements']: | ||
| 81 | try: | ||
| 82 | __import__(package_name) | ||
| 83 | availability[package_name] = True | ||
| 84 | except Exception: # noqa: BLE001 | ||
| 85 | availability[package_name] = False | ||
| 86 | missing.append(package_name) | ||
| 87 | checks['availability'] = availability | ||
| 88 | checks['missing_dependencies'] = missing | ||
| 89 | checks['ready'] = not missing | ||
| 90 | return checks | ||
| 91 | |||
| 92 | |||
| 93 | def validate_vector_table(conn, vector_table: str | None, expected_dim: int | None) -> dict[str, Any]: | ||
| 94 | payload = { | ||
| 95 | 'requested_vector_table': vector_table, | ||
| 96 | 'expected_dim': expected_dim, | ||
| 97 | 'allowed_vector_tables': sorted(ALLOWED_VECTOR_TABLES), | ||
| 98 | 'resolved': False, | ||
| 99 | 'table_exists': False, | ||
| 100 | 'reason': None, | ||
| 101 | } | ||
| 102 | if not vector_table: | ||
| 103 | payload['reason'] = 'missing_vector_table' | ||
| 104 | return payload | ||
| 105 | if vector_table not in ALLOWED_VECTOR_TABLES: | ||
| 106 | payload['reason'] = 'vector_table_not_allowlisted' | ||
| 107 | return payload | ||
| 108 | dim_from_table = 192 if vector_table.endswith('_192') else 768 if vector_table.endswith('_768') else None | ||
| 109 | if expected_dim is not None and dim_from_table is not None and dim_from_table != expected_dim: | ||
| 110 | payload['reason'] = 'vector_table_dim_mismatch' | ||
| 111 | return payload | ||
| 112 | row = conn.execute('SELECT to_regclass(%s);', (vector_table,)).fetchone() | ||
| 113 | payload['table_exists'] = bool(row and row[0]) | ||
| 114 | if not payload['table_exists']: | ||
| 115 | payload['reason'] = 'vector_table_missing_in_schema' | ||
| 116 | return payload | ||
| 117 | payload['resolved'] = True | ||
| 118 | return payload | ||
| 119 | |||
| 120 | |||
| 121 | def build_artifact_path(artifact_dir: Path, *, extraction_job_id: int, window_id: int) -> Path: | ||
| 122 | artifact_dir.mkdir(parents=True, exist_ok=True) | ||
| 123 | return artifact_dir / f'job{extraction_job_id}_window{window_id}.json' | ||
| 124 | |||
| 125 | |||
| 126 | def vector_literal(values: list[float]) -> str: | ||
| 127 | return '[' + ','.join(f'{value:.10f}' for value in values) + ']' | ||
| 128 | |||
| 129 | |||
| 130 | def compute_vector_norm(values: list[float]) -> float: | ||
| 131 | return math.sqrt(sum(value * value for value in values)) | ||
| 132 | |||
| 133 | |||
| 134 | def upsert_audio_embedding( | ||
| 135 | conn, | ||
| 136 | *, | ||
| 137 | feature_set_id: int, | ||
| 138 | extraction_job_id: int, | ||
| 139 | vector_table: str, | ||
| 140 | window: dict[str, object], | ||
| 141 | embedding_uri: str, | ||
| 142 | embedding: list[float], | ||
| 143 | checksum: str | None, | ||
| 144 | metadata_json: dict[str, object], | ||
| 145 | ) -> tuple[int, str]: | ||
| 146 | row = conn.execute( | ||
| 147 | """ | ||
| 148 | INSERT INTO audio_embedding ( | ||
| 149 | feature_set_id, extraction_job_id, asset_id, window_id, recording_id, work_id, | ||
| 150 | canonical_song_id, embedding_storage_mode, embedding_uri, vector_norm, checksum, | ||
| 151 | is_indexed, metadata_json | ||
| 152 | ) VALUES ( | ||
| 153 | %s, %s, %s, %s, %s, %s, | ||
| 154 | %s, %s, %s, %s, %s, | ||
| 155 | TRUE, %s::jsonb | ||
| 156 | ) | ||
| 157 | ON CONFLICT (feature_set_id, window_id) WHERE window_id IS NOT NULL | ||
| 158 | DO UPDATE SET | ||
| 159 | extraction_job_id = EXCLUDED.extraction_job_id, | ||
| 160 | asset_id = EXCLUDED.asset_id, | ||
| 161 | recording_id = EXCLUDED.recording_id, | ||
| 162 | work_id = EXCLUDED.work_id, | ||
| 163 | canonical_song_id = EXCLUDED.canonical_song_id, | ||
| 164 | embedding_storage_mode = EXCLUDED.embedding_storage_mode, | ||
| 165 | embedding_uri = EXCLUDED.embedding_uri, | ||
| 166 | vector_norm = EXCLUDED.vector_norm, | ||
| 167 | checksum = EXCLUDED.checksum, | ||
| 168 | is_indexed = EXCLUDED.is_indexed, | ||
| 169 | metadata_json = EXCLUDED.metadata_json | ||
| 170 | RETURNING embedding_id, xmax = 0 AS inserted; | ||
| 171 | """, | ||
| 172 | ( | ||
| 173 | feature_set_id, | ||
| 174 | extraction_job_id, | ||
| 175 | window['asset_id'], | ||
| 176 | window['window_id'], | ||
| 177 | window['recording_id'], | ||
| 178 | window['work_id'], | ||
| 179 | window['canonical_song_id'], | ||
| 180 | 'pgvector_inline', | ||
| 181 | embedding_uri, | ||
| 182 | compute_vector_norm(embedding), | ||
| 183 | checksum, | ||
| 184 | json.dumps(metadata_json, ensure_ascii=False), | ||
| 185 | ), | ||
| 186 | ).fetchone() | ||
| 187 | embedding_id = int(row[0]) | ||
| 188 | inserted = bool(row[1]) | ||
| 189 | conn.execute( | ||
| 190 | sql.SQL( | ||
| 191 | """ | ||
| 192 | INSERT INTO {vector_table} (embedding_id, embedding) | ||
| 193 | VALUES (%s, %s::vector) | ||
| 194 | ON CONFLICT (embedding_id) | ||
| 195 | DO UPDATE SET embedding = EXCLUDED.embedding; | ||
| 196 | """ | ||
| 197 | ).format(vector_table=sql.Identifier(vector_table)), | ||
| 198 | (embedding_id, vector_literal(embedding)), | ||
| 199 | ) | ||
| 200 | return embedding_id, 'inserted' if inserted else 'updated' | ||
| 201 | |||
| 202 | |||
| 203 | def fail_job( | ||
| 204 | conn, | ||
| 205 | *, | ||
| 206 | job, | ||
| 207 | blockers: list[str], | ||
| 208 | output_target: str, | ||
| 209 | resolved_vector_table: str | None, | ||
| 210 | artifact_dir: Path, | ||
| 211 | scope: dict[str, Any], | ||
| 212 | scope_windows: list[dict[str, object]], | ||
| 213 | missing_windows: list[dict[str, object]], | ||
| 214 | runtime_report: dict[str, Any], | ||
| 215 | vector_table_report: dict[str, Any], | ||
| 216 | ) -> dict[str, Any]: | ||
| 217 | return update_job_status( | ||
| 218 | conn, | ||
| 219 | job.extraction_job_id, | ||
| 220 | status='failed', | ||
| 221 | expected_status='running', | ||
| 222 | output_count=0, | ||
| 223 | metadata_patch={ | ||
| 224 | 'worker': 'run_embedding_job', | ||
| 225 | 'output_target': output_target, | ||
| 226 | 'vector_table': resolved_vector_table, | ||
| 227 | 'dry_run': False, | ||
| 228 | 'write_target_table': output_target, | ||
| 229 | 'artifact_dir': str(artifact_dir), | ||
| 230 | 'execution_mode': 'preflight_failure', | ||
| 231 | 'failure_reason': 'preflight_failed', | ||
| 232 | 'preflight_blockers': blockers, | ||
| 233 | 'scope_window_count': len(scope_windows), | ||
| 234 | 'missing_window_count': len(missing_windows), | ||
| 235 | 'missing_window_samples': missing_windows[:5], | ||
| 236 | 'runtime_report': runtime_report, | ||
| 237 | 'vector_table_report': vector_table_report, | ||
| 238 | 'target_scope_summary': scope, | ||
| 239 | }, | ||
| 240 | set_finished_at=True, | ||
| 241 | ) | ||
| 13 | 242 | ||
| 14 | 243 | ||
| 15 | def main() -> None: | 244 | def main() -> None: |
| ... | @@ -22,6 +251,7 @@ def main() -> None: | ... | @@ -22,6 +251,7 @@ def main() -> None: |
| 22 | ap.add_argument('--vector-table', default=os.environ.get('VECTOR_TABLE')) | 251 | ap.add_argument('--vector-table', default=os.environ.get('VECTOR_TABLE')) |
| 23 | ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_embedding')) | 252 | ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_embedding')) |
| 24 | ap.add_argument('--complete-dry-run', action='store_true') | 253 | ap.add_argument('--complete-dry-run', action='store_true') |
| 254 | ap.add_argument('--artifact-dir', default=os.environ.get('ARTIFACT_DIR', 'data/pgvector_eval/music20/phase1_embeddings')) | ||
| 25 | ap.add_argument('--output') | 255 | ap.add_argument('--output') |
| 26 | args = ap.parse_args() | 256 | args = ap.parse_args() |
| 27 | 257 | ||
| ... | @@ -30,6 +260,8 @@ def main() -> None: | ... | @@ -30,6 +260,8 @@ def main() -> None: |
| 30 | if not args.job_id: | 260 | if not args.job_id: |
| 31 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') | 261 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') |
| 32 | 262 | ||
| 263 | artifact_dir = Path(args.artifact_dir) | ||
| 264 | |||
| 33 | with connect(args.dsn, args.schema) as conn: | 265 | with connect(args.dsn, args.schema) as conn: |
| 34 | job = fetch_job_context(conn, args.job_id) | 266 | job = fetch_job_context(conn, args.job_id) |
| 35 | if job.model_name == 'chromaprint': | 267 | if job.model_name == 'chromaprint': |
| ... | @@ -43,25 +275,37 @@ def main() -> None: | ... | @@ -43,25 +275,37 @@ def main() -> None: |
| 43 | raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') | 275 | raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') |
| 44 | if args.model_version and job.model_version != args.model_version: | 276 | if args.model_version and job.model_version != args.model_version: |
| 45 | raise SystemExit(f'model version mismatch: job={job.model_version} cli={args.model_version}') | 277 | raise SystemExit(f'model version mismatch: job={job.model_version} cli={args.model_version}') |
| 46 | resolved_vector_table = args.vector_table or VECTOR_TABLE_BY_DIM.get(job.embedding_dim or job.output_embedding_dim or -1) | 278 | |
| 279 | expected_dim = job.embedding_dim or job.output_embedding_dim | ||
| 280 | resolved_vector_table = args.vector_table or VECTOR_TABLE_BY_DIM.get(expected_dim or -1) | ||
| 47 | scope = resolve_scope_summary(conn, job.target_scope) | 281 | scope = resolve_scope_summary(conn, job.target_scope) |
| 282 | scope_windows = fetch_scope_windows(conn, job.target_scope) | ||
| 283 | runtime_report = detect_runtime(job.model_name) | ||
| 284 | vector_table_report = validate_vector_table(conn, resolved_vector_table, expected_dim) | ||
| 285 | |||
| 48 | running = update_job_status( | 286 | running = update_job_status( |
| 49 | conn, | 287 | conn, |
| 50 | job.extraction_job_id, | 288 | job.extraction_job_id, |
| 51 | status='running', | 289 | status='running', |
| 52 | expected_status='pending', | 290 | expected_status='pending', |
| 53 | input_count=scope['active_window_count'] or scope['ready_asset_count'], | 291 | input_count=len(scope_windows), |
| 54 | metadata_patch={ | 292 | metadata_patch={ |
| 55 | 'worker': 'run_embedding_job', | 293 | 'worker': 'run_embedding_job', |
| 56 | 'output_target': args.output_target, | 294 | 'output_target': args.output_target, |
| 57 | 'vector_table': resolved_vector_table, | 295 | 'vector_table': resolved_vector_table, |
| 58 | 'dry_run': True, | 296 | 'dry_run': bool(args.complete_dry_run), |
| 59 | 'target_scope_summary': scope, | 297 | 'target_scope_summary': scope, |
| 60 | 'execution_mode': 'dry_run', | 298 | 'execution_mode': 'dry_run' if args.complete_dry_run else 'preflight', |
| 299 | 'runtime_report': runtime_report, | ||
| 300 | 'vector_table_report': vector_table_report, | ||
| 301 | 'scope_window_count': len(scope_windows), | ||
| 61 | }, | 302 | }, |
| 62 | set_started_at=True, | 303 | set_started_at=True, |
| 63 | ) | 304 | ) |
| 64 | completed = None | 305 | completed = None |
| 306 | failed = None | ||
| 307 | processed_windows: list[dict[str, object]] = [] | ||
| 308 | |||
| 65 | if args.complete_dry_run: | 309 | if args.complete_dry_run: |
| 66 | completed = update_job_status( | 310 | completed = update_job_status( |
| 67 | conn, | 311 | conn, |
| ... | @@ -76,9 +320,69 @@ def main() -> None: | ... | @@ -76,9 +320,69 @@ def main() -> None: |
| 76 | 'dry_run': True, | 320 | 'dry_run': True, |
| 77 | 'dry_run_result': 'completed_without_feature_write', | 321 | 'dry_run_result': 'completed_without_feature_write', |
| 78 | 'write_target_table': args.output_target, | 322 | 'write_target_table': args.output_target, |
| 323 | 'scope_window_count': len(scope_windows), | ||
| 324 | 'runtime_report': runtime_report, | ||
| 325 | 'vector_table_report': vector_table_report, | ||
| 79 | }, | 326 | }, |
| 80 | set_finished_at=True, | 327 | set_finished_at=True, |
| 81 | ) | 328 | ) |
| 329 | else: | ||
| 330 | missing_windows: list[dict[str, object]] = [] | ||
| 331 | for window in scope_windows: | ||
| 332 | asset_path = Path(str(window['storage_uri'])) | ||
| 333 | if not asset_path.exists(): | ||
| 334 | missing_windows.append({ | ||
| 335 | 'window_id': window['window_id'], | ||
| 336 | 'asset_id': window['asset_id'], | ||
| 337 | 'storage_uri': str(asset_path), | ||
| 338 | 'reason': 'missing_audio', | ||
| 339 | }) | ||
| 340 | |||
| 341 | blockers: list[str] = [] | ||
| 342 | if missing_windows: | ||
| 343 | blockers.append('unreadable_audio_assets') | ||
| 344 | if not vector_table_report['resolved']: | ||
| 345 | blockers.append(str(vector_table_report['reason'])) | ||
| 346 | if not runtime_report['ready']: | ||
| 347 | blockers.append('model_runtime_unavailable') | ||
| 348 | |||
| 349 | if blockers: | ||
| 350 | failed = fail_job( | ||
| 351 | conn, | ||
| 352 | job=job, | ||
| 353 | blockers=blockers, | ||
| 354 | output_target=args.output_target, | ||
| 355 | resolved_vector_table=resolved_vector_table, | ||
| 356 | artifact_dir=artifact_dir, | ||
| 357 | scope=scope, | ||
| 358 | scope_windows=scope_windows, | ||
| 359 | missing_windows=missing_windows, | ||
| 360 | runtime_report=runtime_report, | ||
| 361 | vector_table_report=vector_table_report, | ||
| 362 | ) | ||
| 363 | else: | ||
| 364 | failed = update_job_status( | ||
| 365 | conn, | ||
| 366 | job.extraction_job_id, | ||
| 367 | status='failed', | ||
| 368 | expected_status='running', | ||
| 369 | output_count=0, | ||
| 370 | metadata_patch={ | ||
| 371 | 'worker': 'run_embedding_job', | ||
| 372 | 'output_target': args.output_target, | ||
| 373 | 'vector_table': resolved_vector_table, | ||
| 374 | 'dry_run': False, | ||
| 375 | 'write_target_table': args.output_target, | ||
| 376 | 'artifact_dir': str(artifact_dir), | ||
| 377 | 'execution_mode': 'write_attempt', | ||
| 378 | 'failure_reason': 'encoder_inference_not_implemented', | ||
| 379 | 'scope_window_count': len(scope_windows), | ||
| 380 | 'runtime_report': runtime_report, | ||
| 381 | 'vector_table_report': vector_table_report, | ||
| 382 | 'next_expected_step': 'replace the guarded failure path with real model inference while keeping the same upsert contract', | ||
| 383 | }, | ||
| 384 | set_finished_at=True, | ||
| 385 | ) | ||
| 82 | 386 | ||
| 83 | emit_payload( | 387 | emit_payload( |
| 84 | { | 388 | { |
| ... | @@ -86,12 +390,17 @@ def main() -> None: | ... | @@ -86,12 +390,17 @@ def main() -> None: |
| 86 | 'schema': args.schema, | 390 | 'schema': args.schema, |
| 87 | 'job': job.__dict__, | 391 | 'job': job.__dict__, |
| 88 | 'target_scope_summary': scope, | 392 | 'target_scope_summary': scope, |
| 393 | 'scope_window_count': len(scope_windows), | ||
| 89 | 'status_after_start': running, | 394 | 'status_after_start': running, |
| 90 | 'status_after_complete': completed, | 395 | 'status_after_complete': completed, |
| 396 | 'status_after_failed': failed, | ||
| 91 | 'resolved_vector_table': resolved_vector_table, | 397 | 'resolved_vector_table': resolved_vector_table, |
| 398 | 'vector_table_report': vector_table_report, | ||
| 399 | 'runtime_report': runtime_report, | ||
| 400 | 'processed_windows': processed_windows, | ||
| 92 | 'notes': [ | 401 | 'notes': [ |
| 93 | 'this worker currently validates planner -> job -> PostgreSQL state flow', | 402 | 'this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics', |
| 94 | 'real encoder inference can replace dry_run while preserving the same job contract', | 403 | 'real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys', |
| 95 | ], | 404 | ], |
| 96 | }, | 405 | }, |
| 97 | args.output, | 406 | args.output, | ... | ... |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 | ||
| 4 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 | ||
| 5 | - 新增 `phase1_worker_embedding_write_attempt.json`、`phase1_worker_embedding_write_guard_report.json` 与 `phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`。 | ||
| 3 | - 更新 `run_chromaprint_job.py` 与 `src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。 | 6 | - 更新 `run_chromaprint_job.py` 与 `src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。 |
| 4 | - 给 `audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。 | 7 | - 给 `audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。 |
| 5 | - 新增 `phase1_worker_chromaprint_write_attempt.json` 与 `phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20` 但 `processed_assets=0`,job 被明确标记为 `failed` 且 `failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。 | 8 | - 新增 `phase1_worker_chromaprint_write_attempt.json` 与 `phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20` 但 `processed_assets=0`,job 被明确标记为 `failed` 且 `failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。 | ... | ... |
| ... | @@ -286,15 +286,75 @@ flowchart TD | ... | @@ -286,15 +286,75 @@ flowchart TD |
| 286 | 286 | ||
| 287 | ### 7.2 Embedding worker | 287 | ### 7.2 Embedding worker |
| 288 | 288 | ||
| 289 | 后续把下面逻辑塞进 `run_embedding_job.py`: | 289 | `run_embedding_job.py` 现在已经不再只是简单 dry-run。当前它已经具备: |
| 290 | 290 | ||
| 291 | 1. 读取 `audio_window` | 291 | 1. 真实读取 `reference_set -> audio_window -> recording_asset` scope |
| 292 | 2. 加载 `MERT` / `MuQ` / `ECAPA` | 292 | 2. 真实检查目标向量表是否存在且与维度匹配 |
| 293 | 3. 提取向量 | 293 | 3. 真实检查模型 runtime 依赖是否齐全 |
| 294 | 4. 写 `audio_embedding` | 294 | 4. 真实检查 source audio 是否存在 |
| 295 | 5. 写 `audio_embedding_vector_<dim>` | 295 | 5. 把 blocker 明确写回 `feature_extraction_job.metadata_json` |
| 296 | 6. 更新 `output_count` | 296 | 6. 在 blocker 存在时把 job 诚实标记为 `failed` |
| 297 | 7. 标记 `completed` | 297 | |
| 298 | ### 当前失败语义 | ||
| 299 | |||
| 300 | semantic lane 当前采用的是 **preflight all-or-nothing**: | ||
| 301 | |||
| 302 | - 只要 scope 内音频路径不可达 / 文件不存在,记为: | ||
| 303 | - `unreadable_audio_assets` | ||
| 304 | - 只要模型 runtime 依赖导入不满足,记为: | ||
| 305 | - `model_runtime_unavailable` | ||
| 306 | - 只要目标向量表非法 / 缺失 / 维度不匹配,记为对应 blocker | ||
| 307 | |||
| 308 | worker 会把这些 blocker 聚合到: | ||
| 309 | |||
| 310 | - `failure_reason = preflight_failed` | ||
| 311 | - `preflight_blockers = [...]` | ||
| 312 | |||
| 313 | 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。 | ||
| 314 | |||
| 315 | ### 当前 live 证据 | ||
| 316 | |||
| 317 | MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: | ||
| 318 | |||
| 319 | - `scope_window_count = 20` | ||
| 320 | - `job_status = failed` | ||
| 321 | - `output_count = 0` | ||
| 322 | - `preflight_blockers = ['unreadable_audio_assets', 'model_runtime_unavailable']` | ||
| 323 | - `runtime_report.missing_dependencies = ['torch', 'torchaudio', 'transformers']` | ||
| 324 | - `audio_embedding_vector_768` 已通过存在性与维度校验 | ||
| 325 | |||
| 326 | 对应产物: | ||
| 327 | |||
| 328 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json` | ||
| 329 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json` | ||
| 330 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json` | ||
| 331 | |||
| 332 | ### 当前幂等保护 | ||
| 333 | |||
| 334 | 为了服务后续真正的 window embedding upsert,`audio_embedding` 现在补了两条唯一键: | ||
| 335 | |||
| 336 | - `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` | ||
| 337 | - `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` | ||
| 338 | |||
| 339 | 这让后续真实 encoder 接入后可以直接做: | ||
| 340 | |||
| 341 | - window 级 embedding upsert | ||
| 342 | - asset 级 embedding upsert | ||
| 343 | |||
| 344 | 而不需要先查再写。 | ||
| 345 | |||
| 346 | ### 下一步替换点 | ||
| 347 | |||
| 348 | 当 runtime 与音频挂载到位后,只需要把 guarded failure path 替换成真实 inference: | ||
| 349 | |||
| 350 | 1. 加载 `MERT` / `MuQ` / `ECAPA` | ||
| 351 | 2. 提取向量 | ||
| 352 | 3. 写 `audio_embedding` | ||
| 353 | 4. 写 `audio_embedding_vector_<dim>` | ||
| 354 | 5. 更新 `output_count` | ||
| 355 | 6. 标记 `completed` | ||
| 356 | |||
| 357 | 也就是说,**PostgreSQL worker contract 已经固定,下一步换的是 encoder adapter,不是 orchestration 结构。** | ||
| 298 | 358 | ||
| 299 | --- | 359 | --- |
| 300 | 360 | ||
| ... | @@ -313,11 +373,11 @@ flowchart TD | ... | @@ -313,11 +373,11 @@ flowchart TD |
| 313 | 373 | ||
| 314 | 当前还没有完成的部分: | 374 | 当前还没有完成的部分: |
| 315 | 375 | ||
| 316 | - 真实 chromaprint 特征写入 | 376 | - exact lane 虽已有真实写入路径,但当前 live 环境仍被 `/workspace/downloads` 缺失阻塞 |
| 317 | - 真实 MERT / MuQ / ECAPA embedding 写入 | 377 | - semantic lane 已有真实 preflight failure contract,但还没有接上真正的 `MERT / MuQ / ECAPA` inference adapter |
| 318 | - `failed` 重试策略 | 378 | - `failed` 重试策略 |
| 319 | - job 分片执行器 | 379 | - job 分片执行器 |
| 320 | - 幂等去重写入策略 | 380 | - 更完整的 embedding artifact / checksum 治理策略 |
| 321 | 381 | ||
| 322 | 但现在已经足够支撑下一阶段: | 382 | 但现在已经足够支撑下一阶段: |
| 323 | 383 | ... | ... |
| ... | @@ -552,6 +552,8 @@ flowchart TD | ... | @@ -552,6 +552,8 @@ flowchart TD |
| 552 | - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘 | 552 | - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘 |
| 553 | - live pgvector 检索和现有 stand-in 逻辑一致 | 553 | - live pgvector 检索和现有 stand-in 逻辑一致 |
| 554 | - `retrieval_candidate` / `match_decision` 可以真实承载在线结果 | 554 | - `retrieval_candidate` / `match_decision` 可以真实承载在线结果 |
| 555 | - semantic worker 已真实验证 preflight failure 语义:既能识别 `/workspace/downloads` 缺失,也能识别 `torch/torchaudio/transformers` 缺失 | ||
| 556 | - `audio_embedding` 已补上 window / asset 双路幂等唯一键,为后续 encoder 真实 upsert 预留稳定主键 | ||
| 555 | 557 | ||
| 556 | ### 未验证 | 558 | ### 未验证 |
| 557 | 559 | ||
| ... | @@ -690,3 +692,50 @@ cd /workspace/acr-engine | ... | @@ -690,3 +692,50 @@ cd /workspace/acr-engine |
| 690 | 692 | ||
| 691 | > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。 | 693 | > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。 |
| 692 | > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。 | 694 | > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。 |
| 695 | |||
| 696 | |||
| 697 | ## 新增:Phase-1 semantic worker live 证据 | ||
| 698 | |||
| 699 | 本轮继续对 `run_embedding_job.py` 做 live PostgreSQL 验证,目标不是伪造 embedding,而是把 **失败语义先固定住**。 | ||
| 700 | |||
| 701 | ### 结果摘要 | ||
| 702 | |||
| 703 | 对 `extraction_job_id=2`(`mert v1-95m`, `5s/2.5s`)执行非 dry-run worker 后: | ||
| 704 | |||
| 705 | | 项 | 结果 | | ||
| 706 | |---|---| | ||
| 707 | | `scope_window_count` | `20` | | ||
| 708 | | `job_status` | `failed` | | ||
| 709 | | `output_count` | `0` | | ||
| 710 | | `failure_reason` | `preflight_failed` | | ||
| 711 | | `preflight_blockers` | `['unreadable_audio_assets', 'model_runtime_unavailable']` | | ||
| 712 | | `vector_table_report.resolved` | `true` | | ||
| 713 | | `audio_embedding_vector_768_count` | `0` | | ||
| 714 | |||
| 715 | 说明: | ||
| 716 | |||
| 717 | - 当前语义 lane 不是“没做事”,而是已经真实走到了 PostgreSQL job scope / runtime / vector table / asset 路径检查 | ||
| 718 | - 只是当前容器同时被两个外部条件挡住: | ||
| 719 | 1. `/workspace/downloads/...` 未挂载 | ||
| 720 | 2. `torch / torchaudio / transformers` 未安装 | ||
| 721 | |||
| 722 | ### 证据文件 | ||
| 723 | |||
| 724 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json` | ||
| 725 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json` | ||
| 726 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json` | ||
| 727 | |||
| 728 | ### 为什么要先补唯一键 | ||
| 729 | |||
| 730 | 当前 `audio_embedding` 已新增: | ||
| 731 | |||
| 732 | - `uq_audio_embedding_feature_window` | ||
| 733 | - `uq_audio_embedding_feature_asset` | ||
| 734 | |||
| 735 | 设计意图是: | ||
| 736 | |||
| 737 | 1. 同一 `feature_set_id + window_id` 的 embedding 重跑时可以稳定 upsert | ||
| 738 | 2. 将来如果有 asset-level embedding,也能独立幂等 | ||
| 739 | 3. 不把幂等职责留给应用层“先查再写” | ||
| 740 | |||
| 741 | 这一步对后续的 `MERT / MuQ / ECAPA` 都通用。 | ... | ... |
| ... | @@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 | 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 |
| 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` | 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` |
| 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 | 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 |
| 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed | ||
| 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 | ||
| 192 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 194 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 193 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 195 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 194 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 196 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment