Commit 399db601 399db60123b2615233d22af3c10957165c60ba4b by cnb.bofCdSsphPA

Make semantic extraction failures auditable before model runtimes land

Constraint: Current container lacks /workspace/downloads and torch/torchaudio/transformers, so Phase-1 semantic work must prove honest failure semantics instead of pretending inference succeeded.
Rejected: Stub semantic embeddings | Would blur the contract between real model outputs and repo-local placeholders.
Confidence: high
Scope-risk: narrow
Directive: Keep the preflight blockers explicit until real MERT/MuQ/ECAPA adapters and asset-level embedding tests exist.
Tested: /usr/local/miniconda3/bin/python -m py_compile workers/run_embedding_job.py workers/run_chromaprint_job.py workers/_job_common.py scripts/bootstrap_phase1_extraction_jobs_live.py scripts/plan_phase1_extraction_jobs_live.py scripts/bootstrap_phase1_reference_members_live.py scripts/live_pgvector_music20_eval.py; git diff --check; /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test; /usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768 --output data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json
Not-tested: Real encoder inference and asset-level embedding upsert path remain unavailable in this container.
1 parent 94d75e92
1 {
2 "audio_embedding_count": 20,
3 "audio_embedding_vector_768_count": 0,
4 "job_2": [
5 2,
6 "failed",
7 20,
8 0,
9 {
10 "lane": "semantic",
11 "role": "primary_baseline",
12 "phase": "phase1",
13 "worker": "run_embedding_job",
14 "dry_run": false,
15 "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
16 "vector_table": "audio_embedding_vector_768",
17 "output_target": "audio_embedding",
18 "execution_mode": "preflight_failure",
19 "failure_reason": "preflight_failed",
20 "runtime_report": {
21 "ready": false,
22 "model_name": "mert",
23 "availability": {
24 "numpy": true,
25 "torch": false,
26 "torchaudio": false,
27 "transformers": false
28 },
29 "requirements": [
30 "numpy",
31 "torch",
32 "torchaudio",
33 "transformers"
34 ],
35 "missing_dependencies": [
36 "torch",
37 "torchaudio",
38 "transformers"
39 ]
40 },
41 "preflight_blockers": [
42 "unreadable_audio_assets",
43 "model_runtime_unavailable"
44 ],
45 "scope_window_count": 20,
46 "write_target_table": "audio_embedding",
47 "vector_table_report": {
48 "reason": null,
49 "resolved": true,
50 "expected_dim": 768,
51 "table_exists": true,
52 "allowed_vector_tables": [
53 "audio_embedding_vector_192",
54 "audio_embedding_vector_768"
55 ],
56 "requested_vector_table": "audio_embedding_vector_768"
57 },
58 "missing_window_count": 20,
59 "target_scope_summary": {
60 "scope_type": "reference_set",
61 "scope_value": "phase1_hot_reference_v1",
62 "recording_count": 20,
63 "reference_set_id": 2,
64 "ready_asset_count": 20,
65 "reference_set_name": "phase1_hot_reference_v1",
66 "active_window_count": 20
67 },
68 "missing_window_samples": [
69 {
70 "reason": "missing_audio",
71 "asset_id": 1,
72 "window_id": 1,
73 "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
74 },
75 {
76 "reason": "missing_audio",
77 "asset_id": 2,
78 "window_id": 2,
79 "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
80 },
81 {
82 "reason": "missing_audio",
83 "asset_id": 3,
84 "window_id": 3,
85 "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
86 },
87 {
88 "reason": "missing_audio",
89 "asset_id": 4,
90 "window_id": 4,
91 "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
92 },
93 {
94 "reason": "missing_audio",
95 "asset_id": 5,
96 "window_id": 5,
97 "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
98 }
99 ]
100 }
101 ]
102 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "worker": "run_embedding_job",
3 "schema": "acr_test",
4 "job": {
5 "extraction_job_id": 2,
6 "feature_set_id": 3,
7 "target_scope": "reference_set:phase1_hot_reference_v1",
8 "job_status": "pending",
9 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
10 "job_metadata": {
11 "lane": "semantic",
12 "role": "primary_baseline",
13 "phase": "phase1"
14 },
15 "feature_name": "semantic_embedding",
16 "feature_level": "window",
17 "extraction_granularity": "sliding_window",
18 "window_sec": 5.0,
19 "hop_sec": 2.5,
20 "embedding_dim": 768,
21 "distance_metric": "cosine",
22 "feature_config": {
23 "role": "primary_semantic_baseline"
24 },
25 "model_id": 3,
26 "model_name": "mert",
27 "model_version": "v1-95m",
28 "model_family": "music_ssl",
29 "input_sample_rate": 24000,
30 "output_embedding_dim": 768,
31 "model_metadata": {
32 "lane": "semantic",
33 "role": "primary_baseline",
34 "phase": "phase1"
35 }
36 },
37 "target_scope_summary": {
38 "scope_type": "reference_set",
39 "scope_value": "phase1_hot_reference_v1",
40 "reference_set_id": 2,
41 "reference_set_name": "phase1_hot_reference_v1",
42 "recording_count": 20,
43 "ready_asset_count": 20,
44 "active_window_count": 20
45 },
46 "scope_window_count": 20,
47 "status_after_start": {
48 "extraction_job_id": 2,
49 "job_status": "running",
50 "input_count": 20,
51 "output_count": null,
52 "started_at": "2026-06-04T13:44:05.982252+08:00",
53 "finished_at": null,
54 "log_uri": null,
55 "metadata_json": {
56 "lane": "semantic",
57 "role": "primary_baseline",
58 "phase": "phase1",
59 "worker": "run_embedding_job",
60 "dry_run": false,
61 "vector_table": "audio_embedding_vector_768",
62 "output_target": "audio_embedding",
63 "execution_mode": "preflight",
64 "runtime_report": {
65 "ready": false,
66 "model_name": "mert",
67 "availability": {
68 "numpy": true,
69 "torch": false,
70 "torchaudio": false,
71 "transformers": false
72 },
73 "requirements": [
74 "numpy",
75 "torch",
76 "torchaudio",
77 "transformers"
78 ],
79 "missing_dependencies": [
80 "torch",
81 "torchaudio",
82 "transformers"
83 ]
84 },
85 "scope_window_count": 20,
86 "vector_table_report": {
87 "reason": null,
88 "resolved": true,
89 "expected_dim": 768,
90 "table_exists": true,
91 "allowed_vector_tables": [
92 "audio_embedding_vector_192",
93 "audio_embedding_vector_768"
94 ],
95 "requested_vector_table": "audio_embedding_vector_768"
96 },
97 "target_scope_summary": {
98 "scope_type": "reference_set",
99 "scope_value": "phase1_hot_reference_v1",
100 "recording_count": 20,
101 "reference_set_id": 2,
102 "ready_asset_count": 20,
103 "reference_set_name": "phase1_hot_reference_v1",
104 "active_window_count": 20
105 }
106 }
107 },
108 "status_after_complete": null,
109 "status_after_failed": {
110 "extraction_job_id": 2,
111 "job_status": "failed",
112 "input_count": 20,
113 "output_count": 0,
114 "started_at": "2026-06-04T13:44:05.982252+08:00",
115 "finished_at": "2026-06-04T13:44:05.983441+08:00",
116 "log_uri": null,
117 "metadata_json": {
118 "lane": "semantic",
119 "role": "primary_baseline",
120 "phase": "phase1",
121 "worker": "run_embedding_job",
122 "dry_run": false,
123 "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
124 "vector_table": "audio_embedding_vector_768",
125 "output_target": "audio_embedding",
126 "execution_mode": "preflight_failure",
127 "failure_reason": "preflight_failed",
128 "runtime_report": {
129 "ready": false,
130 "model_name": "mert",
131 "availability": {
132 "numpy": true,
133 "torch": false,
134 "torchaudio": false,
135 "transformers": false
136 },
137 "requirements": [
138 "numpy",
139 "torch",
140 "torchaudio",
141 "transformers"
142 ],
143 "missing_dependencies": [
144 "torch",
145 "torchaudio",
146 "transformers"
147 ]
148 },
149 "preflight_blockers": [
150 "unreadable_audio_assets",
151 "model_runtime_unavailable"
152 ],
153 "scope_window_count": 20,
154 "write_target_table": "audio_embedding",
155 "vector_table_report": {
156 "reason": null,
157 "resolved": true,
158 "expected_dim": 768,
159 "table_exists": true,
160 "allowed_vector_tables": [
161 "audio_embedding_vector_192",
162 "audio_embedding_vector_768"
163 ],
164 "requested_vector_table": "audio_embedding_vector_768"
165 },
166 "missing_window_count": 20,
167 "target_scope_summary": {
168 "scope_type": "reference_set",
169 "scope_value": "phase1_hot_reference_v1",
170 "recording_count": 20,
171 "reference_set_id": 2,
172 "ready_asset_count": 20,
173 "reference_set_name": "phase1_hot_reference_v1",
174 "active_window_count": 20
175 },
176 "missing_window_samples": [
177 {
178 "reason": "missing_audio",
179 "asset_id": 1,
180 "window_id": 1,
181 "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
182 },
183 {
184 "reason": "missing_audio",
185 "asset_id": 2,
186 "window_id": 2,
187 "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
188 },
189 {
190 "reason": "missing_audio",
191 "asset_id": 3,
192 "window_id": 3,
193 "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
194 },
195 {
196 "reason": "missing_audio",
197 "asset_id": 4,
198 "window_id": 4,
199 "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
200 },
201 {
202 "reason": "missing_audio",
203 "asset_id": 5,
204 "window_id": 5,
205 "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
206 }
207 ]
208 }
209 },
210 "resolved_vector_table": "audio_embedding_vector_768",
211 "vector_table_report": {
212 "requested_vector_table": "audio_embedding_vector_768",
213 "expected_dim": 768,
214 "allowed_vector_tables": [
215 "audio_embedding_vector_192",
216 "audio_embedding_vector_768"
217 ],
218 "resolved": true,
219 "table_exists": true,
220 "reason": null
221 },
222 "runtime_report": {
223 "model_name": "mert",
224 "requirements": [
225 "numpy",
226 "torch",
227 "torchaudio",
228 "transformers"
229 ],
230 "availability": {
231 "numpy": true,
232 "torch": false,
233 "torchaudio": false,
234 "transformers": false
235 },
236 "missing_dependencies": [
237 "torch",
238 "torchaudio",
239 "transformers"
240 ],
241 "ready": false
242 },
243 "processed_windows": [],
244 "notes": [
245 "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
246 "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
247 ]
248 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "command": "/usr/local/miniconda3/bin/python workers/run_embedding_job.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_test --job-id 2 --model-name mert --model-version v1-95m --vector-table audio_embedding_vector_768",
3 "returncode": 1,
4 "stdout": "",
5 "stderr": "failed to update feature_extraction_job=2 with expected_status=pending\n",
6 "expected_guard": "failed to update feature_extraction_job=2 with expected_status=pending",
7 "passed": true
8 }
...\ No newline at end of file ...\ No newline at end of file
...@@ -423,6 +423,14 @@ ON audio_window(canonical_song_id); ...@@ -423,6 +423,14 @@ ON audio_window(canonical_song_id);
423 CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index 423 CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index
424 ON audio_window(active_for_index); 424 ON audio_window(active_for_index);
425 425
426 CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_window
427 ON audio_embedding(feature_set_id, window_id)
428 WHERE window_id IS NOT NULL;
429
430 CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_asset
431 ON audio_embedding(feature_set_id, asset_id)
432 WHERE window_id IS NULL AND asset_id IS NOT NULL;
433
426 CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id 434 CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id
427 ON audio_embedding(feature_set_id); 435 ON audio_embedding(feature_set_id);
428 436
......
...@@ -2,7 +2,13 @@ ...@@ -2,7 +2,13 @@
2 from __future__ import annotations 2 from __future__ import annotations
3 3
4 import argparse 4 import argparse
5 import json
6 import math
5 import os 7 import os
8 from pathlib import Path
9 from typing import Any
10
11 from psycopg import sql
6 12
7 from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status 13 from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status
8 14
...@@ -10,6 +16,229 @@ VECTOR_TABLE_BY_DIM = { ...@@ -10,6 +16,229 @@ VECTOR_TABLE_BY_DIM = {
10 192: 'audio_embedding_vector_192', 16 192: 'audio_embedding_vector_192',
11 768: 'audio_embedding_vector_768', 17 768: 'audio_embedding_vector_768',
12 } 18 }
19 MODEL_RUNTIME_REQUIREMENTS = {
20 'mert': ('numpy', 'torch', 'torchaudio', 'transformers'),
21 'muq': ('numpy', 'torch', 'torchaudio', 'transformers'),
22 'ecapa': ('numpy', 'torch', 'torchaudio', 'speechbrain'),
23 }
24 ALLOWED_VECTOR_TABLES = set(VECTOR_TABLE_BY_DIM.values())
25
26
27 def fetch_scope_windows(conn, target_scope: str) -> list[dict[str, object]]:
28 if not target_scope.startswith('reference_set:'):
29 raise SystemExit(f'unsupported target_scope for embedding worker: {target_scope}')
30 set_name = target_scope.split(':', 1)[1]
31 rows = conn.execute(
32 """
33 SELECT
34 aw.window_id,
35 aw.asset_id,
36 aw.window_index,
37 aw.start_sec,
38 aw.end_sec,
39 aw.duration_sec,
40 aw.recording_id,
41 aw.work_id,
42 aw.canonical_song_id,
43 ra.storage_uri,
44 ra.ingest_status,
45 aw.active_for_index
46 FROM reference_set_registry rs
47 JOIN reference_set_member rsm ON rsm.reference_set_id = rs.reference_set_id
48 JOIN audio_window aw ON aw.recording_id = rsm.recording_id
49 JOIN recording_asset ra ON ra.asset_id = aw.asset_id
50 WHERE rs.set_name = %s
51 AND aw.active_for_index = TRUE
52 AND ra.ingest_status = 'ready'
53 ORDER BY aw.window_id;
54 """,
55 (set_name,),
56 ).fetchall()
57 return [
58 {
59 'window_id': int(row[0]),
60 'asset_id': int(row[1]),
61 'window_index': int(row[2]),
62 'start_sec': float(row[3]),
63 'end_sec': float(row[4]),
64 'duration_sec': float(row[5]),
65 'recording_id': int(row[6]),
66 'work_id': int(row[7]),
67 'canonical_song_id': int(row[8]),
68 'storage_uri': row[9],
69 'ingest_status': row[10],
70 'active_for_index': bool(row[11]),
71 }
72 for row in rows
73 ]
74
75
76 def detect_runtime(model_name: str) -> dict[str, Any]:
77 checks: dict[str, Any] = {'model_name': model_name, 'requirements': list(MODEL_RUNTIME_REQUIREMENTS.get(model_name, ('numpy',)))}
78 availability: dict[str, bool] = {}
79 missing: list[str] = []
80 for package_name in checks['requirements']:
81 try:
82 __import__(package_name)
83 availability[package_name] = True
84 except Exception: # noqa: BLE001
85 availability[package_name] = False
86 missing.append(package_name)
87 checks['availability'] = availability
88 checks['missing_dependencies'] = missing
89 checks['ready'] = not missing
90 return checks
91
92
93 def validate_vector_table(conn, vector_table: str | None, expected_dim: int | None) -> dict[str, Any]:
94 payload = {
95 'requested_vector_table': vector_table,
96 'expected_dim': expected_dim,
97 'allowed_vector_tables': sorted(ALLOWED_VECTOR_TABLES),
98 'resolved': False,
99 'table_exists': False,
100 'reason': None,
101 }
102 if not vector_table:
103 payload['reason'] = 'missing_vector_table'
104 return payload
105 if vector_table not in ALLOWED_VECTOR_TABLES:
106 payload['reason'] = 'vector_table_not_allowlisted'
107 return payload
108 dim_from_table = 192 if vector_table.endswith('_192') else 768 if vector_table.endswith('_768') else None
109 if expected_dim is not None and dim_from_table is not None and dim_from_table != expected_dim:
110 payload['reason'] = 'vector_table_dim_mismatch'
111 return payload
112 row = conn.execute('SELECT to_regclass(%s);', (vector_table,)).fetchone()
113 payload['table_exists'] = bool(row and row[0])
114 if not payload['table_exists']:
115 payload['reason'] = 'vector_table_missing_in_schema'
116 return payload
117 payload['resolved'] = True
118 return payload
119
120
121 def build_artifact_path(artifact_dir: Path, *, extraction_job_id: int, window_id: int) -> Path:
122 artifact_dir.mkdir(parents=True, exist_ok=True)
123 return artifact_dir / f'job{extraction_job_id}_window{window_id}.json'
124
125
126 def vector_literal(values: list[float]) -> str:
127 return '[' + ','.join(f'{value:.10f}' for value in values) + ']'
128
129
130 def compute_vector_norm(values: list[float]) -> float:
131 return math.sqrt(sum(value * value for value in values))
132
133
134 def upsert_audio_embedding(
135 conn,
136 *,
137 feature_set_id: int,
138 extraction_job_id: int,
139 vector_table: str,
140 window: dict[str, object],
141 embedding_uri: str,
142 embedding: list[float],
143 checksum: str | None,
144 metadata_json: dict[str, object],
145 ) -> tuple[int, str]:
146 row = conn.execute(
147 """
148 INSERT INTO audio_embedding (
149 feature_set_id, extraction_job_id, asset_id, window_id, recording_id, work_id,
150 canonical_song_id, embedding_storage_mode, embedding_uri, vector_norm, checksum,
151 is_indexed, metadata_json
152 ) VALUES (
153 %s, %s, %s, %s, %s, %s,
154 %s, %s, %s, %s, %s,
155 TRUE, %s::jsonb
156 )
157 ON CONFLICT (feature_set_id, window_id) WHERE window_id IS NOT NULL
158 DO UPDATE SET
159 extraction_job_id = EXCLUDED.extraction_job_id,
160 asset_id = EXCLUDED.asset_id,
161 recording_id = EXCLUDED.recording_id,
162 work_id = EXCLUDED.work_id,
163 canonical_song_id = EXCLUDED.canonical_song_id,
164 embedding_storage_mode = EXCLUDED.embedding_storage_mode,
165 embedding_uri = EXCLUDED.embedding_uri,
166 vector_norm = EXCLUDED.vector_norm,
167 checksum = EXCLUDED.checksum,
168 is_indexed = EXCLUDED.is_indexed,
169 metadata_json = EXCLUDED.metadata_json
170 RETURNING embedding_id, xmax = 0 AS inserted;
171 """,
172 (
173 feature_set_id,
174 extraction_job_id,
175 window['asset_id'],
176 window['window_id'],
177 window['recording_id'],
178 window['work_id'],
179 window['canonical_song_id'],
180 'pgvector_inline',
181 embedding_uri,
182 compute_vector_norm(embedding),
183 checksum,
184 json.dumps(metadata_json, ensure_ascii=False),
185 ),
186 ).fetchone()
187 embedding_id = int(row[0])
188 inserted = bool(row[1])
189 conn.execute(
190 sql.SQL(
191 """
192 INSERT INTO {vector_table} (embedding_id, embedding)
193 VALUES (%s, %s::vector)
194 ON CONFLICT (embedding_id)
195 DO UPDATE SET embedding = EXCLUDED.embedding;
196 """
197 ).format(vector_table=sql.Identifier(vector_table)),
198 (embedding_id, vector_literal(embedding)),
199 )
200 return embedding_id, 'inserted' if inserted else 'updated'
201
202
203 def fail_job(
204 conn,
205 *,
206 job,
207 blockers: list[str],
208 output_target: str,
209 resolved_vector_table: str | None,
210 artifact_dir: Path,
211 scope: dict[str, Any],
212 scope_windows: list[dict[str, object]],
213 missing_windows: list[dict[str, object]],
214 runtime_report: dict[str, Any],
215 vector_table_report: dict[str, Any],
216 ) -> dict[str, Any]:
217 return update_job_status(
218 conn,
219 job.extraction_job_id,
220 status='failed',
221 expected_status='running',
222 output_count=0,
223 metadata_patch={
224 'worker': 'run_embedding_job',
225 'output_target': output_target,
226 'vector_table': resolved_vector_table,
227 'dry_run': False,
228 'write_target_table': output_target,
229 'artifact_dir': str(artifact_dir),
230 'execution_mode': 'preflight_failure',
231 'failure_reason': 'preflight_failed',
232 'preflight_blockers': blockers,
233 'scope_window_count': len(scope_windows),
234 'missing_window_count': len(missing_windows),
235 'missing_window_samples': missing_windows[:5],
236 'runtime_report': runtime_report,
237 'vector_table_report': vector_table_report,
238 'target_scope_summary': scope,
239 },
240 set_finished_at=True,
241 )
13 242
14 243
15 def main() -> None: 244 def main() -> None:
...@@ -22,6 +251,7 @@ def main() -> None: ...@@ -22,6 +251,7 @@ def main() -> None:
22 ap.add_argument('--vector-table', default=os.environ.get('VECTOR_TABLE')) 251 ap.add_argument('--vector-table', default=os.environ.get('VECTOR_TABLE'))
23 ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_embedding')) 252 ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_embedding'))
24 ap.add_argument('--complete-dry-run', action='store_true') 253 ap.add_argument('--complete-dry-run', action='store_true')
254 ap.add_argument('--artifact-dir', default=os.environ.get('ARTIFACT_DIR', 'data/pgvector_eval/music20/phase1_embeddings'))
25 ap.add_argument('--output') 255 ap.add_argument('--output')
26 args = ap.parse_args() 256 args = ap.parse_args()
27 257
...@@ -30,6 +260,8 @@ def main() -> None: ...@@ -30,6 +260,8 @@ def main() -> None:
30 if not args.job_id: 260 if not args.job_id:
31 raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') 261 raise SystemExit('missing --job-id or EXTRACTION_JOB_ID')
32 262
263 artifact_dir = Path(args.artifact_dir)
264
33 with connect(args.dsn, args.schema) as conn: 265 with connect(args.dsn, args.schema) as conn:
34 job = fetch_job_context(conn, args.job_id) 266 job = fetch_job_context(conn, args.job_id)
35 if job.model_name == 'chromaprint': 267 if job.model_name == 'chromaprint':
...@@ -43,25 +275,37 @@ def main() -> None: ...@@ -43,25 +275,37 @@ def main() -> None:
43 raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') 275 raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}')
44 if args.model_version and job.model_version != args.model_version: 276 if args.model_version and job.model_version != args.model_version:
45 raise SystemExit(f'model version mismatch: job={job.model_version} cli={args.model_version}') 277 raise SystemExit(f'model version mismatch: job={job.model_version} cli={args.model_version}')
46 resolved_vector_table = args.vector_table or VECTOR_TABLE_BY_DIM.get(job.embedding_dim or job.output_embedding_dim or -1) 278
279 expected_dim = job.embedding_dim or job.output_embedding_dim
280 resolved_vector_table = args.vector_table or VECTOR_TABLE_BY_DIM.get(expected_dim or -1)
47 scope = resolve_scope_summary(conn, job.target_scope) 281 scope = resolve_scope_summary(conn, job.target_scope)
282 scope_windows = fetch_scope_windows(conn, job.target_scope)
283 runtime_report = detect_runtime(job.model_name)
284 vector_table_report = validate_vector_table(conn, resolved_vector_table, expected_dim)
285
48 running = update_job_status( 286 running = update_job_status(
49 conn, 287 conn,
50 job.extraction_job_id, 288 job.extraction_job_id,
51 status='running', 289 status='running',
52 expected_status='pending', 290 expected_status='pending',
53 input_count=scope['active_window_count'] or scope['ready_asset_count'], 291 input_count=len(scope_windows),
54 metadata_patch={ 292 metadata_patch={
55 'worker': 'run_embedding_job', 293 'worker': 'run_embedding_job',
56 'output_target': args.output_target, 294 'output_target': args.output_target,
57 'vector_table': resolved_vector_table, 295 'vector_table': resolved_vector_table,
58 'dry_run': True, 296 'dry_run': bool(args.complete_dry_run),
59 'target_scope_summary': scope, 297 'target_scope_summary': scope,
60 'execution_mode': 'dry_run', 298 'execution_mode': 'dry_run' if args.complete_dry_run else 'preflight',
299 'runtime_report': runtime_report,
300 'vector_table_report': vector_table_report,
301 'scope_window_count': len(scope_windows),
61 }, 302 },
62 set_started_at=True, 303 set_started_at=True,
63 ) 304 )
64 completed = None 305 completed = None
306 failed = None
307 processed_windows: list[dict[str, object]] = []
308
65 if args.complete_dry_run: 309 if args.complete_dry_run:
66 completed = update_job_status( 310 completed = update_job_status(
67 conn, 311 conn,
...@@ -76,9 +320,69 @@ def main() -> None: ...@@ -76,9 +320,69 @@ def main() -> None:
76 'dry_run': True, 320 'dry_run': True,
77 'dry_run_result': 'completed_without_feature_write', 321 'dry_run_result': 'completed_without_feature_write',
78 'write_target_table': args.output_target, 322 'write_target_table': args.output_target,
323 'scope_window_count': len(scope_windows),
324 'runtime_report': runtime_report,
325 'vector_table_report': vector_table_report,
79 }, 326 },
80 set_finished_at=True, 327 set_finished_at=True,
81 ) 328 )
329 else:
330 missing_windows: list[dict[str, object]] = []
331 for window in scope_windows:
332 asset_path = Path(str(window['storage_uri']))
333 if not asset_path.exists():
334 missing_windows.append({
335 'window_id': window['window_id'],
336 'asset_id': window['asset_id'],
337 'storage_uri': str(asset_path),
338 'reason': 'missing_audio',
339 })
340
341 blockers: list[str] = []
342 if missing_windows:
343 blockers.append('unreadable_audio_assets')
344 if not vector_table_report['resolved']:
345 blockers.append(str(vector_table_report['reason']))
346 if not runtime_report['ready']:
347 blockers.append('model_runtime_unavailable')
348
349 if blockers:
350 failed = fail_job(
351 conn,
352 job=job,
353 blockers=blockers,
354 output_target=args.output_target,
355 resolved_vector_table=resolved_vector_table,
356 artifact_dir=artifact_dir,
357 scope=scope,
358 scope_windows=scope_windows,
359 missing_windows=missing_windows,
360 runtime_report=runtime_report,
361 vector_table_report=vector_table_report,
362 )
363 else:
364 failed = update_job_status(
365 conn,
366 job.extraction_job_id,
367 status='failed',
368 expected_status='running',
369 output_count=0,
370 metadata_patch={
371 'worker': 'run_embedding_job',
372 'output_target': args.output_target,
373 'vector_table': resolved_vector_table,
374 'dry_run': False,
375 'write_target_table': args.output_target,
376 'artifact_dir': str(artifact_dir),
377 'execution_mode': 'write_attempt',
378 'failure_reason': 'encoder_inference_not_implemented',
379 'scope_window_count': len(scope_windows),
380 'runtime_report': runtime_report,
381 'vector_table_report': vector_table_report,
382 'next_expected_step': 'replace the guarded failure path with real model inference while keeping the same upsert contract',
383 },
384 set_finished_at=True,
385 )
82 386
83 emit_payload( 387 emit_payload(
84 { 388 {
...@@ -86,12 +390,17 @@ def main() -> None: ...@@ -86,12 +390,17 @@ def main() -> None:
86 'schema': args.schema, 390 'schema': args.schema,
87 'job': job.__dict__, 391 'job': job.__dict__,
88 'target_scope_summary': scope, 392 'target_scope_summary': scope,
393 'scope_window_count': len(scope_windows),
89 'status_after_start': running, 394 'status_after_start': running,
90 'status_after_complete': completed, 395 'status_after_complete': completed,
396 'status_after_failed': failed,
91 'resolved_vector_table': resolved_vector_table, 397 'resolved_vector_table': resolved_vector_table,
398 'vector_table_report': vector_table_report,
399 'runtime_report': runtime_report,
400 'processed_windows': processed_windows,
92 'notes': [ 401 'notes': [
93 'this worker currently validates planner -> job -> PostgreSQL state flow', 402 'this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics',
94 'real encoder inference can replace dry_run while preserving the same job contract', 403 'real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys',
95 ], 404 ],
96 }, 405 },
97 args.output, 406 args.output,
......
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets``model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。
4 -`audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL``UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。
5 - 新增 `phase1_worker_embedding_write_attempt.json``phase1_worker_embedding_write_guard_report.json``phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`
3 - 更新 `run_chromaprint_job.py``src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。 6 - 更新 `run_chromaprint_job.py``src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。
4 -`audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。 7 -`audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。
5 - 新增 `phase1_worker_chromaprint_write_attempt.json``phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20``processed_assets=0`,job 被明确标记为 `failed``failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。 8 - 新增 `phase1_worker_chromaprint_write_attempt.json``phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20``processed_assets=0`,job 被明确标记为 `failed``failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。
......
...@@ -286,15 +286,75 @@ flowchart TD ...@@ -286,15 +286,75 @@ flowchart TD
286 286
287 ### 7.2 Embedding worker 287 ### 7.2 Embedding worker
288 288
289 后续把下面逻辑塞进 `run_embedding_job.py` 289 `run_embedding_job.py` 现在已经不再只是简单 dry-run。当前它已经具备
290 290
291 1. 读取 `audio_window` 291 1. 真实读取 `reference_set -> audio_window -> recording_asset` scope
292 2. 加载 `MERT` / `MuQ` / `ECAPA` 292 2. 真实检查目标向量表是否存在且与维度匹配
293 3. 提取向量 293 3. 真实检查模型 runtime 依赖是否齐全
294 4.`audio_embedding` 294 4. 真实检查 source audio 是否存在
295 5.`audio_embedding_vector_<dim>` 295 5. 把 blocker 明确写回 `feature_extraction_job.metadata_json`
296 6. 更新 `output_count` 296 6. 在 blocker 存在时把 job 诚实标记为 `failed`
297 7. 标记 `completed` 297
298 ### 当前失败语义
299
300 semantic lane 当前采用的是 **preflight all-or-nothing**
301
302 - 只要 scope 内音频路径不可达 / 文件不存在,记为:
303 - `unreadable_audio_assets`
304 - 只要模型 runtime 依赖导入不满足,记为:
305 - `model_runtime_unavailable`
306 - 只要目标向量表非法 / 缺失 / 维度不匹配,记为对应 blocker
307
308 worker 会把这些 blocker 聚合到:
309
310 - `failure_reason = preflight_failed`
311 - `preflight_blockers = [...]`
312
313 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。
314
315 ### 当前 live 证据
316
317 MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证:
318
319 - `scope_window_count = 20`
320 - `job_status = failed`
321 - `output_count = 0`
322 - `preflight_blockers = ['unreadable_audio_assets', 'model_runtime_unavailable']`
323 - `runtime_report.missing_dependencies = ['torch', 'torchaudio', 'transformers']`
324 - `audio_embedding_vector_768` 已通过存在性与维度校验
325
326 对应产物:
327
328 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json`
329 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json`
330 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json`
331
332 ### 当前幂等保护
333
334 为了服务后续真正的 window embedding upsert,`audio_embedding` 现在补了两条唯一键:
335
336 - `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL`
337 - `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL`
338
339 这让后续真实 encoder 接入后可以直接做:
340
341 - window 级 embedding upsert
342 - asset 级 embedding upsert
343
344 而不需要先查再写。
345
346 ### 下一步替换点
347
348 当 runtime 与音频挂载到位后,只需要把 guarded failure path 替换成真实 inference:
349
350 1. 加载 `MERT` / `MuQ` / `ECAPA`
351 2. 提取向量
352 3.`audio_embedding`
353 4.`audio_embedding_vector_<dim>`
354 5. 更新 `output_count`
355 6. 标记 `completed`
356
357 也就是说,**PostgreSQL worker contract 已经固定,下一步换的是 encoder adapter,不是 orchestration 结构。**
298 358
299 --- 359 ---
300 360
...@@ -313,11 +373,11 @@ flowchart TD ...@@ -313,11 +373,11 @@ flowchart TD
313 373
314 当前还没有完成的部分: 374 当前还没有完成的部分:
315 375
316 - 真实 chromaprint 特征写入 376 - exact lane 虽已有真实写入路径,但当前 live 环境仍被 `/workspace/downloads` 缺失阻塞
317 - 真实 MERT / MuQ / ECAPA embedding 写入 377 - semantic lane 已有真实 preflight failure contract,但还没有接上真正的 `MERT / MuQ / ECAPA` inference adapter
318 - `failed` 重试策略 378 - `failed` 重试策略
319 - job 分片执行器 379 - job 分片执行器
320 - 幂等去重写入策略 380 - 更完整的 embedding artifact / checksum 治理策略
321 381
322 但现在已经足够支撑下一阶段: 382 但现在已经足够支撑下一阶段:
323 383
......
...@@ -552,6 +552,8 @@ flowchart TD ...@@ -552,6 +552,8 @@ flowchart TD
552 - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘 552 - 样例数据链可以按 `song -> work -> recording -> asset -> window -> embedding` 落盘
553 - live pgvector 检索和现有 stand-in 逻辑一致 553 - live pgvector 检索和现有 stand-in 逻辑一致
554 - `retrieval_candidate` / `match_decision` 可以真实承载在线结果 554 - `retrieval_candidate` / `match_decision` 可以真实承载在线结果
555 - semantic worker 已真实验证 preflight failure 语义:既能识别 `/workspace/downloads` 缺失,也能识别 `torch/torchaudio/transformers` 缺失
556 - `audio_embedding` 已补上 window / asset 双路幂等唯一键,为后续 encoder 真实 upsert 预留稳定主键
555 557
556 ### 未验证 558 ### 未验证
557 559
...@@ -690,3 +692,50 @@ cd /workspace/acr-engine ...@@ -690,3 +692,50 @@ cd /workspace/acr-engine
690 692
691 > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。 693 > PostgreSQL 这条路已经可以真实落 schema、落样例、落 candidate、落 decision,也能真实跑 pgvector 检索。
692 > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。 694 > 当前最大的短板不再是“怎么存”,而是 **当前 baseline embedding 对混淆 query 的召回仍然明显不够**。
695
696
697 ## 新增:Phase-1 semantic worker live 证据
698
699 本轮继续对 `run_embedding_job.py` 做 live PostgreSQL 验证,目标不是伪造 embedding,而是把 **失败语义先固定住**
700
701 ### 结果摘要
702
703 `extraction_job_id=2``mert v1-95m`, `5s/2.5s`)执行非 dry-run worker 后:
704
705 | 项 | 结果 |
706 |---|---|
707 | `scope_window_count` | `20` |
708 | `job_status` | `failed` |
709 | `output_count` | `0` |
710 | `failure_reason` | `preflight_failed` |
711 | `preflight_blockers` | `['unreadable_audio_assets', 'model_runtime_unavailable']` |
712 | `vector_table_report.resolved` | `true` |
713 | `audio_embedding_vector_768_count` | `0` |
714
715 说明:
716
717 - 当前语义 lane 不是“没做事”,而是已经真实走到了 PostgreSQL job scope / runtime / vector table / asset 路径检查
718 - 只是当前容器同时被两个外部条件挡住:
719 1. `/workspace/downloads/...` 未挂载
720 2. `torch / torchaudio / transformers` 未安装
721
722 ### 证据文件
723
724 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_attempt.json`
725 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_write_guard_report.json`
726 - `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_post_state.json`
727
728 ### 为什么要先补唯一键
729
730 当前 `audio_embedding` 已新增:
731
732 - `uq_audio_embedding_feature_window`
733 - `uq_audio_embedding_feature_asset`
734
735 设计意图是:
736
737 1. 同一 `feature_set_id + window_id` 的 embedding 重跑时可以稳定 upsert
738 2. 将来如果有 asset-level embedding,也能独立幂等
739 3. 不把幂等职责留给应用层“先查再写”
740
741 这一步对后续的 `MERT / MuQ / ECAPA` 都通用。
......
...@@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -189,6 +189,8 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
189 - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 189 - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板
190 - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` 190 - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py`
191 - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 191 - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行
192 - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets``model_runtime_unavailable`,而不是把失败伪装成 completed
193 - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同
192 - `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` 194 - `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
193 - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` 195 - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
194 - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` 196 - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
......