Commit 71bbe76f 71bbe76f5d774b0445ca1ef453c8ca615510d4d6 by cnb.bofCdSsphPA

Make semantic vector-table misconfigurations fail with live evidence

Constraint: Phase-1 semantic jobs were already blocked by missing audio and model runtimes, so vector-table regressions needed their own isolated live proof to avoid being masked by the same environment failures.
Rejected: Infer vector-table coverage from code inspection only | It would not prove the worker writes the correct blocker reasons into PostgreSQL metadata.
Confidence: high
Scope-risk: narrow
Directive: When semantic extraction fails, inspect vector_table_report.reason before assuming the host is only missing mounts or model dependencies.
Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_embedding_vector_table_negative_matrix_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_embedding_vector_table_negative_matrix_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --output data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json
Not-tested: No successful semantic extraction path exists yet on this host; this commit validates negative preflight cases only.
1 parent 223f80ac
1 {
2 "worker": "run_embedding_job",
3 "schema": "acr_test",
4 "job": {
5 "extraction_job_id": 2,
6 "feature_set_id": 3,
7 "target_scope": "reference_set:phase1_hot_reference_v1",
8 "job_status": "pending",
9 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
10 "job_metadata": {
11 "lane": "semantic",
12 "role": "primary_baseline",
13 "phase": "phase1"
14 },
15 "feature_name": "semantic_embedding",
16 "feature_level": "window",
17 "extraction_granularity": "sliding_window",
18 "window_sec": 5.0,
19 "hop_sec": 2.5,
20 "embedding_dim": 768,
21 "distance_metric": "cosine",
22 "feature_config": {
23 "role": "primary_semantic_baseline"
24 },
25 "model_id": 3,
26 "model_name": "mert",
27 "model_version": "v1-95m",
28 "model_family": "music_ssl",
29 "input_sample_rate": 24000,
30 "output_embedding_dim": 768,
31 "model_metadata": {
32 "lane": "semantic",
33 "role": "primary_baseline",
34 "phase": "phase1"
35 }
36 },
37 "target_scope_summary": {
38 "scope_type": "reference_set",
39 "scope_value": "phase1_hot_reference_v1",
40 "reference_set_id": 2,
41 "reference_set_name": "phase1_hot_reference_v1",
42 "recording_count": 20,
43 "ready_asset_count": 20,
44 "active_window_count": 20
45 },
46 "scope_window_count": 20,
47 "status_after_start": {
48 "extraction_job_id": 2,
49 "job_status": "running",
50 "input_count": 20,
51 "output_count": null,
52 "started_at": "2026-06-04T14:00:28.270203+08:00",
53 "finished_at": null,
54 "log_uri": null,
55 "metadata_json": {
56 "lane": "semantic",
57 "role": "primary_baseline",
58 "phase": "phase1",
59 "worker": "run_embedding_job",
60 "dry_run": false,
61 "vector_table": "audio_embedding_vector_192",
62 "output_target": "audio_embedding",
63 "execution_mode": "preflight",
64 "runtime_report": {
65 "ready": false,
66 "model_name": "mert",
67 "availability": {
68 "numpy": true,
69 "torch": false,
70 "torchaudio": false,
71 "transformers": false
72 },
73 "requirements": [
74 "numpy",
75 "torch",
76 "torchaudio",
77 "transformers"
78 ],
79 "missing_dependencies": [
80 "torch",
81 "torchaudio",
82 "transformers"
83 ]
84 },
85 "scope_window_count": 20,
86 "vector_table_report": {
87 "reason": "vector_table_dim_mismatch",
88 "resolved": false,
89 "expected_dim": 768,
90 "table_exists": false,
91 "allowed_vector_tables": [
92 "audio_embedding_vector_192",
93 "audio_embedding_vector_768"
94 ],
95 "requested_vector_table": "audio_embedding_vector_192"
96 },
97 "target_scope_summary": {
98 "scope_type": "reference_set",
99 "scope_value": "phase1_hot_reference_v1",
100 "recording_count": 20,
101 "reference_set_id": 2,
102 "ready_asset_count": 20,
103 "reference_set_name": "phase1_hot_reference_v1",
104 "active_window_count": 20
105 }
106 }
107 },
108 "status_after_complete": null,
109 "status_after_failed": {
110 "extraction_job_id": 2,
111 "job_status": "failed",
112 "input_count": 20,
113 "output_count": 0,
114 "started_at": "2026-06-04T14:00:28.270203+08:00",
115 "finished_at": "2026-06-04T14:00:28.271729+08:00",
116 "log_uri": null,
117 "metadata_json": {
118 "lane": "semantic",
119 "role": "primary_baseline",
120 "phase": "phase1",
121 "worker": "run_embedding_job",
122 "dry_run": false,
123 "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
124 "vector_table": "audio_embedding_vector_192",
125 "output_target": "audio_embedding",
126 "execution_mode": "preflight_failure",
127 "failure_reason": "preflight_failed",
128 "runtime_report": {
129 "ready": false,
130 "model_name": "mert",
131 "availability": {
132 "numpy": true,
133 "torch": false,
134 "torchaudio": false,
135 "transformers": false
136 },
137 "requirements": [
138 "numpy",
139 "torch",
140 "torchaudio",
141 "transformers"
142 ],
143 "missing_dependencies": [
144 "torch",
145 "torchaudio",
146 "transformers"
147 ]
148 },
149 "preflight_blockers": [
150 "unreadable_audio_assets",
151 "vector_table_dim_mismatch",
152 "model_runtime_unavailable"
153 ],
154 "scope_window_count": 20,
155 "write_target_table": "audio_embedding",
156 "vector_table_report": {
157 "reason": "vector_table_dim_mismatch",
158 "resolved": false,
159 "expected_dim": 768,
160 "table_exists": false,
161 "allowed_vector_tables": [
162 "audio_embedding_vector_192",
163 "audio_embedding_vector_768"
164 ],
165 "requested_vector_table": "audio_embedding_vector_192"
166 },
167 "missing_window_count": 20,
168 "target_scope_summary": {
169 "scope_type": "reference_set",
170 "scope_value": "phase1_hot_reference_v1",
171 "recording_count": 20,
172 "reference_set_id": 2,
173 "ready_asset_count": 20,
174 "reference_set_name": "phase1_hot_reference_v1",
175 "active_window_count": 20
176 },
177 "missing_window_samples": [
178 {
179 "reason": "missing_audio",
180 "asset_id": 1,
181 "window_id": 1,
182 "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
183 },
184 {
185 "reason": "missing_audio",
186 "asset_id": 2,
187 "window_id": 2,
188 "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
189 },
190 {
191 "reason": "missing_audio",
192 "asset_id": 3,
193 "window_id": 3,
194 "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
195 },
196 {
197 "reason": "missing_audio",
198 "asset_id": 4,
199 "window_id": 4,
200 "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
201 },
202 {
203 "reason": "missing_audio",
204 "asset_id": 5,
205 "window_id": 5,
206 "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
207 }
208 ]
209 }
210 },
211 "resolved_vector_table": "audio_embedding_vector_192",
212 "vector_table_report": {
213 "requested_vector_table": "audio_embedding_vector_192",
214 "expected_dim": 768,
215 "allowed_vector_tables": [
216 "audio_embedding_vector_192",
217 "audio_embedding_vector_768"
218 ],
219 "resolved": false,
220 "table_exists": false,
221 "reason": "vector_table_dim_mismatch"
222 },
223 "runtime_report": {
224 "model_name": "mert",
225 "requirements": [
226 "numpy",
227 "torch",
228 "torchaudio",
229 "transformers"
230 ],
231 "availability": {
232 "numpy": true,
233 "torch": false,
234 "torchaudio": false,
235 "transformers": false
236 },
237 "missing_dependencies": [
238 "torch",
239 "torchaudio",
240 "transformers"
241 ],
242 "ready": false
243 },
244 "processed_windows": [],
245 "notes": [
246 "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
247 "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
248 ]
249 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "worker": "run_embedding_job",
3 "schema": "acr_vector_table_missing_test",
4 "job": {
5 "extraction_job_id": 2,
6 "feature_set_id": 3,
7 "target_scope": "reference_set:phase1_hot_reference_v1",
8 "job_status": "pending",
9 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
10 "job_metadata": {
11 "lane": "semantic",
12 "role": "primary_baseline",
13 "phase": "phase1"
14 },
15 "feature_name": "semantic_embedding",
16 "feature_level": "window",
17 "extraction_granularity": "sliding_window",
18 "window_sec": 5.0,
19 "hop_sec": 2.5,
20 "embedding_dim": 768,
21 "distance_metric": "cosine",
22 "feature_config": {
23 "role": "primary_semantic_baseline"
24 },
25 "model_id": 3,
26 "model_name": "mert",
27 "model_version": "v1-95m",
28 "model_family": "music_ssl",
29 "input_sample_rate": 24000,
30 "output_embedding_dim": 768,
31 "model_metadata": {
32 "lane": "semantic",
33 "role": "primary_baseline",
34 "phase": "phase1"
35 }
36 },
37 "target_scope_summary": {
38 "scope_type": "reference_set",
39 "scope_value": "phase1_hot_reference_v1",
40 "reference_set_id": 2,
41 "reference_set_name": "phase1_hot_reference_v1",
42 "recording_count": 20,
43 "ready_asset_count": 20,
44 "active_window_count": 20
45 },
46 "scope_window_count": 20,
47 "status_after_start": {
48 "extraction_job_id": 2,
49 "job_status": "running",
50 "input_count": 20,
51 "output_count": null,
52 "started_at": "2026-06-04T14:00:28.943358+08:00",
53 "finished_at": null,
54 "log_uri": null,
55 "metadata_json": {
56 "lane": "semantic",
57 "role": "primary_baseline",
58 "phase": "phase1",
59 "worker": "run_embedding_job",
60 "dry_run": false,
61 "vector_table": "audio_embedding_vector_768",
62 "output_target": "audio_embedding",
63 "execution_mode": "preflight",
64 "runtime_report": {
65 "ready": false,
66 "model_name": "mert",
67 "availability": {
68 "numpy": true,
69 "torch": false,
70 "torchaudio": false,
71 "transformers": false
72 },
73 "requirements": [
74 "numpy",
75 "torch",
76 "torchaudio",
77 "transformers"
78 ],
79 "missing_dependencies": [
80 "torch",
81 "torchaudio",
82 "transformers"
83 ]
84 },
85 "scope_window_count": 20,
86 "vector_table_report": {
87 "reason": "vector_table_missing_in_schema",
88 "resolved": false,
89 "expected_dim": 768,
90 "table_exists": false,
91 "allowed_vector_tables": [
92 "audio_embedding_vector_192",
93 "audio_embedding_vector_768"
94 ],
95 "requested_vector_table": "audio_embedding_vector_768"
96 },
97 "target_scope_summary": {
98 "scope_type": "reference_set",
99 "scope_value": "phase1_hot_reference_v1",
100 "recording_count": 20,
101 "reference_set_id": 2,
102 "ready_asset_count": 20,
103 "reference_set_name": "phase1_hot_reference_v1",
104 "active_window_count": 20
105 }
106 }
107 },
108 "status_after_complete": null,
109 "status_after_failed": {
110 "extraction_job_id": 2,
111 "job_status": "failed",
112 "input_count": 20,
113 "output_count": 0,
114 "started_at": "2026-06-04T14:00:28.943358+08:00",
115 "finished_at": "2026-06-04T14:00:28.944578+08:00",
116 "log_uri": null,
117 "metadata_json": {
118 "lane": "semantic",
119 "role": "primary_baseline",
120 "phase": "phase1",
121 "worker": "run_embedding_job",
122 "dry_run": false,
123 "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
124 "vector_table": "audio_embedding_vector_768",
125 "output_target": "audio_embedding",
126 "execution_mode": "preflight_failure",
127 "failure_reason": "preflight_failed",
128 "runtime_report": {
129 "ready": false,
130 "model_name": "mert",
131 "availability": {
132 "numpy": true,
133 "torch": false,
134 "torchaudio": false,
135 "transformers": false
136 },
137 "requirements": [
138 "numpy",
139 "torch",
140 "torchaudio",
141 "transformers"
142 ],
143 "missing_dependencies": [
144 "torch",
145 "torchaudio",
146 "transformers"
147 ]
148 },
149 "preflight_blockers": [
150 "unreadable_audio_assets",
151 "vector_table_missing_in_schema",
152 "model_runtime_unavailable"
153 ],
154 "scope_window_count": 20,
155 "write_target_table": "audio_embedding",
156 "vector_table_report": {
157 "reason": "vector_table_missing_in_schema",
158 "resolved": false,
159 "expected_dim": 768,
160 "table_exists": false,
161 "allowed_vector_tables": [
162 "audio_embedding_vector_192",
163 "audio_embedding_vector_768"
164 ],
165 "requested_vector_table": "audio_embedding_vector_768"
166 },
167 "missing_window_count": 20,
168 "target_scope_summary": {
169 "scope_type": "reference_set",
170 "scope_value": "phase1_hot_reference_v1",
171 "recording_count": 20,
172 "reference_set_id": 2,
173 "ready_asset_count": 20,
174 "reference_set_name": "phase1_hot_reference_v1",
175 "active_window_count": 20
176 },
177 "missing_window_samples": [
178 {
179 "reason": "missing_audio",
180 "asset_id": 1,
181 "window_id": 1,
182 "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
183 },
184 {
185 "reason": "missing_audio",
186 "asset_id": 2,
187 "window_id": 2,
188 "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
189 },
190 {
191 "reason": "missing_audio",
192 "asset_id": 3,
193 "window_id": 3,
194 "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
195 },
196 {
197 "reason": "missing_audio",
198 "asset_id": 4,
199 "window_id": 4,
200 "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
201 },
202 {
203 "reason": "missing_audio",
204 "asset_id": 5,
205 "window_id": 5,
206 "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
207 }
208 ]
209 }
210 },
211 "resolved_vector_table": "audio_embedding_vector_768",
212 "vector_table_report": {
213 "requested_vector_table": "audio_embedding_vector_768",
214 "expected_dim": 768,
215 "allowed_vector_tables": [
216 "audio_embedding_vector_192",
217 "audio_embedding_vector_768"
218 ],
219 "resolved": false,
220 "table_exists": false,
221 "reason": "vector_table_missing_in_schema"
222 },
223 "runtime_report": {
224 "model_name": "mert",
225 "requirements": [
226 "numpy",
227 "torch",
228 "torchaudio",
229 "transformers"
230 ],
231 "availability": {
232 "numpy": true,
233 "torch": false,
234 "torchaudio": false,
235 "transformers": false
236 },
237 "missing_dependencies": [
238 "torch",
239 "torchaudio",
240 "transformers"
241 ],
242 "ready": false
243 },
244 "processed_windows": [],
245 "notes": [
246 "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
247 "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
248 ]
249 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "source_schema": "acr_test",
3 "missing_table_schema": "acr_vector_table_missing_test",
4 "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
5 "cases": [
6 {
7 "case": "vector_table_dim_mismatch",
8 "schema": "acr_test",
9 "vector_table": "audio_embedding_vector_192",
10 "job_status": "failed",
11 "failure_reason": "preflight_failed",
12 "preflight_blockers": [
13 "unreadable_audio_assets",
14 "vector_table_dim_mismatch",
15 "model_runtime_unavailable"
16 ],
17 "vector_table_report": {
18 "reason": "vector_table_dim_mismatch",
19 "resolved": false,
20 "expected_dim": 768,
21 "table_exists": false,
22 "allowed_vector_tables": [
23 "audio_embedding_vector_192",
24 "audio_embedding_vector_768"
25 ],
26 "requested_vector_table": "audio_embedding_vector_192"
27 },
28 "artifact": "data/pgvector_eval/music20/embedding_vector_table_dim_mismatch_attempt.json"
29 },
30 {
31 "case": "vector_table_not_allowlisted",
32 "schema": "acr_test",
33 "vector_table": "audio_embedding_vector_1024",
34 "job_status": "failed",
35 "failure_reason": "preflight_failed",
36 "preflight_blockers": [
37 "unreadable_audio_assets",
38 "vector_table_not_allowlisted",
39 "model_runtime_unavailable"
40 ],
41 "vector_table_report": {
42 "reason": "vector_table_not_allowlisted",
43 "resolved": false,
44 "expected_dim": 768,
45 "table_exists": false,
46 "allowed_vector_tables": [
47 "audio_embedding_vector_192",
48 "audio_embedding_vector_768"
49 ],
50 "requested_vector_table": "audio_embedding_vector_1024"
51 },
52 "artifact": "data/pgvector_eval/music20/embedding_vector_table_not_allowlisted_attempt.json"
53 },
54 {
55 "case": "vector_table_missing_in_schema",
56 "schema": "acr_vector_table_missing_test",
57 "vector_table": "audio_embedding_vector_768",
58 "job_status": "failed",
59 "failure_reason": "preflight_failed",
60 "preflight_blockers": [
61 "unreadable_audio_assets",
62 "vector_table_missing_in_schema",
63 "model_runtime_unavailable"
64 ],
65 "vector_table_report": {
66 "reason": "vector_table_missing_in_schema",
67 "resolved": false,
68 "expected_dim": 768,
69 "table_exists": false,
70 "allowed_vector_tables": [
71 "audio_embedding_vector_192",
72 "audio_embedding_vector_768"
73 ],
74 "requested_vector_table": "audio_embedding_vector_768"
75 },
76 "artifact": "data/pgvector_eval/music20/embedding_vector_table_missing_in_schema_attempt.json"
77 }
78 ],
79 "summary": {
80 "expected_reasons": {
81 "vector_table_dim_mismatch": "vector_table_dim_mismatch",
82 "vector_table_not_allowlisted": "vector_table_not_allowlisted",
83 "vector_table_missing_in_schema": "vector_table_missing_in_schema"
84 },
85 "all_failed": true
86 }
87 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "worker": "run_embedding_job",
3 "schema": "acr_test",
4 "job": {
5 "extraction_job_id": 2,
6 "feature_set_id": 3,
7 "target_scope": "reference_set:phase1_hot_reference_v1",
8 "job_status": "pending",
9 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
10 "job_metadata": {
11 "lane": "semantic",
12 "role": "primary_baseline",
13 "phase": "phase1"
14 },
15 "feature_name": "semantic_embedding",
16 "feature_level": "window",
17 "extraction_granularity": "sliding_window",
18 "window_sec": 5.0,
19 "hop_sec": 2.5,
20 "embedding_dim": 768,
21 "distance_metric": "cosine",
22 "feature_config": {
23 "role": "primary_semantic_baseline"
24 },
25 "model_id": 3,
26 "model_name": "mert",
27 "model_version": "v1-95m",
28 "model_family": "music_ssl",
29 "input_sample_rate": 24000,
30 "output_embedding_dim": 768,
31 "model_metadata": {
32 "lane": "semantic",
33 "role": "primary_baseline",
34 "phase": "phase1"
35 }
36 },
37 "target_scope_summary": {
38 "scope_type": "reference_set",
39 "scope_value": "phase1_hot_reference_v1",
40 "reference_set_id": 2,
41 "reference_set_name": "phase1_hot_reference_v1",
42 "recording_count": 20,
43 "ready_asset_count": 20,
44 "active_window_count": 20
45 },
46 "scope_window_count": 20,
47 "status_after_start": {
48 "extraction_job_id": 2,
49 "job_status": "running",
50 "input_count": 20,
51 "output_count": null,
52 "started_at": "2026-06-04T14:00:28.602175+08:00",
53 "finished_at": null,
54 "log_uri": null,
55 "metadata_json": {
56 "lane": "semantic",
57 "role": "primary_baseline",
58 "phase": "phase1",
59 "worker": "run_embedding_job",
60 "dry_run": false,
61 "vector_table": "audio_embedding_vector_1024",
62 "output_target": "audio_embedding",
63 "execution_mode": "preflight",
64 "runtime_report": {
65 "ready": false,
66 "model_name": "mert",
67 "availability": {
68 "numpy": true,
69 "torch": false,
70 "torchaudio": false,
71 "transformers": false
72 },
73 "requirements": [
74 "numpy",
75 "torch",
76 "torchaudio",
77 "transformers"
78 ],
79 "missing_dependencies": [
80 "torch",
81 "torchaudio",
82 "transformers"
83 ]
84 },
85 "scope_window_count": 20,
86 "vector_table_report": {
87 "reason": "vector_table_not_allowlisted",
88 "resolved": false,
89 "expected_dim": 768,
90 "table_exists": false,
91 "allowed_vector_tables": [
92 "audio_embedding_vector_192",
93 "audio_embedding_vector_768"
94 ],
95 "requested_vector_table": "audio_embedding_vector_1024"
96 },
97 "target_scope_summary": {
98 "scope_type": "reference_set",
99 "scope_value": "phase1_hot_reference_v1",
100 "recording_count": 20,
101 "reference_set_id": 2,
102 "ready_asset_count": 20,
103 "reference_set_name": "phase1_hot_reference_v1",
104 "active_window_count": 20
105 }
106 }
107 },
108 "status_after_complete": null,
109 "status_after_failed": {
110 "extraction_job_id": 2,
111 "job_status": "failed",
112 "input_count": 20,
113 "output_count": 0,
114 "started_at": "2026-06-04T14:00:28.602175+08:00",
115 "finished_at": "2026-06-04T14:00:28.603652+08:00",
116 "log_uri": null,
117 "metadata_json": {
118 "lane": "semantic",
119 "role": "primary_baseline",
120 "phase": "phase1",
121 "worker": "run_embedding_job",
122 "dry_run": false,
123 "artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
124 "vector_table": "audio_embedding_vector_1024",
125 "output_target": "audio_embedding",
126 "execution_mode": "preflight_failure",
127 "failure_reason": "preflight_failed",
128 "runtime_report": {
129 "ready": false,
130 "model_name": "mert",
131 "availability": {
132 "numpy": true,
133 "torch": false,
134 "torchaudio": false,
135 "transformers": false
136 },
137 "requirements": [
138 "numpy",
139 "torch",
140 "torchaudio",
141 "transformers"
142 ],
143 "missing_dependencies": [
144 "torch",
145 "torchaudio",
146 "transformers"
147 ]
148 },
149 "preflight_blockers": [
150 "unreadable_audio_assets",
151 "vector_table_not_allowlisted",
152 "model_runtime_unavailable"
153 ],
154 "scope_window_count": 20,
155 "write_target_table": "audio_embedding",
156 "vector_table_report": {
157 "reason": "vector_table_not_allowlisted",
158 "resolved": false,
159 "expected_dim": 768,
160 "table_exists": false,
161 "allowed_vector_tables": [
162 "audio_embedding_vector_192",
163 "audio_embedding_vector_768"
164 ],
165 "requested_vector_table": "audio_embedding_vector_1024"
166 },
167 "missing_window_count": 20,
168 "target_scope_summary": {
169 "scope_type": "reference_set",
170 "scope_value": "phase1_hot_reference_v1",
171 "recording_count": 20,
172 "reference_set_id": 2,
173 "ready_asset_count": 20,
174 "reference_set_name": "phase1_hot_reference_v1",
175 "active_window_count": 20
176 },
177 "missing_window_samples": [
178 {
179 "reason": "missing_audio",
180 "asset_id": 1,
181 "window_id": 1,
182 "storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
183 },
184 {
185 "reason": "missing_audio",
186 "asset_id": 2,
187 "window_id": 2,
188 "storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
189 },
190 {
191 "reason": "missing_audio",
192 "asset_id": 3,
193 "window_id": 3,
194 "storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
195 },
196 {
197 "reason": "missing_audio",
198 "asset_id": 4,
199 "window_id": 4,
200 "storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
201 },
202 {
203 "reason": "missing_audio",
204 "asset_id": 5,
205 "window_id": 5,
206 "storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
207 }
208 ]
209 }
210 },
211 "resolved_vector_table": "audio_embedding_vector_1024",
212 "vector_table_report": {
213 "requested_vector_table": "audio_embedding_vector_1024",
214 "expected_dim": 768,
215 "allowed_vector_tables": [
216 "audio_embedding_vector_192",
217 "audio_embedding_vector_768"
218 ],
219 "resolved": false,
220 "table_exists": false,
221 "reason": "vector_table_not_allowlisted"
222 },
223 "runtime_report": {
224 "model_name": "mert",
225 "requirements": [
226 "numpy",
227 "torch",
228 "torchaudio",
229 "transformers"
230 ],
231 "availability": {
232 "numpy": true,
233 "torch": false,
234 "torchaudio": false,
235 "transformers": false
236 },
237 "missing_dependencies": [
238 "torch",
239 "torchaudio",
240 "transformers"
241 ],
242 "ready": false
243 },
244 "processed_windows": [],
245 "notes": [
246 "this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
247 "real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
248 ]
249 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import subprocess
7 from pathlib import Path
8 import sys
9 from typing import Any
10
11 import psycopg
12
13 ROOT = Path(__file__).resolve().parents[1]
14 if str(ROOT) not in sys.path:
15 sys.path.insert(0, str(ROOT))
16
17 from workers._job_common import validate_schema
18
19 PYTHON_BIN = '/usr/local/miniconda3/bin/python'
20 DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'embedding_vector_table_negative_matrix_report.json'
21 SOURCE_SCHEMA = 'acr_test'
22 MINIMAL_TABLES = [
23 'canonical_song',
24 'work',
25 'recording',
26 'recording_asset',
27 'audio_window',
28 'model_registry',
29 'feature_set_registry',
30 'feature_extraction_job',
31 'reference_set_registry',
32 'reference_set_member',
33 ]
34
35
36 def run_cmd(cmd: list[str]) -> subprocess.CompletedProcess[str]:
37 return subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True)
38
39
40 def reset_source_jobs(dsn: str) -> None:
41 proc = run_cmd([
42 PYTHON_BIN,
43 'scripts/bootstrap_phase1_extraction_jobs_live.py',
44 '--dsn', dsn,
45 '--schema', SOURCE_SCHEMA,
46 ])
47 if proc.returncode != 0:
48 raise SystemExit(proc.stderr or proc.stdout)
49
50
51 def clone_minimal_schema_without_vectors(dsn: str, target_schema: str) -> None:
52 target_schema = validate_schema(target_schema)
53 with psycopg.connect(dsn, autocommit=True) as conn:
54 conn.execute(f'DROP SCHEMA IF EXISTS {target_schema} CASCADE;')
55 conn.execute(f'CREATE SCHEMA {target_schema};')
56 for table_name in MINIMAL_TABLES:
57 conn.execute(f'CREATE TABLE {target_schema}.{table_name} AS TABLE {SOURCE_SCHEMA}.{table_name} WITH DATA;')
58
59
60 def run_worker_case(*, dsn: str, schema: str, vector_table: str, output_name: str) -> dict[str, Any]:
61 out = ROOT / 'data' / 'pgvector_eval' / 'music20' / output_name
62 proc = run_cmd([
63 PYTHON_BIN,
64 'workers/run_embedding_job.py',
65 '--dsn', dsn,
66 '--schema', schema,
67 '--job-id', '2',
68 '--model-name', 'mert',
69 '--model-version', 'v1-95m',
70 '--vector-table', vector_table,
71 '--output', str(out),
72 ])
73 if proc.returncode != 0:
74 raise SystemExit(proc.stderr or proc.stdout)
75 payload = json.loads(out.read_text(encoding='utf-8'))
76 failed = payload.get('status_after_failed') or {}
77 metadata = failed.get('metadata_json') or {}
78 return {
79 'schema': schema,
80 'vector_table': vector_table,
81 'job_status': failed.get('job_status'),
82 'failure_reason': metadata.get('failure_reason'),
83 'preflight_blockers': metadata.get('preflight_blockers'),
84 'vector_table_report': metadata.get('vector_table_report'),
85 'artifact': str(out.relative_to(ROOT)),
86 }
87
88
89 def main() -> None:
90 ap = argparse.ArgumentParser()
91 ap.add_argument('--dsn', required=True)
92 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
93 ap.add_argument('--missing-table-schema', default='acr_vector_table_missing_test')
94 args = ap.parse_args()
95
96 reset_source_jobs(args.dsn)
97 dim_mismatch = run_worker_case(
98 dsn=args.dsn,
99 schema=SOURCE_SCHEMA,
100 vector_table='audio_embedding_vector_192',
101 output_name='embedding_vector_table_dim_mismatch_attempt.json',
102 )
103
104 reset_source_jobs(args.dsn)
105 not_allowlisted = run_worker_case(
106 dsn=args.dsn,
107 schema=SOURCE_SCHEMA,
108 vector_table='audio_embedding_vector_1024',
109 output_name='embedding_vector_table_not_allowlisted_attempt.json',
110 )
111
112 reset_source_jobs(args.dsn)
113 clone_minimal_schema_without_vectors(args.dsn, args.missing_table_schema)
114 missing_table = run_worker_case(
115 dsn=args.dsn,
116 schema=args.missing_table_schema,
117 vector_table='audio_embedding_vector_768',
118 output_name='embedding_vector_table_missing_in_schema_attempt.json',
119 )
120
121 payload = {
122 'source_schema': SOURCE_SCHEMA,
123 'missing_table_schema': args.missing_table_schema,
124 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
125 'cases': [
126 {'case': 'vector_table_dim_mismatch', **dim_mismatch},
127 {'case': 'vector_table_not_allowlisted', **not_allowlisted},
128 {'case': 'vector_table_missing_in_schema', **missing_table},
129 ],
130 'summary': {
131 'expected_reasons': {
132 'vector_table_dim_mismatch': dim_mismatch['vector_table_report'].get('reason'),
133 'vector_table_not_allowlisted': not_allowlisted['vector_table_report'].get('reason'),
134 'vector_table_missing_in_schema': missing_table['vector_table_report'].get('reason'),
135 },
136 'all_failed': all(item['job_status'] == 'failed' for item in [dim_mismatch, not_allowlisted, missing_table]),
137 },
138 }
139 out = Path(args.output)
140 out.parent.mkdir(parents=True, exist_ok=True)
141 out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
142 print(json.dumps(payload, ensure_ascii=False, indent=2))
143
144
145 if __name__ == '__main__':
146 main()
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `scripts/run_embedding_vector_table_negative_matrix_live.py``embedding_vector_table_negative_matrix_report.json`,在 live PostgreSQL 上补齐 semantic preflight 的三类向量表负例:维度不匹配、未 allowlist、schema 缺表;三类 case 都会稳定落到 `preflight_failed`,且 `vector_table_report.reason` 与预期一致。
3 - 新增 `scripts/run_phase1_worker_contract_smoke_live.py``phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。 4 - 新增 `scripts/run_phase1_worker_contract_smoke_live.py``phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。
4 - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py``audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1` 5 - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py``audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`
5 - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py``phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 6 - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py``phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。
......
...@@ -312,6 +312,21 @@ worker 会把这些 blocker 聚合到: ...@@ -312,6 +312,21 @@ worker 会把这些 blocker 聚合到:
312 312
313 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。 313 这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。
314 314
315 ### 当前 vector table 负例证据
316
317 除了正常 `audio_embedding_vector_768` 存在性校验外,本轮还对 semantic lane 补了 3 类 live 负例:
318
319 - `audio_embedding_vector_192` -> `vector_table_dim_mismatch`
320 - `audio_embedding_vector_1024` -> `vector_table_not_allowlisted`
321 - 缺失 `audio_embedding_vector_768` 的隔离 schema -> `vector_table_missing_in_schema`
322
323 对应产物:
324
325 - `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py`
326 - `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json`
327
328 这说明 semantic worker 当前不只是会在“环境缺依赖”时失败,也能把 **配置错误的向量表** 精确落账。
329
315 ### 当前 live 证据 330 ### 当前 live 证据
316 331
317 MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: 332 MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证:
......
...@@ -845,3 +845,30 @@ cd /workspace/acr-engine ...@@ -845,3 +845,30 @@ cd /workspace/acr-engine
845 - 当前阻塞已经非常明确,主要不是 orchestration,而是环境: 845 - 当前阻塞已经非常明确,主要不是 orchestration,而是环境:
846 - `/workspace/downloads` 未挂载 846 - `/workspace/downloads` 未挂载
847 - semantic model runtime 未安装 847 - semantic model runtime 未安装
848
849
850 ## 新增:semantic vector table 负例矩阵
851
852 为了避免后续把 semantic worker 的失败都误归因为“缺模型/缺音频”,本轮新增:
853
854 - `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py`
855 - `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json`
856
857 它真实验证了 3 类向量表配置错误:
858
859 | case | schema | vector table | reason |
860 |---|---|---|---|
861 | `vector_table_dim_mismatch` | `acr_test` | `audio_embedding_vector_192` | `vector_table_dim_mismatch` |
862 | `vector_table_not_allowlisted` | `acr_test` | `audio_embedding_vector_1024` | `vector_table_not_allowlisted` |
863 | `vector_table_missing_in_schema` | `acr_vector_table_missing_test` | `audio_embedding_vector_768` | `vector_table_missing_in_schema` |
864
865 共同点:
866
867 - 3 条 case 全部 `job_status = failed`
868 - `failure_reason = preflight_failed`
869 - `preflight_blockers` 中除了环境 blocker,还会额外带上精确的 vector-table blocker
870
871 这说明:
872
873 - 当前 semantic preflight 已经能够把“运行环境问题”和“配置错误问题”分层暴露
874 - 后续只要看 `vector_table_report.reason`,就能快速区分是 DDL/配置错误,还是模型 runtime/音频挂载错误
......
...@@ -194,6 +194,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -194,6 +194,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
194 - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 194 - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失
195 - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 195 - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据
196 - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失 196 - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失
197 - `scripts/run_embedding_vector_table_negative_matrix_live.py` 已在 live PostgreSQL 上补齐 semantic vector-table 负例矩阵:`vector_table_dim_mismatch``vector_table_not_allowlisted``vector_table_missing_in_schema` 三类错误都能被稳定写入 `vector_table_report.reason`
197 - `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` 198 - `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
198 - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` 199 - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
199 - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` 200 - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
......