Commit 71bbe76f 71bbe76f5d774b0445ca1ef453c8ca615510d4d6 by cnb.bofCdSsphPA

Make semantic vector-table misconfigurations fail with live evidence

Constraint: Phase-1 semantic jobs were already blocked by missing audio and model runtimes, so vector-table regressions needed their own isolated live proof to avoid being masked by the same environment failures.
Rejected: Infer vector-table coverage from code inspection only | It would not prove the worker writes the correct blocker reasons into PostgreSQL metadata.
Confidence: high
Scope-risk: narrow
Directive: When semantic extraction fails, inspect vector_table_report.reason before assuming the host is only missing mounts or model dependencies.
Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_embedding_vector_table_negative_matrix_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_embedding_vector_table_negative_matrix_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --output data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json
Not-tested: No successful semantic extraction path exists yet on this host; this commit validates negative preflight cases only.
1 parent 223f80ac
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 2,
"feature_set_id": 3,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "primary_semantic_baseline"
},
"model_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 2,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T14:00:28.270203+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_192",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": "vector_table_dim_mismatch",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 2,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T14:00:28.270203+08:00",
"finished_at": "2026-06-04T14:00:28.271729+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_192",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_dim_mismatch",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": "vector_table_dim_mismatch",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_192",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_192",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": false,
"table_exists": false,
"reason": "vector_table_dim_mismatch"
},
"runtime_report": {
"model_name": "mert",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"worker": "run_embedding_job",
"schema": "acr_vector_table_missing_test",
"job": {
"extraction_job_id": 2,
"feature_set_id": 3,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "primary_semantic_baseline"
},
"model_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 2,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T14:00:28.943358+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": "vector_table_missing_in_schema",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 2,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T14:00:28.943358+08:00",
"finished_at": "2026-06-04T14:00:28.944578+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_missing_in_schema",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": "vector_table_missing_in_schema",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_768",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_768",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": false,
"table_exists": false,
"reason": "vector_table_missing_in_schema"
},
"runtime_report": {
"model_name": "mert",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"source_schema": "acr_test",
"missing_table_schema": "acr_vector_table_missing_test",
"dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
"cases": [
{
"case": "vector_table_dim_mismatch",
"schema": "acr_test",
"vector_table": "audio_embedding_vector_192",
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_dim_mismatch",
"model_runtime_unavailable"
],
"vector_table_report": {
"reason": "vector_table_dim_mismatch",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"artifact": "data/pgvector_eval/music20/embedding_vector_table_dim_mismatch_attempt.json"
},
{
"case": "vector_table_not_allowlisted",
"schema": "acr_test",
"vector_table": "audio_embedding_vector_1024",
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_not_allowlisted",
"model_runtime_unavailable"
],
"vector_table_report": {
"reason": "vector_table_not_allowlisted",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_1024"
},
"artifact": "data/pgvector_eval/music20/embedding_vector_table_not_allowlisted_attempt.json"
},
{
"case": "vector_table_missing_in_schema",
"schema": "acr_vector_table_missing_test",
"vector_table": "audio_embedding_vector_768",
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_missing_in_schema",
"model_runtime_unavailable"
],
"vector_table_report": {
"reason": "vector_table_missing_in_schema",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"artifact": "data/pgvector_eval/music20/embedding_vector_table_missing_in_schema_attempt.json"
}
],
"summary": {
"expected_reasons": {
"vector_table_dim_mismatch": "vector_table_dim_mismatch",
"vector_table_not_allowlisted": "vector_table_not_allowlisted",
"vector_table_missing_in_schema": "vector_table_missing_in_schema"
},
"all_failed": true
}
}
\ No newline at end of file
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 2,
"feature_set_id": 3,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "primary_semantic_baseline"
},
"model_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 2,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T14:00:28.602175+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_1024",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": "vector_table_not_allowlisted",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_1024"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 2,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T14:00:28.602175+08:00",
"finished_at": "2026-06-04T14:00:28.603652+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_1024",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"vector_table_not_allowlisted",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": "vector_table_not_allowlisted",
"resolved": false,
"expected_dim": 768,
"table_exists": false,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_1024"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_1024",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_1024",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": false,
"table_exists": false,
"reason": "vector_table_not_allowlisted"
},
"runtime_report": {
"model_name": "mert",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
import sys
from typing import Any
import psycopg
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from workers._job_common import validate_schema
PYTHON_BIN = '/usr/local/miniconda3/bin/python'
DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'embedding_vector_table_negative_matrix_report.json'
SOURCE_SCHEMA = 'acr_test'
MINIMAL_TABLES = [
'canonical_song',
'work',
'recording',
'recording_asset',
'audio_window',
'model_registry',
'feature_set_registry',
'feature_extraction_job',
'reference_set_registry',
'reference_set_member',
]
def run_cmd(cmd: list[str]) -> subprocess.CompletedProcess[str]:
return subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True)
def reset_source_jobs(dsn: str) -> None:
proc = run_cmd([
PYTHON_BIN,
'scripts/bootstrap_phase1_extraction_jobs_live.py',
'--dsn', dsn,
'--schema', SOURCE_SCHEMA,
])
if proc.returncode != 0:
raise SystemExit(proc.stderr or proc.stdout)
def clone_minimal_schema_without_vectors(dsn: str, target_schema: str) -> None:
target_schema = validate_schema(target_schema)
with psycopg.connect(dsn, autocommit=True) as conn:
conn.execute(f'DROP SCHEMA IF EXISTS {target_schema} CASCADE;')
conn.execute(f'CREATE SCHEMA {target_schema};')
for table_name in MINIMAL_TABLES:
conn.execute(f'CREATE TABLE {target_schema}.{table_name} AS TABLE {SOURCE_SCHEMA}.{table_name} WITH DATA;')
def run_worker_case(*, dsn: str, schema: str, vector_table: str, output_name: str) -> dict[str, Any]:
out = ROOT / 'data' / 'pgvector_eval' / 'music20' / output_name
proc = run_cmd([
PYTHON_BIN,
'workers/run_embedding_job.py',
'--dsn', dsn,
'--schema', schema,
'--job-id', '2',
'--model-name', 'mert',
'--model-version', 'v1-95m',
'--vector-table', vector_table,
'--output', str(out),
])
if proc.returncode != 0:
raise SystemExit(proc.stderr or proc.stdout)
payload = json.loads(out.read_text(encoding='utf-8'))
failed = payload.get('status_after_failed') or {}
metadata = failed.get('metadata_json') or {}
return {
'schema': schema,
'vector_table': vector_table,
'job_status': failed.get('job_status'),
'failure_reason': metadata.get('failure_reason'),
'preflight_blockers': metadata.get('preflight_blockers'),
'vector_table_report': metadata.get('vector_table_report'),
'artifact': str(out.relative_to(ROOT)),
}
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument('--dsn', required=True)
ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
ap.add_argument('--missing-table-schema', default='acr_vector_table_missing_test')
args = ap.parse_args()
reset_source_jobs(args.dsn)
dim_mismatch = run_worker_case(
dsn=args.dsn,
schema=SOURCE_SCHEMA,
vector_table='audio_embedding_vector_192',
output_name='embedding_vector_table_dim_mismatch_attempt.json',
)
reset_source_jobs(args.dsn)
not_allowlisted = run_worker_case(
dsn=args.dsn,
schema=SOURCE_SCHEMA,
vector_table='audio_embedding_vector_1024',
output_name='embedding_vector_table_not_allowlisted_attempt.json',
)
reset_source_jobs(args.dsn)
clone_minimal_schema_without_vectors(args.dsn, args.missing_table_schema)
missing_table = run_worker_case(
dsn=args.dsn,
schema=args.missing_table_schema,
vector_table='audio_embedding_vector_768',
output_name='embedding_vector_table_missing_in_schema_attempt.json',
)
payload = {
'source_schema': SOURCE_SCHEMA,
'missing_table_schema': args.missing_table_schema,
'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
'cases': [
{'case': 'vector_table_dim_mismatch', **dim_mismatch},
{'case': 'vector_table_not_allowlisted', **not_allowlisted},
{'case': 'vector_table_missing_in_schema', **missing_table},
],
'summary': {
'expected_reasons': {
'vector_table_dim_mismatch': dim_mismatch['vector_table_report'].get('reason'),
'vector_table_not_allowlisted': not_allowlisted['vector_table_report'].get('reason'),
'vector_table_missing_in_schema': missing_table['vector_table_report'].get('reason'),
},
'all_failed': all(item['job_status'] == 'failed' for item in [dim_mismatch, not_allowlisted, missing_table]),
},
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()
## 2026-06-04
- 新增 `scripts/run_embedding_vector_table_negative_matrix_live.py``embedding_vector_table_negative_matrix_report.json`,在 live PostgreSQL 上补齐 semantic preflight 的三类向量表负例:维度不匹配、未 allowlist、schema 缺表;三类 case 都会稳定落到 `preflight_failed`,且 `vector_table_report.reason` 与预期一致。
- 新增 `scripts/run_phase1_worker_contract_smoke_live.py``phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。
- 新增 `scripts/validate_audio_embedding_asset_upsert_live.py``audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`
- 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py``phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。
......
......@@ -312,6 +312,21 @@ worker 会把这些 blocker 聚合到:
这样不会把“模型没法跑”误写成 completed,也不会只暴露第一个错误。
### 当前 vector table 负例证据
除了正常 `audio_embedding_vector_768` 存在性校验外,本轮还对 semantic lane 补了 3 类 live 负例:
- `audio_embedding_vector_192` -> `vector_table_dim_mismatch`
- `audio_embedding_vector_1024` -> `vector_table_not_allowlisted`
- 缺失 `audio_embedding_vector_768` 的隔离 schema -> `vector_table_missing_in_schema`
对应产物:
- `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py`
- `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json`
这说明 semantic worker 当前不只是会在“环境缺依赖”时失败,也能把 **配置错误的向量表** 精确落账。
### 当前 live 证据
MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证:
......
......@@ -845,3 +845,30 @@ cd /workspace/acr-engine
- 当前阻塞已经非常明确,主要不是 orchestration,而是环境:
- `/workspace/downloads` 未挂载
- semantic model runtime 未安装
## 新增:semantic vector table 负例矩阵
为了避免后续把 semantic worker 的失败都误归因为“缺模型/缺音频”,本轮新增:
- `acr-engine/scripts/run_embedding_vector_table_negative_matrix_live.py`
- `acr-engine/data/pgvector_eval/music20/embedding_vector_table_negative_matrix_report.json`
它真实验证了 3 类向量表配置错误:
| case | schema | vector table | reason |
|---|---|---|---|
| `vector_table_dim_mismatch` | `acr_test` | `audio_embedding_vector_192` | `vector_table_dim_mismatch` |
| `vector_table_not_allowlisted` | `acr_test` | `audio_embedding_vector_1024` | `vector_table_not_allowlisted` |
| `vector_table_missing_in_schema` | `acr_vector_table_missing_test` | `audio_embedding_vector_768` | `vector_table_missing_in_schema` |
共同点:
- 3 条 case 全部 `job_status = failed`
- `failure_reason = preflight_failed`
- `preflight_blockers` 中除了环境 blocker,还会额外带上精确的 vector-table blocker
这说明:
- 当前 semantic preflight 已经能够把“运行环境问题”和“配置错误问题”分层暴露
- 后续只要看 `vector_table_report.reason`,就能快速区分是 DDL/配置错误,还是模型 runtime/音频挂载错误
......
......@@ -194,6 +194,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
- `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失
- `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据
- `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失
- `scripts/run_embedding_vector_table_negative_matrix_live.py` 已在 live PostgreSQL 上补齐 semantic vector-table 负例矩阵:`vector_table_dim_mismatch``vector_table_not_allowlisted``vector_table_missing_in_schema` 三类错误都能被稳定写入 `vector_table_report.reason`
- `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
- worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
- exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
......