Commit 015e3261 015e3261ebda5f73356df75e52100bc4bcbd917a by cnb.bofCdSsphPA

Freeze a live blocker matrix for semantic extraction jobs

Constraint: The current container still lacks mounted source audio and the semantic model runtimes, so repeated manual spot-checks are noisy and wasteful.
Rejected: Ad-hoc one-job validation only | It would not show whether failures are contract-wide or model-specific.
Confidence: high
Scope-risk: narrow
Directive: Re-run the matrix before claiming any semantic worker progress so blocker drift across MERT/MuQ/ECAPA is visible.
Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_phase1_embedding_preflight_matrix_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_phase1_embedding_preflight_matrix_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_embedding_preflight_matrix_report.json
Not-tested: This matrix still cannot prove successful semantic inference until assets and runtime dependencies are available.
1 parent 399db601
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 2,
"feature_set_id": 3,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "primary_semantic_baseline"
},
"model_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 2,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:52:49.952665+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 2,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:52:49.952665+08:00",
"finished_at": "2026-06-04T13:52:49.954302+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_768",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_768",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": true,
"table_exists": true,
"reason": null
},
"runtime_report": {
"model_name": "mert",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 3,
"feature_set_id": 4,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/mert/v1-95m/10s_5s",
"job_metadata": {
"lane": "semantic",
"role": "long_context_validation",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 10.0,
"hop_sec": 5.0,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "long_context_validation"
},
"model_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "primary_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 3,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:52:50.210469+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "long_context_validation",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 3,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:52:50.210469+08:00",
"finished_at": "2026-06-04T13:52:50.211993+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "long_context_validation",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "mert",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_768",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_768",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": true,
"table_exists": true,
"reason": null
},
"runtime_report": {
"model_name": "mert",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 4,
"feature_set_id": 5,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "challenger",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 768,
"distance_metric": "cosine",
"feature_config": {
"role": "semantic_challenger"
},
"model_id": 4,
"model_name": "muq",
"model_version": "large-msd-iter",
"model_family": "music_ssl",
"input_sample_rate": 24000,
"output_embedding_dim": 768,
"model_metadata": {
"lane": "semantic",
"role": "challenger",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 4,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:52:50.434101+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "challenger",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "muq",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 4,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:52:50.434101+08:00",
"finished_at": "2026-06-04T13:52:50.435667+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "challenger",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_768",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "muq",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_768",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_768",
"expected_dim": 768,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": true,
"table_exists": true,
"reason": null
},
"runtime_report": {
"model_name": "muq",
"requirements": [
"numpy",
"torch",
"torchaudio",
"transformers"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"transformers": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"worker": "run_embedding_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 5,
"feature_set_id": 6,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s",
"job_metadata": {
"lane": "semantic",
"role": "historical_baseline",
"phase": "phase1"
},
"feature_name": "semantic_embedding",
"feature_level": "window",
"extraction_granularity": "sliding_window",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": 192,
"distance_metric": "cosine",
"feature_config": {
"role": "historical_baseline"
},
"model_id": 5,
"model_name": "ecapa",
"model_version": "acr-baseline-v1",
"model_family": "speech_derived",
"input_sample_rate": 16000,
"output_embedding_dim": 192,
"model_metadata": {
"lane": "semantic",
"role": "historical_baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_window_count": 20,
"status_after_start": {
"extraction_job_id": 5,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:52:50.702135+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "historical_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"vector_table": "audio_embedding_vector_192",
"output_target": "audio_embedding",
"execution_mode": "preflight",
"runtime_report": {
"ready": false,
"model_name": "ecapa",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"speechbrain": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"speechbrain"
],
"missing_dependencies": [
"torch",
"torchaudio",
"speechbrain"
]
},
"scope_window_count": 20,
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 192,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 5,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:52:50.702135+08:00",
"finished_at": "2026-06-04T13:52:50.703634+08:00",
"log_uri": null,
"metadata_json": {
"lane": "semantic",
"role": "historical_baseline",
"phase": "phase1",
"worker": "run_embedding_job",
"dry_run": false,
"artifact_dir": "data/pgvector_eval/music20/phase1_embeddings",
"vector_table": "audio_embedding_vector_192",
"output_target": "audio_embedding",
"execution_mode": "preflight_failure",
"failure_reason": "preflight_failed",
"runtime_report": {
"ready": false,
"model_name": "ecapa",
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"speechbrain": false
},
"requirements": [
"numpy",
"torch",
"torchaudio",
"speechbrain"
],
"missing_dependencies": [
"torch",
"torchaudio",
"speechbrain"
]
},
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"scope_window_count": 20,
"write_target_table": "audio_embedding",
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 192,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"missing_window_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_window_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"window_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"window_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"window_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"window_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"window_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"resolved_vector_table": "audio_embedding_vector_192",
"vector_table_report": {
"requested_vector_table": "audio_embedding_vector_192",
"expected_dim": 192,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"resolved": true,
"table_exists": true,
"reason": null
},
"runtime_report": {
"model_name": "ecapa",
"requirements": [
"numpy",
"torch",
"torchaudio",
"speechbrain"
],
"availability": {
"numpy": true,
"torch": false,
"torchaudio": false,
"speechbrain": false
},
"missing_dependencies": [
"torch",
"torchaudio",
"speechbrain"
],
"ready": false
},
"processed_windows": [],
"notes": [
"this worker now validates planner -> job -> scope windows -> PostgreSQL failure semantics",
"real model inference should replace the guarded failure path without changing the job contract or idempotent upsert keys"
]
}
\ No newline at end of file
{
"schema": "acr_test",
"dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
"semantic_job_count": 4,
"results": [
{
"extraction_job_id": 2,
"model_name": "mert",
"model_version": "v1-95m",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job2_mert_preflight_attempt.json"
},
{
"extraction_job_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job3_mert_preflight_attempt.json"
},
{
"extraction_job_id": 4,
"model_name": "muq",
"model_version": "large-msd-iter",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job4_muq_preflight_attempt.json"
},
{
"extraction_job_id": 5,
"model_name": "ecapa",
"model_version": "acr-baseline-v1",
"vector_table": "audio_embedding_vector_192",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"speechbrain"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 192,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"attempt_artifact": "data/pgvector_eval/music20/job5_ecapa_preflight_attempt.json"
}
],
"summary": {
"failed_jobs": 4,
"models": [
"mert",
"mert",
"muq",
"ecapa"
],
"unique_blockers": [
"model_runtime_unavailable",
"unreadable_audio_assets"
]
}
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
from typing import Any
import psycopg
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_embedding_preflight_matrix_report.json'
PYTHON_BIN = '/usr/local/miniconda3/bin/python'
def load_semantic_jobs(conn: psycopg.Connection) -> list[dict[str, Any]]:
rows = conn.execute(
"""
SELECT
fej.extraction_job_id,
mr.model_name,
mr.model_version,
fs.embedding_dim
FROM feature_extraction_job fej
JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id
JOIN model_registry mr ON mr.model_id = fs.model_id
WHERE fs.feature_name = 'semantic_embedding'
AND fs.feature_level = 'window'
ORDER BY fej.extraction_job_id;
"""
).fetchall()
return [
{
'extraction_job_id': int(row[0]),
'model_name': row[1],
'model_version': row[2],
'embedding_dim': int(row[3]) if row[3] is not None else None,
'vector_table': f"audio_embedding_vector_{int(row[3])}" if row[3] in (192, 768) else None,
}
for row in rows
]
def reset_jobs(dsn: str, schema: str) -> None:
cmd = [
PYTHON_BIN,
'scripts/bootstrap_phase1_extraction_jobs_live.py',
'--dsn', dsn,
'--schema', schema,
]
subprocess.run(cmd, cwd=ROOT, check=True, capture_output=True, text=True)
def run_job(dsn: str, schema: str, job: dict[str, Any]) -> dict[str, Any]:
attempt_path = ROOT / 'data' / 'pgvector_eval' / 'music20' / f"job{job['extraction_job_id']}_{job['model_name']}_preflight_attempt.json"
cmd = [
PYTHON_BIN,
'workers/run_embedding_job.py',
'--dsn', dsn,
'--schema', schema,
'--job-id', str(job['extraction_job_id']),
'--model-name', job['model_name'],
'--model-version', job['model_version'],
]
if job['vector_table']:
cmd.extend(['--vector-table', job['vector_table']])
cmd.extend(['--output', str(attempt_path)])
proc = subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True)
payload = json.loads(attempt_path.read_text(encoding='utf-8'))
status_after_failed = payload.get('status_after_failed') or {}
metadata = status_after_failed.get('metadata_json') or {}
return {
'extraction_job_id': job['extraction_job_id'],
'model_name': job['model_name'],
'model_version': job['model_version'],
'vector_table': job['vector_table'],
'returncode': proc.returncode,
'job_status': status_after_failed.get('job_status') or payload.get('status_after_complete', {}).get('job_status'),
'failure_reason': metadata.get('failure_reason'),
'preflight_blockers': metadata.get('preflight_blockers'),
'missing_window_count': metadata.get('missing_window_count'),
'runtime_missing_dependencies': ((metadata.get('runtime_report') or {}).get('missing_dependencies')),
'vector_table_report': metadata.get('vector_table_report'),
'attempt_artifact': str(attempt_path.relative_to(ROOT)),
}
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument('--dsn', required=True)
ap.add_argument('--schema', default='acr_test')
ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
args = ap.parse_args()
reset_jobs(args.dsn, args.schema)
with psycopg.connect(args.dsn, autocommit=True) as conn:
conn.execute(f'SET search_path TO {args.schema}, public;')
jobs = load_semantic_jobs(conn)
results = [run_job(args.dsn, args.schema, job) for job in jobs]
payload = {
'schema': args.schema,
'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
'semantic_job_count': len(results),
'results': results,
'summary': {
'failed_jobs': sum(1 for item in results if item['job_status'] == 'failed'),
'models': [item['model_name'] for item in results],
'unique_blockers': sorted({blocker for item in results for blocker in (item.get('preflight_blockers') or [])}),
},
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()
## 2026-06-04
- 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py``phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。
- 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets``model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。
-`audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL``UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。
- 新增 `phase1_worker_embedding_write_attempt.json``phase1_worker_embedding_write_guard_report.json``phase1_worker_embedding_post_state.json`,在 live PostgreSQL `acr_test` 上验证 semantic lane 的非 dry-run 行为:当前 `scope_window_count=20`,但因 `/workspace/downloads/...` 未挂载且 `torch/torchaudio/transformers` 缺失,job 被诚实标记为 `failed`,同时 `audio_embedding_vector_768_count` 仍保持 `0`
......
......@@ -739,3 +739,38 @@ cd /workspace/acr-engine
3. 不把幂等职责留给应用层“先查再写”
这一步对后续的 `MERT / MuQ / ECAPA` 都通用。
## 新增:Semantic preflight blocker matrix
为了避免下次 session 继续手工逐个试,本轮又新增:
- `acr-engine/scripts/run_phase1_embedding_preflight_matrix_live.py`
- `acr-engine/data/pgvector_eval/music20/phase1_embedding_preflight_matrix_report.json`
它会:
1. 先把 `feature_extraction_job` 重置回 `pending`
2. 顺序执行全部 semantic jobs(当前是 `mert 5s``mert 10s``muq 5s``ecapa 5s`
3. 归并输出每个 job 的:
- `failure_reason`
- `preflight_blockers`
- `runtime_missing_dependencies`
- `vector_table_report`
### 当前矩阵结果
| job | model | vector table | blockers | runtime missing |
|---|---|---|---|---|
| 2 | `mert v1-95m` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` |
| 3 | `mert v1-95m` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` |
| 4 | `muq large-msd-iter` | `audio_embedding_vector_768` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `transformers` |
| 5 | `ecapa acr-baseline-v1` | `audio_embedding_vector_192` | `unreadable_audio_assets`, `model_runtime_unavailable` | `torch`, `torchaudio`, `speechbrain` |
结论:
- 当前 semantic lane 的失败已经具有**稳定矩阵特征**,不是某一个 job 独有的偶发问题
- `vector_table` 路径已全部通过
- 当前真正阻塞 Phase-1 encoder-only 落地的是:
1. `/workspace/downloads` 音频挂载
2. 模型 runtime 依赖安装
......
......@@ -191,6 +191,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
- 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行
- semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets``model_runtime_unavailable`,而不是把失败伪装成 completed
- `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同
- `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失
- `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
- worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
- exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
......