Commit 223f80ac 223f80ac85c128da194bface19cb17d93c72247a by cnb.bofCdSsphPA

Collapse Phase-1 worker validation into one live smoke entrypoint

Constraint: Phase-1 now has multiple lane-specific validation scripts, so without a single smoke entrypoint the next session must manually reconstruct the current blocker picture.
Rejected: Keep exact and semantic checks separate only | It would slow restart diagnosis and hide the shared environment blockers.
Confidence: high
Scope-risk: narrow
Directive: Use the smoke entrypoint first on future sessions to distinguish contract regressions from missing mounts/runtime prerequisites.
Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_phase1_worker_contract_smoke_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_phase1_worker_contract_smoke_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_worker_contract_smoke_report.json
Not-tested: This smoke still reflects an environment-blocked host and does not prove successful extraction.
1 parent 6ea7365b
{
"worker": "run_chromaprint_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 1,
"feature_set_id": 2,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/chromaprint/v1",
"job_metadata": {
"lane": "exact",
"phase": "phase1",
"priority": "p0"
},
"feature_name": "fingerprint_asset",
"feature_level": "asset",
"extraction_granularity": "full_asset",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": null,
"distance_metric": "hamming",
"feature_config": {
"lane": "exact",
"index_target": "audio_fingerprint"
},
"model_id": 2,
"model_name": "chromaprint",
"model_version": "v1",
"model_family": "fingerprint",
"input_sample_rate": 16000,
"output_embedding_dim": null,
"model_metadata": {
"lane": "exact",
"note": "exact fingerprint lane baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_asset_count": 20,
"processed_assets": [],
"missing_assets": [
{
"asset_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav",
"reason": "missing_audio"
},
{
"asset_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav",
"reason": "missing_audio"
},
{
"asset_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav",
"reason": "missing_audio"
},
{
"asset_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav",
"reason": "missing_audio"
},
{
"asset_id": 6,
"storage_uri": "/workspace/downloads/105/type_11/57e61cde-4410-4751-93e9-d7a4ecece5791650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 7,
"storage_uri": "/workspace/downloads/106/type_11/bf61426c-67b7-4cf1-a9e7-f78cf519a0021650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 8,
"storage_uri": "/workspace/downloads/107/type_11/296bbc25-617c-4368-9a69-357aeec394381650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 9,
"storage_uri": "/workspace/downloads/108/type_11/d7e28fe6-4ad6-4243-b66b-d90ff5ca1e491650256909.wav",
"reason": "missing_audio"
},
{
"asset_id": 10,
"storage_uri": "/workspace/downloads/109/type_11/84acef9b-2a74-44bc-9eff-5ca7969ac9b61650256909.wav",
"reason": "missing_audio"
},
{
"asset_id": 11,
"storage_uri": "/workspace/downloads/110/type_11/2197b39e-23e2-4a66-b07e-dd672eab214a1650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 12,
"storage_uri": "/workspace/downloads/111/type_11/7f5256e8-de5f-41c5-bf76-419e05df72d81650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 13,
"storage_uri": "/workspace/downloads/112/type_11/34acd523-3c01-443d-ac3d-4ad7b9e2246f1650256907.wav",
"reason": "missing_audio"
},
{
"asset_id": 14,
"storage_uri": "/workspace/downloads/113/type_11/6d9438af-5d83-434b-bb20-76e28d0bbc4e1650256907.wav",
"reason": "missing_audio"
},
{
"asset_id": 15,
"storage_uri": "/workspace/downloads/114/type_11/0238ecbf-b234-470e-82e4-f3b80a267d771650256906.wav",
"reason": "missing_audio"
},
{
"asset_id": 16,
"storage_uri": "/workspace/downloads/115/type_11/aabad0ff-13de-4786-aa9c-40e1f957ed9f1650256906.wav",
"reason": "missing_audio"
},
{
"asset_id": 17,
"storage_uri": "/workspace/downloads/116/type_11/da34f6ff-39e7-4dde-8265-e1bb01b6263e1650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 18,
"storage_uri": "/workspace/downloads/117/type_11/1e1599e6-ebbd-4ceb-a81d-a320331ef6e31650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 19,
"storage_uri": "/workspace/downloads/118/type_11/db64461e-d752-4cf3-ab1d-56ff9232823d1650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 20,
"storage_uri": "/workspace/downloads/119/type_11/180dfa7d-836a-449c-990f-a3bf39c11da11650256898.wav",
"reason": "missing_audio"
}
],
"status_after_start": {
"extraction_job_id": 1,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:57:50.652147+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "exact",
"phase": "phase1",
"worker": "run_chromaprint_job",
"dry_run": false,
"priority": "p0",
"output_target": "audio_fingerprint",
"execution_mode": "write_attempt",
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 1,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:57:50.652147+08:00",
"finished_at": "2026-06-04T13:57:50.653101+08:00",
"log_uri": null,
"metadata_json": {
"lane": "exact",
"phase": "phase1",
"worker": "run_chromaprint_job",
"dry_run": false,
"priority": "p0",
"artifact_dir": "/workspace/acr-engine/data/pgvector_eval/music20/phase1_fingerprints",
"output_target": "audio_fingerprint",
"execution_mode": "write_attempt",
"failure_reason": "unreadable_audio_assets",
"write_target_table": "audio_fingerprint",
"missing_asset_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_asset_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"next_write_target": "audio_fingerprint",
"notes": [
"dry-run preserves the verified planner -> job -> PostgreSQL state flow",
"non-dry-run now writes repo-local chromaprint-style hash artifacts plus audio_fingerprint rows when source audio is readable"
]
}
\ No newline at end of file
{
"schema": "acr_test",
"dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
"exact_lane": {
"job_id": 1,
"returncode": 0,
"job_status": "failed",
"failure_reason": "unreadable_audio_assets",
"missing_asset_count": 20,
"artifact": "data/pgvector_eval/music20/phase1_worker_contract_smoke_exact.json"
},
"semantic_lane": {
"returncode": 0,
"semantic_job_count": 4,
"failed_jobs": 4,
"unique_blockers": [
"model_runtime_unavailable",
"unreadable_audio_assets"
],
"artifact": "data/pgvector_eval/music20/phase1_worker_contract_smoke_semantic_matrix.json"
},
"summary": {
"exact_status": "failed",
"semantic_failed_jobs": 4,
"shared_environment_blockers": [
"missing /workspace/downloads mount",
"missing semantic model runtime dependencies"
]
}
}
\ No newline at end of file
{
"schema": "acr_test",
"dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
"semantic_job_count": 4,
"results": [
{
"extraction_job_id": 2,
"model_name": "mert",
"model_version": "v1-95m",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job2_mert_preflight_attempt.json"
},
{
"extraction_job_id": 3,
"model_name": "mert",
"model_version": "v1-95m",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job3_mert_preflight_attempt.json"
},
{
"extraction_job_id": 4,
"model_name": "muq",
"model_version": "large-msd-iter",
"vector_table": "audio_embedding_vector_768",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"transformers"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 768,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_768"
},
"attempt_artifact": "data/pgvector_eval/music20/job4_muq_preflight_attempt.json"
},
{
"extraction_job_id": 5,
"model_name": "ecapa",
"model_version": "acr-baseline-v1",
"vector_table": "audio_embedding_vector_192",
"returncode": 0,
"job_status": "failed",
"failure_reason": "preflight_failed",
"preflight_blockers": [
"unreadable_audio_assets",
"model_runtime_unavailable"
],
"missing_window_count": 20,
"runtime_missing_dependencies": [
"torch",
"torchaudio",
"speechbrain"
],
"vector_table_report": {
"reason": null,
"resolved": true,
"expected_dim": 192,
"table_exists": true,
"allowed_vector_tables": [
"audio_embedding_vector_192",
"audio_embedding_vector_768"
],
"requested_vector_table": "audio_embedding_vector_192"
},
"attempt_artifact": "data/pgvector_eval/music20/job5_ecapa_preflight_attempt.json"
}
],
"summary": {
"failed_jobs": 4,
"models": [
"mert",
"mert",
"muq",
"ecapa"
],
"unique_blockers": [
"model_runtime_unavailable",
"unreadable_audio_assets"
]
}
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
PYTHON_BIN = '/usr/local/miniconda3/bin/python'
DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_worker_contract_smoke_report.json'
def run_cmd(cmd: list[str]) -> subprocess.CompletedProcess[str]:
return subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True)
def reset_jobs(dsn: str, schema: str) -> None:
proc = run_cmd([
PYTHON_BIN,
'scripts/bootstrap_phase1_extraction_jobs_live.py',
'--dsn', dsn,
'--schema', schema,
])
if proc.returncode != 0:
raise SystemExit(proc.stderr or proc.stdout)
def run_exact_lane(dsn: str, schema: str) -> dict[str, Any]:
out = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_worker_contract_smoke_exact.json'
proc = run_cmd([
PYTHON_BIN,
'workers/run_chromaprint_job.py',
'--dsn', dsn,
'--schema', schema,
'--job-id', '1',
'--output', str(out),
])
if proc.returncode != 0:
raise SystemExit(proc.stderr or proc.stdout)
payload = json.loads(out.read_text(encoding='utf-8'))
status = payload.get('status_after_failed') or payload.get('status_after_complete') or {}
metadata = status.get('metadata_json') or {}
return {
'job_id': 1,
'returncode': proc.returncode,
'job_status': status.get('job_status'),
'failure_reason': metadata.get('failure_reason'),
'missing_asset_count': metadata.get('missing_asset_count'),
'artifact': str(out.relative_to(ROOT)),
}
def run_semantic_matrix(dsn: str, schema: str) -> dict[str, Any]:
out = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_worker_contract_smoke_semantic_matrix.json'
proc = run_cmd([
PYTHON_BIN,
'scripts/run_phase1_embedding_preflight_matrix_live.py',
'--dsn', dsn,
'--schema', schema,
'--output', str(out),
])
if proc.returncode != 0:
raise SystemExit(proc.stderr or proc.stdout)
payload = json.loads(out.read_text(encoding='utf-8'))
return {
'returncode': proc.returncode,
'semantic_job_count': payload.get('semantic_job_count'),
'failed_jobs': payload.get('summary', {}).get('failed_jobs'),
'unique_blockers': payload.get('summary', {}).get('unique_blockers'),
'artifact': str(out.relative_to(ROOT)),
}
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument('--dsn', required=True)
ap.add_argument('--schema', default='acr_test')
ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
args = ap.parse_args()
reset_jobs(args.dsn, args.schema)
exact = run_exact_lane(args.dsn, args.schema)
reset_jobs(args.dsn, args.schema)
semantic = run_semantic_matrix(args.dsn, args.schema)
payload = {
'schema': args.schema,
'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
'exact_lane': exact,
'semantic_lane': semantic,
'summary': {
'exact_status': exact['job_status'],
'semantic_failed_jobs': semantic['failed_jobs'],
'shared_environment_blockers': [
'missing /workspace/downloads mount',
'missing semantic model runtime dependencies',
],
},
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()
## 2026-06-04
- 新增 `scripts/run_phase1_worker_contract_smoke_live.py``phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。
- 新增 `scripts/validate_audio_embedding_asset_upsert_live.py``audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`
- 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py``phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。
- 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets``model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。
......
......@@ -811,3 +811,37 @@ cd /workspace/acr-engine
- asset-level 唯一键不是“纸面存在”,而是已经在 live PostgreSQL 上真实生效
- 后续如果补 asset-level semantic writer,可以直接沿用同一个 `ON CONFLICT (feature_set_id, asset_id) ...` 合同
## 新增:Phase-1 worker contract smoke 总览
为了让下次启动不用分别手工跑 exact worker 与 semantic matrix,本轮新增:
- `acr-engine/scripts/run_phase1_worker_contract_smoke_live.py`
- `acr-engine/data/pgvector_eval/music20/phase1_worker_contract_smoke_report.json`
它会:
1. reset `feature_extraction_job`
2. 跑一次 exact lane 非 dry-run
3. 再 reset jobs
4. 跑完整 semantic preflight matrix
5. 输出一个总览 JSON
### 当前 smoke 总览结果
| lane | 结果 |
|---|---|
| exact | `failed` |
| exact failure reason | `unreadable_audio_assets` |
| exact missing assets | `20` |
| semantic jobs | `4` |
| semantic failed jobs | `4` |
| semantic blockers | `model_runtime_unavailable`, `unreadable_audio_assets` |
这说明:
- 当前 PostgreSQL worker contract 本身已经是**稳定的**
- 当前阻塞已经非常明确,主要不是 orchestration,而是环境:
- `/workspace/downloads` 未挂载
- semantic model runtime 未安装
......
......@@ -193,6 +193,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
- `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同
- `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失
- `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据
- `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失
- `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
- worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
- exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
......