Turn Phase-1 host prerequisites into a live audit artifact
Constraint: Worker-contract validation is now stable enough that the remaining uncertainty is host readiness, so the next blocker had to be made explicit instead of inferred from repeated failed runs. Rejected: Keep prerequisite knowledge only in prose | It would drift and force future sessions to rediscover the same missing mounts and packages. Confidence: high Scope-risk: narrow Directive: Run the prerequisite audit before retrying live extraction so host blockers are measured once and reused across lanes. Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/run_phase1_prereq_audit_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/run_phase1_prereq_audit_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_prereq_audit_report.json Not-tested: This audit does not install dependencies or mount assets; it only reports readiness.
Showing
5 changed files
with
281 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "downloads_root": "/workspace/downloads", | ||
| 5 | "downloads_root_exists": false, | ||
| 6 | "package_checks": { | ||
| 7 | "numpy": { | ||
| 8 | "package": "numpy", | ||
| 9 | "available": true | ||
| 10 | }, | ||
| 11 | "speechbrain": { | ||
| 12 | "package": "speechbrain", | ||
| 13 | "available": false, | ||
| 14 | "error_type": "ModuleNotFoundError", | ||
| 15 | "error": "No module named 'speechbrain'" | ||
| 16 | }, | ||
| 17 | "torch": { | ||
| 18 | "package": "torch", | ||
| 19 | "available": false, | ||
| 20 | "error_type": "ModuleNotFoundError", | ||
| 21 | "error": "No module named 'torch'" | ||
| 22 | }, | ||
| 23 | "torchaudio": { | ||
| 24 | "package": "torchaudio", | ||
| 25 | "available": false, | ||
| 26 | "error_type": "ModuleNotFoundError", | ||
| 27 | "error": "No module named 'torchaudio'" | ||
| 28 | }, | ||
| 29 | "transformers": { | ||
| 30 | "package": "transformers", | ||
| 31 | "available": false, | ||
| 32 | "error_type": "ModuleNotFoundError", | ||
| 33 | "error": "No module named 'transformers'" | ||
| 34 | } | ||
| 35 | }, | ||
| 36 | "jobs": [ | ||
| 37 | { | ||
| 38 | "extraction_job_id": 1, | ||
| 39 | "model_name": "chromaprint", | ||
| 40 | "model_version": "v1", | ||
| 41 | "embedding_dim": null, | ||
| 42 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 43 | "required_packages": [ | ||
| 44 | "numpy" | ||
| 45 | ], | ||
| 46 | "missing_packages": [], | ||
| 47 | "downloads_root_exists": false, | ||
| 48 | "ready_for_live_worker": false | ||
| 49 | }, | ||
| 50 | { | ||
| 51 | "extraction_job_id": 2, | ||
| 52 | "model_name": "mert", | ||
| 53 | "model_version": "v1-95m", | ||
| 54 | "embedding_dim": 768, | ||
| 55 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 56 | "required_packages": [ | ||
| 57 | "numpy", | ||
| 58 | "torch", | ||
| 59 | "torchaudio", | ||
| 60 | "transformers" | ||
| 61 | ], | ||
| 62 | "missing_packages": [ | ||
| 63 | "torch", | ||
| 64 | "torchaudio", | ||
| 65 | "transformers" | ||
| 66 | ], | ||
| 67 | "downloads_root_exists": false, | ||
| 68 | "ready_for_live_worker": false | ||
| 69 | }, | ||
| 70 | { | ||
| 71 | "extraction_job_id": 3, | ||
| 72 | "model_name": "mert", | ||
| 73 | "model_version": "v1-95m", | ||
| 74 | "embedding_dim": 768, | ||
| 75 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 76 | "required_packages": [ | ||
| 77 | "numpy", | ||
| 78 | "torch", | ||
| 79 | "torchaudio", | ||
| 80 | "transformers" | ||
| 81 | ], | ||
| 82 | "missing_packages": [ | ||
| 83 | "torch", | ||
| 84 | "torchaudio", | ||
| 85 | "transformers" | ||
| 86 | ], | ||
| 87 | "downloads_root_exists": false, | ||
| 88 | "ready_for_live_worker": false | ||
| 89 | }, | ||
| 90 | { | ||
| 91 | "extraction_job_id": 4, | ||
| 92 | "model_name": "muq", | ||
| 93 | "model_version": "large-msd-iter", | ||
| 94 | "embedding_dim": 768, | ||
| 95 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 96 | "required_packages": [ | ||
| 97 | "numpy", | ||
| 98 | "torch", | ||
| 99 | "torchaudio", | ||
| 100 | "transformers" | ||
| 101 | ], | ||
| 102 | "missing_packages": [ | ||
| 103 | "torch", | ||
| 104 | "torchaudio", | ||
| 105 | "transformers" | ||
| 106 | ], | ||
| 107 | "downloads_root_exists": false, | ||
| 108 | "ready_for_live_worker": false | ||
| 109 | }, | ||
| 110 | { | ||
| 111 | "extraction_job_id": 5, | ||
| 112 | "model_name": "ecapa", | ||
| 113 | "model_version": "acr-baseline-v1", | ||
| 114 | "embedding_dim": 192, | ||
| 115 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 116 | "required_packages": [ | ||
| 117 | "numpy", | ||
| 118 | "torch", | ||
| 119 | "torchaudio", | ||
| 120 | "speechbrain" | ||
| 121 | ], | ||
| 122 | "missing_packages": [ | ||
| 123 | "torch", | ||
| 124 | "torchaudio", | ||
| 125 | "speechbrain" | ||
| 126 | ], | ||
| 127 | "downloads_root_exists": false, | ||
| 128 | "ready_for_live_worker": false | ||
| 129 | } | ||
| 130 | ], | ||
| 131 | "summary": { | ||
| 132 | "total_jobs": 5, | ||
| 133 | "ready_jobs": 0, | ||
| 134 | "blocked_jobs": 5, | ||
| 135 | "missing_packages_union": [ | ||
| 136 | "speechbrain", | ||
| 137 | "torch", | ||
| 138 | "torchaudio", | ||
| 139 | "transformers" | ||
| 140 | ] | ||
| 141 | } | ||
| 142 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import importlib | ||
| 6 | import json | ||
| 7 | from pathlib import Path | ||
| 8 | import sys | ||
| 9 | from typing import Any | ||
| 10 | |||
| 11 | import psycopg | ||
| 12 | |||
| 13 | ROOT = Path(__file__).resolve().parents[1] | ||
| 14 | if str(ROOT) not in sys.path: | ||
| 15 | sys.path.insert(0, str(ROOT)) | ||
| 16 | |||
| 17 | from workers._job_common import validate_schema | ||
| 18 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_prereq_audit_report.json' | ||
| 19 | MODEL_REQUIREMENTS = { | ||
| 20 | 'mert': ['numpy', 'torch', 'torchaudio', 'transformers'], | ||
| 21 | 'muq': ['numpy', 'torch', 'torchaudio', 'transformers'], | ||
| 22 | 'ecapa': ['numpy', 'torch', 'torchaudio', 'speechbrain'], | ||
| 23 | 'chromaprint': ['numpy'], | ||
| 24 | } | ||
| 25 | |||
| 26 | |||
| 27 | def check_import(name: str) -> dict[str, Any]: | ||
| 28 | try: | ||
| 29 | importlib.import_module(name) | ||
| 30 | return {'package': name, 'available': True} | ||
| 31 | except Exception as exc: # noqa: BLE001 | ||
| 32 | return {'package': name, 'available': False, 'error_type': type(exc).__name__, 'error': str(exc).splitlines()[0]} | ||
| 33 | |||
| 34 | |||
| 35 | def load_jobs(conn: psycopg.Connection) -> list[dict[str, Any]]: | ||
| 36 | rows = conn.execute( | ||
| 37 | """ | ||
| 38 | SELECT fej.extraction_job_id, mr.model_name, mr.model_version, fs.embedding_dim, fej.target_scope | ||
| 39 | FROM feature_extraction_job fej | ||
| 40 | JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id | ||
| 41 | JOIN model_registry mr ON mr.model_id = fs.model_id | ||
| 42 | ORDER BY fej.extraction_job_id; | ||
| 43 | """ | ||
| 44 | ).fetchall() | ||
| 45 | return [ | ||
| 46 | { | ||
| 47 | 'extraction_job_id': int(row[0]), | ||
| 48 | 'model_name': row[1], | ||
| 49 | 'model_version': row[2], | ||
| 50 | 'embedding_dim': int(row[3]) if row[3] is not None else None, | ||
| 51 | 'target_scope': row[4], | ||
| 52 | } | ||
| 53 | for row in rows | ||
| 54 | ] | ||
| 55 | |||
| 56 | |||
| 57 | def main() -> None: | ||
| 58 | ap = argparse.ArgumentParser() | ||
| 59 | ap.add_argument('--dsn', required=True) | ||
| 60 | ap.add_argument('--schema', default='acr_test') | ||
| 61 | ap.add_argument('--downloads-root', default='/workspace/downloads') | ||
| 62 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 63 | args = ap.parse_args() | ||
| 64 | |||
| 65 | schema = validate_schema(args.schema) | ||
| 66 | downloads_root = Path(args.downloads_root) | ||
| 67 | downloads_exists = downloads_root.exists() | ||
| 68 | |||
| 69 | with psycopg.connect(args.dsn, autocommit=True) as conn: | ||
| 70 | conn.execute(f'SET search_path TO {schema}, public;') | ||
| 71 | jobs = load_jobs(conn) | ||
| 72 | |||
| 73 | package_names = sorted({pkg for job in jobs for pkg in MODEL_REQUIREMENTS.get(job['model_name'], ['numpy'])}) | ||
| 74 | package_checks = {item['package']: item for item in (check_import(name) for name in package_names)} | ||
| 75 | |||
| 76 | job_reports = [] | ||
| 77 | for job in jobs: | ||
| 78 | required = MODEL_REQUIREMENTS.get(job['model_name'], ['numpy']) | ||
| 79 | missing = [name for name in required if not package_checks[name]['available']] | ||
| 80 | job_reports.append( | ||
| 81 | { | ||
| 82 | **job, | ||
| 83 | 'required_packages': required, | ||
| 84 | 'missing_packages': missing, | ||
| 85 | 'downloads_root_exists': downloads_exists, | ||
| 86 | 'ready_for_live_worker': downloads_exists and not missing, | ||
| 87 | } | ||
| 88 | ) | ||
| 89 | |||
| 90 | payload = { | ||
| 91 | 'schema': schema, | ||
| 92 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 93 | 'downloads_root': str(downloads_root), | ||
| 94 | 'downloads_root_exists': downloads_exists, | ||
| 95 | 'package_checks': package_checks, | ||
| 96 | 'jobs': job_reports, | ||
| 97 | 'summary': { | ||
| 98 | 'total_jobs': len(job_reports), | ||
| 99 | 'ready_jobs': sum(1 for job in job_reports if job['ready_for_live_worker']), | ||
| 100 | 'blocked_jobs': sum(1 for job in job_reports if not job['ready_for_live_worker']), | ||
| 101 | 'missing_packages_union': sorted({pkg for job in job_reports for pkg in job['missing_packages']}), | ||
| 102 | }, | ||
| 103 | } | ||
| 104 | out = Path(args.output) | ||
| 105 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 106 | out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 107 | print(json.dumps(payload, ensure_ascii=False, indent=2)) | ||
| 108 | |||
| 109 | |||
| 110 | if __name__ == '__main__': | ||
| 111 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `scripts/run_phase1_prereq_audit_live.py` 与 `phase1_prereq_audit_report.json`,把 `/workspace/downloads` 挂载状态、`torch/torchaudio/transformers/speechbrain` 依赖状态与 5 条 Phase-1 jobs 的 readiness 汇总到一份 live 审计报告;当前结果为 `ready_jobs=0`、`blocked_jobs=5`。 | ||
| 3 | - 新增 `scripts/run_embedding_vector_table_negative_matrix_live.py` 与 `embedding_vector_table_negative_matrix_report.json`,在 live PostgreSQL 上补齐 semantic preflight 的三类向量表负例:维度不匹配、未 allowlist、schema 缺表;三类 case 都会稳定落到 `preflight_failed`,且 `vector_table_report.reason` 与预期一致。 | 4 | - 新增 `scripts/run_embedding_vector_table_negative_matrix_live.py` 与 `embedding_vector_table_negative_matrix_report.json`,在 live PostgreSQL 上补齐 semantic preflight 的三类向量表负例:维度不匹配、未 allowlist、schema 缺表;三类 case 都会稳定落到 `preflight_failed`,且 `vector_table_report.reason` 与预期一致。 |
| 4 | - 新增 `scripts/run_phase1_worker_contract_smoke_live.py` 与 `phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。 | 5 | - 新增 `scripts/run_phase1_worker_contract_smoke_live.py` 与 `phase1_worker_contract_smoke_report.json`,把 exact lane 非 dry-run 验证与 semantic preflight matrix 合成一条 live smoke 命令;当前总览结果为 exact=`failed/unreadable_audio_assets`、semantic=`4/4 failed`,说明阻塞点已经收敛到环境挂载与模型 runtime,而不是 worker contract 本身。 |
| 5 | - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py` 与 `audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`。 | 6 | - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py` 与 `audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`。 | ... | ... |
| ... | @@ -872,3 +872,29 @@ cd /workspace/acr-engine | ... | @@ -872,3 +872,29 @@ cd /workspace/acr-engine |
| 872 | 872 | ||
| 873 | - 当前 semantic preflight 已经能够把“运行环境问题”和“配置错误问题”分层暴露 | 873 | - 当前 semantic preflight 已经能够把“运行环境问题”和“配置错误问题”分层暴露 |
| 874 | - 后续只要看 `vector_table_report.reason`,就能快速区分是 DDL/配置错误,还是模型 runtime/音频挂载错误 | 874 | - 后续只要看 `vector_table_report.reason`,就能快速区分是 DDL/配置错误,还是模型 runtime/音频挂载错误 |
| 875 | |||
| 876 | |||
| 877 | ## 新增:Phase-1 prerequisites audit | ||
| 878 | |||
| 879 | 为了避免每次都靠肉眼猜“到底是音频挂载缺失,还是模型 runtime 缺失”,本轮新增: | ||
| 880 | |||
| 881 | - `acr-engine/scripts/run_phase1_prereq_audit_live.py` | ||
| 882 | - `acr-engine/data/pgvector_eval/music20/phase1_prereq_audit_report.json` | ||
| 883 | |||
| 884 | ### 当前审计结果 | ||
| 885 | |||
| 886 | | 指标 | 结果 | | ||
| 887 | |---|---| | ||
| 888 | | `downloads_root_exists` | `false` | | ||
| 889 | | `total_jobs` | `5` | | ||
| 890 | | `ready_jobs` | `0` | | ||
| 891 | | `blocked_jobs` | `5` | | ||
| 892 | | 缺失依赖并集 | `speechbrain`, `torch`, `torchaudio`, `transformers` | | ||
| 893 | |||
| 894 | 按 job 看: | ||
| 895 | |||
| 896 | - `chromaprint`:依赖层面可跑,但被 `/workspace/downloads` 缺失阻塞 | ||
| 897 | - `mert / muq`:同时被 `/workspace/downloads` 缺失与 `torch/torchaudio/transformers` 缺失阻塞 | ||
| 898 | - `ecapa`:同时被 `/workspace/downloads` 缺失与 `torch/torchaudio/speechbrain` 缺失阻塞 | ||
| 899 | |||
| 900 | 这使得“当前为什么跑不通”已经可以通过单份 JSON 报告回答,而不必重新手工试跑。 | ... | ... |
| ... | @@ -195,6 +195,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -195,6 +195,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 195 | - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 | 195 | - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 |
| 196 | - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失 | 196 | - `scripts/run_phase1_worker_contract_smoke_live.py` 已提供一条命令的全局 smoke:当前 exact lane = `failed/unreadable_audio_assets`,semantic lane = `4/4 failed`,共性 blocker 已固化为音频挂载缺失 + 语义模型 runtime 缺失 |
| 197 | - `scripts/run_embedding_vector_table_negative_matrix_live.py` 已在 live PostgreSQL 上补齐 semantic vector-table 负例矩阵:`vector_table_dim_mismatch`、`vector_table_not_allowlisted`、`vector_table_missing_in_schema` 三类错误都能被稳定写入 `vector_table_report.reason` | 197 | - `scripts/run_embedding_vector_table_negative_matrix_live.py` 已在 live PostgreSQL 上补齐 semantic vector-table 负例矩阵:`vector_table_dim_mismatch`、`vector_table_not_allowlisted`、`vector_table_missing_in_schema` 三类错误都能被稳定写入 `vector_table_report.reason` |
| 198 | - `scripts/run_phase1_prereq_audit_live.py` 已给出当前 host 的先决条件审计:`downloads_root_exists=false`、`ready_jobs=0/5`,并把 `torch/torchaudio/transformers/speechbrain` 的缺失状态按 job 落成 JSON 报告 | ||
| 198 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 199 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 199 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 200 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 200 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 201 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment