Commit f13caa3e f13caa3e163e38b3dc795507128a084e42d2edc6 by cnb.bofCdSsphPA

Generate a live execution plan from pending extraction jobs

Constraint: Ralph must keep turning PostgreSQL state into concrete next-step artifacts rather than leaving implied manual steps
Rejected: Stop at creating pending jobs only | It still leaves future sessions to infer ordering and physical targets by hand
Confidence: high
Scope-risk: narrow
Directive: Treat the planner report as the canonical bridge between pending jobs and real extraction workers
Tested: /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-status pending --output data/pgvector_eval/music20/phase1_extraction_plan_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/plan_phase1_extraction_jobs_live.py; git diff --check -- acr-engine/scripts/plan_phase1_extraction_jobs_live.py acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md
Not-tested: Actual worker that consumes the plan to run MERT/MuQ/Chromaprint extraction end-to-end
1 parent 5be68c1d
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7 from typing import Any
8
9 import psycopg
10
11 ROOT = Path(__file__).resolve().parents[1]
12 DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_plan_report.json'
13
14 LANE_PRIORITY = {
15 'exact': 0,
16 'semantic': 1,
17 'cover': 2,
18 }
19
20
21 def parse_target_scope(target_scope: str) -> dict[str, Any]:
22 if ':' in target_scope:
23 scope_type, scope_value = target_scope.split(':', 1)
24 return {'scope_type': scope_type, 'scope_value': scope_value}
25 return {'scope_type': 'unknown', 'scope_value': target_scope}
26
27
28 def main() -> None:
29 ap = argparse.ArgumentParser()
30 ap.add_argument('--dsn', required=True)
31 ap.add_argument('--schema', default='acr_test')
32 ap.add_argument('--job-status', default='pending')
33 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
34 args = ap.parse_args()
35
36 with psycopg.connect(args.dsn) as conn:
37 conn.execute(f'SET search_path TO {args.schema}, public;')
38 rows = conn.execute(
39 """
40 SELECT
41 fej.extraction_job_id,
42 fej.feature_set_id,
43 fej.target_scope,
44 fej.job_status,
45 fej.shard_key,
46 fej.metadata_json,
47 fs.feature_name,
48 fs.feature_level,
49 fs.extraction_granularity,
50 fs.window_sec,
51 fs.hop_sec,
52 fs.embedding_dim,
53 fs.distance_metric,
54 mr.model_name,
55 mr.model_version,
56 mr.model_family,
57 mr.output_embedding_dim,
58 mr.input_sample_rate,
59 mr.default_window_sec,
60 mr.default_hop_sec,
61 mr.metadata_json
62 FROM feature_extraction_job fej
63 JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id
64 JOIN model_registry mr ON mr.model_id = fs.model_id
65 WHERE fej.job_status = %s
66 ORDER BY fej.extraction_job_id;
67 """,
68 (args.job_status,),
69 ).fetchall()
70
71 jobs = []
72 by_lane: dict[str, list[dict[str, Any]]] = {}
73 for row in rows:
74 job_meta = row[5] or {}
75 model_meta = row[20] or {}
76 lane = job_meta.get('lane') or model_meta.get('lane') or 'unknown'
77 scope = parse_target_scope(row[2])
78 physical_target = 'audio_fingerprint' if row[6] == 'fingerprint_asset' else 'audio_embedding'
79 vector_table = None
80 if row[11] == 192:
81 vector_table = 'audio_embedding_vector_192'
82 elif row[11] == 768:
83 vector_table = 'audio_embedding_vector_768'
84
85 item = {
86 'priority_rank': LANE_PRIORITY.get(lane, 99),
87 'lane': lane,
88 'extraction_job_id': row[0],
89 'feature_set_id': row[1],
90 'target_scope': row[2],
91 'scope': scope,
92 'job_status': row[3],
93 'shard_key': row[4],
94 'model_name': row[13],
95 'model_version': row[14],
96 'model_family': row[15],
97 'input_sample_rate': row[17],
98 'feature_name': row[6],
99 'feature_level': row[7],
100 'extraction_granularity': row[8],
101 'window_sec': float(row[9]) if row[9] is not None else None,
102 'hop_sec': float(row[10]) if row[10] is not None else None,
103 'embedding_dim': row[11],
104 'distance_metric': row[12],
105 'physical_target': physical_target,
106 'vector_table': vector_table,
107 'job_metadata': job_meta,
108 'model_metadata': model_meta,
109 'execution_notes': [
110 f"run feature extraction for {row[13]} {row[14]}",
111 f"write to {physical_target}" + (f" + {vector_table}" if vector_table else ''),
112 f"target scope: {row[2]}",
113 ],
114 }
115 jobs.append(item)
116 by_lane.setdefault(lane, []).append(item)
117
118 jobs.sort(key=lambda x: (x['priority_rank'], x['extraction_job_id']))
119 for lane_jobs in by_lane.values():
120 lane_jobs.sort(key=lambda x: x['extraction_job_id'])
121
122 payload = {
123 'schema': args.schema,
124 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
125 'job_status_filter': args.job_status,
126 'counts': {
127 'jobs': len(jobs),
128 'lanes': {lane: len(items) for lane, items in sorted(by_lane.items())},
129 },
130 'ordered_jobs': jobs,
131 'by_lane': by_lane,
132 'execution_order_summary': [
133 {
134 'order': idx + 1,
135 'extraction_job_id': job['extraction_job_id'],
136 'lane': job['lane'],
137 'model_name': job['model_name'],
138 'feature_name': job['feature_name'],
139 'window_sec': job['window_sec'],
140 'hop_sec': job['hop_sec'],
141 'physical_target': job['physical_target'],
142 }
143 for idx, job in enumerate(jobs)
144 ],
145 }
146
147 out = Path(args.output)
148 out.parent.mkdir(parents=True, exist_ok=True)
149 out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
150 print(json.dumps(payload, ensure_ascii=False, indent=2))
151
152
153 if __name__ == '__main__':
154 main()
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。
3 - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 4 - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。
4 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。 5 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。
5 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 6 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。
......
...@@ -344,3 +344,56 @@ cd /workspace/acr-engine ...@@ -344,3 +344,56 @@ cd /workspace/acr-engine
344 这意味着: 344 这意味着:
345 345
346 > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。 346 > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。
347
348 ---
349
350 ## 10. Phase-1 extraction plan(从 pending jobs 生成)
351
352 `feature_extraction_job` 已经存在后,下一步通常不是马上手敲命令,而是先从 PostgreSQL 生成一个**统一执行计划**
353
354 本仓库现在已经提供:
355
356 - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py`
357
358 用途:
359 - 读取 `feature_extraction_job`
360 - 过滤 `job_status=pending`
361 - 联表 `feature_set_registry + model_registry`
362 - 生成按 lane / priority 排序的 execution plan
363
364 ### 10.1 执行命令
365
366 ```bash
367 cd /workspace/acr-engine
368 /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py \
369 --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \
370 --schema acr_test \
371 --job-status pending \
372 --output data/pgvector_eval/music20/phase1_extraction_plan_report.json
373 ```
374
375 ### 10.2 当前已验证结果(acr_test)
376
377 本轮已真实生成一份 ordered execution plan:
378
379 | order | lane | model | feature | physical_target |
380 |---|---|---|---|---|
381 | 1 | `exact` | `chromaprint` | `fingerprint_asset` | `audio_fingerprint` |
382 | 2 | `semantic` | `mert` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
383 | 3 | `semantic` | `mert` | `semantic_embedding 10s/5s` | `audio_embedding` |
384 | 4 | `semantic` | `muq` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
385 | 5 | `semantic` | `ecapa` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
386
387 其中 planner 还会自动给出:
388 - `vector_table`
389 - `audio_embedding_vector_768`
390 - `audio_embedding_vector_192`
391 - `target_scope`
392 - `execution_notes`
393
394 当前产物:
395 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`
396
397 结论:
398
399 > 现在 PostgreSQL 里已经不仅能描述“有哪些 job”,还可以直接生成**按执行顺序排好的抽特征计划**。
......
...@@ -68,6 +68,7 @@ ...@@ -68,6 +68,7 @@
68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | 68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` |
69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | 69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` |
70 | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | 70 | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` |
71 | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` |
71 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | 72 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` |
72 73
73 --- 74 ---
...@@ -416,6 +417,19 @@ flowchart LR ...@@ -416,6 +417,19 @@ flowchart LR
416 对应 live 报告: 417 对应 live 报告:
417 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` 418 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`
418 419
420 ### 本轮继续新增:pending jobs 已可生成 live execution plan
421
422 在 extraction jobs 之后,本轮又新增:
423
424 - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py`
425
426 它已经在 `acr_test` schema 上真实读取 5 条 `pending` jobs,并生成按执行顺序排列的 plan:
427 - `chromaprint exact lane` 优先
428 - 然后是 `mert / muq / ecapa` 的 semantic lanes
429
430 对应 live 报告:
431 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`
432
419 ### 路线 1:继续做 PostgreSQL 工程化 433 ### 路线 1:继续做 PostgreSQL 工程化
420 434
421 1.`live_pgvector_music20_eval.py` 泛化成: 435 1.`live_pgvector_music20_eval.py` 泛化成:
......
...@@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` 184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1`
185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变 185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变
186 - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 186 - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动
187 - PostgreSQL `acr_test` schema 上已真实生成 Phase-1 extraction execution plan,当前顺序是 `chromaprint -> mert -> mert-long -> muq -> ecapa`
187 188
188 ### 未验证 / 仍是缺口 189 ### 未验证 / 仍是缺口
189 - **未实际跑 MERT / MuQ encoder-only 特征抽取** 190 - **未实际跑 MERT / MuQ encoder-only 特征抽取**
......