Commit 5be68c1d 5be68c1d9db63369538b4ae82dc0aecb72ff6a6c by cnb.bofCdSsphPA

Create live Phase-1 extraction jobs in PostgreSQL

Constraint: Continue Phase-1 industrialization without waiting on missing audio mounts, and keep every Ralph step documented and pushed
Rejected: Leave extraction scheduling as an implicit next step after registry bootstrap | It forces future sessions to reconstruct pending jobs by hand
Confidence: high
Scope-risk: narrow
Directive: Use feature_extraction_job as the canonical handoff between registry bootstrap and actual encoder extraction runs
Tested: /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_extraction_jobs_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/bootstrap_phase1_extraction_jobs_live.py; git diff --check -- acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md
Not-tested: Downstream worker that consumes these pending jobs to run real MERT/MuQ extraction
1 parent f0c82687
1 {
2 "schema": "acr_test",
3 "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
4 "jobs": [
5 {
6 "extraction_job_id": 1,
7 "feature_set_id": 2,
8 "model_name": "chromaprint",
9 "model_version": "v1",
10 "feature_name": "fingerprint_asset",
11 "window_sec": 5.0,
12 "hop_sec": 2.5,
13 "target_scope": "reference_set:phase1_hot_reference_v1",
14 "job_status": "pending",
15 "operation": "inserted"
16 },
17 {
18 "extraction_job_id": 2,
19 "feature_set_id": 3,
20 "model_name": "mert",
21 "model_version": "v1-95m",
22 "feature_name": "semantic_embedding",
23 "window_sec": 5.0,
24 "hop_sec": 2.5,
25 "target_scope": "reference_set:phase1_hot_reference_v1",
26 "job_status": "pending",
27 "operation": "inserted"
28 },
29 {
30 "extraction_job_id": 3,
31 "feature_set_id": 4,
32 "model_name": "mert",
33 "model_version": "v1-95m",
34 "feature_name": "semantic_embedding",
35 "window_sec": 10.0,
36 "hop_sec": 5.0,
37 "target_scope": "reference_set:phase1_hot_reference_v1",
38 "job_status": "pending",
39 "operation": "inserted"
40 },
41 {
42 "extraction_job_id": 4,
43 "feature_set_id": 5,
44 "model_name": "muq",
45 "model_version": "large-msd-iter",
46 "feature_name": "semantic_embedding",
47 "window_sec": 5.0,
48 "hop_sec": 2.5,
49 "target_scope": "reference_set:phase1_hot_reference_v1",
50 "job_status": "pending",
51 "operation": "inserted"
52 },
53 {
54 "extraction_job_id": 5,
55 "feature_set_id": 6,
56 "model_name": "ecapa",
57 "model_version": "acr-baseline-v1",
58 "feature_name": "semantic_embedding",
59 "window_sec": 5.0,
60 "hop_sec": 2.5,
61 "target_scope": "reference_set:phase1_hot_reference_v1",
62 "job_status": "pending",
63 "operation": "inserted"
64 }
65 ],
66 "counts": {
67 "feature_extraction_job": 5,
68 "pending_jobs": 5
69 }
70 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7 from typing import Any
8
9 import psycopg
10
11 ROOT = Path(__file__).resolve().parents[1]
12 DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_jobs_report.json'
13
14 JOB_SPECS = [
15 {
16 'model_name': 'chromaprint',
17 'model_version': 'v1',
18 'feature_name': 'fingerprint_asset',
19 'window_sec': 5.0,
20 'hop_sec': 2.5,
21 'target_scope': 'reference_set:phase1_hot_reference_v1',
22 'job_status': 'pending',
23 'shard_key': 'phase1/reference/chromaprint/v1',
24 'metadata_json': {'lane': 'exact', 'phase': 'phase1', 'priority': 'p0'},
25 },
26 {
27 'model_name': 'mert',
28 'model_version': 'v1-95m',
29 'feature_name': 'semantic_embedding',
30 'window_sec': 5.0,
31 'hop_sec': 2.5,
32 'target_scope': 'reference_set:phase1_hot_reference_v1',
33 'job_status': 'pending',
34 'shard_key': 'phase1/reference/mert/v1-95m/5s_2.5s',
35 'metadata_json': {'lane': 'semantic', 'role': 'primary_baseline', 'phase': 'phase1'},
36 },
37 {
38 'model_name': 'mert',
39 'model_version': 'v1-95m',
40 'feature_name': 'semantic_embedding',
41 'window_sec': 10.0,
42 'hop_sec': 5.0,
43 'target_scope': 'reference_set:phase1_hot_reference_v1',
44 'job_status': 'pending',
45 'shard_key': 'phase1/reference/mert/v1-95m/10s_5s',
46 'metadata_json': {'lane': 'semantic', 'role': 'long_context_validation', 'phase': 'phase1'},
47 },
48 {
49 'model_name': 'muq',
50 'model_version': 'large-msd-iter',
51 'feature_name': 'semantic_embedding',
52 'window_sec': 5.0,
53 'hop_sec': 2.5,
54 'target_scope': 'reference_set:phase1_hot_reference_v1',
55 'job_status': 'pending',
56 'shard_key': 'phase1/reference/muq/large-msd-iter/5s_2.5s',
57 'metadata_json': {'lane': 'semantic', 'role': 'challenger', 'phase': 'phase1'},
58 },
59 {
60 'model_name': 'ecapa',
61 'model_version': 'acr-baseline-v1',
62 'feature_name': 'semantic_embedding',
63 'window_sec': 5.0,
64 'hop_sec': 2.5,
65 'target_scope': 'reference_set:phase1_hot_reference_v1',
66 'job_status': 'pending',
67 'shard_key': 'phase1/reference/ecapa/acr-baseline-v1/5s_2.5s',
68 'metadata_json': {'lane': 'semantic', 'role': 'historical_baseline', 'phase': 'phase1'},
69 },
70 ]
71
72
73 def resolve_feature_set_id(conn: psycopg.Connection, job: dict[str, Any]) -> int:
74 row = conn.execute(
75 """
76 SELECT fs.feature_set_id
77 FROM feature_set_registry fs
78 JOIN model_registry mr ON mr.model_id = fs.model_id
79 WHERE mr.model_name = %s
80 AND mr.model_version = %s
81 AND fs.feature_name = %s
82 AND coalesce(fs.window_sec, -1) = coalesce(%s, -1)
83 AND coalesce(fs.hop_sec, -1) = coalesce(%s, -1)
84 ORDER BY fs.feature_set_id
85 LIMIT 1;
86 """,
87 (
88 job['model_name'],
89 job['model_version'],
90 job['feature_name'],
91 job['window_sec'],
92 job['hop_sec'],
93 ),
94 ).fetchone()
95 if not row:
96 raise RuntimeError(
97 f"Feature set not found for {job['model_name']} {job['model_version']} {job['feature_name']} {job['window_sec']}/{job['hop_sec']}"
98 )
99 return int(row[0])
100
101
102 def ensure_job(conn: psycopg.Connection, feature_set_id: int, job: dict[str, Any]) -> tuple[int, str]:
103 existing = conn.execute(
104 """
105 SELECT extraction_job_id
106 FROM feature_extraction_job
107 WHERE feature_set_id = %s
108 AND target_scope = %s
109 AND coalesce(shard_key, '') = coalesce(%s, '')
110 ORDER BY extraction_job_id
111 LIMIT 1;
112 """,
113 (feature_set_id, job['target_scope'], job['shard_key']),
114 ).fetchone()
115 if existing:
116 conn.execute(
117 """
118 UPDATE feature_extraction_job
119 SET job_status = %s,
120 metadata_json = %s::jsonb
121 WHERE extraction_job_id = %s;
122 """,
123 (job['job_status'], json.dumps(job['metadata_json']), existing[0]),
124 )
125 return int(existing[0]), 'reused'
126
127 row = conn.execute(
128 """
129 INSERT INTO feature_extraction_job (
130 feature_set_id, target_scope, job_status, shard_key,
131 input_count, output_count, started_at, finished_at,
132 log_uri, metadata_json
133 ) VALUES (
134 %s, %s, %s, %s,
135 NULL, NULL, NULL, NULL,
136 NULL, %s::jsonb
137 )
138 RETURNING extraction_job_id;
139 """,
140 (feature_set_id, job['target_scope'], job['job_status'], job['shard_key'], json.dumps(job['metadata_json'])),
141 ).fetchone()
142 return int(row[0]), 'inserted'
143
144
145 def main() -> None:
146 ap = argparse.ArgumentParser()
147 ap.add_argument('--dsn', required=True)
148 ap.add_argument('--schema', default='acr_test')
149 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
150 args = ap.parse_args()
151
152 summary: dict[str, Any] = {
153 'schema': args.schema,
154 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
155 'jobs': [],
156 }
157 with psycopg.connect(args.dsn, autocommit=True) as conn:
158 conn.execute(f'SET search_path TO {args.schema}, public;')
159 for job in JOB_SPECS:
160 feature_set_id = resolve_feature_set_id(conn, job)
161 extraction_job_id, operation = ensure_job(conn, feature_set_id, job)
162 summary['jobs'].append({
163 'extraction_job_id': extraction_job_id,
164 'feature_set_id': feature_set_id,
165 'model_name': job['model_name'],
166 'model_version': job['model_version'],
167 'feature_name': job['feature_name'],
168 'window_sec': job['window_sec'],
169 'hop_sec': job['hop_sec'],
170 'target_scope': job['target_scope'],
171 'job_status': job['job_status'],
172 'operation': operation,
173 })
174 summary['counts'] = {
175 'feature_extraction_job': int(conn.execute('SELECT count(*) FROM feature_extraction_job;').fetchone()[0]),
176 'pending_jobs': int(conn.execute("SELECT count(*) FROM feature_extraction_job WHERE job_status = 'pending';").fetchone()[0]),
177 }
178
179 out = Path(args.output)
180 out.parent.mkdir(parents=True, exist_ok=True)
181 out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
182 print(json.dumps(summary, ensure_ascii=False, indent=2))
183
184
185 if __name__ == '__main__':
186 main()
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。
3 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。 4 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。
4 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 5 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。
5 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。 6 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。
......
...@@ -300,3 +300,47 @@ cd /workspace/acr-engine ...@@ -300,3 +300,47 @@ cd /workspace/acr-engine
300 结论: 300 结论:
301 301
302 > 当前 bootstrap 脚本可重复执行,不会把 Phase-1 registry 数据重复灌爆。 302 > 当前 bootstrap 脚本可重复执行,不会把 Phase-1 registry 数据重复灌爆。
303
304 ---
305
306 ## 9. Phase-1 extraction job bootstrap
307
308 `model_registry / feature_set_registry / reference_set_registry` 都已经存在后,下一步不是立刻手工跑抽特征,而是先把 **待执行 job** 写到 `feature_extraction_job`
309
310 本仓库现在已经提供:
311
312 - `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py`
313
314 用途:
315 - 根据已存在的 `feature_set_registry`
316 -`phase1_hot_reference_v1` 生成待执行 extraction jobs
317 - 把 Phase-1 的 exact / semantic lanes 统一放进 PostgreSQL job 表
318
319 ### 9.1 执行命令
320
321 ```bash
322 cd /workspace/acr-engine
323 /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_extraction_jobs_live.py \
324 --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \
325 --schema acr_test \
326 --output data/pgvector_eval/music20/phase1_extraction_jobs_report.json
327 ```
328
329 ### 9.2 当前已验证结果(acr_test)
330
331 本轮已真实创建 5 条待执行 job:
332
333 | lane | model | feature | target_scope | status |
334 |---|---|---|---|---|
335 | exact | `chromaprint` | `fingerprint_asset` | `reference_set:phase1_hot_reference_v1` | `pending` |
336 | semantic | `mert` | `semantic_embedding` 5s/2.5s | `reference_set:phase1_hot_reference_v1` | `pending` |
337 | semantic | `mert` | `semantic_embedding` 10s/5s | `reference_set:phase1_hot_reference_v1` | `pending` |
338 | semantic | `muq` | `semantic_embedding` 5s/2.5s | `reference_set:phase1_hot_reference_v1` | `pending` |
339 | semantic | `ecapa` | `semantic_embedding` 5s/2.5s | `reference_set:phase1_hot_reference_v1` | `pending` |
340
341 对应 live 报告:
342 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`
343
344 这意味着:
345
346 > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。
......
...@@ -67,6 +67,7 @@ ...@@ -67,6 +67,7 @@
67 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` | 67 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` |
68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | 68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` |
69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | 69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` |
70 | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` |
70 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | 71 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` |
71 72
72 --- 73 ---
...@@ -399,6 +400,22 @@ flowchart LR ...@@ -399,6 +400,22 @@ flowchart LR
399 对应 live 报告: 400 对应 live 报告:
400 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` 401 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`
401 402
403 ### 本轮继续新增:Phase-1 extraction jobs 已可 live bootstrap
404
405 在 registry bootstrap 之后,本轮又新增:
406
407 - `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py`
408
409 它已经在 `acr_test` schema 上真实创建了 5 条 `feature_extraction_job`
410 - `chromaprint`
411 - `mert 5s/2.5s`
412 - `mert 10s/5s`
413 - `muq 5s/2.5s`
414 - `ecapa 5s/2.5s`
415
416 对应 live 报告:
417 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`
418
402 ### 路线 1:继续做 PostgreSQL 工程化 419 ### 路线 1:继续做 PostgreSQL 工程化
403 420
404 1.`live_pgvector_music20_eval.py` 泛化成: 421 1.`live_pgvector_music20_eval.py` 泛化成:
......
...@@ -183,6 +183,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -183,6 +183,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
183 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过 183 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过
184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` 184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1`
185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变 185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变
186 - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动
186 187
187 ### 未验证 / 仍是缺口 188 ### 未验证 / 仍是缺口
188 - **未实际跑 MERT / MuQ encoder-only 特征抽取** 189 - **未实际跑 MERT / MuQ encoder-only 特征抽取**
......