Commit f13caa3e f13caa3e163e38b3dc795507128a084e42d2edc6 by cnb.bofCdSsphPA

Generate a live execution plan from pending extraction jobs

Constraint: Ralph must keep turning PostgreSQL state into concrete next-step artifacts rather than leaving implied manual steps
Rejected: Stop at creating pending jobs only | It still leaves future sessions to infer ordering and physical targets by hand
Confidence: high
Scope-risk: narrow
Directive: Treat the planner report as the canonical bridge between pending jobs and real extraction workers
Tested: /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-status pending --output data/pgvector_eval/music20/phase1_extraction_plan_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/plan_phase1_extraction_jobs_live.py; git diff --check -- acr-engine/scripts/plan_phase1_extraction_jobs_live.py acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md
Not-tested: Actual worker that consumes the plan to run MERT/MuQ/Chromaprint extraction end-to-end
1 parent 5be68c1d
1 {
2 "schema": "acr_test",
3 "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
4 "job_status_filter": "pending",
5 "counts": {
6 "jobs": 5,
7 "lanes": {
8 "exact": 1,
9 "semantic": 4
10 }
11 },
12 "ordered_jobs": [
13 {
14 "priority_rank": 0,
15 "lane": "exact",
16 "extraction_job_id": 1,
17 "feature_set_id": 2,
18 "target_scope": "reference_set:phase1_hot_reference_v1",
19 "scope": {
20 "scope_type": "reference_set",
21 "scope_value": "phase1_hot_reference_v1"
22 },
23 "job_status": "pending",
24 "shard_key": "phase1/reference/chromaprint/v1",
25 "model_name": "chromaprint",
26 "model_version": "v1",
27 "model_family": "fingerprint",
28 "input_sample_rate": 16000,
29 "feature_name": "fingerprint_asset",
30 "feature_level": "asset",
31 "extraction_granularity": "full_asset",
32 "window_sec": 5.0,
33 "hop_sec": 2.5,
34 "embedding_dim": null,
35 "distance_metric": "hamming",
36 "physical_target": "audio_fingerprint",
37 "vector_table": null,
38 "job_metadata": {
39 "lane": "exact",
40 "phase": "phase1",
41 "priority": "p0"
42 },
43 "model_metadata": {
44 "lane": "exact",
45 "note": "exact fingerprint lane baseline",
46 "phase": "phase1"
47 },
48 "execution_notes": [
49 "run feature extraction for chromaprint v1",
50 "write to audio_fingerprint",
51 "target scope: reference_set:phase1_hot_reference_v1"
52 ]
53 },
54 {
55 "priority_rank": 1,
56 "lane": "semantic",
57 "extraction_job_id": 2,
58 "feature_set_id": 3,
59 "target_scope": "reference_set:phase1_hot_reference_v1",
60 "scope": {
61 "scope_type": "reference_set",
62 "scope_value": "phase1_hot_reference_v1"
63 },
64 "job_status": "pending",
65 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
66 "model_name": "mert",
67 "model_version": "v1-95m",
68 "model_family": "music_ssl",
69 "input_sample_rate": 24000,
70 "feature_name": "semantic_embedding",
71 "feature_level": "window",
72 "extraction_granularity": "sliding_window",
73 "window_sec": 5.0,
74 "hop_sec": 2.5,
75 "embedding_dim": 768,
76 "distance_metric": "cosine",
77 "physical_target": "audio_embedding",
78 "vector_table": "audio_embedding_vector_768",
79 "job_metadata": {
80 "lane": "semantic",
81 "role": "primary_baseline",
82 "phase": "phase1"
83 },
84 "model_metadata": {
85 "lane": "semantic",
86 "role": "primary_baseline",
87 "phase": "phase1"
88 },
89 "execution_notes": [
90 "run feature extraction for mert v1-95m",
91 "write to audio_embedding + audio_embedding_vector_768",
92 "target scope: reference_set:phase1_hot_reference_v1"
93 ]
94 },
95 {
96 "priority_rank": 1,
97 "lane": "semantic",
98 "extraction_job_id": 3,
99 "feature_set_id": 4,
100 "target_scope": "reference_set:phase1_hot_reference_v1",
101 "scope": {
102 "scope_type": "reference_set",
103 "scope_value": "phase1_hot_reference_v1"
104 },
105 "job_status": "pending",
106 "shard_key": "phase1/reference/mert/v1-95m/10s_5s",
107 "model_name": "mert",
108 "model_version": "v1-95m",
109 "model_family": "music_ssl",
110 "input_sample_rate": 24000,
111 "feature_name": "semantic_embedding",
112 "feature_level": "window",
113 "extraction_granularity": "sliding_window",
114 "window_sec": 10.0,
115 "hop_sec": 5.0,
116 "embedding_dim": 768,
117 "distance_metric": "cosine",
118 "physical_target": "audio_embedding",
119 "vector_table": "audio_embedding_vector_768",
120 "job_metadata": {
121 "lane": "semantic",
122 "role": "long_context_validation",
123 "phase": "phase1"
124 },
125 "model_metadata": {
126 "lane": "semantic",
127 "role": "primary_baseline",
128 "phase": "phase1"
129 },
130 "execution_notes": [
131 "run feature extraction for mert v1-95m",
132 "write to audio_embedding + audio_embedding_vector_768",
133 "target scope: reference_set:phase1_hot_reference_v1"
134 ]
135 },
136 {
137 "priority_rank": 1,
138 "lane": "semantic",
139 "extraction_job_id": 4,
140 "feature_set_id": 5,
141 "target_scope": "reference_set:phase1_hot_reference_v1",
142 "scope": {
143 "scope_type": "reference_set",
144 "scope_value": "phase1_hot_reference_v1"
145 },
146 "job_status": "pending",
147 "shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s",
148 "model_name": "muq",
149 "model_version": "large-msd-iter",
150 "model_family": "music_ssl",
151 "input_sample_rate": 24000,
152 "feature_name": "semantic_embedding",
153 "feature_level": "window",
154 "extraction_granularity": "sliding_window",
155 "window_sec": 5.0,
156 "hop_sec": 2.5,
157 "embedding_dim": 768,
158 "distance_metric": "cosine",
159 "physical_target": "audio_embedding",
160 "vector_table": "audio_embedding_vector_768",
161 "job_metadata": {
162 "lane": "semantic",
163 "role": "challenger",
164 "phase": "phase1"
165 },
166 "model_metadata": {
167 "lane": "semantic",
168 "role": "challenger",
169 "phase": "phase1"
170 },
171 "execution_notes": [
172 "run feature extraction for muq large-msd-iter",
173 "write to audio_embedding + audio_embedding_vector_768",
174 "target scope: reference_set:phase1_hot_reference_v1"
175 ]
176 },
177 {
178 "priority_rank": 1,
179 "lane": "semantic",
180 "extraction_job_id": 5,
181 "feature_set_id": 6,
182 "target_scope": "reference_set:phase1_hot_reference_v1",
183 "scope": {
184 "scope_type": "reference_set",
185 "scope_value": "phase1_hot_reference_v1"
186 },
187 "job_status": "pending",
188 "shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s",
189 "model_name": "ecapa",
190 "model_version": "acr-baseline-v1",
191 "model_family": "speech_derived",
192 "input_sample_rate": 16000,
193 "feature_name": "semantic_embedding",
194 "feature_level": "window",
195 "extraction_granularity": "sliding_window",
196 "window_sec": 5.0,
197 "hop_sec": 2.5,
198 "embedding_dim": 192,
199 "distance_metric": "cosine",
200 "physical_target": "audio_embedding",
201 "vector_table": "audio_embedding_vector_192",
202 "job_metadata": {
203 "lane": "semantic",
204 "role": "historical_baseline",
205 "phase": "phase1"
206 },
207 "model_metadata": {
208 "lane": "semantic",
209 "role": "historical_baseline",
210 "phase": "phase1"
211 },
212 "execution_notes": [
213 "run feature extraction for ecapa acr-baseline-v1",
214 "write to audio_embedding + audio_embedding_vector_192",
215 "target scope: reference_set:phase1_hot_reference_v1"
216 ]
217 }
218 ],
219 "by_lane": {
220 "exact": [
221 {
222 "priority_rank": 0,
223 "lane": "exact",
224 "extraction_job_id": 1,
225 "feature_set_id": 2,
226 "target_scope": "reference_set:phase1_hot_reference_v1",
227 "scope": {
228 "scope_type": "reference_set",
229 "scope_value": "phase1_hot_reference_v1"
230 },
231 "job_status": "pending",
232 "shard_key": "phase1/reference/chromaprint/v1",
233 "model_name": "chromaprint",
234 "model_version": "v1",
235 "model_family": "fingerprint",
236 "input_sample_rate": 16000,
237 "feature_name": "fingerprint_asset",
238 "feature_level": "asset",
239 "extraction_granularity": "full_asset",
240 "window_sec": 5.0,
241 "hop_sec": 2.5,
242 "embedding_dim": null,
243 "distance_metric": "hamming",
244 "physical_target": "audio_fingerprint",
245 "vector_table": null,
246 "job_metadata": {
247 "lane": "exact",
248 "phase": "phase1",
249 "priority": "p0"
250 },
251 "model_metadata": {
252 "lane": "exact",
253 "note": "exact fingerprint lane baseline",
254 "phase": "phase1"
255 },
256 "execution_notes": [
257 "run feature extraction for chromaprint v1",
258 "write to audio_fingerprint",
259 "target scope: reference_set:phase1_hot_reference_v1"
260 ]
261 }
262 ],
263 "semantic": [
264 {
265 "priority_rank": 1,
266 "lane": "semantic",
267 "extraction_job_id": 2,
268 "feature_set_id": 3,
269 "target_scope": "reference_set:phase1_hot_reference_v1",
270 "scope": {
271 "scope_type": "reference_set",
272 "scope_value": "phase1_hot_reference_v1"
273 },
274 "job_status": "pending",
275 "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s",
276 "model_name": "mert",
277 "model_version": "v1-95m",
278 "model_family": "music_ssl",
279 "input_sample_rate": 24000,
280 "feature_name": "semantic_embedding",
281 "feature_level": "window",
282 "extraction_granularity": "sliding_window",
283 "window_sec": 5.0,
284 "hop_sec": 2.5,
285 "embedding_dim": 768,
286 "distance_metric": "cosine",
287 "physical_target": "audio_embedding",
288 "vector_table": "audio_embedding_vector_768",
289 "job_metadata": {
290 "lane": "semantic",
291 "role": "primary_baseline",
292 "phase": "phase1"
293 },
294 "model_metadata": {
295 "lane": "semantic",
296 "role": "primary_baseline",
297 "phase": "phase1"
298 },
299 "execution_notes": [
300 "run feature extraction for mert v1-95m",
301 "write to audio_embedding + audio_embedding_vector_768",
302 "target scope: reference_set:phase1_hot_reference_v1"
303 ]
304 },
305 {
306 "priority_rank": 1,
307 "lane": "semantic",
308 "extraction_job_id": 3,
309 "feature_set_id": 4,
310 "target_scope": "reference_set:phase1_hot_reference_v1",
311 "scope": {
312 "scope_type": "reference_set",
313 "scope_value": "phase1_hot_reference_v1"
314 },
315 "job_status": "pending",
316 "shard_key": "phase1/reference/mert/v1-95m/10s_5s",
317 "model_name": "mert",
318 "model_version": "v1-95m",
319 "model_family": "music_ssl",
320 "input_sample_rate": 24000,
321 "feature_name": "semantic_embedding",
322 "feature_level": "window",
323 "extraction_granularity": "sliding_window",
324 "window_sec": 10.0,
325 "hop_sec": 5.0,
326 "embedding_dim": 768,
327 "distance_metric": "cosine",
328 "physical_target": "audio_embedding",
329 "vector_table": "audio_embedding_vector_768",
330 "job_metadata": {
331 "lane": "semantic",
332 "role": "long_context_validation",
333 "phase": "phase1"
334 },
335 "model_metadata": {
336 "lane": "semantic",
337 "role": "primary_baseline",
338 "phase": "phase1"
339 },
340 "execution_notes": [
341 "run feature extraction for mert v1-95m",
342 "write to audio_embedding + audio_embedding_vector_768",
343 "target scope: reference_set:phase1_hot_reference_v1"
344 ]
345 },
346 {
347 "priority_rank": 1,
348 "lane": "semantic",
349 "extraction_job_id": 4,
350 "feature_set_id": 5,
351 "target_scope": "reference_set:phase1_hot_reference_v1",
352 "scope": {
353 "scope_type": "reference_set",
354 "scope_value": "phase1_hot_reference_v1"
355 },
356 "job_status": "pending",
357 "shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s",
358 "model_name": "muq",
359 "model_version": "large-msd-iter",
360 "model_family": "music_ssl",
361 "input_sample_rate": 24000,
362 "feature_name": "semantic_embedding",
363 "feature_level": "window",
364 "extraction_granularity": "sliding_window",
365 "window_sec": 5.0,
366 "hop_sec": 2.5,
367 "embedding_dim": 768,
368 "distance_metric": "cosine",
369 "physical_target": "audio_embedding",
370 "vector_table": "audio_embedding_vector_768",
371 "job_metadata": {
372 "lane": "semantic",
373 "role": "challenger",
374 "phase": "phase1"
375 },
376 "model_metadata": {
377 "lane": "semantic",
378 "role": "challenger",
379 "phase": "phase1"
380 },
381 "execution_notes": [
382 "run feature extraction for muq large-msd-iter",
383 "write to audio_embedding + audio_embedding_vector_768",
384 "target scope: reference_set:phase1_hot_reference_v1"
385 ]
386 },
387 {
388 "priority_rank": 1,
389 "lane": "semantic",
390 "extraction_job_id": 5,
391 "feature_set_id": 6,
392 "target_scope": "reference_set:phase1_hot_reference_v1",
393 "scope": {
394 "scope_type": "reference_set",
395 "scope_value": "phase1_hot_reference_v1"
396 },
397 "job_status": "pending",
398 "shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s",
399 "model_name": "ecapa",
400 "model_version": "acr-baseline-v1",
401 "model_family": "speech_derived",
402 "input_sample_rate": 16000,
403 "feature_name": "semantic_embedding",
404 "feature_level": "window",
405 "extraction_granularity": "sliding_window",
406 "window_sec": 5.0,
407 "hop_sec": 2.5,
408 "embedding_dim": 192,
409 "distance_metric": "cosine",
410 "physical_target": "audio_embedding",
411 "vector_table": "audio_embedding_vector_192",
412 "job_metadata": {
413 "lane": "semantic",
414 "role": "historical_baseline",
415 "phase": "phase1"
416 },
417 "model_metadata": {
418 "lane": "semantic",
419 "role": "historical_baseline",
420 "phase": "phase1"
421 },
422 "execution_notes": [
423 "run feature extraction for ecapa acr-baseline-v1",
424 "write to audio_embedding + audio_embedding_vector_192",
425 "target scope: reference_set:phase1_hot_reference_v1"
426 ]
427 }
428 ]
429 },
430 "execution_order_summary": [
431 {
432 "order": 1,
433 "extraction_job_id": 1,
434 "lane": "exact",
435 "model_name": "chromaprint",
436 "feature_name": "fingerprint_asset",
437 "window_sec": 5.0,
438 "hop_sec": 2.5,
439 "physical_target": "audio_fingerprint"
440 },
441 {
442 "order": 2,
443 "extraction_job_id": 2,
444 "lane": "semantic",
445 "model_name": "mert",
446 "feature_name": "semantic_embedding",
447 "window_sec": 5.0,
448 "hop_sec": 2.5,
449 "physical_target": "audio_embedding"
450 },
451 {
452 "order": 3,
453 "extraction_job_id": 3,
454 "lane": "semantic",
455 "model_name": "mert",
456 "feature_name": "semantic_embedding",
457 "window_sec": 10.0,
458 "hop_sec": 5.0,
459 "physical_target": "audio_embedding"
460 },
461 {
462 "order": 4,
463 "extraction_job_id": 4,
464 "lane": "semantic",
465 "model_name": "muq",
466 "feature_name": "semantic_embedding",
467 "window_sec": 5.0,
468 "hop_sec": 2.5,
469 "physical_target": "audio_embedding"
470 },
471 {
472 "order": 5,
473 "extraction_job_id": 5,
474 "lane": "semantic",
475 "model_name": "ecapa",
476 "feature_name": "semantic_embedding",
477 "window_sec": 5.0,
478 "hop_sec": 2.5,
479 "physical_target": "audio_embedding"
480 }
481 ]
482 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7 from typing import Any
8
9 import psycopg
10
11 ROOT = Path(__file__).resolve().parents[1]
12 DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_plan_report.json'
13
14 LANE_PRIORITY = {
15 'exact': 0,
16 'semantic': 1,
17 'cover': 2,
18 }
19
20
21 def parse_target_scope(target_scope: str) -> dict[str, Any]:
22 if ':' in target_scope:
23 scope_type, scope_value = target_scope.split(':', 1)
24 return {'scope_type': scope_type, 'scope_value': scope_value}
25 return {'scope_type': 'unknown', 'scope_value': target_scope}
26
27
28 def main() -> None:
29 ap = argparse.ArgumentParser()
30 ap.add_argument('--dsn', required=True)
31 ap.add_argument('--schema', default='acr_test')
32 ap.add_argument('--job-status', default='pending')
33 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
34 args = ap.parse_args()
35
36 with psycopg.connect(args.dsn) as conn:
37 conn.execute(f'SET search_path TO {args.schema}, public;')
38 rows = conn.execute(
39 """
40 SELECT
41 fej.extraction_job_id,
42 fej.feature_set_id,
43 fej.target_scope,
44 fej.job_status,
45 fej.shard_key,
46 fej.metadata_json,
47 fs.feature_name,
48 fs.feature_level,
49 fs.extraction_granularity,
50 fs.window_sec,
51 fs.hop_sec,
52 fs.embedding_dim,
53 fs.distance_metric,
54 mr.model_name,
55 mr.model_version,
56 mr.model_family,
57 mr.output_embedding_dim,
58 mr.input_sample_rate,
59 mr.default_window_sec,
60 mr.default_hop_sec,
61 mr.metadata_json
62 FROM feature_extraction_job fej
63 JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id
64 JOIN model_registry mr ON mr.model_id = fs.model_id
65 WHERE fej.job_status = %s
66 ORDER BY fej.extraction_job_id;
67 """,
68 (args.job_status,),
69 ).fetchall()
70
71 jobs = []
72 by_lane: dict[str, list[dict[str, Any]]] = {}
73 for row in rows:
74 job_meta = row[5] or {}
75 model_meta = row[20] or {}
76 lane = job_meta.get('lane') or model_meta.get('lane') or 'unknown'
77 scope = parse_target_scope(row[2])
78 physical_target = 'audio_fingerprint' if row[6] == 'fingerprint_asset' else 'audio_embedding'
79 vector_table = None
80 if row[11] == 192:
81 vector_table = 'audio_embedding_vector_192'
82 elif row[11] == 768:
83 vector_table = 'audio_embedding_vector_768'
84
85 item = {
86 'priority_rank': LANE_PRIORITY.get(lane, 99),
87 'lane': lane,
88 'extraction_job_id': row[0],
89 'feature_set_id': row[1],
90 'target_scope': row[2],
91 'scope': scope,
92 'job_status': row[3],
93 'shard_key': row[4],
94 'model_name': row[13],
95 'model_version': row[14],
96 'model_family': row[15],
97 'input_sample_rate': row[17],
98 'feature_name': row[6],
99 'feature_level': row[7],
100 'extraction_granularity': row[8],
101 'window_sec': float(row[9]) if row[9] is not None else None,
102 'hop_sec': float(row[10]) if row[10] is not None else None,
103 'embedding_dim': row[11],
104 'distance_metric': row[12],
105 'physical_target': physical_target,
106 'vector_table': vector_table,
107 'job_metadata': job_meta,
108 'model_metadata': model_meta,
109 'execution_notes': [
110 f"run feature extraction for {row[13]} {row[14]}",
111 f"write to {physical_target}" + (f" + {vector_table}" if vector_table else ''),
112 f"target scope: {row[2]}",
113 ],
114 }
115 jobs.append(item)
116 by_lane.setdefault(lane, []).append(item)
117
118 jobs.sort(key=lambda x: (x['priority_rank'], x['extraction_job_id']))
119 for lane_jobs in by_lane.values():
120 lane_jobs.sort(key=lambda x: x['extraction_job_id'])
121
122 payload = {
123 'schema': args.schema,
124 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
125 'job_status_filter': args.job_status,
126 'counts': {
127 'jobs': len(jobs),
128 'lanes': {lane: len(items) for lane, items in sorted(by_lane.items())},
129 },
130 'ordered_jobs': jobs,
131 'by_lane': by_lane,
132 'execution_order_summary': [
133 {
134 'order': idx + 1,
135 'extraction_job_id': job['extraction_job_id'],
136 'lane': job['lane'],
137 'model_name': job['model_name'],
138 'feature_name': job['feature_name'],
139 'window_sec': job['window_sec'],
140 'hop_sec': job['hop_sec'],
141 'physical_target': job['physical_target'],
142 }
143 for idx, job in enumerate(jobs)
144 ],
145 }
146
147 out = Path(args.output)
148 out.parent.mkdir(parents=True, exist_ok=True)
149 out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8')
150 print(json.dumps(payload, ensure_ascii=False, indent=2))
151
152
153 if __name__ == '__main__':
154 main()
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。
3 - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 4 - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py``acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。
4 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。 5 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。
5 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 6 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。
......
...@@ -344,3 +344,56 @@ cd /workspace/acr-engine ...@@ -344,3 +344,56 @@ cd /workspace/acr-engine
344 这意味着: 344 这意味着:
345 345
346 > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。 346 > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。
347
348 ---
349
350 ## 10. Phase-1 extraction plan(从 pending jobs 生成)
351
352 `feature_extraction_job` 已经存在后,下一步通常不是马上手敲命令,而是先从 PostgreSQL 生成一个**统一执行计划**
353
354 本仓库现在已经提供:
355
356 - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py`
357
358 用途:
359 - 读取 `feature_extraction_job`
360 - 过滤 `job_status=pending`
361 - 联表 `feature_set_registry + model_registry`
362 - 生成按 lane / priority 排序的 execution plan
363
364 ### 10.1 执行命令
365
366 ```bash
367 cd /workspace/acr-engine
368 /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py \
369 --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \
370 --schema acr_test \
371 --job-status pending \
372 --output data/pgvector_eval/music20/phase1_extraction_plan_report.json
373 ```
374
375 ### 10.2 当前已验证结果(acr_test)
376
377 本轮已真实生成一份 ordered execution plan:
378
379 | order | lane | model | feature | physical_target |
380 |---|---|---|---|---|
381 | 1 | `exact` | `chromaprint` | `fingerprint_asset` | `audio_fingerprint` |
382 | 2 | `semantic` | `mert` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
383 | 3 | `semantic` | `mert` | `semantic_embedding 10s/5s` | `audio_embedding` |
384 | 4 | `semantic` | `muq` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
385 | 5 | `semantic` | `ecapa` | `semantic_embedding 5s/2.5s` | `audio_embedding` |
386
387 其中 planner 还会自动给出:
388 - `vector_table`
389 - `audio_embedding_vector_768`
390 - `audio_embedding_vector_192`
391 - `target_scope`
392 - `execution_notes`
393
394 当前产物:
395 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`
396
397 结论:
398
399 > 现在 PostgreSQL 里已经不仅能描述“有哪些 job”,还可以直接生成**按执行顺序排好的抽特征计划**。
......
...@@ -68,6 +68,7 @@ ...@@ -68,6 +68,7 @@
68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | 68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` |
69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | 69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` |
70 | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | 70 | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` |
71 | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` |
71 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | 72 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` |
72 73
73 --- 74 ---
...@@ -416,6 +417,19 @@ flowchart LR ...@@ -416,6 +417,19 @@ flowchart LR
416 对应 live 报告: 417 对应 live 报告:
417 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` 418 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`
418 419
420 ### 本轮继续新增:pending jobs 已可生成 live execution plan
421
422 在 extraction jobs 之后,本轮又新增:
423
424 - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py`
425
426 它已经在 `acr_test` schema 上真实读取 5 条 `pending` jobs,并生成按执行顺序排列的 plan:
427 - `chromaprint exact lane` 优先
428 - 然后是 `mert / muq / ecapa` 的 semantic lanes
429
430 对应 live 报告:
431 - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`
432
419 ### 路线 1:继续做 PostgreSQL 工程化 433 ### 路线 1:继续做 PostgreSQL 工程化
420 434
421 1.`live_pgvector_music20_eval.py` 泛化成: 435 1.`live_pgvector_music20_eval.py` 泛化成:
......
...@@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` 184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1`
185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变 185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变
186 - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 186 - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动
187 - PostgreSQL `acr_test` schema 上已真实生成 Phase-1 extraction execution plan,当前顺序是 `chromaprint -> mert -> mert-long -> muq -> ecapa`
187 188
188 ### 未验证 / 仍是缺口 189 ### 未验证 / 仍是缺口
189 - **未实际跑 MERT / MuQ encoder-only 特征抽取** 190 - **未实际跑 MERT / MuQ encoder-only 特征抽取**
......