Commit 44bbfcb5 44bbfcb50895dad287867165c5a5c15943dc6ec6 by cnb.bofCdSsphPA

Bridge pgvector exports toward actual PostgreSQL bulk ingestion

Constraint: Schema and manifest-export templates are useful, but practical adoption still needs an explicit handoff into database load order and SQL shapes
Rejected: Stop at export JSON only | Leaves later sessions to redesign the bulk-ingest bridge from scratch
Confidence: high
Scope-risk: narrow
Directive: Keep bulk-load templates declarative until a real database target is available, then add a live loader without changing manifest semantics
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/pgvector_bulk_load_template.py; /usr/local/miniconda3/bin/python acr-engine/scripts/pgvector_bulk_load_template.py --input acr-engine/reports/pgvector_manifest_export_test.json --output acr-engine/reports/pgvector_bulk_load_plan_test.json
Not-tested: Live PostgreSQL execution remains pending a database environment
1 parent 528cc473
1 {
2 "counts": {
3 "songs": 24,
4 "references": 24,
5 "segments": 20
6 },
7 "sql": {
8 "songs": "INSERT INTO songs (song_id, title, artist, version_id, source_dataset, license)\nVALUES (%(song_id)s, %(title)s, %(artist)s, %(version_id)s, %(source_dataset)s, %(license)s)\nON CONFLICT (song_id) DO UPDATE SET\n title = EXCLUDED.title,\n artist = EXCLUDED.artist,\n version_id = EXCLUDED.version_id,\n source_dataset = EXCLUDED.source_dataset,\n license = EXCLUDED.license;",
9 "references": "INSERT INTO references (song_id, audio_uri, duration_sec, sample_rate)\nVALUES (%(song_id)s, %(audio_uri)s, %(duration_sec)s, %(sample_rate)s);",
10 "segments": "INSERT INTO segments (song_id, audio_uri, offset_sec, duration_sec, split, type, segment_type, source_dataset)\nVALUES (%(song_id)s, %(audio_uri)s, %(offset_sec)s, %(duration_sec)s, %(split)s, %(type)s, %(segment_type)s, %(source_dataset)s);"
11 },
12 "rows": {
13 "songs": [
14 {
15 "song_id": "song_0000",
16 "title": "song_0000",
17 "artist": null,
18 "version_id": null,
19 "source_dataset": "synthetic_v2",
20 "license": null
21 },
22 {
23 "song_id": "song_0001",
24 "title": "song_0001",
25 "artist": null,
26 "version_id": null,
27 "source_dataset": "synthetic_v2",
28 "license": null
29 },
30 {
31 "song_id": "song_0002",
32 "title": "song_0002",
33 "artist": null,
34 "version_id": null,
35 "source_dataset": "synthetic_v2",
36 "license": null
37 },
38 {
39 "song_id": "song_0003",
40 "title": "song_0003",
41 "artist": null,
42 "version_id": null,
43 "source_dataset": "synthetic_v2",
44 "license": null
45 },
46 {
47 "song_id": "song_0004",
48 "title": "song_0004",
49 "artist": null,
50 "version_id": null,
51 "source_dataset": "synthetic_v2",
52 "license": null
53 },
54 {
55 "song_id": "song_0005",
56 "title": "song_0005",
57 "artist": null,
58 "version_id": null,
59 "source_dataset": "synthetic_v2",
60 "license": null
61 },
62 {
63 "song_id": "song_0006",
64 "title": "song_0006",
65 "artist": null,
66 "version_id": null,
67 "source_dataset": "synthetic_v2",
68 "license": null
69 },
70 {
71 "song_id": "song_0007",
72 "title": "song_0007",
73 "artist": null,
74 "version_id": null,
75 "source_dataset": "synthetic_v2",
76 "license": null
77 },
78 {
79 "song_id": "song_0008",
80 "title": "song_0008",
81 "artist": null,
82 "version_id": null,
83 "source_dataset": "synthetic_v2",
84 "license": null
85 },
86 {
87 "song_id": "song_0009",
88 "title": "song_0009",
89 "artist": null,
90 "version_id": null,
91 "source_dataset": "synthetic_v2",
92 "license": null
93 },
94 {
95 "song_id": "song_0010",
96 "title": "song_0010",
97 "artist": null,
98 "version_id": null,
99 "source_dataset": "synthetic_v2",
100 "license": null
101 },
102 {
103 "song_id": "song_0011",
104 "title": "song_0011",
105 "artist": null,
106 "version_id": null,
107 "source_dataset": "synthetic_v2",
108 "license": null
109 },
110 {
111 "song_id": "song_0012",
112 "title": "song_0012",
113 "artist": null,
114 "version_id": null,
115 "source_dataset": "synthetic_v2",
116 "license": null
117 },
118 {
119 "song_id": "song_0013",
120 "title": "song_0013",
121 "artist": null,
122 "version_id": null,
123 "source_dataset": "synthetic_v2",
124 "license": null
125 },
126 {
127 "song_id": "song_0014",
128 "title": "song_0014",
129 "artist": null,
130 "version_id": null,
131 "source_dataset": "synthetic_v2",
132 "license": null
133 },
134 {
135 "song_id": "song_0015",
136 "title": "song_0015",
137 "artist": null,
138 "version_id": null,
139 "source_dataset": "synthetic_v2",
140 "license": null
141 },
142 {
143 "song_id": "song_0016",
144 "title": "song_0016",
145 "artist": null,
146 "version_id": null,
147 "source_dataset": "synthetic_v2",
148 "license": null
149 },
150 {
151 "song_id": "song_0017",
152 "title": "song_0017",
153 "artist": null,
154 "version_id": null,
155 "source_dataset": "synthetic_v2",
156 "license": null
157 },
158 {
159 "song_id": "song_0018",
160 "title": "song_0018",
161 "artist": null,
162 "version_id": null,
163 "source_dataset": "synthetic_v2",
164 "license": null
165 },
166 {
167 "song_id": "song_0019",
168 "title": "song_0019",
169 "artist": null,
170 "version_id": null,
171 "source_dataset": "synthetic_v2",
172 "license": null
173 },
174 {
175 "song_id": "song_0020",
176 "title": "song_0020",
177 "artist": null,
178 "version_id": null,
179 "source_dataset": "synthetic_v2",
180 "license": null
181 },
182 {
183 "song_id": "song_0021",
184 "title": "song_0021",
185 "artist": null,
186 "version_id": null,
187 "source_dataset": "synthetic_v2",
188 "license": null
189 },
190 {
191 "song_id": "song_0022",
192 "title": "song_0022",
193 "artist": null,
194 "version_id": null,
195 "source_dataset": "synthetic_v2",
196 "license": null
197 },
198 {
199 "song_id": "song_0023",
200 "title": "song_0023",
201 "artist": null,
202 "version_id": null,
203 "source_dataset": "synthetic_v2",
204 "license": null
205 }
206 ],
207 "references": [
208 {
209 "song_id": "song_0000",
210 "audio_uri": "songs/song_0000.wav",
211 "duration_sec": 15.0,
212 "sample_rate": 16000
213 },
214 {
215 "song_id": "song_0001",
216 "audio_uri": "songs/song_0001.wav",
217 "duration_sec": 15.0,
218 "sample_rate": 16000
219 },
220 {
221 "song_id": "song_0002",
222 "audio_uri": "songs/song_0002.wav",
223 "duration_sec": 15.0,
224 "sample_rate": 16000
225 },
226 {
227 "song_id": "song_0003",
228 "audio_uri": "songs/song_0003.wav",
229 "duration_sec": 15.0,
230 "sample_rate": 16000
231 },
232 {
233 "song_id": "song_0004",
234 "audio_uri": "songs/song_0004.wav",
235 "duration_sec": 15.0,
236 "sample_rate": 16000
237 },
238 {
239 "song_id": "song_0005",
240 "audio_uri": "songs/song_0005.wav",
241 "duration_sec": 15.0,
242 "sample_rate": 16000
243 },
244 {
245 "song_id": "song_0006",
246 "audio_uri": "songs/song_0006.wav",
247 "duration_sec": 15.0,
248 "sample_rate": 16000
249 },
250 {
251 "song_id": "song_0007",
252 "audio_uri": "songs/song_0007.wav",
253 "duration_sec": 15.0,
254 "sample_rate": 16000
255 },
256 {
257 "song_id": "song_0008",
258 "audio_uri": "songs/song_0008.wav",
259 "duration_sec": 15.0,
260 "sample_rate": 16000
261 },
262 {
263 "song_id": "song_0009",
264 "audio_uri": "songs/song_0009.wav",
265 "duration_sec": 15.0,
266 "sample_rate": 16000
267 },
268 {
269 "song_id": "song_0010",
270 "audio_uri": "songs/song_0010.wav",
271 "duration_sec": 15.0,
272 "sample_rate": 16000
273 },
274 {
275 "song_id": "song_0011",
276 "audio_uri": "songs/song_0011.wav",
277 "duration_sec": 15.0,
278 "sample_rate": 16000
279 },
280 {
281 "song_id": "song_0012",
282 "audio_uri": "songs/song_0012.wav",
283 "duration_sec": 15.0,
284 "sample_rate": 16000
285 },
286 {
287 "song_id": "song_0013",
288 "audio_uri": "songs/song_0013.wav",
289 "duration_sec": 15.0,
290 "sample_rate": 16000
291 },
292 {
293 "song_id": "song_0014",
294 "audio_uri": "songs/song_0014.wav",
295 "duration_sec": 15.0,
296 "sample_rate": 16000
297 },
298 {
299 "song_id": "song_0015",
300 "audio_uri": "songs/song_0015.wav",
301 "duration_sec": 15.0,
302 "sample_rate": 16000
303 },
304 {
305 "song_id": "song_0016",
306 "audio_uri": "songs/song_0016.wav",
307 "duration_sec": 15.0,
308 "sample_rate": 16000
309 },
310 {
311 "song_id": "song_0017",
312 "audio_uri": "songs/song_0017.wav",
313 "duration_sec": 15.0,
314 "sample_rate": 16000
315 },
316 {
317 "song_id": "song_0018",
318 "audio_uri": "songs/song_0018.wav",
319 "duration_sec": 15.0,
320 "sample_rate": 16000
321 },
322 {
323 "song_id": "song_0019",
324 "audio_uri": "songs/song_0019.wav",
325 "duration_sec": 15.0,
326 "sample_rate": 16000
327 },
328 {
329 "song_id": "song_0020",
330 "audio_uri": "songs/song_0020.wav",
331 "duration_sec": 15.0,
332 "sample_rate": 16000
333 },
334 {
335 "song_id": "song_0021",
336 "audio_uri": "songs/song_0021.wav",
337 "duration_sec": 15.0,
338 "sample_rate": 16000
339 },
340 {
341 "song_id": "song_0022",
342 "audio_uri": "songs/song_0022.wav",
343 "duration_sec": 15.0,
344 "sample_rate": 16000
345 },
346 {
347 "song_id": "song_0023",
348 "audio_uri": "songs/song_0023.wav",
349 "duration_sec": 15.0,
350 "sample_rate": 16000
351 }
352 ],
353 "segments": [
354 {
355 "song_id": "song_0020",
356 "audio_uri": "segments/song_0020_seg_00.wav",
357 "offset_sec": 4.349828784349853,
358 "duration_sec": 5.0,
359 "split": "test",
360 "type": "clean",
361 "segment_type": "mid",
362 "source_dataset": "synthetic_v2"
363 },
364 {
365 "song_id": "song_0020",
366 "audio_uri": "segments/song_0020_seg_01.wav",
367 "offset_sec": 9.642182747327407,
368 "duration_sec": 5.0,
369 "split": "test",
370 "type": "clean",
371 "segment_type": "mid",
372 "source_dataset": "synthetic_v2"
373 },
374 {
375 "song_id": "song_0020",
376 "audio_uri": "segments/song_0020_seg_02_augmented.wav",
377 "offset_sec": 2.367717347418965,
378 "duration_sec": 5.0,
379 "split": "test",
380 "type": "augmented",
381 "segment_type": "intro",
382 "source_dataset": "synthetic_v2"
383 },
384 {
385 "song_id": "song_0020",
386 "audio_uri": "segments/song_0020_seg_03_humming_like.wav",
387 "offset_sec": 3.180577192661006,
388 "duration_sec": 5.0,
389 "split": "test",
390 "type": "humming_like",
391 "segment_type": "mid",
392 "source_dataset": "synthetic_v2"
393 },
394 {
395 "song_id": "song_0020",
396 "audio_uri": "segments/song_0020_seg_04_confused.wav",
397 "offset_sec": 4.660551124366617,
398 "duration_sec": 5.0,
399 "split": "test",
400 "type": "confused",
401 "segment_type": "mid",
402 "source_dataset": "synthetic_v2"
403 },
404 {
405 "song_id": "song_0021",
406 "audio_uri": "segments/song_0021_seg_00.wav",
407 "offset_sec": 5.631088908640184,
408 "duration_sec": 5.0,
409 "split": "test",
410 "type": "clean",
411 "segment_type": "mid",
412 "source_dataset": "synthetic_v2"
413 },
414 {
415 "song_id": "song_0021",
416 "audio_uri": "segments/song_0021_seg_01.wav",
417 "offset_sec": 1.8823366490525628,
418 "duration_sec": 5.0,
419 "split": "test",
420 "type": "clean",
421 "segment_type": "intro",
422 "source_dataset": "synthetic_v2"
423 },
424 {
425 "song_id": "song_0021",
426 "audio_uri": "segments/song_0021_seg_02_augmented.wav",
427 "offset_sec": 9.88006210404643,
428 "duration_sec": 5.0,
429 "split": "test",
430 "type": "augmented",
431 "segment_type": "mid",
432 "source_dataset": "synthetic_v2"
433 },
434 {
435 "song_id": "song_0021",
436 "audio_uri": "segments/song_0021_seg_03_humming_like.wav",
437 "offset_sec": 0.9025737685090285,
438 "duration_sec": 5.0,
439 "split": "test",
440 "type": "humming_like",
441 "segment_type": "intro",
442 "source_dataset": "synthetic_v2"
443 },
444 {
445 "song_id": "song_0021",
446 "audio_uri": "segments/song_0021_seg_04_confused.wav",
447 "offset_sec": 1.3048954561918258,
448 "duration_sec": 5.0,
449 "split": "test",
450 "type": "confused",
451 "segment_type": "intro",
452 "source_dataset": "synthetic_v2"
453 },
454 {
455 "song_id": "song_0022",
456 "audio_uri": "segments/song_0022_seg_00.wav",
457 "offset_sec": 3.9746734850812295,
458 "duration_sec": 5.0,
459 "split": "test",
460 "type": "clean",
461 "segment_type": "mid",
462 "source_dataset": "synthetic_v2"
463 },
464 {
465 "song_id": "song_0022",
466 "audio_uri": "segments/song_0022_seg_01.wav",
467 "offset_sec": 4.890968121206573,
468 "duration_sec": 5.0,
469 "split": "test",
470 "type": "clean",
471 "segment_type": "mid",
472 "source_dataset": "synthetic_v2"
473 },
474 {
475 "song_id": "song_0022",
476 "audio_uri": "segments/song_0022_seg_02_augmented.wav",
477 "offset_sec": 6.610400547460049,
478 "duration_sec": 5.0,
479 "split": "test",
480 "type": "augmented",
481 "segment_type": "mid",
482 "source_dataset": "synthetic_v2"
483 },
484 {
485 "song_id": "song_0022",
486 "audio_uri": "segments/song_0022_seg_03_humming_like.wav",
487 "offset_sec": 2.6329596668288424,
488 "duration_sec": 5.0,
489 "split": "test",
490 "type": "humming_like",
491 "segment_type": "intro",
492 "source_dataset": "synthetic_v2"
493 },
494 {
495 "song_id": "song_0022",
496 "audio_uri": "segments/song_0022_seg_04_confused.wav",
497 "offset_sec": 0.8570731183991709,
498 "duration_sec": 5.0,
499 "split": "test",
500 "type": "confused",
501 "segment_type": "intro",
502 "source_dataset": "synthetic_v2"
503 },
504 {
505 "song_id": "song_0023",
506 "audio_uri": "segments/song_0023_seg_00.wav",
507 "offset_sec": 4.461034326075292,
508 "duration_sec": 5.0,
509 "split": "test",
510 "type": "clean",
511 "segment_type": "mid",
512 "source_dataset": "synthetic_v2"
513 },
514 {
515 "song_id": "song_0023",
516 "audio_uri": "segments/song_0023_seg_01.wav",
517 "offset_sec": 9.605203782802876,
518 "duration_sec": 5.0,
519 "split": "test",
520 "type": "clean",
521 "segment_type": "mid",
522 "source_dataset": "synthetic_v2"
523 },
524 {
525 "song_id": "song_0023",
526 "audio_uri": "segments/song_0023_seg_02_augmented.wav",
527 "offset_sec": 4.7458228906154805,
528 "duration_sec": 5.0,
529 "split": "test",
530 "type": "augmented",
531 "segment_type": "mid",
532 "source_dataset": "synthetic_v2"
533 },
534 {
535 "song_id": "song_0023",
536 "audio_uri": "segments/song_0023_seg_03_humming_like.wav",
537 "offset_sec": 8.308702013555955,
538 "duration_sec": 5.0,
539 "split": "test",
540 "type": "humming_like",
541 "segment_type": "mid",
542 "source_dataset": "synthetic_v2"
543 },
544 {
545 "song_id": "song_0023",
546 "audio_uri": "segments/song_0023_seg_04_confused.wav",
547 "offset_sec": 2.213510770155481,
548 "duration_sec": 5.0,
549 "split": "test",
550 "type": "confused",
551 "segment_type": "intro",
552 "source_dataset": "synthetic_v2"
553 }
554 ]
555 },
556 "notes": [
557 "Execute songs before references and segments.",
558 "Embedding rows should be loaded only after reference_id/segment_id resolution.",
559 "A live loader can replace row-wise inserts with COPY/execute_batch."
560 ]
561 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env python3
2 """Template bulk loader for pgvector-related metadata tables.
3
4 This script intentionally avoids requiring psycopg at runtime for now.
5 It produces the SQL statements and row payloads that a future live loader can
6 execute via COPY or execute_batch.
7 """
8
9 from __future__ import annotations
10
11 import argparse
12 import json
13 from pathlib import Path
14
15
16 SQL_STATEMENTS = {
17 "songs": """
18 INSERT INTO songs (song_id, title, artist, version_id, source_dataset, license)
19 VALUES (%(song_id)s, %(title)s, %(artist)s, %(version_id)s, %(source_dataset)s, %(license)s)
20 ON CONFLICT (song_id) DO UPDATE SET
21 title = EXCLUDED.title,
22 artist = EXCLUDED.artist,
23 version_id = EXCLUDED.version_id,
24 source_dataset = EXCLUDED.source_dataset,
25 license = EXCLUDED.license;
26 """.strip(),
27 "references": """
28 INSERT INTO references (song_id, audio_uri, duration_sec, sample_rate)
29 VALUES (%(song_id)s, %(audio_uri)s, %(duration_sec)s, %(sample_rate)s);
30 """.strip(),
31 "segments": """
32 INSERT INTO segments (song_id, audio_uri, offset_sec, duration_sec, split, type, segment_type, source_dataset)
33 VALUES (%(song_id)s, %(audio_uri)s, %(offset_sec)s, %(duration_sec)s, %(split)s, %(type)s, %(segment_type)s, %(source_dataset)s);
34 """.strip(),
35 }
36
37
38 def main():
39 parser = argparse.ArgumentParser()
40 parser.add_argument("--input", required=True, help="JSON exported by export_manifest_to_pgvector_json.py")
41 parser.add_argument("--output", required=True, help="Output JSON plan for later DB execution")
42 args = parser.parse_args()
43
44 payload = json.loads(Path(args.input).read_text())
45 plan = {
46 "counts": {
47 "songs": len(payload.get("songs", [])),
48 "references": len(payload.get("references", [])),
49 "segments": len(payload.get("segments", [])),
50 },
51 "sql": SQL_STATEMENTS,
52 "rows": {
53 "songs": payload.get("songs", []),
54 "references": payload.get("references", []),
55 "segments": payload.get("segments", []),
56 },
57 "notes": [
58 "Execute songs before references and segments.",
59 "Embedding rows should be loaded only after reference_id/segment_id resolution.",
60 "A live loader can replace row-wise inserts with COPY/execute_batch.",
61 ],
62 }
63
64 out = Path(args.output)
65 out.parent.mkdir(parents=True, exist_ok=True)
66 out.write_text(json.dumps(plan, indent=2, ensure_ascii=False))
67 print(json.dumps({
68 "status": "ok",
69 "output": str(out.resolve()),
70 **plan["counts"],
71 }, indent=2, ensure_ascii=False))
72
73
74 if __name__ == "__main__":
75 main()
...@@ -235,6 +235,29 @@ ...@@ -235,6 +235,29 @@
235 235
236 236
237 237
238
239 ### Stage: pgvector bulk load plan 模板
240
241 完成项:
242 - 新增 [acr-engine/scripts/pgvector_bulk_load_template.py](../acr-engine/scripts/pgvector_bulk_load_template.py)
243 - 为 pgvector 导出结果补充 PostgreSQL bulk-load plan 模板
244 -[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充对应说明
245
246 验证结果:
247 - `/usr/local/miniconda3/bin/python -m py_compile scripts/pgvector_bulk_load_template.py` 成功
248 - `/usr/local/miniconda3/bin/python scripts/pgvector_bulk_load_template.py --input reports/pgvector_manifest_export_test.json --output reports/pgvector_bulk_load_plan_test.json` 成功
249 - 当前结果:
250 - `songs=24`
251 - `references=24`
252 - `segments=20`
253
254 结论:
255 - pgvector 方向现在已经具备:
256 - schema 模板
257 - manifest 导出模板
258 - bulk-load plan 模板
259 - 后续接真实 PostgreSQL 时,只差 live loader,而不是从零设计数据入口
260
238 ### Stage: pgvector 落库模板 261 ### Stage: pgvector 落库模板
239 262
240 完成项: 263 完成项:
......
...@@ -539,6 +539,40 @@ cd acr-engine ...@@ -539,6 +539,40 @@ cd acr-engine
539 2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL 539 2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL
540 3. embedding 生成后再写入 `vector(192)` 540 3. embedding 生成后再写入 `vector(192)`
541 541
542
543 ### Bulk load plan 模板
544
545 仓库里现在还新增了:
546
547 - [acr-engine/scripts/pgvector_bulk_load_template.py](../acr-engine/scripts/pgvector_bulk_load_template.py)
548
549 它会把前一步导出的 manifest-friendly JSON,进一步整理成:
550
551 - SQL 语句模板
552 - songs / references / segments 行数据
553 - 导入顺序说明
554
555 示例:
556
557 ```bash
558 cd acr-engine
559 /usr/local/miniconda3/bin/python scripts/pgvector_bulk_load_template.py \
560 --input reports/pgvector_manifest_export_test.json \
561 --output reports/pgvector_bulk_load_plan_test.json
562 ```
563
564 当前已验证结果:
565
566 - `songs=24`
567 - `references=24`
568 - `segments=20`
569
570 这样后续如果你们接真实 PostgreSQL,可以分三步走:
571
572 1. manifest -> pgvector-friendly JSON
573 2. JSON -> bulk load plan
574 3. bulk load plan -> PostgreSQL / pgvector 实际写入
575
542 ## Sources 576 ## Sources
543 577
544 - Current code behavior from: 578 - Current code behavior from:
......