Commit 44bbfcb5 44bbfcb50895dad287867165c5a5c15943dc6ec6 by cnb.bofCdSsphPA

Bridge pgvector exports toward actual PostgreSQL bulk ingestion

Constraint: Schema and manifest-export templates are useful, but practical adoption still needs an explicit handoff into database load order and SQL shapes
Rejected: Stop at export JSON only | Leaves later sessions to redesign the bulk-ingest bridge from scratch
Confidence: high
Scope-risk: narrow
Directive: Keep bulk-load templates declarative until a real database target is available, then add a live loader without changing manifest semantics
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/pgvector_bulk_load_template.py; /usr/local/miniconda3/bin/python acr-engine/scripts/pgvector_bulk_load_template.py --input acr-engine/reports/pgvector_manifest_export_test.json --output acr-engine/reports/pgvector_bulk_load_plan_test.json
Not-tested: Live PostgreSQL execution remains pending a database environment
1 parent 528cc473
{
"counts": {
"songs": 24,
"references": 24,
"segments": 20
},
"sql": {
"songs": "INSERT INTO songs (song_id, title, artist, version_id, source_dataset, license)\nVALUES (%(song_id)s, %(title)s, %(artist)s, %(version_id)s, %(source_dataset)s, %(license)s)\nON CONFLICT (song_id) DO UPDATE SET\n title = EXCLUDED.title,\n artist = EXCLUDED.artist,\n version_id = EXCLUDED.version_id,\n source_dataset = EXCLUDED.source_dataset,\n license = EXCLUDED.license;",
"references": "INSERT INTO references (song_id, audio_uri, duration_sec, sample_rate)\nVALUES (%(song_id)s, %(audio_uri)s, %(duration_sec)s, %(sample_rate)s);",
"segments": "INSERT INTO segments (song_id, audio_uri, offset_sec, duration_sec, split, type, segment_type, source_dataset)\nVALUES (%(song_id)s, %(audio_uri)s, %(offset_sec)s, %(duration_sec)s, %(split)s, %(type)s, %(segment_type)s, %(source_dataset)s);"
},
"rows": {
"songs": [
{
"song_id": "song_0000",
"title": "song_0000",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0001",
"title": "song_0001",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0002",
"title": "song_0002",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0003",
"title": "song_0003",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0004",
"title": "song_0004",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0005",
"title": "song_0005",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0006",
"title": "song_0006",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0007",
"title": "song_0007",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0008",
"title": "song_0008",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0009",
"title": "song_0009",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0010",
"title": "song_0010",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0011",
"title": "song_0011",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0012",
"title": "song_0012",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0013",
"title": "song_0013",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0014",
"title": "song_0014",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0015",
"title": "song_0015",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0016",
"title": "song_0016",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0017",
"title": "song_0017",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0018",
"title": "song_0018",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0019",
"title": "song_0019",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0020",
"title": "song_0020",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0021",
"title": "song_0021",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0022",
"title": "song_0022",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0023",
"title": "song_0023",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
}
],
"references": [
{
"song_id": "song_0000",
"audio_uri": "songs/song_0000.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0001",
"audio_uri": "songs/song_0001.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0002",
"audio_uri": "songs/song_0002.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0003",
"audio_uri": "songs/song_0003.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0004",
"audio_uri": "songs/song_0004.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0005",
"audio_uri": "songs/song_0005.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0006",
"audio_uri": "songs/song_0006.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0007",
"audio_uri": "songs/song_0007.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0008",
"audio_uri": "songs/song_0008.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0009",
"audio_uri": "songs/song_0009.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0010",
"audio_uri": "songs/song_0010.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0011",
"audio_uri": "songs/song_0011.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0012",
"audio_uri": "songs/song_0012.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0013",
"audio_uri": "songs/song_0013.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0014",
"audio_uri": "songs/song_0014.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0015",
"audio_uri": "songs/song_0015.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0016",
"audio_uri": "songs/song_0016.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0017",
"audio_uri": "songs/song_0017.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0018",
"audio_uri": "songs/song_0018.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0019",
"audio_uri": "songs/song_0019.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0020",
"audio_uri": "songs/song_0020.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0021",
"audio_uri": "songs/song_0021.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0022",
"audio_uri": "songs/song_0022.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0023",
"audio_uri": "songs/song_0023.wav",
"duration_sec": 15.0,
"sample_rate": 16000
}
],
"segments": [
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_00.wav",
"offset_sec": 4.349828784349853,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_01.wav",
"offset_sec": 9.642182747327407,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_02_augmented.wav",
"offset_sec": 2.367717347418965,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_03_humming_like.wav",
"offset_sec": 3.180577192661006,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_04_confused.wav",
"offset_sec": 4.660551124366617,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_00.wav",
"offset_sec": 5.631088908640184,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_01.wav",
"offset_sec": 1.8823366490525628,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_02_augmented.wav",
"offset_sec": 9.88006210404643,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_03_humming_like.wav",
"offset_sec": 0.9025737685090285,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_04_confused.wav",
"offset_sec": 1.3048954561918258,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_00.wav",
"offset_sec": 3.9746734850812295,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_01.wav",
"offset_sec": 4.890968121206573,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_02_augmented.wav",
"offset_sec": 6.610400547460049,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_03_humming_like.wav",
"offset_sec": 2.6329596668288424,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_04_confused.wav",
"offset_sec": 0.8570731183991709,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_00.wav",
"offset_sec": 4.461034326075292,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_01.wav",
"offset_sec": 9.605203782802876,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_02_augmented.wav",
"offset_sec": 4.7458228906154805,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_03_humming_like.wav",
"offset_sec": 8.308702013555955,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_04_confused.wav",
"offset_sec": 2.213510770155481,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
}
]
},
"notes": [
"Execute songs before references and segments.",
"Embedding rows should be loaded only after reference_id/segment_id resolution.",
"A live loader can replace row-wise inserts with COPY/execute_batch."
]
}
\ No newline at end of file
#!/usr/bin/env python3
"""Template bulk loader for pgvector-related metadata tables.
This script intentionally avoids requiring psycopg at runtime for now.
It produces the SQL statements and row payloads that a future live loader can
execute via COPY or execute_batch.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
SQL_STATEMENTS = {
"songs": """
INSERT INTO songs (song_id, title, artist, version_id, source_dataset, license)
VALUES (%(song_id)s, %(title)s, %(artist)s, %(version_id)s, %(source_dataset)s, %(license)s)
ON CONFLICT (song_id) DO UPDATE SET
title = EXCLUDED.title,
artist = EXCLUDED.artist,
version_id = EXCLUDED.version_id,
source_dataset = EXCLUDED.source_dataset,
license = EXCLUDED.license;
""".strip(),
"references": """
INSERT INTO references (song_id, audio_uri, duration_sec, sample_rate)
VALUES (%(song_id)s, %(audio_uri)s, %(duration_sec)s, %(sample_rate)s);
""".strip(),
"segments": """
INSERT INTO segments (song_id, audio_uri, offset_sec, duration_sec, split, type, segment_type, source_dataset)
VALUES (%(song_id)s, %(audio_uri)s, %(offset_sec)s, %(duration_sec)s, %(split)s, %(type)s, %(segment_type)s, %(source_dataset)s);
""".strip(),
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="JSON exported by export_manifest_to_pgvector_json.py")
parser.add_argument("--output", required=True, help="Output JSON plan for later DB execution")
args = parser.parse_args()
payload = json.loads(Path(args.input).read_text())
plan = {
"counts": {
"songs": len(payload.get("songs", [])),
"references": len(payload.get("references", [])),
"segments": len(payload.get("segments", [])),
},
"sql": SQL_STATEMENTS,
"rows": {
"songs": payload.get("songs", []),
"references": payload.get("references", []),
"segments": payload.get("segments", []),
},
"notes": [
"Execute songs before references and segments.",
"Embedding rows should be loaded only after reference_id/segment_id resolution.",
"A live loader can replace row-wise inserts with COPY/execute_batch.",
],
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(plan, indent=2, ensure_ascii=False))
print(json.dumps({
"status": "ok",
"output": str(out.resolve()),
**plan["counts"],
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
......@@ -235,6 +235,29 @@
### Stage: pgvector bulk load plan 模板
完成项:
- 新增 [acr-engine/scripts/pgvector_bulk_load_template.py](../acr-engine/scripts/pgvector_bulk_load_template.py)
- 为 pgvector 导出结果补充 PostgreSQL bulk-load plan 模板
-[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充对应说明
验证结果:
- `/usr/local/miniconda3/bin/python -m py_compile scripts/pgvector_bulk_load_template.py` 成功
- `/usr/local/miniconda3/bin/python scripts/pgvector_bulk_load_template.py --input reports/pgvector_manifest_export_test.json --output reports/pgvector_bulk_load_plan_test.json` 成功
- 当前结果:
- `songs=24`
- `references=24`
- `segments=20`
结论:
- pgvector 方向现在已经具备:
- schema 模板
- manifest 导出模板
- bulk-load plan 模板
- 后续接真实 PostgreSQL 时,只差 live loader,而不是从零设计数据入口
### Stage: pgvector 落库模板
完成项:
......
......@@ -539,6 +539,40 @@ cd acr-engine
2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL
3. embedding 生成后再写入 `vector(192)`
### Bulk load plan 模板
仓库里现在还新增了:
- [acr-engine/scripts/pgvector_bulk_load_template.py](../acr-engine/scripts/pgvector_bulk_load_template.py)
它会把前一步导出的 manifest-friendly JSON,进一步整理成:
- SQL 语句模板
- songs / references / segments 行数据
- 导入顺序说明
示例:
```bash
cd acr-engine
/usr/local/miniconda3/bin/python scripts/pgvector_bulk_load_template.py \
--input reports/pgvector_manifest_export_test.json \
--output reports/pgvector_bulk_load_plan_test.json
```
当前已验证结果:
- `songs=24`
- `references=24`
- `segments=20`
这样后续如果你们接真实 PostgreSQL,可以分三步走:
1. manifest -> pgvector-friendly JSON
2. JSON -> bulk load plan
3. bulk load plan -> PostgreSQL / pgvector 实际写入
## Sources
- Current code behavior from:
......