Commit 528cc473 528cc473ba8e53a0df86001f564f101a2fc126a1 by cnb.bofCdSsphPA

Turn pgvector planning into repo-native ingestion templates

Constraint: The user needs concrete downstream data handling guidance now, and future vector retrieval work should not start from abstract docs alone
Rejected: Leave pgvector support at prose-only guidance | Delays integration by forcing later sessions to reinvent schema and export bridges
Confidence: high
Scope-risk: narrow
Directive: Keep schema/export templates aligned with actual manifest semantics before adding live database loaders
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/export_manifest_to_pgvector_json.py; /usr/local/miniconda3/bin/python acr-engine/scripts/export_manifest_to_pgvector_json.py --data acr-engine/data/synthetic_v2 --split test --source-dataset synthetic_v2 --output acr-engine/reports/pgvector_manifest_export_test.json
Not-tested: Live PostgreSQL/pgvector ingestion remains pending a real database target
1 parent d6d67893
1 #!/usr/bin/env python3
2 """Export project manifests into a pgvector-friendly JSON payload.
3
4 This does not require PostgreSQL at runtime. It prepares normalized rows so a
5 future loader can bulk ingest them into Postgres/pgvector safely.
6 """
7
8 from __future__ import annotations
9
10 import argparse
11 import json
12 from pathlib import Path
13
14
15 def load_json(path: Path):
16 return json.loads(path.read_text())
17
18
19 def main():
20 parser = argparse.ArgumentParser()
21 parser.add_argument("--data", required=True, help="manifest directory")
22 parser.add_argument("--output", required=True)
23 parser.add_argument("--split", default="train")
24 parser.add_argument("--source-dataset", default="unknown")
25 args = parser.parse_args()
26
27 data_dir = Path(args.data)
28 catalog = load_json(data_dir / "catalog.json")
29 split_rows = load_json(data_dir / f"{args.split}.json")
30
31 songs = {}
32 references = []
33 segments = []
34
35 for row in catalog:
36 song_id = row["song_id"]
37 songs.setdefault(song_id, {
38 "song_id": song_id,
39 "title": song_id,
40 "artist": None,
41 "version_id": None,
42 "source_dataset": row.get("source_dataset", args.source_dataset),
43 "license": None,
44 })
45 if row.get("type") == "reference":
46 references.append({
47 "song_id": song_id,
48 "audio_uri": row["audio_path"],
49 "duration_sec": row["duration"],
50 "sample_rate": 16000,
51 })
52
53 for row in split_rows:
54 if row.get("type") == "reference":
55 continue
56 song_id = row["song_id"]
57 songs.setdefault(song_id, {
58 "song_id": song_id,
59 "title": song_id,
60 "artist": None,
61 "version_id": None,
62 "source_dataset": row.get("source_dataset", args.source_dataset),
63 "license": None,
64 })
65 segments.append({
66 "song_id": song_id,
67 "audio_uri": row["audio_path"],
68 "offset_sec": row.get("offset", 0.0),
69 "duration_sec": row["duration"],
70 "split": args.split,
71 "type": row.get("type", "unknown"),
72 "segment_type": row.get("segment_type"),
73 "source_dataset": row.get("source_dataset", args.source_dataset),
74 })
75
76 payload = {
77 "songs": list(songs.values()),
78 "references": references,
79 "segments": segments,
80 }
81
82 out = Path(args.output)
83 out.parent.mkdir(parents=True, exist_ok=True)
84 out.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
85 print(json.dumps({
86 "status": "ok",
87 "output": str(out.resolve()),
88 "songs": len(payload["songs"]),
89 "references": len(payload["references"]),
90 "segments": len(payload["segments"]),
91 }, indent=2, ensure_ascii=False))
92
93
94 if __name__ == "__main__":
95 main()
1 CREATE EXTENSION IF NOT EXISTS vector;
2
3 CREATE TABLE IF NOT EXISTS songs (
4 song_id TEXT PRIMARY KEY,
5 title TEXT,
6 artist TEXT,
7 version_id TEXT,
8 source_dataset TEXT,
9 license TEXT,
10 created_at TIMESTAMPTZ DEFAULT NOW()
11 );
12
13 CREATE TABLE IF NOT EXISTS references (
14 reference_id BIGSERIAL PRIMARY KEY,
15 song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
16 audio_uri TEXT NOT NULL,
17 duration_sec DOUBLE PRECISION NOT NULL,
18 sample_rate INTEGER DEFAULT 16000,
19 created_at TIMESTAMPTZ DEFAULT NOW()
20 );
21
22 CREATE TABLE IF NOT EXISTS segments (
23 segment_id BIGSERIAL PRIMARY KEY,
24 song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
25 audio_uri TEXT NOT NULL,
26 offset_sec DOUBLE PRECISION DEFAULT 0,
27 duration_sec DOUBLE PRECISION NOT NULL,
28 split TEXT,
29 type TEXT NOT NULL,
30 segment_type TEXT,
31 source_dataset TEXT,
32 created_at TIMESTAMPTZ DEFAULT NOW()
33 );
34
35 CREATE TABLE IF NOT EXISTS reference_embeddings (
36 embedding_id BIGSERIAL PRIMARY KEY,
37 reference_id BIGINT NOT NULL REFERENCES references(reference_id) ON DELETE CASCADE,
38 song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
39 embedding vector(192) NOT NULL,
40 model_version TEXT NOT NULL,
41 data_version TEXT,
42 created_at TIMESTAMPTZ DEFAULT NOW()
43 );
44
45 CREATE TABLE IF NOT EXISTS query_embeddings (
46 embedding_id BIGSERIAL PRIMARY KEY,
47 segment_id BIGINT NOT NULL REFERENCES segments(segment_id) ON DELETE CASCADE,
48 song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
49 embedding vector(192) NOT NULL,
50 model_version TEXT NOT NULL,
51 data_version TEXT,
52 created_at TIMESTAMPTZ DEFAULT NOW()
53 );
54
55 CREATE INDEX IF NOT EXISTS idx_segments_song_id ON segments(song_id);
56 CREATE INDEX IF NOT EXISTS idx_references_song_id ON references(song_id);
57 CREATE INDEX IF NOT EXISTS idx_reference_embeddings_song_id ON reference_embeddings(song_id);
58 CREATE INDEX IF NOT EXISTS idx_query_embeddings_song_id ON query_embeddings(song_id);
59
60 CREATE INDEX IF NOT EXISTS idx_reference_embeddings_vector_cosine
61 ON reference_embeddings USING ivfflat (embedding vector_cosine_ops)
62 WITH (lists = 100);
63
64 CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector_cosine
65 ON query_embeddings USING ivfflat (embedding vector_cosine_ops)
66 WITH (lists = 100);
...@@ -234,6 +234,26 @@ ...@@ -234,6 +234,26 @@
234 234
235 235
236 236
237
238 ### Stage: pgvector 落库模板
239
240 完成项:
241 - 新增 [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
242 - 新增 [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
243 -[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充可执行模板说明
244
245 验证结果:
246 - `/usr/local/miniconda3/bin/python -m py_compile scripts/export_manifest_to_pgvector_json.py` 成功
247 - `/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py --data data/synthetic_v2 --split test --source-dataset synthetic_v2 --output reports/pgvector_manifest_export_test.json` 成功
248 - 当前导出结果:
249 - `songs=24`
250 - `references=24`
251 - `segments=20`
252
253 结论:
254 - pgvector 方向现在不仅有概念文档,还有可直接复用的 schema 和 manifest 导出桥接脚本
255 - 后续接 PostgreSQL 时返工成本会显著降低
256
237 ### Stage: FMA 下载自动守护 257 ### Stage: FMA 下载自动守护
238 258
239 完成项: 259 完成项:
......
...@@ -508,6 +508,37 @@ val.json ...@@ -508,6 +508,37 @@ val.json
508 - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) 508 - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md)
509 - [session-handoff.md](./session-handoff.md) 509 - [session-handoff.md](./session-handoff.md)
510 510
511
512 ## 12. 可直接落地的 pgvector 模板
513
514 仓库里现在已经补了两个可直接参考的模板:
515
516 - SQL schema: [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
517 - manifest 导出桥接脚本: [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
518
519 ### 导出示例
520
521 ```bash
522 cd acr-engine
523 /usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py \
524 --data data/synthetic_v2 \
525 --split test \
526 --source-dataset synthetic_v2 \
527 --output reports/pgvector_manifest_export_test.json
528 ```
529
530 ### 当前已验证结果
531
532 - `songs=24`
533 - `references=24`
534 - `segments=20`
535
536 这一步还不会直接写 PostgreSQL,作用是:
537
538 1. 先把项目现有 manifest 规范转换成 pgvector-friendly 结构化 JSON
539 2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL
540 3. embedding 生成后再写入 `vector(192)`
541
511 ## Sources 542 ## Sources
512 543
513 - Current code behavior from: 544 - Current code behavior from:
......