Turn pgvector planning into repo-native ingestion templates
Constraint: The user needs concrete downstream data handling guidance now, and future vector retrieval work should not start from abstract docs alone Rejected: Leave pgvector support at prose-only guidance | Delays integration by forcing later sessions to reinvent schema and export bridges Confidence: high Scope-risk: narrow Directive: Keep schema/export templates aligned with actual manifest semantics before adding live database loaders Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/export_manifest_to_pgvector_json.py; /usr/local/miniconda3/bin/python acr-engine/scripts/export_manifest_to_pgvector_json.py --data acr-engine/data/synthetic_v2 --split test --source-dataset synthetic_v2 --output acr-engine/reports/pgvector_manifest_export_test.json Not-tested: Live PostgreSQL/pgvector ingestion remains pending a real database target
Showing
5 changed files
with
212 additions
and
0 deletions
This diff is collapsed.
Click to expand it.
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Export project manifests into a pgvector-friendly JSON payload. | ||
| 3 | |||
| 4 | This does not require PostgreSQL at runtime. It prepares normalized rows so a | ||
| 5 | future loader can bulk ingest them into Postgres/pgvector safely. | ||
| 6 | """ | ||
| 7 | |||
| 8 | from __future__ import annotations | ||
| 9 | |||
| 10 | import argparse | ||
| 11 | import json | ||
| 12 | from pathlib import Path | ||
| 13 | |||
| 14 | |||
| 15 | def load_json(path: Path): | ||
| 16 | return json.loads(path.read_text()) | ||
| 17 | |||
| 18 | |||
| 19 | def main(): | ||
| 20 | parser = argparse.ArgumentParser() | ||
| 21 | parser.add_argument("--data", required=True, help="manifest directory") | ||
| 22 | parser.add_argument("--output", required=True) | ||
| 23 | parser.add_argument("--split", default="train") | ||
| 24 | parser.add_argument("--source-dataset", default="unknown") | ||
| 25 | args = parser.parse_args() | ||
| 26 | |||
| 27 | data_dir = Path(args.data) | ||
| 28 | catalog = load_json(data_dir / "catalog.json") | ||
| 29 | split_rows = load_json(data_dir / f"{args.split}.json") | ||
| 30 | |||
| 31 | songs = {} | ||
| 32 | references = [] | ||
| 33 | segments = [] | ||
| 34 | |||
| 35 | for row in catalog: | ||
| 36 | song_id = row["song_id"] | ||
| 37 | songs.setdefault(song_id, { | ||
| 38 | "song_id": song_id, | ||
| 39 | "title": song_id, | ||
| 40 | "artist": None, | ||
| 41 | "version_id": None, | ||
| 42 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 43 | "license": None, | ||
| 44 | }) | ||
| 45 | if row.get("type") == "reference": | ||
| 46 | references.append({ | ||
| 47 | "song_id": song_id, | ||
| 48 | "audio_uri": row["audio_path"], | ||
| 49 | "duration_sec": row["duration"], | ||
| 50 | "sample_rate": 16000, | ||
| 51 | }) | ||
| 52 | |||
| 53 | for row in split_rows: | ||
| 54 | if row.get("type") == "reference": | ||
| 55 | continue | ||
| 56 | song_id = row["song_id"] | ||
| 57 | songs.setdefault(song_id, { | ||
| 58 | "song_id": song_id, | ||
| 59 | "title": song_id, | ||
| 60 | "artist": None, | ||
| 61 | "version_id": None, | ||
| 62 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 63 | "license": None, | ||
| 64 | }) | ||
| 65 | segments.append({ | ||
| 66 | "song_id": song_id, | ||
| 67 | "audio_uri": row["audio_path"], | ||
| 68 | "offset_sec": row.get("offset", 0.0), | ||
| 69 | "duration_sec": row["duration"], | ||
| 70 | "split": args.split, | ||
| 71 | "type": row.get("type", "unknown"), | ||
| 72 | "segment_type": row.get("segment_type"), | ||
| 73 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 74 | }) | ||
| 75 | |||
| 76 | payload = { | ||
| 77 | "songs": list(songs.values()), | ||
| 78 | "references": references, | ||
| 79 | "segments": segments, | ||
| 80 | } | ||
| 81 | |||
| 82 | out = Path(args.output) | ||
| 83 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 84 | out.write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | ||
| 85 | print(json.dumps({ | ||
| 86 | "status": "ok", | ||
| 87 | "output": str(out.resolve()), | ||
| 88 | "songs": len(payload["songs"]), | ||
| 89 | "references": len(payload["references"]), | ||
| 90 | "segments": len(payload["segments"]), | ||
| 91 | }, indent=2, ensure_ascii=False)) | ||
| 92 | |||
| 93 | |||
| 94 | if __name__ == "__main__": | ||
| 95 | main() |
acr-engine/sql/pgvector_schema.sql
0 → 100644
| 1 | CREATE EXTENSION IF NOT EXISTS vector; | ||
| 2 | |||
| 3 | CREATE TABLE IF NOT EXISTS songs ( | ||
| 4 | song_id TEXT PRIMARY KEY, | ||
| 5 | title TEXT, | ||
| 6 | artist TEXT, | ||
| 7 | version_id TEXT, | ||
| 8 | source_dataset TEXT, | ||
| 9 | license TEXT, | ||
| 10 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 11 | ); | ||
| 12 | |||
| 13 | CREATE TABLE IF NOT EXISTS references ( | ||
| 14 | reference_id BIGSERIAL PRIMARY KEY, | ||
| 15 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 16 | audio_uri TEXT NOT NULL, | ||
| 17 | duration_sec DOUBLE PRECISION NOT NULL, | ||
| 18 | sample_rate INTEGER DEFAULT 16000, | ||
| 19 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 20 | ); | ||
| 21 | |||
| 22 | CREATE TABLE IF NOT EXISTS segments ( | ||
| 23 | segment_id BIGSERIAL PRIMARY KEY, | ||
| 24 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 25 | audio_uri TEXT NOT NULL, | ||
| 26 | offset_sec DOUBLE PRECISION DEFAULT 0, | ||
| 27 | duration_sec DOUBLE PRECISION NOT NULL, | ||
| 28 | split TEXT, | ||
| 29 | type TEXT NOT NULL, | ||
| 30 | segment_type TEXT, | ||
| 31 | source_dataset TEXT, | ||
| 32 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 33 | ); | ||
| 34 | |||
| 35 | CREATE TABLE IF NOT EXISTS reference_embeddings ( | ||
| 36 | embedding_id BIGSERIAL PRIMARY KEY, | ||
| 37 | reference_id BIGINT NOT NULL REFERENCES references(reference_id) ON DELETE CASCADE, | ||
| 38 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 39 | embedding vector(192) NOT NULL, | ||
| 40 | model_version TEXT NOT NULL, | ||
| 41 | data_version TEXT, | ||
| 42 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 43 | ); | ||
| 44 | |||
| 45 | CREATE TABLE IF NOT EXISTS query_embeddings ( | ||
| 46 | embedding_id BIGSERIAL PRIMARY KEY, | ||
| 47 | segment_id BIGINT NOT NULL REFERENCES segments(segment_id) ON DELETE CASCADE, | ||
| 48 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 49 | embedding vector(192) NOT NULL, | ||
| 50 | model_version TEXT NOT NULL, | ||
| 51 | data_version TEXT, | ||
| 52 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 53 | ); | ||
| 54 | |||
| 55 | CREATE INDEX IF NOT EXISTS idx_segments_song_id ON segments(song_id); | ||
| 56 | CREATE INDEX IF NOT EXISTS idx_references_song_id ON references(song_id); | ||
| 57 | CREATE INDEX IF NOT EXISTS idx_reference_embeddings_song_id ON reference_embeddings(song_id); | ||
| 58 | CREATE INDEX IF NOT EXISTS idx_query_embeddings_song_id ON query_embeddings(song_id); | ||
| 59 | |||
| 60 | CREATE INDEX IF NOT EXISTS idx_reference_embeddings_vector_cosine | ||
| 61 | ON reference_embeddings USING ivfflat (embedding vector_cosine_ops) | ||
| 62 | WITH (lists = 100); | ||
| 63 | |||
| 64 | CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector_cosine | ||
| 65 | ON query_embeddings USING ivfflat (embedding vector_cosine_ops) | ||
| 66 | WITH (lists = 100); |
| ... | @@ -234,6 +234,26 @@ | ... | @@ -234,6 +234,26 @@ |
| 234 | 234 | ||
| 235 | 235 | ||
| 236 | 236 | ||
| 237 | |||
| 238 | ### Stage: pgvector 落库模板 | ||
| 239 | |||
| 240 | 完成项: | ||
| 241 | - 新增 [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql) | ||
| 242 | - 新增 [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py) | ||
| 243 | - 在 [docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充可执行模板说明 | ||
| 244 | |||
| 245 | 验证结果: | ||
| 246 | - `/usr/local/miniconda3/bin/python -m py_compile scripts/export_manifest_to_pgvector_json.py` 成功 | ||
| 247 | - `/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py --data data/synthetic_v2 --split test --source-dataset synthetic_v2 --output reports/pgvector_manifest_export_test.json` 成功 | ||
| 248 | - 当前导出结果: | ||
| 249 | - `songs=24` | ||
| 250 | - `references=24` | ||
| 251 | - `segments=20` | ||
| 252 | |||
| 253 | 结论: | ||
| 254 | - pgvector 方向现在不仅有概念文档,还有可直接复用的 schema 和 manifest 导出桥接脚本 | ||
| 255 | - 后续接 PostgreSQL 时返工成本会显著降低 | ||
| 256 | |||
| 237 | ### Stage: FMA 下载自动守护 | 257 | ### Stage: FMA 下载自动守护 |
| 238 | 258 | ||
| 239 | 完成项: | 259 | 完成项: | ... | ... |
| ... | @@ -508,6 +508,37 @@ val.json | ... | @@ -508,6 +508,37 @@ val.json |
| 508 | - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) | 508 | - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) |
| 509 | - [session-handoff.md](./session-handoff.md) | 509 | - [session-handoff.md](./session-handoff.md) |
| 510 | 510 | ||
| 511 | |||
| 512 | ## 12. 可直接落地的 pgvector 模板 | ||
| 513 | |||
| 514 | 仓库里现在已经补了两个可直接参考的模板: | ||
| 515 | |||
| 516 | - SQL schema: [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql) | ||
| 517 | - manifest 导出桥接脚本: [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py) | ||
| 518 | |||
| 519 | ### 导出示例 | ||
| 520 | |||
| 521 | ```bash | ||
| 522 | cd acr-engine | ||
| 523 | /usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py \ | ||
| 524 | --data data/synthetic_v2 \ | ||
| 525 | --split test \ | ||
| 526 | --source-dataset synthetic_v2 \ | ||
| 527 | --output reports/pgvector_manifest_export_test.json | ||
| 528 | ``` | ||
| 529 | |||
| 530 | ### 当前已验证结果 | ||
| 531 | |||
| 532 | - `songs=24` | ||
| 533 | - `references=24` | ||
| 534 | - `segments=20` | ||
| 535 | |||
| 536 | 这一步还不会直接写 PostgreSQL,作用是: | ||
| 537 | |||
| 538 | 1. 先把项目现有 manifest 规范转换成 pgvector-friendly 结构化 JSON | ||
| 539 | 2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL | ||
| 540 | 3. embedding 生成后再写入 `vector(192)` 列 | ||
| 541 | |||
| 511 | ## Sources | 542 | ## Sources |
| 512 | 543 | ||
| 513 | - Current code behavior from: | 544 | - Current code behavior from: | ... | ... |
-
Please register or sign in to post a comment