Commit 528cc473 528cc473ba8e53a0df86001f564f101a2fc126a1 by cnb.bofCdSsphPA

Turn pgvector planning into repo-native ingestion templates

Constraint: The user needs concrete downstream data handling guidance now, and future vector retrieval work should not start from abstract docs alone
Rejected: Leave pgvector support at prose-only guidance | Delays integration by forcing later sessions to reinvent schema and export bridges
Confidence: high
Scope-risk: narrow
Directive: Keep schema/export templates aligned with actual manifest semantics before adding live database loaders
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/export_manifest_to_pgvector_json.py; /usr/local/miniconda3/bin/python acr-engine/scripts/export_manifest_to_pgvector_json.py --data acr-engine/data/synthetic_v2 --split test --source-dataset synthetic_v2 --output acr-engine/reports/pgvector_manifest_export_test.json
Not-tested: Live PostgreSQL/pgvector ingestion remains pending a real database target
1 parent d6d67893
#!/usr/bin/env python3
"""Export project manifests into a pgvector-friendly JSON payload.
This does not require PostgreSQL at runtime. It prepares normalized rows so a
future loader can bulk ingest them into Postgres/pgvector safely.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
def load_json(path: Path):
return json.loads(path.read_text())
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", required=True, help="manifest directory")
parser.add_argument("--output", required=True)
parser.add_argument("--split", default="train")
parser.add_argument("--source-dataset", default="unknown")
args = parser.parse_args()
data_dir = Path(args.data)
catalog = load_json(data_dir / "catalog.json")
split_rows = load_json(data_dir / f"{args.split}.json")
songs = {}
references = []
segments = []
for row in catalog:
song_id = row["song_id"]
songs.setdefault(song_id, {
"song_id": song_id,
"title": song_id,
"artist": None,
"version_id": None,
"source_dataset": row.get("source_dataset", args.source_dataset),
"license": None,
})
if row.get("type") == "reference":
references.append({
"song_id": song_id,
"audio_uri": row["audio_path"],
"duration_sec": row["duration"],
"sample_rate": 16000,
})
for row in split_rows:
if row.get("type") == "reference":
continue
song_id = row["song_id"]
songs.setdefault(song_id, {
"song_id": song_id,
"title": song_id,
"artist": None,
"version_id": None,
"source_dataset": row.get("source_dataset", args.source_dataset),
"license": None,
})
segments.append({
"song_id": song_id,
"audio_uri": row["audio_path"],
"offset_sec": row.get("offset", 0.0),
"duration_sec": row["duration"],
"split": args.split,
"type": row.get("type", "unknown"),
"segment_type": row.get("segment_type"),
"source_dataset": row.get("source_dataset", args.source_dataset),
})
payload = {
"songs": list(songs.values()),
"references": references,
"segments": segments,
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
print(json.dumps({
"status": "ok",
"output": str(out.resolve()),
"songs": len(payload["songs"]),
"references": len(payload["references"]),
"segments": len(payload["segments"]),
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS songs (
song_id TEXT PRIMARY KEY,
title TEXT,
artist TEXT,
version_id TEXT,
source_dataset TEXT,
license TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS references (
reference_id BIGSERIAL PRIMARY KEY,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
audio_uri TEXT NOT NULL,
duration_sec DOUBLE PRECISION NOT NULL,
sample_rate INTEGER DEFAULT 16000,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS segments (
segment_id BIGSERIAL PRIMARY KEY,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
audio_uri TEXT NOT NULL,
offset_sec DOUBLE PRECISION DEFAULT 0,
duration_sec DOUBLE PRECISION NOT NULL,
split TEXT,
type TEXT NOT NULL,
segment_type TEXT,
source_dataset TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS reference_embeddings (
embedding_id BIGSERIAL PRIMARY KEY,
reference_id BIGINT NOT NULL REFERENCES references(reference_id) ON DELETE CASCADE,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
embedding vector(192) NOT NULL,
model_version TEXT NOT NULL,
data_version TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS query_embeddings (
embedding_id BIGSERIAL PRIMARY KEY,
segment_id BIGINT NOT NULL REFERENCES segments(segment_id) ON DELETE CASCADE,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
embedding vector(192) NOT NULL,
model_version TEXT NOT NULL,
data_version TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_segments_song_id ON segments(song_id);
CREATE INDEX IF NOT EXISTS idx_references_song_id ON references(song_id);
CREATE INDEX IF NOT EXISTS idx_reference_embeddings_song_id ON reference_embeddings(song_id);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_song_id ON query_embeddings(song_id);
CREATE INDEX IF NOT EXISTS idx_reference_embeddings_vector_cosine
ON reference_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector_cosine
ON query_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
......@@ -234,6 +234,26 @@
### Stage: pgvector 落库模板
完成项:
- 新增 [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
- 新增 [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
-[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充可执行模板说明
验证结果:
- `/usr/local/miniconda3/bin/python -m py_compile scripts/export_manifest_to_pgvector_json.py` 成功
- `/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py --data data/synthetic_v2 --split test --source-dataset synthetic_v2 --output reports/pgvector_manifest_export_test.json` 成功
- 当前导出结果:
- `songs=24`
- `references=24`
- `segments=20`
结论:
- pgvector 方向现在不仅有概念文档,还有可直接复用的 schema 和 manifest 导出桥接脚本
- 后续接 PostgreSQL 时返工成本会显著降低
### Stage: FMA 下载自动守护
完成项:
......
......@@ -508,6 +508,37 @@ val.json
- [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md)
- [session-handoff.md](./session-handoff.md)
## 12. 可直接落地的 pgvector 模板
仓库里现在已经补了两个可直接参考的模板:
- SQL schema: [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
- manifest 导出桥接脚本: [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
### 导出示例
```bash
cd acr-engine
/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py \
--data data/synthetic_v2 \
--split test \
--source-dataset synthetic_v2 \
--output reports/pgvector_manifest_export_test.json
```
### 当前已验证结果
- `songs=24`
- `references=24`
- `segments=20`
这一步还不会直接写 PostgreSQL,作用是:
1. 先把项目现有 manifest 规范转换成 pgvector-friendly 结构化 JSON
2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL
3. embedding 生成后再写入 `vector(192)`
## Sources
- Current code behavior from:
......