Commit 528cc473 528cc473ba8e53a0df86001f564f101a2fc126a1 by cnb.bofCdSsphPA

Turn pgvector planning into repo-native ingestion templates

Constraint: The user needs concrete downstream data handling guidance now, and future vector retrieval work should not start from abstract docs alone
Rejected: Leave pgvector support at prose-only guidance | Delays integration by forcing later sessions to reinvent schema and export bridges
Confidence: high
Scope-risk: narrow
Directive: Keep schema/export templates aligned with actual manifest semantics before adding live database loaders
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/export_manifest_to_pgvector_json.py; /usr/local/miniconda3/bin/python acr-engine/scripts/export_manifest_to_pgvector_json.py --data acr-engine/data/synthetic_v2 --split test --source-dataset synthetic_v2 --output acr-engine/reports/pgvector_manifest_export_test.json
Not-tested: Live PostgreSQL/pgvector ingestion remains pending a real database target
1 parent d6d67893
{
"songs": [
{
"song_id": "song_0000",
"title": "song_0000",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0001",
"title": "song_0001",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0002",
"title": "song_0002",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0003",
"title": "song_0003",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0004",
"title": "song_0004",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0005",
"title": "song_0005",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0006",
"title": "song_0006",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0007",
"title": "song_0007",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0008",
"title": "song_0008",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0009",
"title": "song_0009",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0010",
"title": "song_0010",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0011",
"title": "song_0011",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0012",
"title": "song_0012",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0013",
"title": "song_0013",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0014",
"title": "song_0014",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0015",
"title": "song_0015",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0016",
"title": "song_0016",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0017",
"title": "song_0017",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0018",
"title": "song_0018",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0019",
"title": "song_0019",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0020",
"title": "song_0020",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0021",
"title": "song_0021",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0022",
"title": "song_0022",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
},
{
"song_id": "song_0023",
"title": "song_0023",
"artist": null,
"version_id": null,
"source_dataset": "synthetic_v2",
"license": null
}
],
"references": [
{
"song_id": "song_0000",
"audio_uri": "songs/song_0000.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0001",
"audio_uri": "songs/song_0001.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0002",
"audio_uri": "songs/song_0002.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0003",
"audio_uri": "songs/song_0003.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0004",
"audio_uri": "songs/song_0004.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0005",
"audio_uri": "songs/song_0005.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0006",
"audio_uri": "songs/song_0006.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0007",
"audio_uri": "songs/song_0007.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0008",
"audio_uri": "songs/song_0008.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0009",
"audio_uri": "songs/song_0009.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0010",
"audio_uri": "songs/song_0010.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0011",
"audio_uri": "songs/song_0011.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0012",
"audio_uri": "songs/song_0012.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0013",
"audio_uri": "songs/song_0013.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0014",
"audio_uri": "songs/song_0014.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0015",
"audio_uri": "songs/song_0015.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0016",
"audio_uri": "songs/song_0016.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0017",
"audio_uri": "songs/song_0017.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0018",
"audio_uri": "songs/song_0018.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0019",
"audio_uri": "songs/song_0019.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0020",
"audio_uri": "songs/song_0020.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0021",
"audio_uri": "songs/song_0021.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0022",
"audio_uri": "songs/song_0022.wav",
"duration_sec": 15.0,
"sample_rate": 16000
},
{
"song_id": "song_0023",
"audio_uri": "songs/song_0023.wav",
"duration_sec": 15.0,
"sample_rate": 16000
}
],
"segments": [
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_00.wav",
"offset_sec": 4.349828784349853,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_01.wav",
"offset_sec": 9.642182747327407,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_02_augmented.wav",
"offset_sec": 2.367717347418965,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_03_humming_like.wav",
"offset_sec": 3.180577192661006,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0020",
"audio_uri": "segments/song_0020_seg_04_confused.wav",
"offset_sec": 4.660551124366617,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_00.wav",
"offset_sec": 5.631088908640184,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_01.wav",
"offset_sec": 1.8823366490525628,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_02_augmented.wav",
"offset_sec": 9.88006210404643,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_03_humming_like.wav",
"offset_sec": 0.9025737685090285,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0021",
"audio_uri": "segments/song_0021_seg_04_confused.wav",
"offset_sec": 1.3048954561918258,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_00.wav",
"offset_sec": 3.9746734850812295,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_01.wav",
"offset_sec": 4.890968121206573,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_02_augmented.wav",
"offset_sec": 6.610400547460049,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_03_humming_like.wav",
"offset_sec": 2.6329596668288424,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0022",
"audio_uri": "segments/song_0022_seg_04_confused.wav",
"offset_sec": 0.8570731183991709,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_00.wav",
"offset_sec": 4.461034326075292,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_01.wav",
"offset_sec": 9.605203782802876,
"duration_sec": 5.0,
"split": "test",
"type": "clean",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_02_augmented.wav",
"offset_sec": 4.7458228906154805,
"duration_sec": 5.0,
"split": "test",
"type": "augmented",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_03_humming_like.wav",
"offset_sec": 8.308702013555955,
"duration_sec": 5.0,
"split": "test",
"type": "humming_like",
"segment_type": "mid",
"source_dataset": "synthetic_v2"
},
{
"song_id": "song_0023",
"audio_uri": "segments/song_0023_seg_04_confused.wav",
"offset_sec": 2.213510770155481,
"duration_sec": 5.0,
"split": "test",
"type": "confused",
"segment_type": "intro",
"source_dataset": "synthetic_v2"
}
]
}
\ No newline at end of file
#!/usr/bin/env python3
"""Export project manifests into a pgvector-friendly JSON payload.
This does not require PostgreSQL at runtime. It prepares normalized rows so a
future loader can bulk ingest them into Postgres/pgvector safely.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
def load_json(path: Path):
return json.loads(path.read_text())
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", required=True, help="manifest directory")
parser.add_argument("--output", required=True)
parser.add_argument("--split", default="train")
parser.add_argument("--source-dataset", default="unknown")
args = parser.parse_args()
data_dir = Path(args.data)
catalog = load_json(data_dir / "catalog.json")
split_rows = load_json(data_dir / f"{args.split}.json")
songs = {}
references = []
segments = []
for row in catalog:
song_id = row["song_id"]
songs.setdefault(song_id, {
"song_id": song_id,
"title": song_id,
"artist": None,
"version_id": None,
"source_dataset": row.get("source_dataset", args.source_dataset),
"license": None,
})
if row.get("type") == "reference":
references.append({
"song_id": song_id,
"audio_uri": row["audio_path"],
"duration_sec": row["duration"],
"sample_rate": 16000,
})
for row in split_rows:
if row.get("type") == "reference":
continue
song_id = row["song_id"]
songs.setdefault(song_id, {
"song_id": song_id,
"title": song_id,
"artist": None,
"version_id": None,
"source_dataset": row.get("source_dataset", args.source_dataset),
"license": None,
})
segments.append({
"song_id": song_id,
"audio_uri": row["audio_path"],
"offset_sec": row.get("offset", 0.0),
"duration_sec": row["duration"],
"split": args.split,
"type": row.get("type", "unknown"),
"segment_type": row.get("segment_type"),
"source_dataset": row.get("source_dataset", args.source_dataset),
})
payload = {
"songs": list(songs.values()),
"references": references,
"segments": segments,
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
print(json.dumps({
"status": "ok",
"output": str(out.resolve()),
"songs": len(payload["songs"]),
"references": len(payload["references"]),
"segments": len(payload["segments"]),
}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS songs (
song_id TEXT PRIMARY KEY,
title TEXT,
artist TEXT,
version_id TEXT,
source_dataset TEXT,
license TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS references (
reference_id BIGSERIAL PRIMARY KEY,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
audio_uri TEXT NOT NULL,
duration_sec DOUBLE PRECISION NOT NULL,
sample_rate INTEGER DEFAULT 16000,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS segments (
segment_id BIGSERIAL PRIMARY KEY,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
audio_uri TEXT NOT NULL,
offset_sec DOUBLE PRECISION DEFAULT 0,
duration_sec DOUBLE PRECISION NOT NULL,
split TEXT,
type TEXT NOT NULL,
segment_type TEXT,
source_dataset TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS reference_embeddings (
embedding_id BIGSERIAL PRIMARY KEY,
reference_id BIGINT NOT NULL REFERENCES references(reference_id) ON DELETE CASCADE,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
embedding vector(192) NOT NULL,
model_version TEXT NOT NULL,
data_version TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS query_embeddings (
embedding_id BIGSERIAL PRIMARY KEY,
segment_id BIGINT NOT NULL REFERENCES segments(segment_id) ON DELETE CASCADE,
song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE,
embedding vector(192) NOT NULL,
model_version TEXT NOT NULL,
data_version TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_segments_song_id ON segments(song_id);
CREATE INDEX IF NOT EXISTS idx_references_song_id ON references(song_id);
CREATE INDEX IF NOT EXISTS idx_reference_embeddings_song_id ON reference_embeddings(song_id);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_song_id ON query_embeddings(song_id);
CREATE INDEX IF NOT EXISTS idx_reference_embeddings_vector_cosine
ON reference_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector_cosine
ON query_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
......@@ -234,6 +234,26 @@
### Stage: pgvector 落库模板
完成项:
- 新增 [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
- 新增 [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
-[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充可执行模板说明
验证结果:
- `/usr/local/miniconda3/bin/python -m py_compile scripts/export_manifest_to_pgvector_json.py` 成功
- `/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py --data data/synthetic_v2 --split test --source-dataset synthetic_v2 --output reports/pgvector_manifest_export_test.json` 成功
- 当前导出结果:
- `songs=24`
- `references=24`
- `segments=20`
结论:
- pgvector 方向现在不仅有概念文档,还有可直接复用的 schema 和 manifest 导出桥接脚本
- 后续接 PostgreSQL 时返工成本会显著降低
### Stage: FMA 下载自动守护
完成项:
......
......@@ -508,6 +508,37 @@ val.json
- [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md)
- [session-handoff.md](./session-handoff.md)
## 12. 可直接落地的 pgvector 模板
仓库里现在已经补了两个可直接参考的模板:
- SQL schema: [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql)
- manifest 导出桥接脚本: [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py)
### 导出示例
```bash
cd acr-engine
/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py \
--data data/synthetic_v2 \
--split test \
--source-dataset synthetic_v2 \
--output reports/pgvector_manifest_export_test.json
```
### 当前已验证结果
- `songs=24`
- `references=24`
- `segments=20`
这一步还不会直接写 PostgreSQL,作用是:
1. 先把项目现有 manifest 规范转换成 pgvector-friendly 结构化 JSON
2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL
3. embedding 生成后再写入 `vector(192)`
## Sources
- Current code behavior from:
......