Commit 58041e10 58041e10b6c2c0709ad738224363a0d0d36334c7 by cnb.bofCdSsphPA

Connect internal asset exports to pgvector preparation early

Constraint: Internal CSV ingestion should reach a pgvector-ready payload without requiring a second custom export path
Rejected: Limit the mapper to manifest outputs only | Forces another transformation layer before database loading
Confidence: high
Scope-risk: narrow
Directive: Keep pgvector payloads aligned with the shared songs/references/segments contract while preserving internal asset metadata fields
Tested: internal_asset_type_mapper.py with --emit-pgvector-json produced songs=2 references=2 segments=2 and included audio_role/asset_type_code/validation_status in sample rows
Not-tested: Direct bulk load into PostgreSQL using a live pgvector database
1 parent 5334df1f
...@@ -176,6 +176,68 @@ def build_manifest_bundle( ...@@ -176,6 +176,68 @@ def build_manifest_bundle(
176 } 176 }
177 177
178 178
179 def build_pgvector_payload(
180 references: List[Dict],
181 queries: List[Dict],
182 split: str,
183 ) -> Dict[str, List[Dict]]:
184 songs: Dict[str, Dict] = {}
185 reference_rows: List[Dict] = []
186 segment_rows: List[Dict] = []
187
188 for row in references:
189 song_id = row["song_id"]
190 songs.setdefault(song_id, {
191 "song_id": song_id,
192 "title": song_id,
193 "artist": None,
194 "version_id": row.get("version_id"),
195 "source_dataset": row.get("source_dataset", "internal_assets"),
196 "license": None,
197 })
198 reference_rows.append({
199 "song_id": song_id,
200 "audio_uri": row["audio_path"],
201 "duration_sec": row.get("duration", 0.0),
202 "sample_rate": 16000,
203 "audio_role": row.get("audio_role"),
204 "asset_type_code": row.get("asset_type_code"),
205 "audio_exists": row.get("audio_exists"),
206 "validation_status": row.get("validation_status"),
207 })
208
209 for row in queries:
210 song_id = row["song_id"]
211 songs.setdefault(song_id, {
212 "song_id": song_id,
213 "title": song_id,
214 "artist": None,
215 "version_id": row.get("version_id"),
216 "source_dataset": row.get("source_dataset", "internal_assets"),
217 "license": None,
218 })
219 segment_rows.append({
220 "song_id": song_id,
221 "audio_uri": row["audio_path"],
222 "offset_sec": row.get("offset", 0.0) if row.get("offset") is not None else 0.0,
223 "duration_sec": row.get("duration", 0.0),
224 "split": split,
225 "type": row.get("type", "unknown"),
226 "segment_type": row.get("segment_type"),
227 "source_dataset": row.get("source_dataset", "internal_assets"),
228 "audio_role": row.get("audio_role"),
229 "asset_type_code": row.get("asset_type_code"),
230 "audio_exists": row.get("audio_exists"),
231 "validation_status": row.get("validation_status"),
232 })
233
234 return {
235 "songs": list(songs.values()),
236 "references": reference_rows,
237 "segments": segment_rows,
238 }
239
240
179 def main(): 241 def main():
180 parser = argparse.ArgumentParser() 242 parser = argparse.ArgumentParser()
181 parser.add_argument("csv_path") 243 parser.add_argument("csv_path")
...@@ -192,6 +254,8 @@ def main(): ...@@ -192,6 +254,8 @@ def main():
192 parser.add_argument("--audio-root", default=None) 254 parser.add_argument("--audio-root", default=None)
193 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") 255 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
194 parser.add_argument("--emit-manifests", action="store_true") 256 parser.add_argument("--emit-manifests", action="store_true")
257 parser.add_argument("--emit-pgvector-json", action="store_true")
258 parser.add_argument("--pgvector-split", default="train")
195 parser.add_argument("--eval-ratio", type=float, default=0.2) 259 parser.add_argument("--eval-ratio", type=float, default=0.2)
196 parser.add_argument("--seed", type=int, default=42) 260 parser.add_argument("--seed", type=int, default=42)
197 args = parser.parse_args() 261 args = parser.parse_args()
...@@ -242,6 +306,19 @@ def main(): ...@@ -242,6 +306,19 @@ def main():
242 summary["manifest_test_rows"] = len(bundle["test"]) 306 summary["manifest_test_rows"] = len(bundle["test"])
243 summary["manifest_val_rows"] = len(bundle["val"]) 307 summary["manifest_val_rows"] = len(bundle["val"])
244 308
309 if args.emit_pgvector_json:
310 pgvector_payload = build_pgvector_payload(
311 references=references,
312 queries=queries,
313 split=args.pgvector_split,
314 )
315 pgvector_path = out_dir / "pgvector_payload.json"
316 pgvector_path.write_text(json.dumps(pgvector_payload, indent=2, ensure_ascii=False))
317 summary["pgvector_payload"] = str(pgvector_path)
318 summary["pgvector_songs"] = len(pgvector_payload["songs"])
319 summary["pgvector_references"] = len(pgvector_payload["references"])
320 summary["pgvector_segments"] = len(pgvector_payload["segments"])
321
245 for name, payload in outputs.items(): 322 for name, payload in outputs.items():
246 (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) 323 (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False))
247 324
......
...@@ -2,6 +2,39 @@ ...@@ -2,6 +2,39 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出
6
7 完成项:
8 - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py`
9 - 新增 `--emit-pgvector-json`
10 - 新增 `--pgvector-split`
11 - 可直接导出:
12 - `pgvector_payload.json`
13 - 导出结构与现有 pgvector 导出工具兼容,包含:
14 - `songs`
15 - `references`
16 - `segments`
17 - 同时额外保留:
18 - `audio_role`
19 - `asset_type_code`
20 - `audio_exists`
21 - `validation_status`
22
23 验证结果:
24 - 运行:
25 - `internal_asset_type_mapper.py ... --emit-pgvector-json --pgvector-split train`
26 - 输出摘要:
27 - `pgvector_songs = 2`
28 - `pgvector_references = 2`
29 - `pgvector_segments = 2`
30 - 抽样检查:
31 - reference 行含 `duration_sec/sample_rate/audio_role/asset_type_code`
32 - segment 行含 `offset_sec/split/type/segment_type/audio_role`
33
34 结论:
35 - 现在内部素材 CSV 已经可以直接桥接到 pgvector 入库准备阶段
36 - 后续再补 loader 或数据库直写时,不需要重新设计内部素材导出结构
37
5 ### Stage: 为内部素材映射脚本增加音频存在性与时长校验 38 ### Stage: 为内部素材映射脚本增加音频存在性与时长校验
6 39
7 完成项: 40 完成项:
......
...@@ -495,6 +495,8 @@ query: ...@@ -495,6 +495,8 @@ query:
495 - `manifest_bundle/train.json` 495 - `manifest_bundle/train.json`
496 - `manifest_bundle/test.json` 496 - `manifest_bundle/test.json`
497 - `manifest_bundle/val.json` 497 - `manifest_bundle/val.json`
498 - 可选直接生成:
499 - `pgvector_payload.json`
498 - 可选做音频校验: 500 - 可选做音频校验:
499 - `audio_exists` 501 - `audio_exists`
500 - `duration_sec` 502 - `duration_sec`
...@@ -523,6 +525,23 @@ query: ...@@ -523,6 +525,23 @@ query:
523 - `duration` 525 - `duration`
524 - `missing_audio` 汇总 526 - `missing_audio` 汇总
525 527
528 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出:
529
530 ```bash
531 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train
532 ```
533
534 输出会包含:
535 - `songs`
536 - `references`
537 - `segments`
538
539 并额外带上:
540 - `audio_role`
541 - `asset_type_code`
542 - `audio_exists`
543 - `validation_status`
544
526 如果你想临时把伴奏类也纳入导出,可用: 545 如果你想临时把伴奏类也纳入导出,可用:
527 546
528 ```bash 547 ```bash
......