Connect internal asset exports to pgvector preparation early
Constraint: Internal CSV ingestion should reach a pgvector-ready payload without requiring a second custom export path Rejected: Limit the mapper to manifest outputs only | Forces another transformation layer before database loading Confidence: high Scope-risk: narrow Directive: Keep pgvector payloads aligned with the shared songs/references/segments contract while preserving internal asset metadata fields Tested: internal_asset_type_mapper.py with --emit-pgvector-json produced songs=2 references=2 segments=2 and included audio_role/asset_type_code/validation_status in sample rows Not-tested: Direct bulk load into PostgreSQL using a live pgvector database
Showing
3 changed files
with
129 additions
and
0 deletions
| ... | @@ -176,6 +176,68 @@ def build_manifest_bundle( | ... | @@ -176,6 +176,68 @@ def build_manifest_bundle( |
| 176 | } | 176 | } |
| 177 | 177 | ||
| 178 | 178 | ||
| 179 | def build_pgvector_payload( | ||
| 180 | references: List[Dict], | ||
| 181 | queries: List[Dict], | ||
| 182 | split: str, | ||
| 183 | ) -> Dict[str, List[Dict]]: | ||
| 184 | songs: Dict[str, Dict] = {} | ||
| 185 | reference_rows: List[Dict] = [] | ||
| 186 | segment_rows: List[Dict] = [] | ||
| 187 | |||
| 188 | for row in references: | ||
| 189 | song_id = row["song_id"] | ||
| 190 | songs.setdefault(song_id, { | ||
| 191 | "song_id": song_id, | ||
| 192 | "title": song_id, | ||
| 193 | "artist": None, | ||
| 194 | "version_id": row.get("version_id"), | ||
| 195 | "source_dataset": row.get("source_dataset", "internal_assets"), | ||
| 196 | "license": None, | ||
| 197 | }) | ||
| 198 | reference_rows.append({ | ||
| 199 | "song_id": song_id, | ||
| 200 | "audio_uri": row["audio_path"], | ||
| 201 | "duration_sec": row.get("duration", 0.0), | ||
| 202 | "sample_rate": 16000, | ||
| 203 | "audio_role": row.get("audio_role"), | ||
| 204 | "asset_type_code": row.get("asset_type_code"), | ||
| 205 | "audio_exists": row.get("audio_exists"), | ||
| 206 | "validation_status": row.get("validation_status"), | ||
| 207 | }) | ||
| 208 | |||
| 209 | for row in queries: | ||
| 210 | song_id = row["song_id"] | ||
| 211 | songs.setdefault(song_id, { | ||
| 212 | "song_id": song_id, | ||
| 213 | "title": song_id, | ||
| 214 | "artist": None, | ||
| 215 | "version_id": row.get("version_id"), | ||
| 216 | "source_dataset": row.get("source_dataset", "internal_assets"), | ||
| 217 | "license": None, | ||
| 218 | }) | ||
| 219 | segment_rows.append({ | ||
| 220 | "song_id": song_id, | ||
| 221 | "audio_uri": row["audio_path"], | ||
| 222 | "offset_sec": row.get("offset", 0.0) if row.get("offset") is not None else 0.0, | ||
| 223 | "duration_sec": row.get("duration", 0.0), | ||
| 224 | "split": split, | ||
| 225 | "type": row.get("type", "unknown"), | ||
| 226 | "segment_type": row.get("segment_type"), | ||
| 227 | "source_dataset": row.get("source_dataset", "internal_assets"), | ||
| 228 | "audio_role": row.get("audio_role"), | ||
| 229 | "asset_type_code": row.get("asset_type_code"), | ||
| 230 | "audio_exists": row.get("audio_exists"), | ||
| 231 | "validation_status": row.get("validation_status"), | ||
| 232 | }) | ||
| 233 | |||
| 234 | return { | ||
| 235 | "songs": list(songs.values()), | ||
| 236 | "references": reference_rows, | ||
| 237 | "segments": segment_rows, | ||
| 238 | } | ||
| 239 | |||
| 240 | |||
| 179 | def main(): | 241 | def main(): |
| 180 | parser = argparse.ArgumentParser() | 242 | parser = argparse.ArgumentParser() |
| 181 | parser.add_argument("csv_path") | 243 | parser.add_argument("csv_path") |
| ... | @@ -192,6 +254,8 @@ def main(): | ... | @@ -192,6 +254,8 @@ def main(): |
| 192 | parser.add_argument("--audio-root", default=None) | 254 | parser.add_argument("--audio-root", default=None) |
| 193 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | 255 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") |
| 194 | parser.add_argument("--emit-manifests", action="store_true") | 256 | parser.add_argument("--emit-manifests", action="store_true") |
| 257 | parser.add_argument("--emit-pgvector-json", action="store_true") | ||
| 258 | parser.add_argument("--pgvector-split", default="train") | ||
| 195 | parser.add_argument("--eval-ratio", type=float, default=0.2) | 259 | parser.add_argument("--eval-ratio", type=float, default=0.2) |
| 196 | parser.add_argument("--seed", type=int, default=42) | 260 | parser.add_argument("--seed", type=int, default=42) |
| 197 | args = parser.parse_args() | 261 | args = parser.parse_args() |
| ... | @@ -242,6 +306,19 @@ def main(): | ... | @@ -242,6 +306,19 @@ def main(): |
| 242 | summary["manifest_test_rows"] = len(bundle["test"]) | 306 | summary["manifest_test_rows"] = len(bundle["test"]) |
| 243 | summary["manifest_val_rows"] = len(bundle["val"]) | 307 | summary["manifest_val_rows"] = len(bundle["val"]) |
| 244 | 308 | ||
| 309 | if args.emit_pgvector_json: | ||
| 310 | pgvector_payload = build_pgvector_payload( | ||
| 311 | references=references, | ||
| 312 | queries=queries, | ||
| 313 | split=args.pgvector_split, | ||
| 314 | ) | ||
| 315 | pgvector_path = out_dir / "pgvector_payload.json" | ||
| 316 | pgvector_path.write_text(json.dumps(pgvector_payload, indent=2, ensure_ascii=False)) | ||
| 317 | summary["pgvector_payload"] = str(pgvector_path) | ||
| 318 | summary["pgvector_songs"] = len(pgvector_payload["songs"]) | ||
| 319 | summary["pgvector_references"] = len(pgvector_payload["references"]) | ||
| 320 | summary["pgvector_segments"] = len(pgvector_payload["segments"]) | ||
| 321 | |||
| 245 | for name, payload in outputs.items(): | 322 | for name, payload in outputs.items(): |
| 246 | (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | 323 | (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) |
| 247 | 324 | ... | ... |
| ... | @@ -2,6 +2,39 @@ | ... | @@ -2,6 +2,39 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py` | ||
| 9 | - 新增 `--emit-pgvector-json` | ||
| 10 | - 新增 `--pgvector-split` | ||
| 11 | - 可直接导出: | ||
| 12 | - `pgvector_payload.json` | ||
| 13 | - 导出结构与现有 pgvector 导出工具兼容,包含: | ||
| 14 | - `songs` | ||
| 15 | - `references` | ||
| 16 | - `segments` | ||
| 17 | - 同时额外保留: | ||
| 18 | - `audio_role` | ||
| 19 | - `asset_type_code` | ||
| 20 | - `audio_exists` | ||
| 21 | - `validation_status` | ||
| 22 | |||
| 23 | 验证结果: | ||
| 24 | - 运行: | ||
| 25 | - `internal_asset_type_mapper.py ... --emit-pgvector-json --pgvector-split train` | ||
| 26 | - 输出摘要: | ||
| 27 | - `pgvector_songs = 2` | ||
| 28 | - `pgvector_references = 2` | ||
| 29 | - `pgvector_segments = 2` | ||
| 30 | - 抽样检查: | ||
| 31 | - reference 行含 `duration_sec/sample_rate/audio_role/asset_type_code` | ||
| 32 | - segment 行含 `offset_sec/split/type/segment_type/audio_role` | ||
| 33 | |||
| 34 | 结论: | ||
| 35 | - 现在内部素材 CSV 已经可以直接桥接到 pgvector 入库准备阶段 | ||
| 36 | - 后续再补 loader 或数据库直写时,不需要重新设计内部素材导出结构 | ||
| 37 | |||
| 5 | ### Stage: 为内部素材映射脚本增加音频存在性与时长校验 | 38 | ### Stage: 为内部素材映射脚本增加音频存在性与时长校验 |
| 6 | 39 | ||
| 7 | 完成项: | 40 | 完成项: | ... | ... |
| ... | @@ -495,6 +495,8 @@ query: | ... | @@ -495,6 +495,8 @@ query: |
| 495 | - `manifest_bundle/train.json` | 495 | - `manifest_bundle/train.json` |
| 496 | - `manifest_bundle/test.json` | 496 | - `manifest_bundle/test.json` |
| 497 | - `manifest_bundle/val.json` | 497 | - `manifest_bundle/val.json` |
| 498 | - 可选直接生成: | ||
| 499 | - `pgvector_payload.json` | ||
| 498 | - 可选做音频校验: | 500 | - 可选做音频校验: |
| 499 | - `audio_exists` | 501 | - `audio_exists` |
| 500 | - `duration_sec` | 502 | - `duration_sec` |
| ... | @@ -523,6 +525,23 @@ query: | ... | @@ -523,6 +525,23 @@ query: |
| 523 | - `duration` | 525 | - `duration` |
| 524 | - `missing_audio` 汇总 | 526 | - `missing_audio` 汇总 |
| 525 | 527 | ||
| 528 | 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: | ||
| 529 | |||
| 530 | ```bash | ||
| 531 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train | ||
| 532 | ``` | ||
| 533 | |||
| 534 | 输出会包含: | ||
| 535 | - `songs` | ||
| 536 | - `references` | ||
| 537 | - `segments` | ||
| 538 | |||
| 539 | 并额外带上: | ||
| 540 | - `audio_role` | ||
| 541 | - `asset_type_code` | ||
| 542 | - `audio_exists` | ||
| 543 | - `validation_status` | ||
| 544 | |||
| 526 | 如果你想临时把伴奏类也纳入导出,可用: | 545 | 如果你想临时把伴奏类也纳入导出,可用: |
| 527 | 546 | ||
| 528 | ```bash | 547 | ```bash | ... | ... |
-
Please register or sign in to post a comment