Bridge internal CSV exports into manifest bundles before ingestion at scale
Constraint: Internal asset exports should reach train/test-ready manifests without repeated manual reshaping Rejected: Stop at references/queries JSON only | Still leaves each import needing custom bundle assembly and split logic Confidence: high Scope-risk: narrow Directive: Keep internal manifest emission conservative and deterministic; preserve train/test query presence even on tiny exports Tested: internal_asset_type_mapper.py sample run with --emit-manifests produced catalog/train/test/val and balanced 1 query in both train and test Not-tested: Duration/offset enrichment from live source metadata and audio-path existence checks on production exports
Showing
3 changed files
with
118 additions
and
8 deletions
| ... | @@ -10,6 +10,7 @@ from __future__ import annotations | ... | @@ -10,6 +10,7 @@ from __future__ import annotations |
| 10 | import argparse | 10 | import argparse |
| 11 | import csv | 11 | import csv |
| 12 | import json | 12 | import json |
| 13 | import random | ||
| 13 | from pathlib import Path | 14 | from pathlib import Path |
| 14 | from typing import Dict, List, Tuple | 15 | from typing import Dict, List, Tuple |
| 15 | 16 | ||
| ... | @@ -107,6 +108,50 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ | ... | @@ -107,6 +108,50 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ |
| 107 | return references, queries, metadata_only, excluded | 108 | return references, queries, metadata_only, excluded |
| 108 | 109 | ||
| 109 | 110 | ||
| 111 | def build_manifest_bundle( | ||
| 112 | references: List[Dict], | ||
| 113 | queries: List[Dict], | ||
| 114 | eval_ratio: float, | ||
| 115 | seed: int, | ||
| 116 | ) -> Dict[str, List[Dict]]: | ||
| 117 | rng = random.Random(seed) | ||
| 118 | grouped_queries: Dict[str, List[Dict]] = {} | ||
| 119 | for row in queries: | ||
| 120 | grouped_queries.setdefault(row["song_id"], []).append(row) | ||
| 121 | |||
| 122 | train_queries: List[Dict] = [] | ||
| 123 | test_queries: List[Dict] = [] | ||
| 124 | val_queries: List[Dict] = [] | ||
| 125 | |||
| 126 | for song_id, items in grouped_queries.items(): | ||
| 127 | items = list(items) | ||
| 128 | rng.shuffle(items) | ||
| 129 | if len(items) == 1: | ||
| 130 | train_queries.extend(items) | ||
| 131 | continue | ||
| 132 | |||
| 133 | num_test = max(1, round(len(items) * eval_ratio)) | ||
| 134 | num_test = min(num_test, len(items) - 1) | ||
| 135 | test_part = items[:num_test] | ||
| 136 | train_part = items[num_test:] | ||
| 137 | if not train_part and test_part: | ||
| 138 | train_part.append(test_part.pop()) | ||
| 139 | train_queries.extend(train_part) | ||
| 140 | test_queries.extend(test_part) | ||
| 141 | |||
| 142 | if len(queries) >= 2 and not test_queries and train_queries: | ||
| 143 | test_queries.append(train_queries.pop()) | ||
| 144 | if len(queries) >= 2 and not train_queries and test_queries: | ||
| 145 | train_queries.append(test_queries.pop()) | ||
| 146 | |||
| 147 | return { | ||
| 148 | "catalog": references, | ||
| 149 | "train": train_queries + references, | ||
| 150 | "test": test_queries + references, | ||
| 151 | "val": val_queries, | ||
| 152 | } | ||
| 153 | |||
| 154 | |||
| 110 | def main(): | 155 | def main(): |
| 111 | parser = argparse.ArgumentParser() | 156 | parser = argparse.ArgumentParser() |
| 112 | parser.add_argument("csv_path") | 157 | parser.add_argument("csv_path") |
| ... | @@ -121,6 +166,9 @@ def main(): | ... | @@ -121,6 +166,9 @@ def main(): |
| 121 | parser.add_argument("--artist-field", default="artist") | 166 | parser.add_argument("--artist-field", default="artist") |
| 122 | parser.add_argument("--platform-field", default="source_platform") | 167 | parser.add_argument("--platform-field", default="source_platform") |
| 123 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | 168 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") |
| 169 | parser.add_argument("--emit-manifests", action="store_true") | ||
| 170 | parser.add_argument("--eval-ratio", type=float, default=0.2) | ||
| 171 | parser.add_argument("--seed", type=int, default=42) | ||
| 124 | args = parser.parse_args() | 172 | args = parser.parse_args() |
| 125 | 173 | ||
| 126 | rows = [] | 174 | rows = [] |
| ... | @@ -133,20 +181,38 @@ def main(): | ... | @@ -133,20 +181,38 @@ def main(): |
| 133 | 181 | ||
| 134 | out_dir = Path(args.output_dir) | 182 | out_dir = Path(args.output_dir) |
| 135 | out_dir.mkdir(parents=True, exist_ok=True) | 183 | out_dir.mkdir(parents=True, exist_ok=True) |
| 184 | summary = { | ||
| 185 | "input_rows": len(rows), | ||
| 186 | "references": len(references), | ||
| 187 | "queries": len(queries), | ||
| 188 | "metadata_only": len(metadata_only), | ||
| 189 | "excluded": len(excluded), | ||
| 190 | "include_conditionals_as": args.include_conditionals_as, | ||
| 191 | } | ||
| 136 | outputs = { | 192 | outputs = { |
| 137 | "references.json": references, | 193 | "references.json": references, |
| 138 | "queries.json": queries, | 194 | "queries.json": queries, |
| 139 | "metadata_only.json": metadata_only, | 195 | "metadata_only.json": metadata_only, |
| 140 | "excluded.json": excluded, | 196 | "excluded.json": excluded, |
| 141 | "summary.json": { | 197 | "summary.json": summary, |
| 142 | "input_rows": len(rows), | ||
| 143 | "references": len(references), | ||
| 144 | "queries": len(queries), | ||
| 145 | "metadata_only": len(metadata_only), | ||
| 146 | "excluded": len(excluded), | ||
| 147 | "include_conditionals_as": args.include_conditionals_as, | ||
| 148 | }, | ||
| 149 | } | 198 | } |
| 199 | |||
| 200 | if args.emit_manifests: | ||
| 201 | manifest_dir = out_dir / "manifest_bundle" | ||
| 202 | manifest_dir.mkdir(parents=True, exist_ok=True) | ||
| 203 | bundle = build_manifest_bundle( | ||
| 204 | references=references, | ||
| 205 | queries=queries, | ||
| 206 | eval_ratio=args.eval_ratio, | ||
| 207 | seed=args.seed, | ||
| 208 | ) | ||
| 209 | for split, payload in bundle.items(): | ||
| 210 | (manifest_dir / f"{split}.json").write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | ||
| 211 | summary["manifest_bundle"] = str(manifest_dir) | ||
| 212 | summary["manifest_train_rows"] = len(bundle["train"]) | ||
| 213 | summary["manifest_test_rows"] = len(bundle["test"]) | ||
| 214 | summary["manifest_val_rows"] = len(bundle["val"]) | ||
| 215 | |||
| 150 | for name, payload in outputs.items(): | 216 | for name, payload in outputs.items(): |
| 151 | (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | 217 | (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) |
| 152 | 218 | ... | ... |
| ... | @@ -2,6 +2,39 @@ | ... | @@ -2,6 +2,39 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 让内部素材映射脚本直接输出 train/test manifests | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py` | ||
| 9 | - 新增 `--emit-manifests` | ||
| 10 | - 新增 `--eval-ratio` | ||
| 11 | - 新增 `--seed` | ||
| 12 | - 在原有 `references/queries/metadata_only/excluded` 基础上,新增: | ||
| 13 | - `manifest_bundle/catalog.json` | ||
| 14 | - `manifest_bundle/train.json` | ||
| 15 | - `manifest_bundle/test.json` | ||
| 16 | - `manifest_bundle/val.json` | ||
| 17 | - 增加小样本保护: | ||
| 18 | - 即使 query 很少,也尽量保证 `train/test` 都有 query | ||
| 19 | - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) | ||
| 20 | |||
| 21 | 验证结果: | ||
| 22 | - 使用 6 行样例 CSV 执行: | ||
| 23 | - `internal_asset_type_mapper.py ... --emit-manifests --eval-ratio 0.5 --seed 42` | ||
| 24 | - 输出摘要: | ||
| 25 | - `manifest_bundle` 已生成 | ||
| 26 | - `manifest_train_rows = 3` | ||
| 27 | - `manifest_test_rows = 3` | ||
| 28 | - `manifest_val_rows = 0` | ||
| 29 | - manifest 检查: | ||
| 30 | - `catalog`:2 references | ||
| 31 | - `train`:1 query + 2 references | ||
| 32 | - `test`:1 query + 2 references | ||
| 33 | |||
| 34 | 结论: | ||
| 35 | - 现在内部素材 CSV 已经可以一步变成接近可训练的 manifest bundle | ||
| 36 | - 后续如果再补充 duration/offset/audio existence 校验,就能更平滑接入正式训练链路 | ||
| 37 | |||
| 5 | ### Stage: 将内部素材 type 策略落成可执行映射脚本 | 38 | ### Stage: 将内部素材 type 策略落成可执行映射脚本 |
| 6 | 39 | ||
| 7 | 完成项: | 40 | 完成项: | ... | ... |
| ... | @@ -490,6 +490,11 @@ query: | ... | @@ -490,6 +490,11 @@ query: |
| 490 | - `queries.json` | 490 | - `queries.json` |
| 491 | - `metadata_only.json` | 491 | - `metadata_only.json` |
| 492 | - `excluded.json` | 492 | - `excluded.json` |
| 493 | - 可选直接生成: | ||
| 494 | - `manifest_bundle/catalog.json` | ||
| 495 | - `manifest_bundle/train.json` | ||
| 496 | - `manifest_bundle/test.json` | ||
| 497 | - `manifest_bundle/val.json` | ||
| 493 | 498 | ||
| 494 | 最短示例: | 499 | 最短示例: |
| 495 | 500 | ||
| ... | @@ -497,6 +502,12 @@ query: | ... | @@ -497,6 +502,12 @@ query: |
| 497 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map | 502 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map |
| 498 | ``` | 503 | ``` |
| 499 | 504 | ||
| 505 | 如果你希望直接产出可训练 manifest: | ||
| 506 | |||
| 507 | ```bash | ||
| 508 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2 | ||
| 509 | ``` | ||
| 510 | |||
| 500 | 如果你想临时把伴奏类也纳入导出,可用: | 511 | 如果你想临时把伴奏类也纳入导出,可用: |
| 501 | 512 | ||
| 502 | ```bash | 513 | ```bash | ... | ... |
-
Please register or sign in to post a comment