Commit f048e400 f048e4001b0dbce0e395517e4cbf564cb94bc2f3 by cnb.bofCdSsphPA

Bridge internal CSV exports into manifest bundles before ingestion at scale

Constraint: Internal asset exports should reach train/test-ready manifests without repeated manual reshaping
Rejected: Stop at references/queries JSON only | Still leaves each import needing custom bundle assembly and split logic
Confidence: high
Scope-risk: narrow
Directive: Keep internal manifest emission conservative and deterministic; preserve train/test query presence even on tiny exports
Tested: internal_asset_type_mapper.py sample run with --emit-manifests produced catalog/train/test/val and balanced 1 query in both train and test
Not-tested: Duration/offset enrichment from live source metadata and audio-path existence checks on production exports
1 parent 728ef117
...@@ -10,6 +10,7 @@ from __future__ import annotations ...@@ -10,6 +10,7 @@ from __future__ import annotations
10 import argparse 10 import argparse
11 import csv 11 import csv
12 import json 12 import json
13 import random
13 from pathlib import Path 14 from pathlib import Path
14 from typing import Dict, List, Tuple 15 from typing import Dict, List, Tuple
15 16
...@@ -107,6 +108,50 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ ...@@ -107,6 +108,50 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[
107 return references, queries, metadata_only, excluded 108 return references, queries, metadata_only, excluded
108 109
109 110
111 def build_manifest_bundle(
112 references: List[Dict],
113 queries: List[Dict],
114 eval_ratio: float,
115 seed: int,
116 ) -> Dict[str, List[Dict]]:
117 rng = random.Random(seed)
118 grouped_queries: Dict[str, List[Dict]] = {}
119 for row in queries:
120 grouped_queries.setdefault(row["song_id"], []).append(row)
121
122 train_queries: List[Dict] = []
123 test_queries: List[Dict] = []
124 val_queries: List[Dict] = []
125
126 for song_id, items in grouped_queries.items():
127 items = list(items)
128 rng.shuffle(items)
129 if len(items) == 1:
130 train_queries.extend(items)
131 continue
132
133 num_test = max(1, round(len(items) * eval_ratio))
134 num_test = min(num_test, len(items) - 1)
135 test_part = items[:num_test]
136 train_part = items[num_test:]
137 if not train_part and test_part:
138 train_part.append(test_part.pop())
139 train_queries.extend(train_part)
140 test_queries.extend(test_part)
141
142 if len(queries) >= 2 and not test_queries and train_queries:
143 test_queries.append(train_queries.pop())
144 if len(queries) >= 2 and not train_queries and test_queries:
145 train_queries.append(test_queries.pop())
146
147 return {
148 "catalog": references,
149 "train": train_queries + references,
150 "test": test_queries + references,
151 "val": val_queries,
152 }
153
154
110 def main(): 155 def main():
111 parser = argparse.ArgumentParser() 156 parser = argparse.ArgumentParser()
112 parser.add_argument("csv_path") 157 parser.add_argument("csv_path")
...@@ -121,6 +166,9 @@ def main(): ...@@ -121,6 +166,9 @@ def main():
121 parser.add_argument("--artist-field", default="artist") 166 parser.add_argument("--artist-field", default="artist")
122 parser.add_argument("--platform-field", default="source_platform") 167 parser.add_argument("--platform-field", default="source_platform")
123 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") 168 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
169 parser.add_argument("--emit-manifests", action="store_true")
170 parser.add_argument("--eval-ratio", type=float, default=0.2)
171 parser.add_argument("--seed", type=int, default=42)
124 args = parser.parse_args() 172 args = parser.parse_args()
125 173
126 rows = [] 174 rows = []
...@@ -133,20 +181,38 @@ def main(): ...@@ -133,20 +181,38 @@ def main():
133 181
134 out_dir = Path(args.output_dir) 182 out_dir = Path(args.output_dir)
135 out_dir.mkdir(parents=True, exist_ok=True) 183 out_dir.mkdir(parents=True, exist_ok=True)
136 outputs = { 184 summary = {
137 "references.json": references,
138 "queries.json": queries,
139 "metadata_only.json": metadata_only,
140 "excluded.json": excluded,
141 "summary.json": {
142 "input_rows": len(rows), 185 "input_rows": len(rows),
143 "references": len(references), 186 "references": len(references),
144 "queries": len(queries), 187 "queries": len(queries),
145 "metadata_only": len(metadata_only), 188 "metadata_only": len(metadata_only),
146 "excluded": len(excluded), 189 "excluded": len(excluded),
147 "include_conditionals_as": args.include_conditionals_as, 190 "include_conditionals_as": args.include_conditionals_as,
148 },
149 } 191 }
192 outputs = {
193 "references.json": references,
194 "queries.json": queries,
195 "metadata_only.json": metadata_only,
196 "excluded.json": excluded,
197 "summary.json": summary,
198 }
199
200 if args.emit_manifests:
201 manifest_dir = out_dir / "manifest_bundle"
202 manifest_dir.mkdir(parents=True, exist_ok=True)
203 bundle = build_manifest_bundle(
204 references=references,
205 queries=queries,
206 eval_ratio=args.eval_ratio,
207 seed=args.seed,
208 )
209 for split, payload in bundle.items():
210 (manifest_dir / f"{split}.json").write_text(json.dumps(payload, indent=2, ensure_ascii=False))
211 summary["manifest_bundle"] = str(manifest_dir)
212 summary["manifest_train_rows"] = len(bundle["train"])
213 summary["manifest_test_rows"] = len(bundle["test"])
214 summary["manifest_val_rows"] = len(bundle["val"])
215
150 for name, payload in outputs.items(): 216 for name, payload in outputs.items():
151 (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) 217 (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False))
152 218
......
...@@ -2,6 +2,39 @@ ...@@ -2,6 +2,39 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 让内部素材映射脚本直接输出 train/test manifests
6
7 完成项:
8 - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py`
9 - 新增 `--emit-manifests`
10 - 新增 `--eval-ratio`
11 - 新增 `--seed`
12 - 在原有 `references/queries/metadata_only/excluded` 基础上,新增:
13 - `manifest_bundle/catalog.json`
14 - `manifest_bundle/train.json`
15 - `manifest_bundle/test.json`
16 - `manifest_bundle/val.json`
17 - 增加小样本保护:
18 - 即使 query 很少,也尽量保证 `train/test` 都有 query
19 - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md)
20
21 验证结果:
22 - 使用 6 行样例 CSV 执行:
23 - `internal_asset_type_mapper.py ... --emit-manifests --eval-ratio 0.5 --seed 42`
24 - 输出摘要:
25 - `manifest_bundle` 已生成
26 - `manifest_train_rows = 3`
27 - `manifest_test_rows = 3`
28 - `manifest_val_rows = 0`
29 - manifest 检查:
30 - `catalog`:2 references
31 - `train`:1 query + 2 references
32 - `test`:1 query + 2 references
33
34 结论:
35 - 现在内部素材 CSV 已经可以一步变成接近可训练的 manifest bundle
36 - 后续如果再补充 duration/offset/audio existence 校验,就能更平滑接入正式训练链路
37
5 ### Stage: 将内部素材 type 策略落成可执行映射脚本 38 ### Stage: 将内部素材 type 策略落成可执行映射脚本
6 39
7 完成项: 40 完成项:
......
...@@ -490,6 +490,11 @@ query: ...@@ -490,6 +490,11 @@ query:
490 - `queries.json` 490 - `queries.json`
491 - `metadata_only.json` 491 - `metadata_only.json`
492 - `excluded.json` 492 - `excluded.json`
493 - 可选直接生成:
494 - `manifest_bundle/catalog.json`
495 - `manifest_bundle/train.json`
496 - `manifest_bundle/test.json`
497 - `manifest_bundle/val.json`
493 498
494 最短示例: 499 最短示例:
495 500
...@@ -497,6 +502,12 @@ query: ...@@ -497,6 +502,12 @@ query:
497 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map 502 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map
498 ``` 503 ```
499 504
505 如果你希望直接产出可训练 manifest:
506
507 ```bash
508 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2
509 ```
510
500 如果你想临时把伴奏类也纳入导出,可用: 511 如果你想临时把伴奏类也纳入导出,可用:
501 512
502 ```bash 513 ```bash
......