Commit 5334df1f 5334df1fb3faf970851523337c0603f43726305b by cnb.bofCdSsphPA

Validate internal audio assets before manifest-scale training

Constraint: Internal CSV exports should expose missing audio and usable durations before they are treated as train-ready manifests
Rejected: Defer path and duration checks to later training failures | Would make ingestion debugging slow and noisy
Confidence: high
Scope-risk: narrow
Directive: Keep internal asset validation lightweight at mapping time; surface existence and duration early, then layer richer QC rules incrementally
Tested: internal_asset_type_mapper.py with --audio-root on a 6-row sample detected missing_audio=2 and emitted durations for existing reference/query assets
Not-tested: Production-scale scans over the full internal asset repository
1 parent f048e400
...@@ -13,6 +13,7 @@ import json ...@@ -13,6 +13,7 @@ import json
13 import random 13 import random
14 from pathlib import Path 14 from pathlib import Path
15 from typing import Dict, List, Tuple 15 from typing import Dict, List, Tuple
16 import soundfile as sf
16 17
17 REFERENCE = "reference" 18 REFERENCE = "reference"
18 QUERY = "query" 19 QUERY = "query"
...@@ -43,11 +44,29 @@ TYPE_POLICY: Dict[int, Dict[str, str]] = { ...@@ -43,11 +44,29 @@ TYPE_POLICY: Dict[int, Dict[str, str]] = {
43 } 44 }
44 45
45 46
47 def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool, float | None]:
48 if not asset_path:
49 return False, None
50 path = Path(asset_path)
51 if audio_root and not path.is_absolute():
52 path = audio_root / path
53 if not path.exists():
54 return False, None
55 try:
56 info = sf.info(str(path))
57 return True, float(info.duration)
58 except Exception:
59 return True, None
60
61
46 def normalize_row(row: Dict[str, str], args) -> Dict: 62 def normalize_row(row: Dict[str, str], args) -> Dict:
47 type_code = int(row[args.type_field]) 63 type_code = int(row[args.type_field])
48 policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) 64 policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"})
49 canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song" 65 canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song"
50 version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}" 66 version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}"
67 audio_path = row.get(args.path_field)
68 audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None)
69 validation_status = "ok" if audio_exists else "missing_audio"
51 record = { 70 record = {
52 "asset_id": row.get(args.asset_id_field), 71 "asset_id": row.get(args.asset_id_field),
53 "canonical_song_id": canonical_song_id, 72 "canonical_song_id": canonical_song_id,
...@@ -57,7 +76,10 @@ def normalize_row(row: Dict[str, str], args) -> Dict: ...@@ -57,7 +76,10 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
57 "recommended_train_type": policy["train_type"], 76 "recommended_train_type": policy["train_type"],
58 "priority": policy["priority"], 77 "priority": policy["priority"],
59 "bucket": policy["bucket"], 78 "bucket": policy["bucket"],
60 "audio_path": row.get(args.path_field), 79 "audio_path": audio_path,
80 "audio_exists": audio_exists,
81 "duration_sec": duration_sec,
82 "validation_status": validation_status,
61 "title": row.get(args.title_field), 83 "title": row.get(args.title_field),
62 "artist": row.get(args.artist_field), 84 "artist": row.get(args.artist_field),
63 "source_platform": row.get(args.platform_field) or "internal", 85 "source_platform": row.get(args.platform_field) or "internal",
...@@ -72,6 +94,8 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: ...@@ -72,6 +94,8 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict:
72 "asset_type_code": record["asset_type_code"], 94 "asset_type_code": record["asset_type_code"],
73 "audio_role": record["audio_role"], 95 "audio_role": record["audio_role"],
74 "audio_path": record["audio_path"], 96 "audio_path": record["audio_path"],
97 "audio_exists": record["audio_exists"],
98 "validation_status": record["validation_status"],
75 "source_dataset": "internal_assets", 99 "source_dataset": "internal_assets",
76 "source_platform": record["source_platform"], 100 "source_platform": record["source_platform"],
77 } 101 }
...@@ -79,12 +103,12 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: ...@@ -79,12 +103,12 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict:
79 return { 103 return {
80 **base, 104 **base,
81 "type": "reference", 105 "type": "reference",
82 "duration": 0.0, 106 "duration": record["duration_sec"] or 0.0,
83 } 107 }
84 return { 108 return {
85 **base, 109 **base,
86 "type": record["recommended_train_type"], 110 "type": record["recommended_train_type"],
87 "duration": 0.0, 111 "duration": record["duration_sec"] or 0.0,
88 "offset": None, 112 "offset": None,
89 "segment_type": "external_query", 113 "segment_type": "external_query",
90 } 114 }
...@@ -165,6 +189,7 @@ def main(): ...@@ -165,6 +189,7 @@ def main():
165 parser.add_argument("--title-field", default="title") 189 parser.add_argument("--title-field", default="title")
166 parser.add_argument("--artist-field", default="artist") 190 parser.add_argument("--artist-field", default="artist")
167 parser.add_argument("--platform-field", default="source_platform") 191 parser.add_argument("--platform-field", default="source_platform")
192 parser.add_argument("--audio-root", default=None)
168 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") 193 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
169 parser.add_argument("--emit-manifests", action="store_true") 194 parser.add_argument("--emit-manifests", action="store_true")
170 parser.add_argument("--eval-ratio", type=float, default=0.2) 195 parser.add_argument("--eval-ratio", type=float, default=0.2)
...@@ -178,6 +203,8 @@ def main(): ...@@ -178,6 +203,8 @@ def main():
178 rows.append(normalize_row(row, args)) 203 rows.append(normalize_row(row, args))
179 204
180 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) 205 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as)
206 missing_audio = sum(1 for row in rows if not row["audio_exists"])
207 trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL})
181 208
182 out_dir = Path(args.output_dir) 209 out_dir = Path(args.output_dir)
183 out_dir.mkdir(parents=True, exist_ok=True) 210 out_dir.mkdir(parents=True, exist_ok=True)
...@@ -187,6 +214,8 @@ def main(): ...@@ -187,6 +214,8 @@ def main():
187 "queries": len(queries), 214 "queries": len(queries),
188 "metadata_only": len(metadata_only), 215 "metadata_only": len(metadata_only),
189 "excluded": len(excluded), 216 "excluded": len(excluded),
217 "missing_audio": missing_audio,
218 "trainable_audio_rows": trainable_audio_rows,
190 "include_conditionals_as": args.include_conditionals_as, 219 "include_conditionals_as": args.include_conditionals_as,
191 } 220 }
192 outputs = { 221 outputs = {
......
...@@ -2,6 +2,35 @@ ...@@ -2,6 +2,35 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 为内部素材映射脚本增加音频存在性与时长校验
6
7 完成项:
8 - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py`
9 - 新增 `--audio-root`
10 - 自动探测 `audio_exists`
11 - 自动探测 `duration_sec`
12 - 自动写入 `validation_status`
13 - 在 summary 中新增:
14 - `missing_audio`
15 - `trainable_audio_rows`
16 - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md)
17
18 验证结果:
19 - 构造了 6 行样例 CSV,其中 4 个真实音频、2 个缺失路径
20 - 运行:
21 - `internal_asset_type_mapper.py ... --audio-root /tmp/internal_assets_audio --emit-manifests`
22 - 输出摘要:
23 - `missing_audio = 2`
24 - `trainable_audio_rows = 4`
25 - 生成的 reference/query 记录已带:
26 - `audio_exists = true`
27 - `validation_status = ok`
28 - 正确的 `duration`
29
30 结论:
31 - 现在内部素材 CSV 到 manifest 的链路已经具备最基础的训练前质量校验
32 - 后续再补 offset / 更细粒度质量规则时,不需要推翻现有脚本结构
33
5 ### Stage: 让内部素材映射脚本直接输出 train/test manifests 34 ### Stage: 让内部素材映射脚本直接输出 train/test manifests
6 35
7 完成项: 36 完成项:
......
...@@ -495,6 +495,10 @@ query: ...@@ -495,6 +495,10 @@ query:
495 - `manifest_bundle/train.json` 495 - `manifest_bundle/train.json`
496 - `manifest_bundle/test.json` 496 - `manifest_bundle/test.json`
497 - `manifest_bundle/val.json` 497 - `manifest_bundle/val.json`
498 - 可选做音频校验:
499 - `audio_exists`
500 - `duration_sec`
501 - `validation_status`
498 502
499 最短示例: 503 最短示例:
500 504
...@@ -508,6 +512,17 @@ query: ...@@ -508,6 +512,17 @@ query:
508 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2 512 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2
509 ``` 513 ```
510 514
515 如果你们的 CSV 里是相对路径,推荐加上音频根目录:
516
517 ```bash
518 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-manifests
519 ```
520
521 这样脚本会自动补:
522 - `audio_exists`
523 - `duration`
524 - `missing_audio` 汇总
525
511 如果你想临时把伴奏类也纳入导出,可用: 526 如果你想临时把伴奏类也纳入导出,可用:
512 527
513 ```bash 528 ```bash
......