Validate internal audio assets before manifest-scale training
Constraint: Internal CSV exports should expose missing audio and usable durations before they are treated as train-ready manifests Rejected: Defer path and duration checks to later training failures | Would make ingestion debugging slow and noisy Confidence: high Scope-risk: narrow Directive: Keep internal asset validation lightweight at mapping time; surface existence and duration early, then layer richer QC rules incrementally Tested: internal_asset_type_mapper.py with --audio-root on a 6-row sample detected missing_audio=2 and emitted durations for existing reference/query assets Not-tested: Production-scale scans over the full internal asset repository
Showing
3 changed files
with
76 additions
and
3 deletions
| ... | @@ -13,6 +13,7 @@ import json | ... | @@ -13,6 +13,7 @@ import json |
| 13 | import random | 13 | import random |
| 14 | from pathlib import Path | 14 | from pathlib import Path |
| 15 | from typing import Dict, List, Tuple | 15 | from typing import Dict, List, Tuple |
| 16 | import soundfile as sf | ||
| 16 | 17 | ||
| 17 | REFERENCE = "reference" | 18 | REFERENCE = "reference" |
| 18 | QUERY = "query" | 19 | QUERY = "query" |
| ... | @@ -43,11 +44,29 @@ TYPE_POLICY: Dict[int, Dict[str, str]] = { | ... | @@ -43,11 +44,29 @@ TYPE_POLICY: Dict[int, Dict[str, str]] = { |
| 43 | } | 44 | } |
| 44 | 45 | ||
| 45 | 46 | ||
| 47 | def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool, float | None]: | ||
| 48 | if not asset_path: | ||
| 49 | return False, None | ||
| 50 | path = Path(asset_path) | ||
| 51 | if audio_root and not path.is_absolute(): | ||
| 52 | path = audio_root / path | ||
| 53 | if not path.exists(): | ||
| 54 | return False, None | ||
| 55 | try: | ||
| 56 | info = sf.info(str(path)) | ||
| 57 | return True, float(info.duration) | ||
| 58 | except Exception: | ||
| 59 | return True, None | ||
| 60 | |||
| 61 | |||
| 46 | def normalize_row(row: Dict[str, str], args) -> Dict: | 62 | def normalize_row(row: Dict[str, str], args) -> Dict: |
| 47 | type_code = int(row[args.type_field]) | 63 | type_code = int(row[args.type_field]) |
| 48 | policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) | 64 | policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) |
| 49 | canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song" | 65 | canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song" |
| 50 | version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}" | 66 | version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}" |
| 67 | audio_path = row.get(args.path_field) | ||
| 68 | audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None) | ||
| 69 | validation_status = "ok" if audio_exists else "missing_audio" | ||
| 51 | record = { | 70 | record = { |
| 52 | "asset_id": row.get(args.asset_id_field), | 71 | "asset_id": row.get(args.asset_id_field), |
| 53 | "canonical_song_id": canonical_song_id, | 72 | "canonical_song_id": canonical_song_id, |
| ... | @@ -57,7 +76,10 @@ def normalize_row(row: Dict[str, str], args) -> Dict: | ... | @@ -57,7 +76,10 @@ def normalize_row(row: Dict[str, str], args) -> Dict: |
| 57 | "recommended_train_type": policy["train_type"], | 76 | "recommended_train_type": policy["train_type"], |
| 58 | "priority": policy["priority"], | 77 | "priority": policy["priority"], |
| 59 | "bucket": policy["bucket"], | 78 | "bucket": policy["bucket"], |
| 60 | "audio_path": row.get(args.path_field), | 79 | "audio_path": audio_path, |
| 80 | "audio_exists": audio_exists, | ||
| 81 | "duration_sec": duration_sec, | ||
| 82 | "validation_status": validation_status, | ||
| 61 | "title": row.get(args.title_field), | 83 | "title": row.get(args.title_field), |
| 62 | "artist": row.get(args.artist_field), | 84 | "artist": row.get(args.artist_field), |
| 63 | "source_platform": row.get(args.platform_field) or "internal", | 85 | "source_platform": row.get(args.platform_field) or "internal", |
| ... | @@ -72,6 +94,8 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: | ... | @@ -72,6 +94,8 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: |
| 72 | "asset_type_code": record["asset_type_code"], | 94 | "asset_type_code": record["asset_type_code"], |
| 73 | "audio_role": record["audio_role"], | 95 | "audio_role": record["audio_role"], |
| 74 | "audio_path": record["audio_path"], | 96 | "audio_path": record["audio_path"], |
| 97 | "audio_exists": record["audio_exists"], | ||
| 98 | "validation_status": record["validation_status"], | ||
| 75 | "source_dataset": "internal_assets", | 99 | "source_dataset": "internal_assets", |
| 76 | "source_platform": record["source_platform"], | 100 | "source_platform": record["source_platform"], |
| 77 | } | 101 | } |
| ... | @@ -79,12 +103,12 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: | ... | @@ -79,12 +103,12 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: |
| 79 | return { | 103 | return { |
| 80 | **base, | 104 | **base, |
| 81 | "type": "reference", | 105 | "type": "reference", |
| 82 | "duration": 0.0, | 106 | "duration": record["duration_sec"] or 0.0, |
| 83 | } | 107 | } |
| 84 | return { | 108 | return { |
| 85 | **base, | 109 | **base, |
| 86 | "type": record["recommended_train_type"], | 110 | "type": record["recommended_train_type"], |
| 87 | "duration": 0.0, | 111 | "duration": record["duration_sec"] or 0.0, |
| 88 | "offset": None, | 112 | "offset": None, |
| 89 | "segment_type": "external_query", | 113 | "segment_type": "external_query", |
| 90 | } | 114 | } |
| ... | @@ -165,6 +189,7 @@ def main(): | ... | @@ -165,6 +189,7 @@ def main(): |
| 165 | parser.add_argument("--title-field", default="title") | 189 | parser.add_argument("--title-field", default="title") |
| 166 | parser.add_argument("--artist-field", default="artist") | 190 | parser.add_argument("--artist-field", default="artist") |
| 167 | parser.add_argument("--platform-field", default="source_platform") | 191 | parser.add_argument("--platform-field", default="source_platform") |
| 192 | parser.add_argument("--audio-root", default=None) | ||
| 168 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | 193 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") |
| 169 | parser.add_argument("--emit-manifests", action="store_true") | 194 | parser.add_argument("--emit-manifests", action="store_true") |
| 170 | parser.add_argument("--eval-ratio", type=float, default=0.2) | 195 | parser.add_argument("--eval-ratio", type=float, default=0.2) |
| ... | @@ -178,6 +203,8 @@ def main(): | ... | @@ -178,6 +203,8 @@ def main(): |
| 178 | rows.append(normalize_row(row, args)) | 203 | rows.append(normalize_row(row, args)) |
| 179 | 204 | ||
| 180 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) | 205 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) |
| 206 | missing_audio = sum(1 for row in rows if not row["audio_exists"]) | ||
| 207 | trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) | ||
| 181 | 208 | ||
| 182 | out_dir = Path(args.output_dir) | 209 | out_dir = Path(args.output_dir) |
| 183 | out_dir.mkdir(parents=True, exist_ok=True) | 210 | out_dir.mkdir(parents=True, exist_ok=True) |
| ... | @@ -187,6 +214,8 @@ def main(): | ... | @@ -187,6 +214,8 @@ def main(): |
| 187 | "queries": len(queries), | 214 | "queries": len(queries), |
| 188 | "metadata_only": len(metadata_only), | 215 | "metadata_only": len(metadata_only), |
| 189 | "excluded": len(excluded), | 216 | "excluded": len(excluded), |
| 217 | "missing_audio": missing_audio, | ||
| 218 | "trainable_audio_rows": trainable_audio_rows, | ||
| 190 | "include_conditionals_as": args.include_conditionals_as, | 219 | "include_conditionals_as": args.include_conditionals_as, |
| 191 | } | 220 | } |
| 192 | outputs = { | 221 | outputs = { | ... | ... |
| ... | @@ -2,6 +2,35 @@ | ... | @@ -2,6 +2,35 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 为内部素材映射脚本增加音频存在性与时长校验 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py` | ||
| 9 | - 新增 `--audio-root` | ||
| 10 | - 自动探测 `audio_exists` | ||
| 11 | - 自动探测 `duration_sec` | ||
| 12 | - 自动写入 `validation_status` | ||
| 13 | - 在 summary 中新增: | ||
| 14 | - `missing_audio` | ||
| 15 | - `trainable_audio_rows` | ||
| 16 | - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) | ||
| 17 | |||
| 18 | 验证结果: | ||
| 19 | - 构造了 6 行样例 CSV,其中 4 个真实音频、2 个缺失路径 | ||
| 20 | - 运行: | ||
| 21 | - `internal_asset_type_mapper.py ... --audio-root /tmp/internal_assets_audio --emit-manifests` | ||
| 22 | - 输出摘要: | ||
| 23 | - `missing_audio = 2` | ||
| 24 | - `trainable_audio_rows = 4` | ||
| 25 | - 生成的 reference/query 记录已带: | ||
| 26 | - `audio_exists = true` | ||
| 27 | - `validation_status = ok` | ||
| 28 | - 正确的 `duration` | ||
| 29 | |||
| 30 | 结论: | ||
| 31 | - 现在内部素材 CSV 到 manifest 的链路已经具备最基础的训练前质量校验 | ||
| 32 | - 后续再补 offset / 更细粒度质量规则时,不需要推翻现有脚本结构 | ||
| 33 | |||
| 5 | ### Stage: 让内部素材映射脚本直接输出 train/test manifests | 34 | ### Stage: 让内部素材映射脚本直接输出 train/test manifests |
| 6 | 35 | ||
| 7 | 完成项: | 36 | 完成项: | ... | ... |
| ... | @@ -495,6 +495,10 @@ query: | ... | @@ -495,6 +495,10 @@ query: |
| 495 | - `manifest_bundle/train.json` | 495 | - `manifest_bundle/train.json` |
| 496 | - `manifest_bundle/test.json` | 496 | - `manifest_bundle/test.json` |
| 497 | - `manifest_bundle/val.json` | 497 | - `manifest_bundle/val.json` |
| 498 | - 可选做音频校验: | ||
| 499 | - `audio_exists` | ||
| 500 | - `duration_sec` | ||
| 501 | - `validation_status` | ||
| 498 | 502 | ||
| 499 | 最短示例: | 503 | 最短示例: |
| 500 | 504 | ||
| ... | @@ -508,6 +512,17 @@ query: | ... | @@ -508,6 +512,17 @@ query: |
| 508 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2 | 512 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --emit-manifests --eval-ratio 0.2 |
| 509 | ``` | 513 | ``` |
| 510 | 514 | ||
| 515 | 如果你们的 CSV 里是相对路径,推荐加上音频根目录: | ||
| 516 | |||
| 517 | ```bash | ||
| 518 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-manifests | ||
| 519 | ``` | ||
| 520 | |||
| 521 | 这样脚本会自动补: | ||
| 522 | - `audio_exists` | ||
| 523 | - `duration` | ||
| 524 | - `missing_audio` 汇总 | ||
| 525 | |||
| 511 | 如果你想临时把伴奏类也纳入导出,可用: | 526 | 如果你想临时把伴奏类也纳入导出,可用: |
| 512 | 527 | ||
| 513 | ```bash | 528 | ```bash | ... | ... |
-
Please register or sign in to post a comment