Commit 728ef117 728ef117d8f8c43aead3e468dfeaf8b5788849e4 by cnb.bofCdSsphPA

Make internal asset policies executable before DB-scale import

Constraint: Internal type enums need a repeatable mapping path into manifest-ready buckets before bulk database exports begin
Rejected: Leave type handling as documentation only | Would force repeated manual filtering and inconsistent ingestion decisions
Confidence: high
Scope-risk: narrow
Directive: Keep internal asset mapping defaults conservative; conditional instrumental variants should stay opt-in until version-aware training is ready
Tested: internal_asset_type_mapper.py on a 6-row sample CSV produced references=2 queries=2 metadata_only=1 excluded=1 with expected type routing
Not-tested: Direct SQL export integration against the live source database
1 parent bf098870
1 #!/usr/bin/env python3
2 """Map internal asset type codes into manifest-ready ACR roles.
3
4 Input: CSV exported from an internal asset table.
5 Output: JSON bundles for references, queries, metadata-only assets, and excluded assets.
6 """
7
8 from __future__ import annotations
9
10 import argparse
11 import csv
12 import json
13 from pathlib import Path
14 from typing import Dict, List, Tuple
15
16 REFERENCE = "reference"
17 QUERY = "query"
18 METADATA = "metadata_only"
19 EXCLUDED = "excluded"
20 CONDITIONAL = "conditional"
21
22 TYPE_POLICY: Dict[int, Dict[str, str]] = {
23 1: {"bucket": REFERENCE, "audio_role": "original_lossy", "train_type": "reference", "priority": "secondary_reference"},
24 2: {"bucket": CONDITIONAL, "audio_role": "inst_with_harmony_lossy", "train_type": "hard_negative", "priority": "conditional"},
25 3: {"bucket": METADATA, "audio_role": "lyrics_txt", "train_type": "none", "priority": "metadata"},
26 4: {"bucket": METADATA, "audio_role": "cover_image", "train_type": "none", "priority": "metadata"},
27 5: {"bucket": METADATA, "audio_role": "license_doc", "train_type": "none", "priority": "metadata"},
28 6: {"bucket": METADATA, "audio_role": "album_info", "train_type": "none", "priority": "metadata"},
29 7: {"bucket": QUERY, "audio_role": "short_video_clip", "train_type": "clean", "priority": "high_value_query"},
30 8: {"bucket": QUERY, "audio_role": "chorus_clip", "train_type": "clean", "priority": "high_value_query"},
31 9: {"bucket": CONDITIONAL, "audio_role": "inst_no_harmony_lossy", "train_type": "hard_negative", "priority": "conditional"},
32 10: {"bucket": CONDITIONAL, "audio_role": "inst_no_harmony_lossless", "train_type": "hard_negative", "priority": "conditional"},
33 11: {"bucket": REFERENCE, "audio_role": "original_lossless", "train_type": "reference", "priority": "primary_reference"},
34 12: {"bucket": CONDITIONAL, "audio_role": "inst_with_harmony_lossless", "train_type": "hard_negative", "priority": "conditional"},
35 13: {"bucket": METADATA, "audio_role": "lyrics_lrc", "train_type": "none", "priority": "metadata"},
36 14: {"bucket": METADATA, "audio_role": "cover_source", "train_type": "none", "priority": "metadata"},
37 16: {"bucket": QUERY, "audio_role": "short_video_clip", "train_type": "clean", "priority": "high_value_query"},
38 17: {"bucket": METADATA, "audio_role": "archive_package", "train_type": "none", "priority": "metadata"},
39 18: {"bucket": QUERY, "audio_role": "demo_audio", "train_type": "clean", "priority": "screen_before_use"},
40 19: {"bucket": METADATA, "audio_role": "sheet_image", "train_type": "none", "priority": "metadata"},
41 20: {"bucket": METADATA, "audio_role": "lyrics_translation", "train_type": "none", "priority": "metadata"},
42 }
43
44
45 def normalize_row(row: Dict[str, str], args) -> Dict:
46 type_code = int(row[args.type_field])
47 policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"})
48 canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song"
49 version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}"
50 record = {
51 "asset_id": row.get(args.asset_id_field),
52 "canonical_song_id": canonical_song_id,
53 "version_id": version_id,
54 "asset_type_code": type_code,
55 "audio_role": policy["audio_role"],
56 "recommended_train_type": policy["train_type"],
57 "priority": policy["priority"],
58 "bucket": policy["bucket"],
59 "audio_path": row.get(args.path_field),
60 "title": row.get(args.title_field),
61 "artist": row.get(args.artist_field),
62 "source_platform": row.get(args.platform_field) or "internal",
63 }
64 return record
65
66
67 def to_manifest_record(record: Dict, bucket: str) -> Dict:
68 base = {
69 "song_id": record["canonical_song_id"],
70 "version_id": record["version_id"],
71 "asset_type_code": record["asset_type_code"],
72 "audio_role": record["audio_role"],
73 "audio_path": record["audio_path"],
74 "source_dataset": "internal_assets",
75 "source_platform": record["source_platform"],
76 }
77 if bucket == REFERENCE:
78 return {
79 **base,
80 "type": "reference",
81 "duration": 0.0,
82 }
83 return {
84 **base,
85 "type": record["recommended_train_type"],
86 "duration": 0.0,
87 "offset": None,
88 "segment_type": "external_query",
89 }
90
91
92 def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]:
93 references, queries, metadata_only, excluded = [], [], [], []
94 for record in rows:
95 bucket = record["bucket"]
96 if bucket == CONDITIONAL:
97 bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED
98
99 if bucket == REFERENCE:
100 references.append(to_manifest_record(record, REFERENCE))
101 elif bucket == QUERY:
102 queries.append(to_manifest_record(record, QUERY))
103 elif bucket == METADATA:
104 metadata_only.append(record)
105 else:
106 excluded.append(record)
107 return references, queries, metadata_only, excluded
108
109
110 def main():
111 parser = argparse.ArgumentParser()
112 parser.add_argument("csv_path")
113 parser.add_argument("--output-dir", required=True)
114 parser.add_argument("--asset-id-field", default="id")
115 parser.add_argument("--song-field", default="song_id")
116 parser.add_argument("--canonical-song-field", default="canonical_song_id")
117 parser.add_argument("--version-field", default="version_id")
118 parser.add_argument("--type-field", default="type")
119 parser.add_argument("--path-field", default="audio_path")
120 parser.add_argument("--title-field", default="title")
121 parser.add_argument("--artist-field", default="artist")
122 parser.add_argument("--platform-field", default="source_platform")
123 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
124 args = parser.parse_args()
125
126 rows = []
127 with open(args.csv_path, newline="") as f:
128 reader = csv.DictReader(f)
129 for row in reader:
130 rows.append(normalize_row(row, args))
131
132 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as)
133
134 out_dir = Path(args.output_dir)
135 out_dir.mkdir(parents=True, exist_ok=True)
136 outputs = {
137 "references.json": references,
138 "queries.json": queries,
139 "metadata_only.json": metadata_only,
140 "excluded.json": excluded,
141 "summary.json": {
142 "input_rows": len(rows),
143 "references": len(references),
144 "queries": len(queries),
145 "metadata_only": len(metadata_only),
146 "excluded": len(excluded),
147 "include_conditionals_as": args.include_conditionals_as,
148 },
149 }
150 for name, payload in outputs.items():
151 (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False))
152
153 print(json.dumps(outputs["summary.json"], indent=2, ensure_ascii=False))
154
155
156 if __name__ == "__main__":
157 main()
...@@ -2,6 +2,34 @@ ...@@ -2,6 +2,34 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 将内部素材 type 策略落成可执行映射脚本
6
7 完成项:
8 - 新增 [acr-engine/scripts/internal_asset_type_mapper.py](../acr-engine/scripts/internal_asset_type_mapper.py)
9 - 支持从内部素材 CSV 自动分流到:
10 - `references.json`
11 - `queries.json`
12 - `metadata_only.json`
13 - `excluded.json`
14 - 支持 `--include-conditionals-as`,可选把伴奏类临时导出成 `query``reference`
15 -[training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 增加脚本入口说明
16
17 验证结果:
18 - 使用 6 行样例 CSV 验证:
19 - `11/1 -> references`
20 - `7/18 -> queries`
21 - `3 -> metadata_only`
22 - `12 -> excluded`
23 - 摘要输出:
24 - `references = 2`
25 - `queries = 2`
26 - `metadata_only = 1`
27 - `excluded = 1`
28
29 结论:
30 - 现在内部素材 type 策略已经不只在文档里,而是可以直接作为批量清洗入口使用
31 - 后续如果要从数据库导出 CSV 再转 manifest,已经有第一版可执行桥接脚本
32
5 ### Stage: 为内部素材 type 枚举补齐训练参与策略文档 33 ### Stage: 为内部素材 type 枚举补齐训练参与策略文档
6 34
7 完成项: 35 完成项:
......
...@@ -478,5 +478,35 @@ query: ...@@ -478,5 +478,35 @@ query:
478 | 18 | `demo_audio` | `clean` / `augmented` | 478 | 18 | `demo_audio` | `clean` / `augmented` |
479 | 2/9/10/12 | `instrumental_variant` | 先不进主训练,或做 hard negative | 479 | 2/9/10/12 | `instrumental_variant` | 先不进主训练,或做 hard negative |
480 480
481 ## 12.6 现在仓库里已经有可执行映射脚本
482
483 脚本:
484 - [acr-engine/scripts/internal_asset_type_mapper.py](../acr-engine/scripts/internal_asset_type_mapper.py)
485
486 作用:
487 - 读取内部素材 CSV
488 -`type` 枚举自动分流成:
489 - `references.json`
490 - `queries.json`
491 - `metadata_only.json`
492 - `excluded.json`
493
494 最短示例:
495
496 ```bash
497 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map
498 ```
499
500 如果你想临时把伴奏类也纳入导出,可用:
501
502 ```bash
503 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --include-conditionals-as query
504 ```
505
506 但默认仍建议:
507 - `--include-conditionals-as skip`
508
509 这样更符合当前主任务“先把原曲识别打稳,再逐步纳入伴奏版本”的策略。
510
481 ## Sources 511 ## Sources
482 - 当前代码事实来自 [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py), [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py), [acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py), [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py), [acr-engine/train.py](../acr-engine/train.py) 512 - 当前代码事实来自 [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py), [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py), [acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py), [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py), [acr-engine/train.py](../acr-engine/train.py)
......