Commit fb1d00b6 fb1d00b69c8f8285193b0160a582d922bfa342f3 by cnb.bofCdSsphPA

Unify open dataset preparation behind adapter commands

Constraint: Personal-use experimentation needs a single entrypoint from local open-audio directories to train/eval manifests
Rejected: Separate manual manifest generation per dataset | Too error-prone and slows iterative training/evaluation
Confidence: high
Scope-risk: narrow
Directive: Point real FMA or MTG-Jamendo local download folders at prepare-local before expanding training runs
Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma tmp/open_music_demo --output-root data/external_ingested/demo_via_adapter --eval-ratio 0.5 --query-duration 5.0
Not-tested: Full upstream corpus import and large-scale training
1 parent 167aa6e5
1 [
2 {
3 "song_id": "fma_00000",
4 "audio_path": "open_music_demo/song_0000.wav",
5 "duration": 15.0,
6 "type": "reference",
7 "source_dataset": "fma"
8 },
9 {
10 "song_id": "fma_00001",
11 "audio_path": "open_music_demo/song_0001.wav",
12 "duration": 15.0,
13 "type": "reference",
14 "source_dataset": "fma"
15 }
16 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "fma_00000",
4 "audio_path": "open_music_demo/song_0000.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 6.394,
8 "segment_type": "external_query",
9 "source_dataset": "fma"
10 },
11 {
12 "song_id": "fma_00000",
13 "audio_path": "open_music_demo/song_0000.wav",
14 "duration": 15.0,
15 "type": "reference",
16 "source_dataset": "fma"
17 },
18 {
19 "song_id": "fma_00001",
20 "audio_path": "open_music_demo/song_0001.wav",
21 "duration": 15.0,
22 "type": "reference",
23 "source_dataset": "fma"
24 }
25 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "fma_00001",
4 "audio_path": "open_music_demo/song_0001.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 2.75,
8 "segment_type": "external_query",
9 "source_dataset": "fma"
10 },
11 {
12 "song_id": "fma_00000",
13 "audio_path": "open_music_demo/song_0000.wav",
14 "duration": 15.0,
15 "type": "reference",
16 "source_dataset": "fma"
17 },
18 {
19 "song_id": "fma_00001",
20 "audio_path": "open_music_demo/song_0001.wav",
21 "duration": 15.0,
22 "type": "reference",
23 "source_dataset": "fma"
24 }
25 ]
...\ No newline at end of file ...\ No newline at end of file
1 []
...\ No newline at end of file ...\ No newline at end of file
...@@ -7,6 +7,7 @@ from pathlib import Path ...@@ -7,6 +7,7 @@ from pathlib import Path
7 from typing import Dict, List 7 from typing import Dict, List
8 import argparse 8 import argparse
9 import json 9 import json
10 import subprocess
10 11
11 12
12 @dataclass 13 @dataclass
...@@ -42,6 +43,36 @@ class BaseAdapter: ...@@ -42,6 +43,36 @@ class BaseAdapter:
42 json.dump(manifest, f, indent=2, ensure_ascii=False) 43 json.dump(manifest, f, indent=2, ensure_ascii=False)
43 return manifest 44 return manifest
44 45
46 def prepare_local_audio(
47 self,
48 input_dir: Path,
49 output_root: Path,
50 eval_ratio: float = 0.2,
51 query_duration: float = 8.0,
52 seed: int = 42,
53 ) -> Dict:
54 output_root.mkdir(parents=True, exist_ok=True)
55 cmd = [
56 "/usr/local/miniconda3/bin/python",
57 "src/data/manifest_tools.py",
58 "audio-dir-to-splits",
59 str(input_dir),
60 str(output_root),
61 "--source-dataset",
62 self.name,
63 "--eval-ratio",
64 str(eval_ratio),
65 "--query-duration",
66 str(query_duration),
67 "--seed",
68 str(seed),
69 ]
70 result = subprocess.check_output(cmd, text=True)
71 summary = json.loads(result)
72 summary["input_dir"] = str(input_dir)
73 summary["dataset"] = self.name
74 return summary
75
45 76
46 class FMAAdapter(BaseAdapter): 77 class FMAAdapter(BaseAdapter):
47 name = "fma" 78 name = "fma"
...@@ -156,6 +187,14 @@ def main(): ...@@ -156,6 +187,14 @@ def main():
156 p = sub.add_parser("describe") 187 p = sub.add_parser("describe")
157 p.add_argument("dataset", choices=sorted(ADAPTERS)) 188 p.add_argument("dataset", choices=sorted(ADAPTERS))
158 189
190 p = sub.add_parser("prepare-local")
191 p.add_argument("dataset", choices=sorted(ADAPTERS))
192 p.add_argument("input_dir")
193 p.add_argument("--output-root", default="data/external_ingested")
194 p.add_argument("--eval-ratio", type=float, default=0.2)
195 p.add_argument("--query-duration", type=float, default=8.0)
196 p.add_argument("--seed", type=int, default=42)
197
159 args = parser.parse_args() 198 args = parser.parse_args()
160 if args.cmd == "registry": 199 if args.cmd == "registry":
161 path = write_registry(args.output) 200 path = write_registry(args.output)
...@@ -165,6 +204,16 @@ def main(): ...@@ -165,6 +204,16 @@ def main():
165 print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False)) 204 print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False))
166 elif args.cmd == "describe": 205 elif args.cmd == "describe":
167 print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False)) 206 print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False))
207 elif args.cmd == "prepare-local":
208 root = Path(args.output_root) / args.dataset
209 summary = ADAPTERS[args.dataset].prepare_local_audio(
210 Path(args.input_dir),
211 root,
212 eval_ratio=args.eval_ratio,
213 query_duration=args.query_duration,
214 seed=args.seed,
215 )
216 print(json.dumps(summary, indent=2, ensure_ascii=False))
168 217
169 218
170 if __name__ == "__main__": 219 if __name__ == "__main__":
......
...@@ -97,6 +97,29 @@ ...@@ -97,6 +97,29 @@
97 - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力 97 - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力
98 - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录 98 - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录
99 99
100 ### Stage: adapter-level 本地开源目录接入
101
102 完成项:
103 - 扩展 `src/data/external_adapters.py`
104 - 新增 `prepare-local` 命令
105 - 支持通过 adapter 入口直接把本地开源音频目录转成:
106 - `catalog.json`
107 - `train.json`
108 - `test.json`
109 - `val.json`
110
111 验证结果:
112 - `python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py` 成功
113 - `python src/data/external_adapters.py prepare-local fma tmp/open_music_demo --output-root data/external_ingested/demo_via_adapter --eval-ratio 0.5 --query-duration 5.0` 成功
114 - 输出结果:
115 - `catalog=2`
116 - `train_queries=1`
117 - `test_queries=1`
118
119 结论:
120 - 现在接入真实 FMA / MTG-Jamendo 目录时,不需要再手动拼 manifests
121 - adapter 已经能作为统一入口管理开放数据集的训练/评估切分
122
100 ## 2026-06-02 123 ## 2026-06-02
101 124
102 ### Stage: 文档补全 + ACR 最小可运行链路 125 ### Stage: 文档补全 + ACR 最小可运行链路
......
...@@ -18,6 +18,12 @@ ...@@ -18,6 +18,12 @@
18 - CCMusic / ModelScope:优先当补充评估或探索来源 18 - CCMusic / ModelScope:优先当补充评估或探索来源
19 - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞 19 - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞
20 20
21 建议接入顺序:
22 1. 下载/准备 FMA 或 MTG-Jamendo 的本地音频目录
23 2. 运行 `external_adapters.py prepare-local ...`
24 3. 生成 `catalog/train/test/val` manifests
25 4.`train.json` 用于训练,将 `test.json` 用于固定评估
26
21 --- 27 ---
22 28
23 ## 1. 来源分层图 29 ## 1. 来源分层图
......
...@@ -142,6 +142,10 @@ flowchart LR ...@@ -142,6 +142,10 @@ flowchart LR
142 - 至少固定一部分曲目只进 `test.json`,不要同时参与训练 142 - 至少固定一部分曲目只进 `test.json`,不要同时参与训练
143 - 小数据集也要保证至少 1 个 train query + 1 个 test query 143 - 小数据集也要保证至少 1 个 train query + 1 个 test query
144 144
145 CLI 入口:
146 - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits`
147 - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>`
148
145 ## 5. 文字说明 149 ## 5. 文字说明
146 150
147 ### 5.1 为什么必须分离 catalog 和 query 151 ### 5.1 为什么必须分离 catalog 和 query
......