Unify open dataset preparation behind adapter commands
Constraint: Personal-use experimentation needs a single entrypoint from local open-audio directories to train/eval manifests Rejected: Separate manual manifest generation per dataset | Too error-prone and slows iterative training/evaluation Confidence: high Scope-risk: narrow Directive: Point real FMA or MTG-Jamendo local download folders at prepare-local before expanding training runs Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma tmp/open_music_demo --output-root data/external_ingested/demo_via_adapter --eval-ratio 0.5 --query-duration 5.0 Not-tested: Full upstream corpus import and large-scale training
Showing
8 changed files
with
149 additions
and
0 deletions
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "fma_00000", | ||
| 4 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 5 | "duration": 15.0, | ||
| 6 | "type": "reference", | ||
| 7 | "source_dataset": "fma" | ||
| 8 | }, | ||
| 9 | { | ||
| 10 | "song_id": "fma_00001", | ||
| 11 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 12 | "duration": 15.0, | ||
| 13 | "type": "reference", | ||
| 14 | "source_dataset": "fma" | ||
| 15 | } | ||
| 16 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "fma_00000", | ||
| 4 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 6.394, | ||
| 8 | "segment_type": "external_query", | ||
| 9 | "source_dataset": "fma" | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "song_id": "fma_00000", | ||
| 13 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 14 | "duration": 15.0, | ||
| 15 | "type": "reference", | ||
| 16 | "source_dataset": "fma" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "fma_00001", | ||
| 20 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 21 | "duration": 15.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "fma" | ||
| 24 | } | ||
| 25 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "fma_00001", | ||
| 4 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 2.75, | ||
| 8 | "segment_type": "external_query", | ||
| 9 | "source_dataset": "fma" | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "song_id": "fma_00000", | ||
| 13 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 14 | "duration": 15.0, | ||
| 15 | "type": "reference", | ||
| 16 | "source_dataset": "fma" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "fma_00001", | ||
| 20 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 21 | "duration": 15.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "fma" | ||
| 24 | } | ||
| 25 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -7,6 +7,7 @@ from pathlib import Path | ... | @@ -7,6 +7,7 @@ from pathlib import Path |
| 7 | from typing import Dict, List | 7 | from typing import Dict, List |
| 8 | import argparse | 8 | import argparse |
| 9 | import json | 9 | import json |
| 10 | import subprocess | ||
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | @dataclass | 13 | @dataclass |
| ... | @@ -42,6 +43,36 @@ class BaseAdapter: | ... | @@ -42,6 +43,36 @@ class BaseAdapter: |
| 42 | json.dump(manifest, f, indent=2, ensure_ascii=False) | 43 | json.dump(manifest, f, indent=2, ensure_ascii=False) |
| 43 | return manifest | 44 | return manifest |
| 44 | 45 | ||
| 46 | def prepare_local_audio( | ||
| 47 | self, | ||
| 48 | input_dir: Path, | ||
| 49 | output_root: Path, | ||
| 50 | eval_ratio: float = 0.2, | ||
| 51 | query_duration: float = 8.0, | ||
| 52 | seed: int = 42, | ||
| 53 | ) -> Dict: | ||
| 54 | output_root.mkdir(parents=True, exist_ok=True) | ||
| 55 | cmd = [ | ||
| 56 | "/usr/local/miniconda3/bin/python", | ||
| 57 | "src/data/manifest_tools.py", | ||
| 58 | "audio-dir-to-splits", | ||
| 59 | str(input_dir), | ||
| 60 | str(output_root), | ||
| 61 | "--source-dataset", | ||
| 62 | self.name, | ||
| 63 | "--eval-ratio", | ||
| 64 | str(eval_ratio), | ||
| 65 | "--query-duration", | ||
| 66 | str(query_duration), | ||
| 67 | "--seed", | ||
| 68 | str(seed), | ||
| 69 | ] | ||
| 70 | result = subprocess.check_output(cmd, text=True) | ||
| 71 | summary = json.loads(result) | ||
| 72 | summary["input_dir"] = str(input_dir) | ||
| 73 | summary["dataset"] = self.name | ||
| 74 | return summary | ||
| 75 | |||
| 45 | 76 | ||
| 46 | class FMAAdapter(BaseAdapter): | 77 | class FMAAdapter(BaseAdapter): |
| 47 | name = "fma" | 78 | name = "fma" |
| ... | @@ -156,6 +187,14 @@ def main(): | ... | @@ -156,6 +187,14 @@ def main(): |
| 156 | p = sub.add_parser("describe") | 187 | p = sub.add_parser("describe") |
| 157 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | 188 | p.add_argument("dataset", choices=sorted(ADAPTERS)) |
| 158 | 189 | ||
| 190 | p = sub.add_parser("prepare-local") | ||
| 191 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | ||
| 192 | p.add_argument("input_dir") | ||
| 193 | p.add_argument("--output-root", default="data/external_ingested") | ||
| 194 | p.add_argument("--eval-ratio", type=float, default=0.2) | ||
| 195 | p.add_argument("--query-duration", type=float, default=8.0) | ||
| 196 | p.add_argument("--seed", type=int, default=42) | ||
| 197 | |||
| 159 | args = parser.parse_args() | 198 | args = parser.parse_args() |
| 160 | if args.cmd == "registry": | 199 | if args.cmd == "registry": |
| 161 | path = write_registry(args.output) | 200 | path = write_registry(args.output) |
| ... | @@ -165,6 +204,16 @@ def main(): | ... | @@ -165,6 +204,16 @@ def main(): |
| 165 | print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False)) | 204 | print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False)) |
| 166 | elif args.cmd == "describe": | 205 | elif args.cmd == "describe": |
| 167 | print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False)) | 206 | print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False)) |
| 207 | elif args.cmd == "prepare-local": | ||
| 208 | root = Path(args.output_root) / args.dataset | ||
| 209 | summary = ADAPTERS[args.dataset].prepare_local_audio( | ||
| 210 | Path(args.input_dir), | ||
| 211 | root, | ||
| 212 | eval_ratio=args.eval_ratio, | ||
| 213 | query_duration=args.query_duration, | ||
| 214 | seed=args.seed, | ||
| 215 | ) | ||
| 216 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | ||
| 168 | 217 | ||
| 169 | 218 | ||
| 170 | if __name__ == "__main__": | 219 | if __name__ == "__main__": | ... | ... |
| ... | @@ -97,6 +97,29 @@ | ... | @@ -97,6 +97,29 @@ |
| 97 | - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力 | 97 | - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力 |
| 98 | - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录 | 98 | - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录 |
| 99 | 99 | ||
| 100 | ### Stage: adapter-level 本地开源目录接入 | ||
| 101 | |||
| 102 | 完成项: | ||
| 103 | - 扩展 `src/data/external_adapters.py` | ||
| 104 | - 新增 `prepare-local` 命令 | ||
| 105 | - 支持通过 adapter 入口直接把本地开源音频目录转成: | ||
| 106 | - `catalog.json` | ||
| 107 | - `train.json` | ||
| 108 | - `test.json` | ||
| 109 | - `val.json` | ||
| 110 | |||
| 111 | 验证结果: | ||
| 112 | - `python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py` 成功 | ||
| 113 | - `python src/data/external_adapters.py prepare-local fma tmp/open_music_demo --output-root data/external_ingested/demo_via_adapter --eval-ratio 0.5 --query-duration 5.0` 成功 | ||
| 114 | - 输出结果: | ||
| 115 | - `catalog=2` | ||
| 116 | - `train_queries=1` | ||
| 117 | - `test_queries=1` | ||
| 118 | |||
| 119 | 结论: | ||
| 120 | - 现在接入真实 FMA / MTG-Jamendo 目录时,不需要再手动拼 manifests | ||
| 121 | - adapter 已经能作为统一入口管理开放数据集的训练/评估切分 | ||
| 122 | |||
| 100 | ## 2026-06-02 | 123 | ## 2026-06-02 |
| 101 | 124 | ||
| 102 | ### Stage: 文档补全 + ACR 最小可运行链路 | 125 | ### Stage: 文档补全 + ACR 最小可运行链路 | ... | ... |
| ... | @@ -18,6 +18,12 @@ | ... | @@ -18,6 +18,12 @@ |
| 18 | - CCMusic / ModelScope:优先当补充评估或探索来源 | 18 | - CCMusic / ModelScope:优先当补充评估或探索来源 |
| 19 | - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞 | 19 | - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞 |
| 20 | 20 | ||
| 21 | 建议接入顺序: | ||
| 22 | 1. 下载/准备 FMA 或 MTG-Jamendo 的本地音频目录 | ||
| 23 | 2. 运行 `external_adapters.py prepare-local ...` | ||
| 24 | 3. 生成 `catalog/train/test/val` manifests | ||
| 25 | 4. 将 `train.json` 用于训练,将 `test.json` 用于固定评估 | ||
| 26 | |||
| 21 | --- | 27 | --- |
| 22 | 28 | ||
| 23 | ## 1. 来源分层图 | 29 | ## 1. 来源分层图 | ... | ... |
| ... | @@ -142,6 +142,10 @@ flowchart LR | ... | @@ -142,6 +142,10 @@ flowchart LR |
| 142 | - 至少固定一部分曲目只进 `test.json`,不要同时参与训练 | 142 | - 至少固定一部分曲目只进 `test.json`,不要同时参与训练 |
| 143 | - 小数据集也要保证至少 1 个 train query + 1 个 test query | 143 | - 小数据集也要保证至少 1 个 train query + 1 个 test query |
| 144 | 144 | ||
| 145 | CLI 入口: | ||
| 146 | - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits` | ||
| 147 | - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>` | ||
| 148 | |||
| 145 | ## 5. 文字说明 | 149 | ## 5. 文字说明 |
| 146 | 150 | ||
| 147 | ### 5.1 为什么必须分离 catalog 和 query | 151 | ### 5.1 为什么必须分离 catalog 和 query | ... | ... |
-
Please register or sign in to post a comment