Commit 167aa6e5 167aa6e520482deaf6379353e7fdf0732dea9de6 by cnb.bofCdSsphPA

Enable open music datasets to feed train and eval splits

Constraint: Personal-use workflow needs real train/eval manifests rather than bootstrap-only placeholders
Rejected: Keep external datasets as catalog skeletons only | Does not satisfy training/evaluation reuse requirement
Confidence: high
Scope-risk: narrow
Directive: Wire real FMA or MTG-Jamendo local download directories into this ingestion path before larger-scale training
Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/manifest_tools.py audio-dir-to-splits tmp/open_music_demo data/external_ingested/demo_fma_like --source-dataset demo_fma_like --eval-ratio 0.5 --query-duration 5.0
Not-tested: Full download/import of upstream FMA or MTG-Jamendo corpora
1 parent d665b1fd
1 [
2 {
3 "song_id": "demo_fma_like_00000",
4 "audio_path": "open_music_demo/song_0000.wav",
5 "duration": 15.0,
6 "type": "reference",
7 "source_dataset": "demo_fma_like"
8 },
9 {
10 "song_id": "demo_fma_like_00001",
11 "audio_path": "open_music_demo/song_0001.wav",
12 "duration": 15.0,
13 "type": "reference",
14 "source_dataset": "demo_fma_like"
15 }
16 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "demo_fma_like_00000",
4 "audio_path": "open_music_demo/song_0000.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 6.394,
8 "segment_type": "external_query",
9 "source_dataset": "demo_fma_like"
10 },
11 {
12 "song_id": "demo_fma_like_00000",
13 "audio_path": "open_music_demo/song_0000.wav",
14 "duration": 15.0,
15 "type": "reference",
16 "source_dataset": "demo_fma_like"
17 },
18 {
19 "song_id": "demo_fma_like_00001",
20 "audio_path": "open_music_demo/song_0001.wav",
21 "duration": 15.0,
22 "type": "reference",
23 "source_dataset": "demo_fma_like"
24 }
25 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "demo_fma_like_00001",
4 "audio_path": "open_music_demo/song_0001.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 2.75,
8 "segment_type": "external_query",
9 "source_dataset": "demo_fma_like"
10 },
11 {
12 "song_id": "demo_fma_like_00000",
13 "audio_path": "open_music_demo/song_0000.wav",
14 "duration": 15.0,
15 "type": "reference",
16 "source_dataset": "demo_fma_like"
17 },
18 {
19 "song_id": "demo_fma_like_00001",
20 "audio_path": "open_music_demo/song_0001.wav",
21 "duration": 15.0,
22 "type": "reference",
23 "source_dataset": "demo_fma_like"
24 }
25 ]
...\ No newline at end of file ...\ No newline at end of file
1 []
...\ No newline at end of file ...\ No newline at end of file
...@@ -5,8 +5,10 @@ from __future__ import annotations ...@@ -5,8 +5,10 @@ from __future__ import annotations
5 import argparse 5 import argparse
6 import csv 6 import csv
7 import json 7 import json
8 import random
8 from pathlib import Path 9 from pathlib import Path
9 from typing import List, Dict 10 from typing import List, Dict
11 import soundfile as sf
10 12
11 13
12 def write_catalog(records: List[Dict], output_path: Path): 14 def write_catalog(records: List[Dict], output_path: Path):
...@@ -33,6 +35,77 @@ def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_p ...@@ -33,6 +35,77 @@ def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_p
33 return len(records) 35 return len(records)
34 36
35 37
38 def build_train_eval_from_audio_dir(
39 audio_dir: Path,
40 output_dir: Path,
41 source_dataset: str,
42 exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"),
43 eval_ratio: float = 0.2,
44 query_duration: float = 8.0,
45 seed: int = 42,
46 ):
47 rng = random.Random(seed)
48 files = [p for p in sorted(audio_dir.rglob("*")) if p.suffix.lower() in exts]
49 output_dir.mkdir(parents=True, exist_ok=True)
50 manifests_dir = output_dir / "manifests"
51 manifests_dir.mkdir(parents=True, exist_ok=True)
52
53 refs = []
54 train = []
55 test = []
56
57 for idx, path in enumerate(files):
58 rel = path.relative_to(output_dir.parent if output_dir.parent in path.parents else audio_dir.parent)
59 song_id = f"{source_dataset}_{idx:05d}"
60 try:
61 info = sf.info(str(path))
62 duration = float(info.duration)
63 except Exception:
64 duration = 0.0
65
66 ref = {
67 "song_id": song_id,
68 "audio_path": str(rel),
69 "duration": duration,
70 "type": "reference",
71 "source_dataset": source_dataset,
72 }
73 refs.append(ref)
74
75 if duration >= query_duration:
76 max_offset = max(0.0, duration - query_duration)
77 offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0
78 query = {
79 "song_id": song_id,
80 "audio_path": str(rel),
81 "duration": query_duration,
82 "type": "clean",
83 "offset": round(offset, 3),
84 "segment_type": "external_query",
85 "source_dataset": source_dataset,
86 }
87 if rng.random() < eval_ratio:
88 test.append(query)
89 else:
90 train.append(query)
91
92 if len(files) >= 2 and not train and test:
93 train.append(test.pop())
94 if len(files) >= 2 and not test and train:
95 test.append(train.pop())
96
97 write_catalog(refs, manifests_dir / "catalog.json")
98 write_catalog(train + refs, manifests_dir / "train.json")
99 write_catalog(test + refs, manifests_dir / "test.json")
100 write_catalog([], manifests_dir / "val.json")
101 return {
102 "catalog": len(refs),
103 "train_queries": len(train),
104 "test_queries": len(test),
105 "output_dir": str(manifests_dir),
106 }
107
108
36 def main(): 109 def main():
37 parser = argparse.ArgumentParser() 110 parser = argparse.ArgumentParser()
38 sub = parser.add_subparsers(dest="cmd", required=True) 111 sub = parser.add_subparsers(dest="cmd", required=True)
...@@ -43,10 +116,28 @@ def main(): ...@@ -43,10 +116,28 @@ def main():
43 p.add_argument("--path-field", default="audio_path") 116 p.add_argument("--path-field", default="audio_path")
44 p.add_argument("--id-field", default="song_id") 117 p.add_argument("--id-field", default="song_id")
45 118
119 p = sub.add_parser("audio-dir-to-splits")
120 p.add_argument("audio_dir")
121 p.add_argument("output_dir")
122 p.add_argument("--source-dataset", required=True)
123 p.add_argument("--eval-ratio", type=float, default=0.2)
124 p.add_argument("--query-duration", type=float, default=8.0)
125 p.add_argument("--seed", type=int, default=42)
126
46 args = parser.parse_args() 127 args = parser.parse_args()
47 if args.cmd == "csv-to-catalog": 128 if args.cmd == "csv-to-catalog":
48 count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field) 129 count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
49 print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False)) 130 print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))
131 elif args.cmd == "audio-dir-to-splits":
132 summary = build_train_eval_from_audio_dir(
133 Path(args.audio_dir),
134 Path(args.output_dir),
135 source_dataset=args.source_dataset,
136 eval_ratio=args.eval_ratio,
137 query_duration=args.query_duration,
138 seed=args.seed,
139 )
140 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
50 141
51 142
52 if __name__ == "__main__": 143 if __name__ == "__main__":
......
...@@ -72,6 +72,31 @@ ...@@ -72,6 +72,31 @@
72 - `ecapa` 权重略升、`chroma` 略降能恢复 `humming_like`,同时保持 `confused` 72 - `ecapa` 权重略升、`chroma` 略降能恢复 `humming_like`,同时保持 `confused`
73 - 下一阶段应继续把外部开源数据集真正接成 train/eval manifests,而不是只停在 bootstrap 73 - 下一阶段应继续把外部开源数据集真正接成 train/eval manifests,而不是只停在 bootstrap
74 74
75 ### Stage: 开源数据集 ingestion(train/eval manifests)
76
77 完成项:
78 - 扩展 `src/data/manifest_tools.py`
79 - 新增 `audio-dir-to-splits` CLI
80 - 支持从本地开放音频目录自动生成:
81 - `catalog.json`
82 - `train.json`
83 - `test.json`
84 - `val.json`
85 - 新增小数据集保护,确保个人使用场景下也能同时有 train/test queries
86
87 验证结果:
88 - `python -m py_compile src/data/manifest_tools.py` 成功
89 - 使用本地 demo 音频目录成功生成真实 manifests
90 - 修正后小样本结果:
91 - `catalog=2`
92 - `train_queries=1`
93 - `test_queries=1`
94
95 结论:
96 - 项目现在不再只停留在 external bootstrap
97 - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力
98 - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录
99
75 ## 2026-06-02 100 ## 2026-06-02
76 101
77 ### Stage: 文档补全 + ACR 最小可运行链路 102 ### Stage: 文档补全 + ACR 最小可运行链路
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
4 4
5 ## 一页结论 5 ## 一页结论
6 6
7 - 外部数据集接入的第一原则不是“能下载”,而是“**能否合法商用** 7 - 当前优先目标改为:**个人使用下充分利用开源数据集**
8 - 外部数据集接入现在不仅要能 bootstrap,还要能真实切成 train/eval manifests
8 - 当前建议优先级: 9 - 当前建议优先级:
9 1. FMA 10 1. FMA
10 2. MTG-Jamendo 11 2. MTG-Jamendo
...@@ -12,6 +13,11 @@ ...@@ -12,6 +13,11 @@
12 4. ModelScope music datasets(白名单后) 13 4. ModelScope music datasets(白名单后)
13 - ModelScope 与 CCMusic 当前都不能默认直接进入商用训练 14 - ModelScope 与 CCMusic 当前都不能默认直接进入商用训练
14 15
16 对个人使用的直接建议:
17 - FMA / MTG-Jamendo:优先转成训练与评估资产
18 - CCMusic / ModelScope:优先当补充评估或探索来源
19 - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞
20
15 --- 21 ---
16 22
17 ## 1. 来源分层图 23 ## 1. 来源分层图
......
...@@ -117,6 +117,31 @@ flowchart LR ...@@ -117,6 +117,31 @@ flowchart LR
117 - 对个人使用场景,推荐把一部分开源数据集固定成 **fusion tuning eval set** 117 - 对个人使用场景,推荐把一部分开源数据集固定成 **fusion tuning eval set**
118 - 这样训练、检索、调参可以分离,而不是每次都重训 118 - 这样训练、检索、调参可以分离,而不是每次都重训
119 119
120 ---
121
122 ## 4.3 开源数据集 train/eval 切分图
123
124 ```mermaid
125 flowchart LR
126 A[Open Audio Dir] --> B[audio-dir-to-splits]
127 B --> C[catalog.json]
128 B --> D[train.json]
129 B --> E[test.json]
130 B --> F[val.json]
131 ```
132
133 | 产物 | 用途 | 说明 |
134 |---|---|---|
135 | `catalog.json` | 建索引 | 所有 reference 曲目 |
136 | `train.json` | 训练查询 | query + references |
137 | `test.json` | 评估查询 | query + references |
138 | `val.json` | 预留验证集 | 当前可为空 |
139
140 推荐法则(个人使用):
141 - FMA / MTG-Jamendo 可优先用于真实 train/eval baseline
142 - 至少固定一部分曲目只进 `test.json`,不要同时参与训练
143 - 小数据集也要保证至少 1 个 train query + 1 个 test query
144
120 ## 5. 文字说明 145 ## 5. 文字说明
121 146
122 ### 5.1 为什么必须分离 catalog 和 query 147 ### 5.1 为什么必须分离 catalog 和 query
......