Enable open music datasets to feed train and eval splits
Constraint: Personal-use workflow needs real train/eval manifests rather than bootstrap-only placeholders Rejected: Keep external datasets as catalog skeletons only | Does not satisfy training/evaluation reuse requirement Confidence: high Scope-risk: narrow Directive: Wire real FMA or MTG-Jamendo local download directories into this ingestion path before larger-scale training Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/manifest_tools.py audio-dir-to-splits tmp/open_music_demo data/external_ingested/demo_fma_like --source-dataset demo_fma_like --eval-ratio 0.5 --query-duration 5.0 Not-tested: Full download/import of upstream FMA or MTG-Jamendo corpora
Showing
8 changed files
with
215 additions
and
1 deletions
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "demo_fma_like_00000", | ||
| 4 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 5 | "duration": 15.0, | ||
| 6 | "type": "reference", | ||
| 7 | "source_dataset": "demo_fma_like" | ||
| 8 | }, | ||
| 9 | { | ||
| 10 | "song_id": "demo_fma_like_00001", | ||
| 11 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 12 | "duration": 15.0, | ||
| 13 | "type": "reference", | ||
| 14 | "source_dataset": "demo_fma_like" | ||
| 15 | } | ||
| 16 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "demo_fma_like_00000", | ||
| 4 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 6.394, | ||
| 8 | "segment_type": "external_query", | ||
| 9 | "source_dataset": "demo_fma_like" | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "song_id": "demo_fma_like_00000", | ||
| 13 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 14 | "duration": 15.0, | ||
| 15 | "type": "reference", | ||
| 16 | "source_dataset": "demo_fma_like" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "demo_fma_like_00001", | ||
| 20 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 21 | "duration": 15.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "demo_fma_like" | ||
| 24 | } | ||
| 25 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "demo_fma_like_00001", | ||
| 4 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 2.75, | ||
| 8 | "segment_type": "external_query", | ||
| 9 | "source_dataset": "demo_fma_like" | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "song_id": "demo_fma_like_00000", | ||
| 13 | "audio_path": "open_music_demo/song_0000.wav", | ||
| 14 | "duration": 15.0, | ||
| 15 | "type": "reference", | ||
| 16 | "source_dataset": "demo_fma_like" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "demo_fma_like_00001", | ||
| 20 | "audio_path": "open_music_demo/song_0001.wav", | ||
| 21 | "duration": 15.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "demo_fma_like" | ||
| 24 | } | ||
| 25 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -5,8 +5,10 @@ from __future__ import annotations | ... | @@ -5,8 +5,10 @@ from __future__ import annotations |
| 5 | import argparse | 5 | import argparse |
| 6 | import csv | 6 | import csv |
| 7 | import json | 7 | import json |
| 8 | import random | ||
| 8 | from pathlib import Path | 9 | from pathlib import Path |
| 9 | from typing import List, Dict | 10 | from typing import List, Dict |
| 11 | import soundfile as sf | ||
| 10 | 12 | ||
| 11 | 13 | ||
| 12 | def write_catalog(records: List[Dict], output_path: Path): | 14 | def write_catalog(records: List[Dict], output_path: Path): |
| ... | @@ -33,6 +35,77 @@ def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_p | ... | @@ -33,6 +35,77 @@ def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_p |
| 33 | return len(records) | 35 | return len(records) |
| 34 | 36 | ||
| 35 | 37 | ||
| 38 | def build_train_eval_from_audio_dir( | ||
| 39 | audio_dir: Path, | ||
| 40 | output_dir: Path, | ||
| 41 | source_dataset: str, | ||
| 42 | exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"), | ||
| 43 | eval_ratio: float = 0.2, | ||
| 44 | query_duration: float = 8.0, | ||
| 45 | seed: int = 42, | ||
| 46 | ): | ||
| 47 | rng = random.Random(seed) | ||
| 48 | files = [p for p in sorted(audio_dir.rglob("*")) if p.suffix.lower() in exts] | ||
| 49 | output_dir.mkdir(parents=True, exist_ok=True) | ||
| 50 | manifests_dir = output_dir / "manifests" | ||
| 51 | manifests_dir.mkdir(parents=True, exist_ok=True) | ||
| 52 | |||
| 53 | refs = [] | ||
| 54 | train = [] | ||
| 55 | test = [] | ||
| 56 | |||
| 57 | for idx, path in enumerate(files): | ||
| 58 | rel = path.relative_to(output_dir.parent if output_dir.parent in path.parents else audio_dir.parent) | ||
| 59 | song_id = f"{source_dataset}_{idx:05d}" | ||
| 60 | try: | ||
| 61 | info = sf.info(str(path)) | ||
| 62 | duration = float(info.duration) | ||
| 63 | except Exception: | ||
| 64 | duration = 0.0 | ||
| 65 | |||
| 66 | ref = { | ||
| 67 | "song_id": song_id, | ||
| 68 | "audio_path": str(rel), | ||
| 69 | "duration": duration, | ||
| 70 | "type": "reference", | ||
| 71 | "source_dataset": source_dataset, | ||
| 72 | } | ||
| 73 | refs.append(ref) | ||
| 74 | |||
| 75 | if duration >= query_duration: | ||
| 76 | max_offset = max(0.0, duration - query_duration) | ||
| 77 | offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0 | ||
| 78 | query = { | ||
| 79 | "song_id": song_id, | ||
| 80 | "audio_path": str(rel), | ||
| 81 | "duration": query_duration, | ||
| 82 | "type": "clean", | ||
| 83 | "offset": round(offset, 3), | ||
| 84 | "segment_type": "external_query", | ||
| 85 | "source_dataset": source_dataset, | ||
| 86 | } | ||
| 87 | if rng.random() < eval_ratio: | ||
| 88 | test.append(query) | ||
| 89 | else: | ||
| 90 | train.append(query) | ||
| 91 | |||
| 92 | if len(files) >= 2 and not train and test: | ||
| 93 | train.append(test.pop()) | ||
| 94 | if len(files) >= 2 and not test and train: | ||
| 95 | test.append(train.pop()) | ||
| 96 | |||
| 97 | write_catalog(refs, manifests_dir / "catalog.json") | ||
| 98 | write_catalog(train + refs, manifests_dir / "train.json") | ||
| 99 | write_catalog(test + refs, manifests_dir / "test.json") | ||
| 100 | write_catalog([], manifests_dir / "val.json") | ||
| 101 | return { | ||
| 102 | "catalog": len(refs), | ||
| 103 | "train_queries": len(train), | ||
| 104 | "test_queries": len(test), | ||
| 105 | "output_dir": str(manifests_dir), | ||
| 106 | } | ||
| 107 | |||
| 108 | |||
| 36 | def main(): | 109 | def main(): |
| 37 | parser = argparse.ArgumentParser() | 110 | parser = argparse.ArgumentParser() |
| 38 | sub = parser.add_subparsers(dest="cmd", required=True) | 111 | sub = parser.add_subparsers(dest="cmd", required=True) |
| ... | @@ -43,10 +116,28 @@ def main(): | ... | @@ -43,10 +116,28 @@ def main(): |
| 43 | p.add_argument("--path-field", default="audio_path") | 116 | p.add_argument("--path-field", default="audio_path") |
| 44 | p.add_argument("--id-field", default="song_id") | 117 | p.add_argument("--id-field", default="song_id") |
| 45 | 118 | ||
| 119 | p = sub.add_parser("audio-dir-to-splits") | ||
| 120 | p.add_argument("audio_dir") | ||
| 121 | p.add_argument("output_dir") | ||
| 122 | p.add_argument("--source-dataset", required=True) | ||
| 123 | p.add_argument("--eval-ratio", type=float, default=0.2) | ||
| 124 | p.add_argument("--query-duration", type=float, default=8.0) | ||
| 125 | p.add_argument("--seed", type=int, default=42) | ||
| 126 | |||
| 46 | args = parser.parse_args() | 127 | args = parser.parse_args() |
| 47 | if args.cmd == "csv-to-catalog": | 128 | if args.cmd == "csv-to-catalog": |
| 48 | count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field) | 129 | count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field) |
| 49 | print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False)) | 130 | print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False)) |
| 131 | elif args.cmd == "audio-dir-to-splits": | ||
| 132 | summary = build_train_eval_from_audio_dir( | ||
| 133 | Path(args.audio_dir), | ||
| 134 | Path(args.output_dir), | ||
| 135 | source_dataset=args.source_dataset, | ||
| 136 | eval_ratio=args.eval_ratio, | ||
| 137 | query_duration=args.query_duration, | ||
| 138 | seed=args.seed, | ||
| 139 | ) | ||
| 140 | print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) | ||
| 50 | 141 | ||
| 51 | 142 | ||
| 52 | if __name__ == "__main__": | 143 | if __name__ == "__main__": | ... | ... |
| ... | @@ -72,6 +72,31 @@ | ... | @@ -72,6 +72,31 @@ |
| 72 | - `ecapa` 权重略升、`chroma` 略降能恢复 `humming_like`,同时保持 `confused` | 72 | - `ecapa` 权重略升、`chroma` 略降能恢复 `humming_like`,同时保持 `confused` |
| 73 | - 下一阶段应继续把外部开源数据集真正接成 train/eval manifests,而不是只停在 bootstrap | 73 | - 下一阶段应继续把外部开源数据集真正接成 train/eval manifests,而不是只停在 bootstrap |
| 74 | 74 | ||
| 75 | ### Stage: 开源数据集 ingestion(train/eval manifests) | ||
| 76 | |||
| 77 | 完成项: | ||
| 78 | - 扩展 `src/data/manifest_tools.py` | ||
| 79 | - 新增 `audio-dir-to-splits` CLI | ||
| 80 | - 支持从本地开放音频目录自动生成: | ||
| 81 | - `catalog.json` | ||
| 82 | - `train.json` | ||
| 83 | - `test.json` | ||
| 84 | - `val.json` | ||
| 85 | - 新增小数据集保护,确保个人使用场景下也能同时有 train/test queries | ||
| 86 | |||
| 87 | 验证结果: | ||
| 88 | - `python -m py_compile src/data/manifest_tools.py` 成功 | ||
| 89 | - 使用本地 demo 音频目录成功生成真实 manifests | ||
| 90 | - 修正后小样本结果: | ||
| 91 | - `catalog=2` | ||
| 92 | - `train_queries=1` | ||
| 93 | - `test_queries=1` | ||
| 94 | |||
| 95 | 结论: | ||
| 96 | - 项目现在不再只停留在 external bootstrap | ||
| 97 | - 已经具备把开源音乐数据目录直接切成训练/评估输入的能力 | ||
| 98 | - 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录 | ||
| 99 | |||
| 75 | ## 2026-06-02 | 100 | ## 2026-06-02 |
| 76 | 101 | ||
| 77 | ### Stage: 文档补全 + ACR 最小可运行链路 | 102 | ### Stage: 文档补全 + ACR 最小可运行链路 | ... | ... |
| ... | @@ -4,7 +4,8 @@ | ... | @@ -4,7 +4,8 @@ |
| 4 | 4 | ||
| 5 | ## 一页结论 | 5 | ## 一页结论 |
| 6 | 6 | ||
| 7 | - 外部数据集接入的第一原则不是“能下载”,而是“**能否合法商用**” | 7 | - 当前优先目标改为:**个人使用下充分利用开源数据集** |
| 8 | - 外部数据集接入现在不仅要能 bootstrap,还要能真实切成 train/eval manifests | ||
| 8 | - 当前建议优先级: | 9 | - 当前建议优先级: |
| 9 | 1. FMA | 10 | 1. FMA |
| 10 | 2. MTG-Jamendo | 11 | 2. MTG-Jamendo |
| ... | @@ -12,6 +13,11 @@ | ... | @@ -12,6 +13,11 @@ |
| 12 | 4. ModelScope music datasets(白名单后) | 13 | 4. ModelScope music datasets(白名单后) |
| 13 | - ModelScope 与 CCMusic 当前都不能默认直接进入商用训练 | 14 | - ModelScope 与 CCMusic 当前都不能默认直接进入商用训练 |
| 14 | 15 | ||
| 16 | 对个人使用的直接建议: | ||
| 17 | - FMA / MTG-Jamendo:优先转成训练与评估资产 | ||
| 18 | - CCMusic / ModelScope:优先当补充评估或探索来源 | ||
| 19 | - 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞 | ||
| 20 | |||
| 15 | --- | 21 | --- |
| 16 | 22 | ||
| 17 | ## 1. 来源分层图 | 23 | ## 1. 来源分层图 | ... | ... |
| ... | @@ -117,6 +117,31 @@ flowchart LR | ... | @@ -117,6 +117,31 @@ flowchart LR |
| 117 | - 对个人使用场景,推荐把一部分开源数据集固定成 **fusion tuning eval set** | 117 | - 对个人使用场景,推荐把一部分开源数据集固定成 **fusion tuning eval set** |
| 118 | - 这样训练、检索、调参可以分离,而不是每次都重训 | 118 | - 这样训练、检索、调参可以分离,而不是每次都重训 |
| 119 | 119 | ||
| 120 | --- | ||
| 121 | |||
| 122 | ## 4.3 开源数据集 train/eval 切分图 | ||
| 123 | |||
| 124 | ```mermaid | ||
| 125 | flowchart LR | ||
| 126 | A[Open Audio Dir] --> B[audio-dir-to-splits] | ||
| 127 | B --> C[catalog.json] | ||
| 128 | B --> D[train.json] | ||
| 129 | B --> E[test.json] | ||
| 130 | B --> F[val.json] | ||
| 131 | ``` | ||
| 132 | |||
| 133 | | 产物 | 用途 | 说明 | | ||
| 134 | |---|---|---| | ||
| 135 | | `catalog.json` | 建索引 | 所有 reference 曲目 | | ||
| 136 | | `train.json` | 训练查询 | query + references | | ||
| 137 | | `test.json` | 评估查询 | query + references | | ||
| 138 | | `val.json` | 预留验证集 | 当前可为空 | | ||
| 139 | |||
| 140 | 推荐法则(个人使用): | ||
| 141 | - FMA / MTG-Jamendo 可优先用于真实 train/eval baseline | ||
| 142 | - 至少固定一部分曲目只进 `test.json`,不要同时参与训练 | ||
| 143 | - 小数据集也要保证至少 1 个 train query + 1 个 test query | ||
| 144 | |||
| 120 | ## 5. 文字说明 | 145 | ## 5. 文字说明 |
| 121 | 146 | ||
| 122 | ### 5.1 为什么必须分离 catalog 和 query | 147 | ### 5.1 为什么必须分离 catalog 和 query | ... | ... |
-
Please register or sign in to post a comment