Commit 167aa6e5 167aa6e520482deaf6379353e7fdf0732dea9de6 by cnb.bofCdSsphPA

Enable open music datasets to feed train and eval splits

Constraint: Personal-use workflow needs real train/eval manifests rather than bootstrap-only placeholders
Rejected: Keep external datasets as catalog skeletons only | Does not satisfy training/evaluation reuse requirement
Confidence: high
Scope-risk: narrow
Directive: Wire real FMA or MTG-Jamendo local download directories into this ingestion path before larger-scale training
Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/manifest_tools.py audio-dir-to-splits tmp/open_music_demo data/external_ingested/demo_fma_like --source-dataset demo_fma_like --eval-ratio 0.5 --query-duration 5.0
Not-tested: Full download/import of upstream FMA or MTG-Jamendo corpora
1 parent d665b1fd
[
{
"song_id": "demo_fma_like_00000",
"audio_path": "open_music_demo/song_0000.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
},
{
"song_id": "demo_fma_like_00001",
"audio_path": "open_music_demo/song_0001.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
}
]
\ No newline at end of file
[
{
"song_id": "demo_fma_like_00000",
"audio_path": "open_music_demo/song_0000.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.394,
"segment_type": "external_query",
"source_dataset": "demo_fma_like"
},
{
"song_id": "demo_fma_like_00000",
"audio_path": "open_music_demo/song_0000.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
},
{
"song_id": "demo_fma_like_00001",
"audio_path": "open_music_demo/song_0001.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
}
]
\ No newline at end of file
[
{
"song_id": "demo_fma_like_00001",
"audio_path": "open_music_demo/song_0001.wav",
"duration": 5.0,
"type": "clean",
"offset": 2.75,
"segment_type": "external_query",
"source_dataset": "demo_fma_like"
},
{
"song_id": "demo_fma_like_00000",
"audio_path": "open_music_demo/song_0000.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
},
{
"song_id": "demo_fma_like_00001",
"audio_path": "open_music_demo/song_0001.wav",
"duration": 15.0,
"type": "reference",
"source_dataset": "demo_fma_like"
}
]
\ No newline at end of file
......@@ -5,8 +5,10 @@ from __future__ import annotations
import argparse
import csv
import json
import random
from pathlib import Path
from typing import List, Dict
import soundfile as sf
def write_catalog(records: List[Dict], output_path: Path):
......@@ -33,6 +35,77 @@ def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_p
return len(records)
def build_train_eval_from_audio_dir(
audio_dir: Path,
output_dir: Path,
source_dataset: str,
exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"),
eval_ratio: float = 0.2,
query_duration: float = 8.0,
seed: int = 42,
):
rng = random.Random(seed)
files = [p for p in sorted(audio_dir.rglob("*")) if p.suffix.lower() in exts]
output_dir.mkdir(parents=True, exist_ok=True)
manifests_dir = output_dir / "manifests"
manifests_dir.mkdir(parents=True, exist_ok=True)
refs = []
train = []
test = []
for idx, path in enumerate(files):
rel = path.relative_to(output_dir.parent if output_dir.parent in path.parents else audio_dir.parent)
song_id = f"{source_dataset}_{idx:05d}"
try:
info = sf.info(str(path))
duration = float(info.duration)
except Exception:
duration = 0.0
ref = {
"song_id": song_id,
"audio_path": str(rel),
"duration": duration,
"type": "reference",
"source_dataset": source_dataset,
}
refs.append(ref)
if duration >= query_duration:
max_offset = max(0.0, duration - query_duration)
offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0
query = {
"song_id": song_id,
"audio_path": str(rel),
"duration": query_duration,
"type": "clean",
"offset": round(offset, 3),
"segment_type": "external_query",
"source_dataset": source_dataset,
}
if rng.random() < eval_ratio:
test.append(query)
else:
train.append(query)
if len(files) >= 2 and not train and test:
train.append(test.pop())
if len(files) >= 2 and not test and train:
test.append(train.pop())
write_catalog(refs, manifests_dir / "catalog.json")
write_catalog(train + refs, manifests_dir / "train.json")
write_catalog(test + refs, manifests_dir / "test.json")
write_catalog([], manifests_dir / "val.json")
return {
"catalog": len(refs),
"train_queries": len(train),
"test_queries": len(test),
"output_dir": str(manifests_dir),
}
def main():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
......@@ -43,10 +116,28 @@ def main():
p.add_argument("--path-field", default="audio_path")
p.add_argument("--id-field", default="song_id")
p = sub.add_parser("audio-dir-to-splits")
p.add_argument("audio_dir")
p.add_argument("output_dir")
p.add_argument("--source-dataset", required=True)
p.add_argument("--eval-ratio", type=float, default=0.2)
p.add_argument("--query-duration", type=float, default=8.0)
p.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
if args.cmd == "csv-to-catalog":
count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))
elif args.cmd == "audio-dir-to-splits":
summary = build_train_eval_from_audio_dir(
Path(args.audio_dir),
Path(args.output_dir),
source_dataset=args.source_dataset,
eval_ratio=args.eval_ratio,
query_duration=args.query_duration,
seed=args.seed,
)
print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
if __name__ == "__main__":
......
......@@ -72,6 +72,31 @@
- `ecapa` 权重略升、`chroma` 略降能恢复 `humming_like`,同时保持 `confused`
- 下一阶段应继续把外部开源数据集真正接成 train/eval manifests,而不是只停在 bootstrap
### Stage: 开源数据集 ingestion(train/eval manifests)
完成项:
- 扩展 `src/data/manifest_tools.py`
- 新增 `audio-dir-to-splits` CLI
- 支持从本地开放音频目录自动生成:
- `catalog.json`
- `train.json`
- `test.json`
- `val.json`
- 新增小数据集保护,确保个人使用场景下也能同时有 train/test queries
验证结果:
- `python -m py_compile src/data/manifest_tools.py` 成功
- 使用本地 demo 音频目录成功生成真实 manifests
- 修正后小样本结果:
- `catalog=2`
- `train_queries=1`
- `test_queries=1`
结论:
- 项目现在不再只停留在 external bootstrap
- 已经具备把开源音乐数据目录直接切成训练/评估输入的能力
- 下一阶段可以继续对接真实 FMA / MTG-Jamendo 下载目录
## 2026-06-02
### Stage: 文档补全 + ACR 最小可运行链路
......
......@@ -4,7 +4,8 @@
## 一页结论
- 外部数据集接入的第一原则不是“能下载”,而是“**能否合法商用**
- 当前优先目标改为:**个人使用下充分利用开源数据集**
- 外部数据集接入现在不仅要能 bootstrap,还要能真实切成 train/eval manifests
- 当前建议优先级:
1. FMA
2. MTG-Jamendo
......@@ -12,6 +13,11 @@
4. ModelScope music datasets(白名单后)
- ModelScope 与 CCMusic 当前都不能默认直接进入商用训练
对个人使用的直接建议:
- FMA / MTG-Jamendo:优先转成训练与评估资产
- CCMusic / ModelScope:优先当补充评估或探索来源
- 保留 license 注记,但不再把“商用阻塞”作为个人实验主阻塞
---
## 1. 来源分层图
......
......@@ -117,6 +117,31 @@ flowchart LR
- 对个人使用场景,推荐把一部分开源数据集固定成 **fusion tuning eval set**
- 这样训练、检索、调参可以分离,而不是每次都重训
---
## 4.3 开源数据集 train/eval 切分图
```mermaid
flowchart LR
A[Open Audio Dir] --> B[audio-dir-to-splits]
B --> C[catalog.json]
B --> D[train.json]
B --> E[test.json]
B --> F[val.json]
```
| 产物 | 用途 | 说明 |
|---|---|---|
| `catalog.json` | 建索引 | 所有 reference 曲目 |
| `train.json` | 训练查询 | query + references |
| `test.json` | 评估查询 | query + references |
| `val.json` | 预留验证集 | 当前可为空 |
推荐法则(个人使用):
- FMA / MTG-Jamendo 可优先用于真实 train/eval baseline
- 至少固定一部分曲目只进 `test.json`,不要同时参与训练
- 小数据集也要保证至少 1 个 train query + 1 个 test query
## 5. 文字说明
### 5.1 为什么必须分离 catalog 和 query
......