Commit ad350314 ad35031460899f747fb0c9e6b1c5bc6977dca139 by cnb.bofCdSsphPA

Add external dataset bootstrap and record hard-case oversampling regression

Extend the data ingress path with bootstrap manifests for real datasets and capture an unsuccessful hard-case oversampling experiment so future iterations can avoid repeating the same weak strategy.

Constraint: Continuous optimization requires preserving negative results, not just successful ones
Rejected: Drop the oversampling attempt without record | would lose evidence and encourage redoing the same low-yield change
Confidence: high
Scope-risk: moderate
Directive: Next hard-case work should focus on melody-aware supervision and harder negatives instead of naive sample repetition
Tested: bootstrap manifest generation for FMA and CCMusic; 2-epoch CPU training for models_v4; index_v4 build; fast eval JSON generation for smoke-v4
Not-tested: whitelisted real audio ingestion beyond placeholder manifests; full melody-aware slow-eval on models_v4
1 parent 1b812bea
# ccmusic bootstrap
- Fill raw audio files under `raw/`
- Review license before training
- Convert to final catalog/query manifests
[
{
"song_id": "ccmusic_track_0000",
"audio_path": "raw/ccmusic_track_0000.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "ccmusic",
"license_status": "review_required"
},
{
"song_id": "ccmusic_track_0001",
"audio_path": "raw/ccmusic_track_0001.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "ccmusic",
"license_status": "review_required"
},
{
"song_id": "ccmusic_track_0002",
"audio_path": "raw/ccmusic_track_0002.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "ccmusic",
"license_status": "review_required"
}
]
\ No newline at end of file
# fma bootstrap
- Fill raw audio files under `raw/`
- Review license before training
- Convert to final catalog/query manifests
[
{
"song_id": "fma_track_0000",
"audio_path": "raw/fma_track_0000.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "fma",
"license_status": "review_required"
},
{
"song_id": "fma_track_0001",
"audio_path": "raw/fma_track_0001.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "fma",
"license_status": "review_required"
},
{
"song_id": "fma_track_0002",
"audio_path": "raw/fma_track_0002.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "fma",
"license_status": "review_required"
}
]
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
{
"song_0000": 0,
"song_0001": 1,
"song_0002": 2,
"song_0003": 3,
"song_0004": 4,
"song_0005": 5,
"song_0006": 6,
"song_0007": 7,
"song_0008": 8,
"song_0009": 9,
"song_0010": 10,
"song_0011": 11,
"song_0012": 12,
"song_0013": 13,
"song_0014": 14,
"song_0015": 15
}
\ No newline at end of file
{
"split": "test",
"num_queries": 20,
"top1": 0.4,
"topk": 0.8,
"by_type": {
"clean": {
"n": 8,
"top1": 0.75,
"topk": 1.0
},
"augmented": {
"n": 4,
"top1": 0.5,
"topk": 1.0
},
"humming_like": {
"n": 4,
"top1": 0.0,
"topk": 0.75
},
"confused": {
"n": 4,
"top1": 0.0,
"topk": 0.25
}
},
"hard_case_summary": {
"humming_like": {
"n": 4,
"top1": 0.0,
"topk": 0.75
},
"confused": {
"n": 4,
"top1": 0.0,
"topk": 0.25
}
},
"sample_failures": [
{
"truth": "song_0020",
"query": "segments/song_0020_seg_04_confused.wav",
"type": "confused",
"preds": [
"song_0010",
"song_0014",
"song_0012",
"song_0009",
"song_0005"
]
},
{
"truth": "song_0021",
"query": "segments/song_0021_seg_04_confused.wav",
"type": "confused",
"preds": [
"song_0001",
"song_0022",
"song_0003",
"song_0023",
"song_0002"
]
},
{
"truth": "song_0022",
"query": "segments/song_0022_seg_03_humming_like.wav",
"type": "humming_like",
"preds": [
"song_0007",
"song_0009",
"song_0021",
"song_0000",
"song_0002"
]
},
{
"truth": "song_0023",
"query": "segments/song_0023_seg_04_confused.wav",
"type": "confused",
"preds": [
"song_0012",
"song_0002",
"song_0022",
"song_0000",
"song_0006"
]
}
]
}
\ No newline at end of file
#!/usr/bin/env python3
"""Bootstrap manifest skeletons for whitelisted external datasets."""
import argparse
import json
from pathlib import Path
TEMPLATES = {
'fma': {'source_dataset': 'fma', 'license_status': 'review_required'},
'mtg_jamendo': {'source_dataset': 'mtg_jamendo', 'license_status': 'review_required'},
'ccmusic': {'source_dataset': 'ccmusic', 'license_status': 'review_required'},
'modelscope_music': {'source_dataset': 'modelscope_music', 'license_status': 'deny_until_whitelisted'},
}
def bootstrap(dataset: str, output_dir: str, num_placeholders: int = 3):
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
base = TEMPLATES[dataset]
rows = []
for i in range(num_placeholders):
rows.append({
'song_id': f'{dataset}_track_{i:04d}',
'audio_path': f'raw/{dataset}_track_{i:04d}.wav',
'duration': 0.0,
'type': 'reference',
**base,
})
(out / 'raw').mkdir(exist_ok=True)
(out / 'manifests').mkdir(exist_ok=True)
with open(out / 'manifests' / 'catalog.bootstrap.json', 'w') as f:
json.dump(rows, f, indent=2, ensure_ascii=False)
with open(out / 'README.bootstrap.md', 'w') as f:
f.write(f'# {dataset} bootstrap\n\n- Fill raw audio files under `raw/`\n- Review license before training\n- Convert to final catalog/query manifests\n')
return out / 'manifests' / 'catalog.bootstrap.json'
def main():
parser = argparse.ArgumentParser()
parser.add_argument('dataset', choices=sorted(TEMPLATES))
parser.add_argument('--output-dir', required=True)
parser.add_argument('--num-placeholders', type=int, default=3)
args = parser.parse_args()
path = bootstrap(args.dataset, args.output_dir, args.num_placeholders)
print(path)
if __name__ == '__main__':
main()
......@@ -191,10 +191,14 @@ class SongPairDataset(Dataset):
self.by_song.setdefault(item["song_id"], []).append(item)
self.song_ids = sorted(self.by_song)
self.sample_song_ids = []
for sid, items in self.by_song.items():
weight = 3 if any(x.get("type") in {"confused", "humming_like"} for x in items) else 1
self.sample_song_ids.extend([sid] * weight)
self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
def __len__(self):
return len(self.song_ids)
return len(self.sample_song_ids)
def _load_clip(self, sample: Dict) -> np.ndarray:
path = self.data_dir / sample["audio_path"]
......@@ -217,7 +221,7 @@ class SongPairDataset(Dataset):
return torch.FloatTensor(mel)
def __getitem__(self, idx):
song_id = self.song_ids[idx]
song_id = self.sample_song_ids[idx]
choices = self.by_song[song_id]
if len(choices) == 1:
a = b = choices[0]
......
......@@ -113,3 +113,26 @@
- artifact generator 成功输出 4 类发布产物
- `reports/smoke-v2/synthetic_v2/` 目录产物存在性检查通过
- 当前 fast-eval 指标:top1=0.60, top5=0.75,hard-case 仍需继续优化
## 2026-06-02
### Stage: 外部数据集 bootstrap + hard-case 过采样试验
完成项:
- 新增 `src/data/bootstrap_external.py`
- 可自动为 `fma` / `ccmusic` 生成 bootstrap catalog manifest
-`SongPairDataset` 中加入困难样本过采样试验(`confused` / `humming_like`
- 重新训练 `models_v4`、重建 `index_v4`、重跑 `smoke-v4` 评测
验证结果:
- `data/external_bootstrap/fma/manifests/catalog.bootstrap.json` 成功生成
- `data/external_bootstrap/ccmusic/manifests/catalog.bootstrap.json` 成功生成
- `reports/smoke-v4/synthetic_v2/eval.json` 成功生成
- 当前试验结果:top1=0.40, top5=0.80
- hard-case 结果未改善:
- humming_like top1=0.00
- confused top1=0.00
结论:
- 该轮简单过采样策略无效,且整体精度下降
- 下一轮应改用更细粒度 hard-negative / melody-aware 正则,而不是继续放大样本重复权重
......