Commit ad350314 ad35031460899f747fb0c9e6b1c5bc6977dca139 by cnb.bofCdSsphPA

Add external dataset bootstrap and record hard-case oversampling regression

Extend the data ingress path with bootstrap manifests for real datasets and capture an unsuccessful hard-case oversampling experiment so future iterations can avoid repeating the same weak strategy.

Constraint: Continuous optimization requires preserving negative results, not just successful ones
Rejected: Drop the oversampling attempt without record | would lose evidence and encourage redoing the same low-yield change
Confidence: high
Scope-risk: moderate
Directive: Next hard-case work should focus on melody-aware supervision and harder negatives instead of naive sample repetition
Tested: bootstrap manifest generation for FMA and CCMusic; 2-epoch CPU training for models_v4; index_v4 build; fast eval JSON generation for smoke-v4
Not-tested: whitelisted real audio ingestion beyond placeholder manifests; full melody-aware slow-eval on models_v4
1 parent 1b812bea
1 # ccmusic bootstrap
2
3 - Fill raw audio files under `raw/`
4 - Review license before training
5 - Convert to final catalog/query manifests
1 [
2 {
3 "song_id": "ccmusic_track_0000",
4 "audio_path": "raw/ccmusic_track_0000.wav",
5 "duration": 0.0,
6 "type": "reference",
7 "source_dataset": "ccmusic",
8 "license_status": "review_required"
9 },
10 {
11 "song_id": "ccmusic_track_0001",
12 "audio_path": "raw/ccmusic_track_0001.wav",
13 "duration": 0.0,
14 "type": "reference",
15 "source_dataset": "ccmusic",
16 "license_status": "review_required"
17 },
18 {
19 "song_id": "ccmusic_track_0002",
20 "audio_path": "raw/ccmusic_track_0002.wav",
21 "duration": 0.0,
22 "type": "reference",
23 "source_dataset": "ccmusic",
24 "license_status": "review_required"
25 }
26 ]
...\ No newline at end of file ...\ No newline at end of file
1 # fma bootstrap
2
3 - Fill raw audio files under `raw/`
4 - Review license before training
5 - Convert to final catalog/query manifests
1 [
2 {
3 "song_id": "fma_track_0000",
4 "audio_path": "raw/fma_track_0000.wav",
5 "duration": 0.0,
6 "type": "reference",
7 "source_dataset": "fma",
8 "license_status": "review_required"
9 },
10 {
11 "song_id": "fma_track_0001",
12 "audio_path": "raw/fma_track_0001.wav",
13 "duration": 0.0,
14 "type": "reference",
15 "source_dataset": "fma",
16 "license_status": "review_required"
17 },
18 {
19 "song_id": "fma_track_0002",
20 "audio_path": "raw/fma_track_0002.wav",
21 "duration": 0.0,
22 "type": "reference",
23 "source_dataset": "fma",
24 "license_status": "review_required"
25 }
26 ]
...\ No newline at end of file ...\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
1 {
2 "song_0000": 0,
3 "song_0001": 1,
4 "song_0002": 2,
5 "song_0003": 3,
6 "song_0004": 4,
7 "song_0005": 5,
8 "song_0006": 6,
9 "song_0007": 7,
10 "song_0008": 8,
11 "song_0009": 9,
12 "song_0010": 10,
13 "song_0011": 11,
14 "song_0012": 12,
15 "song_0013": 13,
16 "song_0014": 14,
17 "song_0015": 15
18 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "split": "test",
3 "num_queries": 20,
4 "top1": 0.4,
5 "topk": 0.8,
6 "by_type": {
7 "clean": {
8 "n": 8,
9 "top1": 0.75,
10 "topk": 1.0
11 },
12 "augmented": {
13 "n": 4,
14 "top1": 0.5,
15 "topk": 1.0
16 },
17 "humming_like": {
18 "n": 4,
19 "top1": 0.0,
20 "topk": 0.75
21 },
22 "confused": {
23 "n": 4,
24 "top1": 0.0,
25 "topk": 0.25
26 }
27 },
28 "hard_case_summary": {
29 "humming_like": {
30 "n": 4,
31 "top1": 0.0,
32 "topk": 0.75
33 },
34 "confused": {
35 "n": 4,
36 "top1": 0.0,
37 "topk": 0.25
38 }
39 },
40 "sample_failures": [
41 {
42 "truth": "song_0020",
43 "query": "segments/song_0020_seg_04_confused.wav",
44 "type": "confused",
45 "preds": [
46 "song_0010",
47 "song_0014",
48 "song_0012",
49 "song_0009",
50 "song_0005"
51 ]
52 },
53 {
54 "truth": "song_0021",
55 "query": "segments/song_0021_seg_04_confused.wav",
56 "type": "confused",
57 "preds": [
58 "song_0001",
59 "song_0022",
60 "song_0003",
61 "song_0023",
62 "song_0002"
63 ]
64 },
65 {
66 "truth": "song_0022",
67 "query": "segments/song_0022_seg_03_humming_like.wav",
68 "type": "humming_like",
69 "preds": [
70 "song_0007",
71 "song_0009",
72 "song_0021",
73 "song_0000",
74 "song_0002"
75 ]
76 },
77 {
78 "truth": "song_0023",
79 "query": "segments/song_0023_seg_04_confused.wav",
80 "type": "confused",
81 "preds": [
82 "song_0012",
83 "song_0002",
84 "song_0022",
85 "song_0000",
86 "song_0006"
87 ]
88 }
89 ]
90 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env python3
2 """Bootstrap manifest skeletons for whitelisted external datasets."""
3
4 import argparse
5 import json
6 from pathlib import Path
7
8 TEMPLATES = {
9 'fma': {'source_dataset': 'fma', 'license_status': 'review_required'},
10 'mtg_jamendo': {'source_dataset': 'mtg_jamendo', 'license_status': 'review_required'},
11 'ccmusic': {'source_dataset': 'ccmusic', 'license_status': 'review_required'},
12 'modelscope_music': {'source_dataset': 'modelscope_music', 'license_status': 'deny_until_whitelisted'},
13 }
14
15
16 def bootstrap(dataset: str, output_dir: str, num_placeholders: int = 3):
17 out = Path(output_dir)
18 out.mkdir(parents=True, exist_ok=True)
19 base = TEMPLATES[dataset]
20 rows = []
21 for i in range(num_placeholders):
22 rows.append({
23 'song_id': f'{dataset}_track_{i:04d}',
24 'audio_path': f'raw/{dataset}_track_{i:04d}.wav',
25 'duration': 0.0,
26 'type': 'reference',
27 **base,
28 })
29 (out / 'raw').mkdir(exist_ok=True)
30 (out / 'manifests').mkdir(exist_ok=True)
31 with open(out / 'manifests' / 'catalog.bootstrap.json', 'w') as f:
32 json.dump(rows, f, indent=2, ensure_ascii=False)
33 with open(out / 'README.bootstrap.md', 'w') as f:
34 f.write(f'# {dataset} bootstrap\n\n- Fill raw audio files under `raw/`\n- Review license before training\n- Convert to final catalog/query manifests\n')
35 return out / 'manifests' / 'catalog.bootstrap.json'
36
37
38 def main():
39 parser = argparse.ArgumentParser()
40 parser.add_argument('dataset', choices=sorted(TEMPLATES))
41 parser.add_argument('--output-dir', required=True)
42 parser.add_argument('--num-placeholders', type=int, default=3)
43 args = parser.parse_args()
44 path = bootstrap(args.dataset, args.output_dir, args.num_placeholders)
45 print(path)
46
47
48 if __name__ == '__main__':
49 main()
...@@ -191,10 +191,14 @@ class SongPairDataset(Dataset): ...@@ -191,10 +191,14 @@ class SongPairDataset(Dataset):
191 self.by_song.setdefault(item["song_id"], []).append(item) 191 self.by_song.setdefault(item["song_id"], []).append(item)
192 192
193 self.song_ids = sorted(self.by_song) 193 self.song_ids = sorted(self.by_song)
194 self.sample_song_ids = []
195 for sid, items in self.by_song.items():
196 weight = 3 if any(x.get("type") in {"confused", "humming_like"} for x in items) else 1
197 self.sample_song_ids.extend([sid] * weight)
194 self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)} 198 self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
195 199
196 def __len__(self): 200 def __len__(self):
197 return len(self.song_ids) 201 return len(self.sample_song_ids)
198 202
199 def _load_clip(self, sample: Dict) -> np.ndarray: 203 def _load_clip(self, sample: Dict) -> np.ndarray:
200 path = self.data_dir / sample["audio_path"] 204 path = self.data_dir / sample["audio_path"]
...@@ -217,7 +221,7 @@ class SongPairDataset(Dataset): ...@@ -217,7 +221,7 @@ class SongPairDataset(Dataset):
217 return torch.FloatTensor(mel) 221 return torch.FloatTensor(mel)
218 222
219 def __getitem__(self, idx): 223 def __getitem__(self, idx):
220 song_id = self.song_ids[idx] 224 song_id = self.sample_song_ids[idx]
221 choices = self.by_song[song_id] 225 choices = self.by_song[song_id]
222 if len(choices) == 1: 226 if len(choices) == 1:
223 a = b = choices[0] 227 a = b = choices[0]
......
...@@ -113,3 +113,26 @@ ...@@ -113,3 +113,26 @@
113 - artifact generator 成功输出 4 类发布产物 113 - artifact generator 成功输出 4 类发布产物
114 - `reports/smoke-v2/synthetic_v2/` 目录产物存在性检查通过 114 - `reports/smoke-v2/synthetic_v2/` 目录产物存在性检查通过
115 - 当前 fast-eval 指标:top1=0.60, top5=0.75,hard-case 仍需继续优化 115 - 当前 fast-eval 指标:top1=0.60, top5=0.75,hard-case 仍需继续优化
116
117 ## 2026-06-02
118
119 ### Stage: 外部数据集 bootstrap + hard-case 过采样试验
120
121 完成项:
122 - 新增 `src/data/bootstrap_external.py`
123 - 可自动为 `fma` / `ccmusic` 生成 bootstrap catalog manifest
124 -`SongPairDataset` 中加入困难样本过采样试验(`confused` / `humming_like`
125 - 重新训练 `models_v4`、重建 `index_v4`、重跑 `smoke-v4` 评测
126
127 验证结果:
128 - `data/external_bootstrap/fma/manifests/catalog.bootstrap.json` 成功生成
129 - `data/external_bootstrap/ccmusic/manifests/catalog.bootstrap.json` 成功生成
130 - `reports/smoke-v4/synthetic_v2/eval.json` 成功生成
131 - 当前试验结果:top1=0.40, top5=0.80
132 - hard-case 结果未改善:
133 - humming_like top1=0.00
134 - confused top1=0.00
135
136 结论:
137 - 该轮简单过采样策略无效,且整体精度下降
138 - 下一轮应改用更细粒度 hard-negative / melody-aware 正则,而不是继续放大样本重复权重
......