Add external dataset bootstrap and record hard-case oversampling regression
Extend the data ingress path with bootstrap manifests for real datasets and capture an unsuccessful hard-case oversampling experiment so future iterations can avoid repeating the same weak strategy. Constraint: Continuous optimization requires preserving negative results, not just successful ones Rejected: Drop the oversampling attempt without record | would lose evidence and encourage redoing the same low-yield change Confidence: high Scope-risk: moderate Directive: Next hard-case work should focus on melody-aware supervision and harder negatives instead of naive sample repetition Tested: bootstrap manifest generation for FMA and CCMusic; 2-epoch CPU training for models_v4; index_v4 build; fast eval JSON generation for smoke-v4 Not-tested: whitelisted real audio ingestion beyond placeholder manifests; full melody-aware slow-eval on models_v4
Showing
22 changed files
with
248 additions
and
2 deletions
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "ccmusic_track_0000", | ||
| 4 | "audio_path": "raw/ccmusic_track_0000.wav", | ||
| 5 | "duration": 0.0, | ||
| 6 | "type": "reference", | ||
| 7 | "source_dataset": "ccmusic", | ||
| 8 | "license_status": "review_required" | ||
| 9 | }, | ||
| 10 | { | ||
| 11 | "song_id": "ccmusic_track_0001", | ||
| 12 | "audio_path": "raw/ccmusic_track_0001.wav", | ||
| 13 | "duration": 0.0, | ||
| 14 | "type": "reference", | ||
| 15 | "source_dataset": "ccmusic", | ||
| 16 | "license_status": "review_required" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "ccmusic_track_0002", | ||
| 20 | "audio_path": "raw/ccmusic_track_0002.wav", | ||
| 21 | "duration": 0.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "ccmusic", | ||
| 24 | "license_status": "review_required" | ||
| 25 | } | ||
| 26 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "fma_track_0000", | ||
| 4 | "audio_path": "raw/fma_track_0000.wav", | ||
| 5 | "duration": 0.0, | ||
| 6 | "type": "reference", | ||
| 7 | "source_dataset": "fma", | ||
| 8 | "license_status": "review_required" | ||
| 9 | }, | ||
| 10 | { | ||
| 11 | "song_id": "fma_track_0001", | ||
| 12 | "audio_path": "raw/fma_track_0001.wav", | ||
| 13 | "duration": 0.0, | ||
| 14 | "type": "reference", | ||
| 15 | "source_dataset": "fma", | ||
| 16 | "license_status": "review_required" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "fma_track_0002", | ||
| 20 | "audio_path": "raw/fma_track_0002.wav", | ||
| 21 | "duration": 0.0, | ||
| 22 | "type": "reference", | ||
| 23 | "source_dataset": "fma", | ||
| 24 | "license_status": "review_required" | ||
| 25 | } | ||
| 26 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/index_v4/chromaprint.pkl
0 → 100644
No preview for this file type
acr-engine/data/index_v4/reference_embs.npy
0 → 100644
No preview for this file type
acr-engine/data/index_v4/reference_ids.npy
0 → 100644
No preview for this file type
acr-engine/data/models_v4/best_model.pt
0 → 100644
This file is too large to display.
acr-engine/data/models_v4/song_to_idx.json
0 → 100644
| 1 | { | ||
| 2 | "song_0000": 0, | ||
| 3 | "song_0001": 1, | ||
| 4 | "song_0002": 2, | ||
| 5 | "song_0003": 3, | ||
| 6 | "song_0004": 4, | ||
| 7 | "song_0005": 5, | ||
| 8 | "song_0006": 6, | ||
| 9 | "song_0007": 7, | ||
| 10 | "song_0008": 8, | ||
| 11 | "song_0009": 9, | ||
| 12 | "song_0010": 10, | ||
| 13 | "song_0011": 11, | ||
| 14 | "song_0012": 12, | ||
| 15 | "song_0013": 13, | ||
| 16 | "song_0014": 14, | ||
| 17 | "song_0015": 15 | ||
| 18 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "split": "test", | ||
| 3 | "num_queries": 20, | ||
| 4 | "top1": 0.4, | ||
| 5 | "topk": 0.8, | ||
| 6 | "by_type": { | ||
| 7 | "clean": { | ||
| 8 | "n": 8, | ||
| 9 | "top1": 0.75, | ||
| 10 | "topk": 1.0 | ||
| 11 | }, | ||
| 12 | "augmented": { | ||
| 13 | "n": 4, | ||
| 14 | "top1": 0.5, | ||
| 15 | "topk": 1.0 | ||
| 16 | }, | ||
| 17 | "humming_like": { | ||
| 18 | "n": 4, | ||
| 19 | "top1": 0.0, | ||
| 20 | "topk": 0.75 | ||
| 21 | }, | ||
| 22 | "confused": { | ||
| 23 | "n": 4, | ||
| 24 | "top1": 0.0, | ||
| 25 | "topk": 0.25 | ||
| 26 | } | ||
| 27 | }, | ||
| 28 | "hard_case_summary": { | ||
| 29 | "humming_like": { | ||
| 30 | "n": 4, | ||
| 31 | "top1": 0.0, | ||
| 32 | "topk": 0.75 | ||
| 33 | }, | ||
| 34 | "confused": { | ||
| 35 | "n": 4, | ||
| 36 | "top1": 0.0, | ||
| 37 | "topk": 0.25 | ||
| 38 | } | ||
| 39 | }, | ||
| 40 | "sample_failures": [ | ||
| 41 | { | ||
| 42 | "truth": "song_0020", | ||
| 43 | "query": "segments/song_0020_seg_04_confused.wav", | ||
| 44 | "type": "confused", | ||
| 45 | "preds": [ | ||
| 46 | "song_0010", | ||
| 47 | "song_0014", | ||
| 48 | "song_0012", | ||
| 49 | "song_0009", | ||
| 50 | "song_0005" | ||
| 51 | ] | ||
| 52 | }, | ||
| 53 | { | ||
| 54 | "truth": "song_0021", | ||
| 55 | "query": "segments/song_0021_seg_04_confused.wav", | ||
| 56 | "type": "confused", | ||
| 57 | "preds": [ | ||
| 58 | "song_0001", | ||
| 59 | "song_0022", | ||
| 60 | "song_0003", | ||
| 61 | "song_0023", | ||
| 62 | "song_0002" | ||
| 63 | ] | ||
| 64 | }, | ||
| 65 | { | ||
| 66 | "truth": "song_0022", | ||
| 67 | "query": "segments/song_0022_seg_03_humming_like.wav", | ||
| 68 | "type": "humming_like", | ||
| 69 | "preds": [ | ||
| 70 | "song_0007", | ||
| 71 | "song_0009", | ||
| 72 | "song_0021", | ||
| 73 | "song_0000", | ||
| 74 | "song_0002" | ||
| 75 | ] | ||
| 76 | }, | ||
| 77 | { | ||
| 78 | "truth": "song_0023", | ||
| 79 | "query": "segments/song_0023_seg_04_confused.wav", | ||
| 80 | "type": "confused", | ||
| 81 | "preds": [ | ||
| 82 | "song_0012", | ||
| 83 | "song_0002", | ||
| 84 | "song_0022", | ||
| 85 | "song_0000", | ||
| 86 | "song_0006" | ||
| 87 | ] | ||
| 88 | } | ||
| 89 | ] | ||
| 90 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
No preview for this file type
acr-engine/src/data/bootstrap_external.py
0 → 100755
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Bootstrap manifest skeletons for whitelisted external datasets.""" | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | |||
| 8 | TEMPLATES = { | ||
| 9 | 'fma': {'source_dataset': 'fma', 'license_status': 'review_required'}, | ||
| 10 | 'mtg_jamendo': {'source_dataset': 'mtg_jamendo', 'license_status': 'review_required'}, | ||
| 11 | 'ccmusic': {'source_dataset': 'ccmusic', 'license_status': 'review_required'}, | ||
| 12 | 'modelscope_music': {'source_dataset': 'modelscope_music', 'license_status': 'deny_until_whitelisted'}, | ||
| 13 | } | ||
| 14 | |||
| 15 | |||
| 16 | def bootstrap(dataset: str, output_dir: str, num_placeholders: int = 3): | ||
| 17 | out = Path(output_dir) | ||
| 18 | out.mkdir(parents=True, exist_ok=True) | ||
| 19 | base = TEMPLATES[dataset] | ||
| 20 | rows = [] | ||
| 21 | for i in range(num_placeholders): | ||
| 22 | rows.append({ | ||
| 23 | 'song_id': f'{dataset}_track_{i:04d}', | ||
| 24 | 'audio_path': f'raw/{dataset}_track_{i:04d}.wav', | ||
| 25 | 'duration': 0.0, | ||
| 26 | 'type': 'reference', | ||
| 27 | **base, | ||
| 28 | }) | ||
| 29 | (out / 'raw').mkdir(exist_ok=True) | ||
| 30 | (out / 'manifests').mkdir(exist_ok=True) | ||
| 31 | with open(out / 'manifests' / 'catalog.bootstrap.json', 'w') as f: | ||
| 32 | json.dump(rows, f, indent=2, ensure_ascii=False) | ||
| 33 | with open(out / 'README.bootstrap.md', 'w') as f: | ||
| 34 | f.write(f'# {dataset} bootstrap\n\n- Fill raw audio files under `raw/`\n- Review license before training\n- Convert to final catalog/query manifests\n') | ||
| 35 | return out / 'manifests' / 'catalog.bootstrap.json' | ||
| 36 | |||
| 37 | |||
| 38 | def main(): | ||
| 39 | parser = argparse.ArgumentParser() | ||
| 40 | parser.add_argument('dataset', choices=sorted(TEMPLATES)) | ||
| 41 | parser.add_argument('--output-dir', required=True) | ||
| 42 | parser.add_argument('--num-placeholders', type=int, default=3) | ||
| 43 | args = parser.parse_args() | ||
| 44 | path = bootstrap(args.dataset, args.output_dir, args.num_placeholders) | ||
| 45 | print(path) | ||
| 46 | |||
| 47 | |||
| 48 | if __name__ == '__main__': | ||
| 49 | main() |
| ... | @@ -191,10 +191,14 @@ class SongPairDataset(Dataset): | ... | @@ -191,10 +191,14 @@ class SongPairDataset(Dataset): |
| 191 | self.by_song.setdefault(item["song_id"], []).append(item) | 191 | self.by_song.setdefault(item["song_id"], []).append(item) |
| 192 | 192 | ||
| 193 | self.song_ids = sorted(self.by_song) | 193 | self.song_ids = sorted(self.by_song) |
| 194 | self.sample_song_ids = [] | ||
| 195 | for sid, items in self.by_song.items(): | ||
| 196 | weight = 3 if any(x.get("type") in {"confused", "humming_like"} for x in items) else 1 | ||
| 197 | self.sample_song_ids.extend([sid] * weight) | ||
| 194 | self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)} | 198 | self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)} |
| 195 | 199 | ||
| 196 | def __len__(self): | 200 | def __len__(self): |
| 197 | return len(self.song_ids) | 201 | return len(self.sample_song_ids) |
| 198 | 202 | ||
| 199 | def _load_clip(self, sample: Dict) -> np.ndarray: | 203 | def _load_clip(self, sample: Dict) -> np.ndarray: |
| 200 | path = self.data_dir / sample["audio_path"] | 204 | path = self.data_dir / sample["audio_path"] |
| ... | @@ -217,7 +221,7 @@ class SongPairDataset(Dataset): | ... | @@ -217,7 +221,7 @@ class SongPairDataset(Dataset): |
| 217 | return torch.FloatTensor(mel) | 221 | return torch.FloatTensor(mel) |
| 218 | 222 | ||
| 219 | def __getitem__(self, idx): | 223 | def __getitem__(self, idx): |
| 220 | song_id = self.song_ids[idx] | 224 | song_id = self.sample_song_ids[idx] |
| 221 | choices = self.by_song[song_id] | 225 | choices = self.by_song[song_id] |
| 222 | if len(choices) == 1: | 226 | if len(choices) == 1: |
| 223 | a = b = choices[0] | 227 | a = b = choices[0] | ... | ... |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
| ... | @@ -113,3 +113,26 @@ | ... | @@ -113,3 +113,26 @@ |
| 113 | - artifact generator 成功输出 4 类发布产物 | 113 | - artifact generator 成功输出 4 类发布产物 |
| 114 | - `reports/smoke-v2/synthetic_v2/` 目录产物存在性检查通过 | 114 | - `reports/smoke-v2/synthetic_v2/` 目录产物存在性检查通过 |
| 115 | - 当前 fast-eval 指标:top1=0.60, top5=0.75,hard-case 仍需继续优化 | 115 | - 当前 fast-eval 指标:top1=0.60, top5=0.75,hard-case 仍需继续优化 |
| 116 | |||
| 117 | ## 2026-06-02 | ||
| 118 | |||
| 119 | ### Stage: 外部数据集 bootstrap + hard-case 过采样试验 | ||
| 120 | |||
| 121 | 完成项: | ||
| 122 | - 新增 `src/data/bootstrap_external.py` | ||
| 123 | - 可自动为 `fma` / `ccmusic` 生成 bootstrap catalog manifest | ||
| 124 | - 在 `SongPairDataset` 中加入困难样本过采样试验(`confused` / `humming_like`) | ||
| 125 | - 重新训练 `models_v4`、重建 `index_v4`、重跑 `smoke-v4` 评测 | ||
| 126 | |||
| 127 | 验证结果: | ||
| 128 | - `data/external_bootstrap/fma/manifests/catalog.bootstrap.json` 成功生成 | ||
| 129 | - `data/external_bootstrap/ccmusic/manifests/catalog.bootstrap.json` 成功生成 | ||
| 130 | - `reports/smoke-v4/synthetic_v2/eval.json` 成功生成 | ||
| 131 | - 当前试验结果:top1=0.40, top5=0.80 | ||
| 132 | - hard-case 结果未改善: | ||
| 133 | - humming_like top1=0.00 | ||
| 134 | - confused top1=0.00 | ||
| 135 | |||
| 136 | 结论: | ||
| 137 | - 该轮简单过采样策略无效,且整体精度下降 | ||
| 138 | - 下一轮应改用更细粒度 hard-negative / melody-aware 正则,而不是继续放大样本重复权重 | ... | ... |
-
Please register or sign in to post a comment