Commit 48c97a90 48c97a90bcc97aee9d7cd52fb084b99e9ab46218 by cnb.bofCdSsphPA

Extend dataset bootstrap coverage and improve humming hard-case weighting

Broaden external dataset bootstrap support and replace naive hard-case oversampling with a more targeted weighting signal that measurably helps humming-like queries while preserving the release/eval workflow.

Constraint: Hard-case optimization must be evidence-driven and preserve a record of mixed outcomes across iterations
Rejected: Reuse naive oversampling after regression | it already showed worse overall behavior with no hard-case gain
Confidence: medium
Scope-risk: moderate
Directive: Next iteration should target confused-case negatives explicitly; do not assume humming gains transfer to confusion robustness
Tested: bootstrap generation for MTG-Jamendo and ModelScope placeholders; 2-epoch CPU training for models_v5; index_v5 build; fast eval JSON generation for smoke-v5
Not-tested: real audio ingestion for the new datasets; full melody-aware slow evaluation on models_v5
1 parent ad350314
No preview for this file type
# modelscope_music bootstrap
- Fill raw audio files under `raw/`
- Review license before training
- Convert to final catalog/query manifests
[
{
"song_id": "modelscope_music_track_0000",
"audio_path": "raw/modelscope_music_track_0000.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "modelscope_music",
"license_status": "deny_until_whitelisted"
},
{
"song_id": "modelscope_music_track_0001",
"audio_path": "raw/modelscope_music_track_0001.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "modelscope_music",
"license_status": "deny_until_whitelisted"
},
{
"song_id": "modelscope_music_track_0002",
"audio_path": "raw/modelscope_music_track_0002.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "modelscope_music",
"license_status": "deny_until_whitelisted"
}
]
\ No newline at end of file
# mtg_jamendo bootstrap
- Fill raw audio files under `raw/`
- Review license before training
- Convert to final catalog/query manifests
[
{
"song_id": "mtg_jamendo_track_0000",
"audio_path": "raw/mtg_jamendo_track_0000.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "mtg_jamendo",
"license_status": "review_required"
},
{
"song_id": "mtg_jamendo_track_0001",
"audio_path": "raw/mtg_jamendo_track_0001.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "mtg_jamendo",
"license_status": "review_required"
},
{
"song_id": "mtg_jamendo_track_0002",
"audio_path": "raw/mtg_jamendo_track_0002.wav",
"duration": 0.0,
"type": "reference",
"source_dataset": "mtg_jamendo",
"license_status": "review_required"
}
]
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
{
"song_0000": 0,
"song_0001": 1,
"song_0002": 2,
"song_0003": 3,
"song_0004": 4,
"song_0005": 5,
"song_0006": 6,
"song_0007": 7,
"song_0008": 8,
"song_0009": 9,
"song_0010": 10,
"song_0011": 11,
"song_0012": 12,
"song_0013": 13,
"song_0014": 14,
"song_0015": 15
}
\ No newline at end of file
{
"split": "test",
"num_queries": 20,
"top1": 0.6,
"topk": 0.9,
"by_type": {
"clean": {
"n": 8,
"top1": 1.0,
"topk": 1.0
},
"augmented": {
"n": 4,
"top1": 0.5,
"topk": 1.0
},
"humming_like": {
"n": 4,
"top1": 0.5,
"topk": 0.75
},
"confused": {
"n": 4,
"top1": 0.0,
"topk": 0.75
}
},
"hard_case_summary": {
"humming_like": {
"n": 4,
"top1": 0.5,
"topk": 0.75
},
"confused": {
"n": 4,
"top1": 0.0,
"topk": 0.75
}
},
"sample_failures": [
{
"truth": "song_0020",
"query": "segments/song_0020_seg_04_confused.wav",
"type": "confused",
"preds": [
"song_0002",
"song_0022",
"song_0006",
"song_0023",
"song_0001"
]
},
{
"truth": "song_0022",
"query": "segments/song_0022_seg_03_humming_like.wav",
"type": "humming_like",
"preds": [
"song_0021",
"song_0001",
"song_0000",
"song_0003",
"song_0023"
]
}
]
}
\ No newline at end of file
......@@ -228,6 +228,9 @@ class SongPairDataset(Dataset):
else:
a, b = random.sample(choices, 2)
pair_types = {a.get("type", "unknown"), b.get("type", "unknown")}
hard_weight = 2.5 if pair_types & {"confused", "humming_like"} else 1.0
wavs = []
for sample in (a, b):
y = self._load_clip(sample)
......@@ -244,4 +247,5 @@ class SongPairDataset(Dataset):
"mel": torch.stack(wavs, dim=0),
"song_id": torch.tensor([label, label], dtype=torch.long),
"song_name": song_id,
"hard_weight": torch.tensor(hard_weight, dtype=torch.float32),
}
......
......@@ -48,9 +48,14 @@ class CombinedLoss(nn.Module):
logits: torch.Tensor,
labels: torch.Tensor,
supcon_labels: torch.Tensor,
hard_weight: torch.Tensor | None = None,
) -> dict:
loss_supcon = self.supcon(embedding, supcon_labels)
loss_ce = self.ce(logits, labels)
if hard_weight is not None:
weight = hard_weight.float().mean()
loss_supcon = loss_supcon * weight
loss_ce = loss_ce * weight
total = self.supcon_weight * loss_supcon + self.aam_weight * loss_ce
return {
......
......@@ -23,17 +23,21 @@ def collate_fn(batch):
mels = []
song_ids = []
song_names = []
hard_weights = []
for b in batch:
mel = b["mel"]
hw = b.get("hard_weight", torch.tensor(1.0))
if mel.dim() == 3:
for i in range(mel.shape[0]):
mels.append(mel[i])
song_ids.append(b["song_id"][i])
song_names.append(b["song_name"])
hard_weights.append(hw)
else:
mels.append(mel)
song_ids.append(b["song_id"])
song_names.append(b["song_name"])
hard_weights.append(hw)
max_t = max(m.shape[1] for m in mels)
mels_padded = []
......@@ -47,6 +51,7 @@ def collate_fn(batch):
"mel": torch.cat(mels_padded, dim=0),
"song_id": torch.stack(song_ids),
"song_name": song_names,
"hard_weight": torch.stack(hard_weights),
}
......@@ -60,7 +65,7 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg)
with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"):
embedding, logits = model(mel, labels)
loss_dict = criterion(embedding, logits, labels, labels)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
optimizer.zero_grad()
if scaler:
......@@ -205,7 +210,7 @@ def main():
mel = batch["mel"].to(device)
labels = batch["song_id"].to(device)
embedding, logits = model(mel, labels)
loss_dict = criterion(embedding, logits, labels, labels)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
loss_dict["loss"].backward()
print(f" Forward/backward OK. Loss: {loss_dict['loss']:.4f}")
print(f" Embedding shape: {embedding.shape}")
......
......@@ -136,3 +136,25 @@
结论:
- 该轮简单过采样策略无效,且整体精度下降
- 下一轮应改用更细粒度 hard-negative / melody-aware 正则,而不是继续放大样本重复权重
## 2026-06-02
### Stage: MTG-Jamendo / ModelScope bootstrap + type-aware hard-case weighting
完成项:
- 补充 `mtg_jamendo``modelscope_music` 的 bootstrap manifest 生成
- 在训练链路中加入 type-aware hard-case weighting(针对 `confused` / `humming_like`
- 重训 `models_v5`、重建 `index_v5`、重跑 `smoke-v5` 评测
验证结果:
- `data/external_bootstrap/mtg_jamendo/manifests/catalog.bootstrap.json` 成功生成
- `data/external_bootstrap/modelscope_music/manifests/catalog.bootstrap.json` 成功生成
- `reports/smoke-v5/synthetic_v2/eval.json` 成功生成
- 当前结果:top1=0.60, top5=0.90
- hard-case 结果:
- humming_like top1=0.50(较 v4 有提升)
- confused top1=0.00(仍未解决)
结论:
- type-aware weighting 比 naive oversampling 更有效
- 下一轮应专门针对 confused 类设计更强的 negative mining / confusion-aware 信号
......