Commit 4b16286e 4b16286e20856132abbe8cfeafab1af1ee23c0ce by cnb.bofCdSsphPA

Raise ACR robustness with retrieval-first structure and music-aware inputs

Shift the prototype toward music-retrieval behavior by documenting dataset contracts, upgrading the frontend to 128-bin Mel plus band splitting, and adding retrieval evaluation plus harder confusion-oriented augmentation.

Constraint: The previous pipeline mixed train splits with the searchable catalog and hid real retrieval quality
Rejected: Keep classification-centric validation and whole-song averaged references | it masked structural accuracy failures
Confidence: medium
Scope-risk: moderate
Directive: Next iterations should target humming/confused top1 with specialized melody-aware retrieval and stronger real-data calibration
Tested: synthetic_v2 generation; 3-epoch CPU training; index build; evaluate.py top1=0.65 top5=0.95 on test split
Not-tested: external open-dataset ingestion; foundation-model baselines; production latency
1 parent 62688d3b
...@@ -5,9 +5,11 @@ model: ...@@ -5,9 +5,11 @@ model:
5 se_channels: 128 5 se_channels: 128
6 res2net_scale: 8 6 res2net_scale: 8
7 num_blocks: 3 7 num_blocks: 3
8 n_mels: 80 8 n_mels: 128
9 aam_m: 0.3 9 aam_m: 0.3
10 aam_s: 30.0 10 aam_s: 30.0
11 use_band_split: true
12 band_split_channels: 128
11 13
12 data: 14 data:
13 sample_rate: 16000 15 sample_rate: 16000
...@@ -39,3 +41,8 @@ engine: ...@@ -39,3 +41,8 @@ engine:
39 chroma_weight: 0.3 41 chroma_weight: 0.3
40 ecapa_weight: 0.7 42 ecapa_weight: 0.7
41 reject_threshold: 0.4 43 reject_threshold: 0.4
44
45 augmentation:
46 pro_wgan_balance: true
47 minority_noise_scale: 0.35
48 minority_pitch_shift: 8
......
...@@ -229,9 +229,12 @@ class SongPairDataset(Dataset): ...@@ -229,9 +229,12 @@ class SongPairDataset(Dataset):
229 y = self._load_clip(sample) 229 y = self._load_clip(sample)
230 if self.augment: 230 if self.augment:
231 from src.utils.augment import AugmentPipeline 231 from src.utils.augment import AugmentPipeline
232 y = AugmentPipeline(self.sr)(y) 232 y = AugmentPipeline(self.sr, aggressive=sample.get("type") in {"confused", "humming_like"})(y)
233 wavs.append(self._to_mel(y)) 233 wavs.append(self._to_mel(y))
234 234
235 max_t = max(w.shape[1] for w in wavs)
236 wavs = [torch.nn.functional.pad(w, (0, max_t - w.shape[1])) if w.shape[1] < max_t else w for w in wavs]
237
235 label = self.song_to_idx[song_id] 238 label = self.song_to_idx[song_id]
236 return { 239 return {
237 "mel": torch.stack(wavs, dim=0), 240 "mel": torch.stack(wavs, dim=0),
......
...@@ -28,14 +28,20 @@ class ECAPAEmbedder: ...@@ -28,14 +28,20 @@ class ECAPAEmbedder:
28 state = torch.load(model_path, map_location="cpu", weights_only=True) 28 state = torch.load(model_path, map_location="cpu", weights_only=True)
29 cfg = state.get("config", {}) 29 cfg = state.get("config", {})
30 model_cfg = cfg.get("model", {}) 30 model_cfg = cfg.get("model", {})
31 data_cfg = cfg.get("data", {})
32 self.n_mels = model_cfg.get("n_mels", n_mels)
33 self.n_fft = data_cfg.get("n_fft", n_fft)
34 self.hop_length = data_cfg.get("hop_length", hop_length)
31 self.model = ECAPA_ACR( 35 self.model = ECAPA_ACR(
32 n_mels=model_cfg.get("n_mels", n_mels), 36 n_mels=self.n_mels,
33 embed_dim=model_cfg.get("embed_dim", 192), 37 embed_dim=model_cfg.get("embed_dim", 192),
34 channels=model_cfg.get("channels", 512), 38 channels=model_cfg.get("channels", 512),
35 se_channels=model_cfg.get("se_channels", 128), 39 se_channels=model_cfg.get("se_channels", 128),
36 res2net_scale=model_cfg.get("res2net_scale", 8), 40 res2net_scale=model_cfg.get("res2net_scale", 8),
37 num_blocks=model_cfg.get("num_blocks", 3), 41 num_blocks=model_cfg.get("num_blocks", 3),
38 num_classes=None, 42 num_classes=None,
43 use_band_split=model_cfg.get("use_band_split", True),
44 band_split_channels=model_cfg.get("band_split_channels", 128),
39 ) 45 )
40 missing = self.model.load_state_dict(state["model_state_dict"], strict=False) 46 missing = self.model.load_state_dict(state["model_state_dict"], strict=False)
41 if missing.unexpected_keys: 47 if missing.unexpected_keys:
......
1 import torch 1 import torch
2 import torch.nn as nn 2 import torch.nn as nn
3 import torch.nn.functional as F 3 import torch.nn.functional as F
4 from typing import Optional, Tuple 4 from typing import Optional, Tuple, List
5 5
6 6
7 class SEModule(nn.Module): 7 class SEModule(nn.Module):
...@@ -19,13 +19,43 @@ class SEModule(nn.Module): ...@@ -19,13 +19,43 @@ class SEModule(nn.Module):
19 return x * self.se(x) 19 return x * self.se(x)
20 20
21 21
22 class BandSplitBlock(nn.Module):
23 def __init__(self, n_mels: int, split_points: Optional[List[int]] = None, out_channels: int = 128):
24 super().__init__()
25 self.split_points = split_points or [16, 32, 64, 96, n_mels]
26 starts = [0] + self.split_points[:-1]
27 widths = [end - start for start, end in zip(starts, self.split_points)]
28 self.band_projs = nn.ModuleList(
29 [
30 nn.Sequential(
31 nn.Conv1d(width, out_channels, kernel_size=1),
32 nn.ReLU(),
33 nn.BatchNorm1d(out_channels),
34 )
35 for width in widths
36 ]
37 )
38 self.fuse = nn.Sequential(
39 nn.Conv1d(out_channels * len(widths), out_channels * len(widths), kernel_size=1),
40 nn.ReLU(),
41 nn.BatchNorm1d(out_channels * len(widths)),
42 )
43
44 def forward(self, x):
45 starts = [0] + self.split_points[:-1]
46 bands = []
47 for proj, start, end in zip(self.band_projs, starts, self.split_points):
48 bands.append(proj(x[:, start:end, :]))
49 return self.fuse(torch.cat(bands, dim=1))
50
51
22 class Res2Block(nn.Module): 52 class Res2Block(nn.Module):
23 def __init__(self, channels, kernel_size=3, dilation=1, scale=8, se_channels=128): 53 def __init__(self, channels, kernel_size=3, dilation=1, scale=8, se_channels=128):
24 super().__init__() 54 super().__init__()
25 self.width = channels // scale 55 self.width = channels // scale
26 self.num_split = scale 56 self.num_split = scale
27 self.convs = nn.ModuleList() 57 self.convs = nn.ModuleList()
28 for i in range(self.num_split): 58 for _ in range(self.num_split):
29 self.convs.append( 59 self.convs.append(
30 nn.Sequential( 60 nn.Sequential(
31 nn.Conv1d( 61 nn.Conv1d(
...@@ -54,7 +84,7 @@ class Res2Block(nn.Module): ...@@ -54,7 +84,7 @@ class Res2Block(nn.Module):
54 if i == 0: 84 if i == 0:
55 out.append(conv(part)) 85 out.append(conv(part))
56 else: 86 else:
57 out.append(conv(out[-1] if len(out) else part + part)) 87 out.append(conv(part + out[-1]))
58 x = torch.cat(out, dim=1) 88 x = torch.cat(out, dim=1)
59 x = self.conv1x1(x) 89 x = self.conv1x1(x)
60 x = self.se(x) 90 x = self.se(x)
...@@ -96,7 +126,7 @@ class AAMSoftmax(nn.Module): ...@@ -96,7 +126,7 @@ class AAMSoftmax(nn.Module):
96 class ECAPA_ACR(nn.Module): 126 class ECAPA_ACR(nn.Module):
97 def __init__( 127 def __init__(
98 self, 128 self,
99 n_mels: int = 80, 129 n_mels: int = 128,
100 embed_dim: int = 192, 130 embed_dim: int = 192,
101 channels: int = 512, 131 channels: int = 512,
102 se_channels: int = 128, 132 se_channels: int = 128,
...@@ -105,20 +135,23 @@ class ECAPA_ACR(nn.Module): ...@@ -105,20 +135,23 @@ class ECAPA_ACR(nn.Module):
105 num_classes: Optional[int] = None, 135 num_classes: Optional[int] = None,
106 aam_m: float = 0.3, 136 aam_m: float = 0.3,
107 aam_s: float = 30.0, 137 aam_s: float = 30.0,
138 use_band_split: bool = True,
139 band_split_channels: int = 128,
108 ): 140 ):
109 super().__init__() 141 super().__init__()
110 self.embed_dim = embed_dim 142 self.embed_dim = embed_dim
143 front_channels = band_split_channels * 5 if use_band_split else channels
144 self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
111 145
112 self.conv1 = nn.Sequential( 146 self.conv1 = nn.Sequential(
113 nn.Conv1d(n_mels, channels, kernel_size=5, stride=1, padding=2), 147 nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
114 nn.ReLU(), 148 nn.ReLU(),
115 nn.BatchNorm1d(channels), 149 nn.BatchNorm1d(channels),
116 ) 150 )
117 151
118 dilations = [1, 2, 3] if num_blocks == 3 else [d * 1 for d in range(1, num_blocks + 1)] 152 dilations = [1, 2, 3] if num_blocks == 3 else [d for d in range(1, num_blocks + 1)]
119 self.blocks = nn.ModuleList() 153 self.blocks = nn.ModuleList(
120 for d in dilations[:num_blocks]: 154 [
121 self.blocks.append(
122 Res2Block( 155 Res2Block(
123 channels=channels, 156 channels=channels,
124 kernel_size=3, 157 kernel_size=3,
...@@ -126,7 +159,9 @@ class ECAPA_ACR(nn.Module): ...@@ -126,7 +159,9 @@ class ECAPA_ACR(nn.Module):
126 scale=res2net_scale, 159 scale=res2net_scale,
127 se_channels=se_channels, 160 se_channels=se_channels,
128 ) 161 )
129 ) 162 for d in dilations[:num_blocks]
163 ]
164 )
130 165
131 in_channels = channels * num_blocks 166 in_channels = channels * num_blocks
132 self.mfa = nn.Sequential( 167 self.mfa = nn.Sequential(
...@@ -134,34 +169,25 @@ class ECAPA_ACR(nn.Module): ...@@ -134,34 +169,25 @@ class ECAPA_ACR(nn.Module):
134 nn.ReLU(), 169 nn.ReLU(),
135 nn.BatchNorm1d(channels * 3), 170 nn.BatchNorm1d(channels * 3),
136 ) 171 )
137
138 self.pooling = StatisticsPooling() 172 self.pooling = StatisticsPooling()
139 self.fc = nn.Linear(channels * 3 * 2, embed_dim) 173 self.fc = nn.Linear(channels * 3 * 2, embed_dim)
140 self.bn = nn.BatchNorm1d(embed_dim, affine=False) 174 self.bn = nn.BatchNorm1d(embed_dim, affine=False)
175 self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None
141 176
142 if num_classes is not None: 177 def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
143 self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) 178 x = self.band_split(mel) if self.band_split is not None else mel
144 else: 179 x = self.conv1(x)
145 self.aam = None
146
147 def forward(
148 self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None
149 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
150 x = self.conv1(mel)
151 block_outputs = [] 180 block_outputs = []
152 for block in self.blocks: 181 for block in self.blocks:
153 x = block(x) 182 x = block(x)
154 block_outputs.append(x) 183 block_outputs.append(x)
155
156 x = torch.cat(block_outputs, dim=1) 184 x = torch.cat(block_outputs, dim=1)
157 x = self.mfa(x) 185 x = self.mfa(x)
158 x = self.pooling(x) 186 x = self.pooling(x)
159 x = self.fc(x) 187 x = self.fc(x)
160 x = self.bn(x) 188 x = self.bn(x)
161 embedding = F.normalize(x, p=2, dim=1) 189 embedding = F.normalize(x, p=2, dim=1)
162
163 if labels is not None and self.aam is not None: 190 if labels is not None and self.aam is not None:
164 logits = self.aam(embedding, labels) 191 logits = self.aam(embedding, labels)
165 return embedding, logits 192 return embedding, logits
166
167 return embedding, None 193 return embedding, None
......
...@@ -4,12 +4,13 @@ from typing import Optional, Tuple ...@@ -4,12 +4,13 @@ from typing import Optional, Tuple
4 4
5 5
6 class AugmentPipeline: 6 class AugmentPipeline:
7 def __init__(self, sr: int = 16000): 7 def __init__(self, sr: int = 16000, aggressive: bool = False):
8 self.sr = sr 8 self.sr = sr
9 self.noise_snr_range = (5, 30) 9 self.noise_snr_range = (5, 30)
10 self.pitch_shift_range = (-6, 6) 10 self.pitch_shift_range = (-6, 6)
11 self.time_stretch_range = (0.85, 1.15) 11 self.time_stretch_range = (0.85, 1.15)
12 self.mp3_bitrate_range = (32, 128) 12 self.mp3_bitrate_range = (32, 128)
13 self.aggressive = aggressive
13 14
14 def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray: 15 def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray:
15 if snr_db is None: 16 if snr_db is None:
...@@ -57,14 +58,18 @@ class AugmentPipeline: ...@@ -57,14 +58,18 @@ class AugmentPipeline:
57 return mel 58 return mel
58 59
59 def __call__(self, y: np.ndarray) -> np.ndarray: 60 def __call__(self, y: np.ndarray) -> np.ndarray:
60 if random.random() < 0.5: 61 noise_p = 0.75 if self.aggressive else 0.5
61 y = self.add_noise(y) 62 stretch_p = 0.55 if self.aggressive else 0.3
62 if random.random() < 0.3: 63 pitch_p = 0.55 if self.aggressive else 0.3
63 y = self.time_stretch(y) 64 reverb_p = 0.35 if self.aggressive else 0.2
64 if random.random() < 0.3: 65 if random.random() < noise_p:
65 y = self.pitch_shift(y) 66 y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None)
66 if random.random() < 0.2: 67 if random.random() < stretch_p:
67 y = self.add_reverb(y) 68 y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None)
69 if random.random() < pitch_p:
70 y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None)
71 if random.random() < reverb_p:
72 y = self.add_reverb(y, decay=random.uniform(0.2, 0.6))
68 return y 73 return y
69 74
70 75
......
...@@ -187,6 +187,8 @@ def main(): ...@@ -187,6 +187,8 @@ def main():
187 num_classes=num_classes, 187 num_classes=num_classes,
188 aam_m=cfg["model"]["aam_m"], 188 aam_m=cfg["model"]["aam_m"],
189 aam_s=cfg["model"]["aam_s"], 189 aam_s=cfg["model"]["aam_s"],
190 use_band_split=cfg["model"].get("use_band_split", True),
191 band_split_channels=cfg["model"].get("band_split_channels", 128),
190 ).to(device) 192 ).to(device)
191 193
192 criterion = CombinedLoss( 194 criterion = CombinedLoss(
......
...@@ -21,3 +21,35 @@ ...@@ -21,3 +21,35 @@
21 - 已完成 1 epoch CPU 训练并生成 `best_model.pt` 21 - 已完成 1 epoch CPU 训练并生成 `best_model.pt`
22 - 已完成指纹索引与 embedding 索引构建 22 - 已完成指纹索引与 embedding 索引构建
23 - 已完成识别命令并输出 JSON 候选结果 23 - 已完成识别命令并输出 JSON 候选结果
24
25 ## 2026-06-02
26
27 ### Stage: 准确率优化 v2(128 Mel / band-split / retrieval 评测 / dataset 规范 / SOTA 调研)
28
29 完成项:
30 - 补充 dataset / 输入输出规范:`docs/dataset-spec.md`
31 - 补充开源数据集接入计划:`docs/open-dataset-plan.md`
32 - 补充 2026 SOTA 研究说明:`docs/sota-research-2026.md`
33 - 输入特征从低维说话人风格配置改为 `128 Mel`
34 - 新增频带分割模块 `BandSplitBlock`
35 - 引入 pro-WGAN 风格工程近似平衡策略(针对困难样本的更强增广)
36 - 合成数据新增 `confused` / `humming_like` 样本类型
37 - 引入 `catalog.json` 作为可搜索 reference 清单
38 - 索引从整曲单向量改为 window-level embedding index
39 - 新增 `evaluate.py` 做 retrieval 评测
40 - 训练逻辑改为更 retrieval-oriented 的 song-pair 训练输入
41
42 验证结果:
43 - synthetic_v2 端到端重新跑通
44 - build-index 成功
45 - evaluate 成功
46 - test split 指标:top1=0.65, top5=0.95
47 - 分类型指标:
48 - clean top1=1.00
49 - augmented top1=0.75
50 - humming_like top1=0.25
51 - confused top1=0.25
52
53 结论:
54 - 结构性错误(catalog/index/fusion/评测缺失)已明显改善
55 - 当前主要剩余短板是 humming_like / confused 的鲁棒识别
......
1 # ACR Dataset / 输入输出规范
2
3 > 更新:2026-06-02
4
5 ## 1. 目标
6
7 定义本项目数据集规范、输入输出处理流程、catalog/query 划分方式,以及训练/评测所需的 manifest 结构。
8
9 ## 2. 数据层对象
10
11 ### 2.1 Reference / Catalog
12 可检索曲库中的标准参考音频。
13
14 字段:
15
16 ```json
17 {
18 "song_id": "song_0001",
19 "audio_path": "songs/song_0001.wav",
20 "duration": 20.0,
21 "base_freq": 261.63,
22 "type": "reference"
23 }
24 ```
25
26 用途:
27 - 建立 chromaprint 索引
28 - 建立 embedding window 索引
29 - 作为检索目标集合
30
31 ### 2.2 Query Segment
32 待识别片段。
33
34 字段:
35
36 ```json
37 {
38 "song_id": "song_0001",
39 "audio_path": "segments/song_0001_seg_02_confused.wav",
40 "duration": 5.0,
41 "type": "confused",
42 "offset": 8.3,
43 "segment_type": "mid"
44 }
45 ```
46
47 用途:
48 - 训练片段对
49 - top-k 检索评测
50 - 鲁棒性测试
51
52 ## 3. Manifest 文件
53
54 | 文件 | 用途 |
55 |---|---|
56 | `train.json` | 训练查询片段 + 训练 reference |
57 | `val.json` | 验证查询片段 + 验证 reference |
58 | `test.json` | 测试查询片段 + 测试 reference |
59 | `catalog.json` | 可搜索 reference 总表 |
60
61 注意:
62 - `catalog.json`**检索索引输入**
63 - `train/val/test.json`**实验 split**
64 - 不再把 “模型训练 split” 和 “可搜索曲库” 混为一谈
65
66 ## 4. 输入特征规范
67
68 ### 4.1 输入音频
69 - 默认采样率:`16 kHz`
70 - 通道:`mono`
71 - 训练/query 窗长:`5s`
72 - 滑窗步长:`2.5s`
73
74 ### 4.2 声学特征
75 当前改为:
76 - **128维 Mel 频谱**
77
78 不再采用传统说话人任务常见的 40 维 MFCC 作为主输入,因为:
79 - 音乐任务更依赖频带结构与谐波信息
80 - Mel 频谱对音乐 timbre / harmony / texture 表达更自然
81 - 便于 band-split 模块对频带进行分块建模
82
83 ## 5. 输出规范
84
85 ### 5.1 训练输出
86 模型输出:
87 - `embedding: [B, D]`
88 - `logits: [B, num_classes]`(辅助分类头)
89
90 主要目标:
91 - retrieval embedding 学得稳定
92 - 同 song 片段彼此接近
93 - 不同 song 分离
94
95 ### 5.2 推理输出
96 识别输出:
97
98 ```json
99 {
100 "candidates": [
101 {
102 "song_id": "song_0001",
103 "confidence": 0.93,
104 "chromaprint_score": 0.88,
105 "ecapa_score": 0.96,
106 "accepted": true,
107 "metadata": {}
108 }
109 ],
110 "processing_time_ms": 120.4,
111 "num_candidates": 5
112 }
113 ```
114
115 ## 6. Query 类型定义
116
117 | type | 含义 |
118 |---|---|
119 | `clean` | 原始干净片段 |
120 | `augmented` | 常规增强片段 |
121 | `confused` | 强混淆/干扰片段 |
122 | `humming_like` | 哼唱风格近似片段 |
123 | `reference` | 标准参考整曲 |
124
125 ## 7. pro-WGAN 平衡策略(工程近似版)
126
127 当前仓库先实现的是**pro-WGAN 风格的数据平衡近似策略**,不是完整生成式 GAN 训练:
128
129 - 对难样本类型(`confused`, `humming_like`)增加更强增广概率
130 - 通过 harder augmentation 近似 minority/hard-case oversampling
131 - 保持 manifest 结构兼容,后续可替换成真正的生成式平衡器
132
133 后续若接入完整 GAN 平衡器,可把它作为:
134 - 离线样本扩增器
135 - 困难类别样本生成器
136 - catalog/query domain adaptation 工具
137
138 ## 8. 频带分割模块
139
140 输入层新增 `BandSplitBlock`
141 - 将 128 Mel bins 分割为多个子频带
142 - 每个子带做独立投影
143 - 再拼接进入主干网络
144
145 目的:
146 - 强化低频节奏 / 中频和声 / 高频音色的分带建模
147 - 更符合音乐频谱结构
148 - 为后续更复杂 band-aware retrieval 打基础
1 # ACR / Music Retrieval SOTA Research (截至 2026-06-02)
2
3 ## 结论摘要
4
5 到 2025-2026,这个方向相比传统“从零训练一个小型 ECAPA embedding”已经明显前进了。
6
7 当前更强的方向主要有三类:
8
9 1. **Neural Audio Fingerprinting 的鲁棒训练增强**
10 2. **Music Foundation Model 作为 backbone / teacher**
11 3. **Band-split / band-aware 结构用于音乐频谱建模**
12
13 ## 1. Neural AFP 的更强实践
14
15 ### Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification (2025)
16 - arXiv: https://arxiv.org/abs/2506.22661
17
18 关键信息:
19 - 指出很多 neural AFP 工作对真实退化模拟不够真实
20 - 系统比较 metric learning 方法
21 - 发现自监督 triplet loss 变体在该任务中更优
22 - 强调多个 positive samples 对不同 loss 的影响不同
23
24 对本项目的启发:
25 - 不应只依赖当前简单 SupCon + CE
26 - 应增加更真实的退化增强
27 - 应明确做 retrieval 指标选择,而非只看分类头
28
29 ## 2. Music Foundation Model Backbones
30
31 ### Robust Neural Audio Fingerprinting using Music Foundation Models (2025)
32 - arXiv: https://arxiv.org/abs/2511.05399
33
34 关键信息:
35 - 使用预训练 music foundation model(例如 MuQ、MERT)作为 neural fingerprinting backbone
36 - 在 distorted / compressed / manipulated 音频条件下优于从零训练模型
37 - 还能更好做 segment-level localization
38
39 ### MERT (2023)
40 - arXiv: https://arxiv.org/abs/2306.00107
41
42 关键信息:
43 - 大规模自监督 music understanding 模型
44 - 在多个 music understanding 任务上达到强表现
45
46 ### MuQ (2025)
47 - arXiv: https://arxiv.org/abs/2501.01108
48
49 关键信息:
50 - 面向音乐的自监督表征学习模型
51 - 使用 Mel-RVQ 目标
52 - 在多种下游任务上优于更早工作
53
54 对本项目的启发:
55 - 2026 继续只用小模型从零训,不太可能是最佳路线
56 - 更合理路线:
57 - 当前仓库保留轻量自训 baseline
58 - 下一阶段增加 MERT / MuQ frozen encoder 或 adapter fine-tune 版本
59
60 ## 3. Band-split / band-aware 结构
61
62 ### Music Source Separation with Band-split RNN (2022)
63 - arXiv: https://arxiv.org/abs/2209.15174
64
65 关键信息:
66 - 显式把频谱切成多个频带再建模
67 - 对音乐任务优于直接照搬通用音频结构
68
69 虽然该文主要做 source separation,不是 ACR,但它对“音乐频带先验”很有启发。
70
71 对本项目的启发:
72 - 输入层加入 band-split 是合理工程方向
73 - 未来可继续发展成:
74 - band-aware attention
75 - multi-band retrieval heads
76 - harmonic/rhythm 双塔结构
77
78 ## 4. 数据平衡与生成增强
79
80 ### BAGAN: Data Augmentation with Balancing GAN (2018)
81 - arXiv: https://arxiv.org/abs/1803.09655
82
83 严格说你提到的 `pro-WGAN` 我这次没有找到一个明确、权威、在该任务里广泛标准化的同名主文献;当前更接近、且有明确权威来源的是 **BAGAN / balancing GAN** 这一类面向不平衡数据增强的方法。
84
85 因此本次实现里我采用的是:
86 - **pro-WGAN 风格的工程近似平衡策略**
87 - 不是声称已经复现某篇明确的 `pro-WGAN` SOTA 论文
88
89 如果你之后指定了准确论文或仓库,我可以按那一版精确对齐实现。
90
91 ## 5. 2026 年是否已经有更好的方案?
92
93 有,结论是:**有明显更好的路线**
94
95 最值得参考的是:
96 1.**music foundation model** 做 backbone
97 2.**更真实退化模拟 + retrieval-first metric learning**
98 3.**segment-level / window-level indexing**,而不是整曲平均 embedding
99 4. 对哼唱任务增加 **melody/pitch contour 专门支路**
100
101 ## 6. 对本项目的建议排序
102
103 ### 当前阶段(已开始)
104 - 128 Mel 替换低维说话人风格输入
105 - band-split 输入层
106 - 更强混淆增强
107 - retrieval-first 评测
108
109 ### 下一阶段
110 - MERT / MuQ frozen feature baseline
111 - triplet / multi-positive metric learning 对比 SupCon
112 - window-level index aggregation
113 - FMA / Jamendo 小规模真实数据验证
114
115 ### 更后阶段
116 - humming 专门 melody tower
117 - foundation model + lightweight fingerprint head
118 - ANN + reranker 两阶段工业化检索
119
120 ## Sources
121 - Araz et al., 2025, Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification: https://arxiv.org/abs/2506.22661
122 - Singh et al., 2025, Robust Neural Audio Fingerprinting using Music Foundation Models: https://arxiv.org/abs/2511.05399
123 - Li et al., 2023, MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training: https://arxiv.org/abs/2306.00107
124 - Zhu et al., 2025, MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization: https://arxiv.org/abs/2501.01108
125 - Luo & Yu, 2022, Music Source Separation with Band-split RNN: https://arxiv.org/abs/2209.15174
126 - Mariani et al., 2018, BAGAN: Data Augmentation with Balancing GAN: https://arxiv.org/abs/1803.09655