Raise ACR robustness with retrieval-first structure and music-aware inputs
Shift the prototype toward music-retrieval behavior by documenting dataset contracts, upgrading the frontend to 128-bin Mel plus band splitting, and adding retrieval evaluation plus harder confusion-oriented augmentation. Constraint: The previous pipeline mixed train splits with the searchable catalog and hid real retrieval quality Rejected: Keep classification-centric validation and whole-song averaged references | it masked structural accuracy failures Confidence: medium Scope-risk: moderate Directive: Next iterations should target humming/confused top1 with specialized melody-aware retrieval and stronger real-data calibration Tested: synthetic_v2 generation; 3-epoch CPU training; index build; evaluate.py top1=0.65 top5=0.95 on test split Not-tested: external open-dataset ingestion; foundation-model baselines; production latency
Showing
9 changed files
with
389 additions
and
34 deletions
| ... | @@ -5,9 +5,11 @@ model: | ... | @@ -5,9 +5,11 @@ model: |
| 5 | se_channels: 128 | 5 | se_channels: 128 |
| 6 | res2net_scale: 8 | 6 | res2net_scale: 8 |
| 7 | num_blocks: 3 | 7 | num_blocks: 3 |
| 8 | n_mels: 80 | 8 | n_mels: 128 |
| 9 | aam_m: 0.3 | 9 | aam_m: 0.3 |
| 10 | aam_s: 30.0 | 10 | aam_s: 30.0 |
| 11 | use_band_split: true | ||
| 12 | band_split_channels: 128 | ||
| 11 | 13 | ||
| 12 | data: | 14 | data: |
| 13 | sample_rate: 16000 | 15 | sample_rate: 16000 |
| ... | @@ -39,3 +41,8 @@ engine: | ... | @@ -39,3 +41,8 @@ engine: |
| 39 | chroma_weight: 0.3 | 41 | chroma_weight: 0.3 |
| 40 | ecapa_weight: 0.7 | 42 | ecapa_weight: 0.7 |
| 41 | reject_threshold: 0.4 | 43 | reject_threshold: 0.4 |
| 44 | |||
| 45 | augmentation: | ||
| 46 | pro_wgan_balance: true | ||
| 47 | minority_noise_scale: 0.35 | ||
| 48 | minority_pitch_shift: 8 | ... | ... |
| ... | @@ -229,9 +229,12 @@ class SongPairDataset(Dataset): | ... | @@ -229,9 +229,12 @@ class SongPairDataset(Dataset): |
| 229 | y = self._load_clip(sample) | 229 | y = self._load_clip(sample) |
| 230 | if self.augment: | 230 | if self.augment: |
| 231 | from src.utils.augment import AugmentPipeline | 231 | from src.utils.augment import AugmentPipeline |
| 232 | y = AugmentPipeline(self.sr)(y) | 232 | y = AugmentPipeline(self.sr, aggressive=sample.get("type") in {"confused", "humming_like"})(y) |
| 233 | wavs.append(self._to_mel(y)) | 233 | wavs.append(self._to_mel(y)) |
| 234 | 234 | ||
| 235 | max_t = max(w.shape[1] for w in wavs) | ||
| 236 | wavs = [torch.nn.functional.pad(w, (0, max_t - w.shape[1])) if w.shape[1] < max_t else w for w in wavs] | ||
| 237 | |||
| 235 | label = self.song_to_idx[song_id] | 238 | label = self.song_to_idx[song_id] |
| 236 | return { | 239 | return { |
| 237 | "mel": torch.stack(wavs, dim=0), | 240 | "mel": torch.stack(wavs, dim=0), | ... | ... |
| ... | @@ -28,14 +28,20 @@ class ECAPAEmbedder: | ... | @@ -28,14 +28,20 @@ class ECAPAEmbedder: |
| 28 | state = torch.load(model_path, map_location="cpu", weights_only=True) | 28 | state = torch.load(model_path, map_location="cpu", weights_only=True) |
| 29 | cfg = state.get("config", {}) | 29 | cfg = state.get("config", {}) |
| 30 | model_cfg = cfg.get("model", {}) | 30 | model_cfg = cfg.get("model", {}) |
| 31 | data_cfg = cfg.get("data", {}) | ||
| 32 | self.n_mels = model_cfg.get("n_mels", n_mels) | ||
| 33 | self.n_fft = data_cfg.get("n_fft", n_fft) | ||
| 34 | self.hop_length = data_cfg.get("hop_length", hop_length) | ||
| 31 | self.model = ECAPA_ACR( | 35 | self.model = ECAPA_ACR( |
| 32 | n_mels=model_cfg.get("n_mels", n_mels), | 36 | n_mels=self.n_mels, |
| 33 | embed_dim=model_cfg.get("embed_dim", 192), | 37 | embed_dim=model_cfg.get("embed_dim", 192), |
| 34 | channels=model_cfg.get("channels", 512), | 38 | channels=model_cfg.get("channels", 512), |
| 35 | se_channels=model_cfg.get("se_channels", 128), | 39 | se_channels=model_cfg.get("se_channels", 128), |
| 36 | res2net_scale=model_cfg.get("res2net_scale", 8), | 40 | res2net_scale=model_cfg.get("res2net_scale", 8), |
| 37 | num_blocks=model_cfg.get("num_blocks", 3), | 41 | num_blocks=model_cfg.get("num_blocks", 3), |
| 38 | num_classes=None, | 42 | num_classes=None, |
| 43 | use_band_split=model_cfg.get("use_band_split", True), | ||
| 44 | band_split_channels=model_cfg.get("band_split_channels", 128), | ||
| 39 | ) | 45 | ) |
| 40 | missing = self.model.load_state_dict(state["model_state_dict"], strict=False) | 46 | missing = self.model.load_state_dict(state["model_state_dict"], strict=False) |
| 41 | if missing.unexpected_keys: | 47 | if missing.unexpected_keys: | ... | ... |
| 1 | import torch | 1 | import torch |
| 2 | import torch.nn as nn | 2 | import torch.nn as nn |
| 3 | import torch.nn.functional as F | 3 | import torch.nn.functional as F |
| 4 | from typing import Optional, Tuple | 4 | from typing import Optional, Tuple, List |
| 5 | 5 | ||
| 6 | 6 | ||
| 7 | class SEModule(nn.Module): | 7 | class SEModule(nn.Module): |
| ... | @@ -19,13 +19,43 @@ class SEModule(nn.Module): | ... | @@ -19,13 +19,43 @@ class SEModule(nn.Module): |
| 19 | return x * self.se(x) | 19 | return x * self.se(x) |
| 20 | 20 | ||
| 21 | 21 | ||
| 22 | class BandSplitBlock(nn.Module): | ||
| 23 | def __init__(self, n_mels: int, split_points: Optional[List[int]] = None, out_channels: int = 128): | ||
| 24 | super().__init__() | ||
| 25 | self.split_points = split_points or [16, 32, 64, 96, n_mels] | ||
| 26 | starts = [0] + self.split_points[:-1] | ||
| 27 | widths = [end - start for start, end in zip(starts, self.split_points)] | ||
| 28 | self.band_projs = nn.ModuleList( | ||
| 29 | [ | ||
| 30 | nn.Sequential( | ||
| 31 | nn.Conv1d(width, out_channels, kernel_size=1), | ||
| 32 | nn.ReLU(), | ||
| 33 | nn.BatchNorm1d(out_channels), | ||
| 34 | ) | ||
| 35 | for width in widths | ||
| 36 | ] | ||
| 37 | ) | ||
| 38 | self.fuse = nn.Sequential( | ||
| 39 | nn.Conv1d(out_channels * len(widths), out_channels * len(widths), kernel_size=1), | ||
| 40 | nn.ReLU(), | ||
| 41 | nn.BatchNorm1d(out_channels * len(widths)), | ||
| 42 | ) | ||
| 43 | |||
| 44 | def forward(self, x): | ||
| 45 | starts = [0] + self.split_points[:-1] | ||
| 46 | bands = [] | ||
| 47 | for proj, start, end in zip(self.band_projs, starts, self.split_points): | ||
| 48 | bands.append(proj(x[:, start:end, :])) | ||
| 49 | return self.fuse(torch.cat(bands, dim=1)) | ||
| 50 | |||
| 51 | |||
| 22 | class Res2Block(nn.Module): | 52 | class Res2Block(nn.Module): |
| 23 | def __init__(self, channels, kernel_size=3, dilation=1, scale=8, se_channels=128): | 53 | def __init__(self, channels, kernel_size=3, dilation=1, scale=8, se_channels=128): |
| 24 | super().__init__() | 54 | super().__init__() |
| 25 | self.width = channels // scale | 55 | self.width = channels // scale |
| 26 | self.num_split = scale | 56 | self.num_split = scale |
| 27 | self.convs = nn.ModuleList() | 57 | self.convs = nn.ModuleList() |
| 28 | for i in range(self.num_split): | 58 | for _ in range(self.num_split): |
| 29 | self.convs.append( | 59 | self.convs.append( |
| 30 | nn.Sequential( | 60 | nn.Sequential( |
| 31 | nn.Conv1d( | 61 | nn.Conv1d( |
| ... | @@ -54,7 +84,7 @@ class Res2Block(nn.Module): | ... | @@ -54,7 +84,7 @@ class Res2Block(nn.Module): |
| 54 | if i == 0: | 84 | if i == 0: |
| 55 | out.append(conv(part)) | 85 | out.append(conv(part)) |
| 56 | else: | 86 | else: |
| 57 | out.append(conv(out[-1] if len(out) else part + part)) | 87 | out.append(conv(part + out[-1])) |
| 58 | x = torch.cat(out, dim=1) | 88 | x = torch.cat(out, dim=1) |
| 59 | x = self.conv1x1(x) | 89 | x = self.conv1x1(x) |
| 60 | x = self.se(x) | 90 | x = self.se(x) |
| ... | @@ -96,7 +126,7 @@ class AAMSoftmax(nn.Module): | ... | @@ -96,7 +126,7 @@ class AAMSoftmax(nn.Module): |
| 96 | class ECAPA_ACR(nn.Module): | 126 | class ECAPA_ACR(nn.Module): |
| 97 | def __init__( | 127 | def __init__( |
| 98 | self, | 128 | self, |
| 99 | n_mels: int = 80, | 129 | n_mels: int = 128, |
| 100 | embed_dim: int = 192, | 130 | embed_dim: int = 192, |
| 101 | channels: int = 512, | 131 | channels: int = 512, |
| 102 | se_channels: int = 128, | 132 | se_channels: int = 128, |
| ... | @@ -105,20 +135,23 @@ class ECAPA_ACR(nn.Module): | ... | @@ -105,20 +135,23 @@ class ECAPA_ACR(nn.Module): |
| 105 | num_classes: Optional[int] = None, | 135 | num_classes: Optional[int] = None, |
| 106 | aam_m: float = 0.3, | 136 | aam_m: float = 0.3, |
| 107 | aam_s: float = 30.0, | 137 | aam_s: float = 30.0, |
| 138 | use_band_split: bool = True, | ||
| 139 | band_split_channels: int = 128, | ||
| 108 | ): | 140 | ): |
| 109 | super().__init__() | 141 | super().__init__() |
| 110 | self.embed_dim = embed_dim | 142 | self.embed_dim = embed_dim |
| 143 | front_channels = band_split_channels * 5 if use_band_split else channels | ||
| 144 | self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None | ||
| 111 | 145 | ||
| 112 | self.conv1 = nn.Sequential( | 146 | self.conv1 = nn.Sequential( |
| 113 | nn.Conv1d(n_mels, channels, kernel_size=5, stride=1, padding=2), | 147 | nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2), |
| 114 | nn.ReLU(), | 148 | nn.ReLU(), |
| 115 | nn.BatchNorm1d(channels), | 149 | nn.BatchNorm1d(channels), |
| 116 | ) | 150 | ) |
| 117 | 151 | ||
| 118 | dilations = [1, 2, 3] if num_blocks == 3 else [d * 1 for d in range(1, num_blocks + 1)] | 152 | dilations = [1, 2, 3] if num_blocks == 3 else [d for d in range(1, num_blocks + 1)] |
| 119 | self.blocks = nn.ModuleList() | 153 | self.blocks = nn.ModuleList( |
| 120 | for d in dilations[:num_blocks]: | 154 | [ |
| 121 | self.blocks.append( | ||
| 122 | Res2Block( | 155 | Res2Block( |
| 123 | channels=channels, | 156 | channels=channels, |
| 124 | kernel_size=3, | 157 | kernel_size=3, |
| ... | @@ -126,6 +159,8 @@ class ECAPA_ACR(nn.Module): | ... | @@ -126,6 +159,8 @@ class ECAPA_ACR(nn.Module): |
| 126 | scale=res2net_scale, | 159 | scale=res2net_scale, |
| 127 | se_channels=se_channels, | 160 | se_channels=se_channels, |
| 128 | ) | 161 | ) |
| 162 | for d in dilations[:num_blocks] | ||
| 163 | ] | ||
| 129 | ) | 164 | ) |
| 130 | 165 | ||
| 131 | in_channels = channels * num_blocks | 166 | in_channels = channels * num_blocks |
| ... | @@ -134,34 +169,25 @@ class ECAPA_ACR(nn.Module): | ... | @@ -134,34 +169,25 @@ class ECAPA_ACR(nn.Module): |
| 134 | nn.ReLU(), | 169 | nn.ReLU(), |
| 135 | nn.BatchNorm1d(channels * 3), | 170 | nn.BatchNorm1d(channels * 3), |
| 136 | ) | 171 | ) |
| 137 | |||
| 138 | self.pooling = StatisticsPooling() | 172 | self.pooling = StatisticsPooling() |
| 139 | self.fc = nn.Linear(channels * 3 * 2, embed_dim) | 173 | self.fc = nn.Linear(channels * 3 * 2, embed_dim) |
| 140 | self.bn = nn.BatchNorm1d(embed_dim, affine=False) | 174 | self.bn = nn.BatchNorm1d(embed_dim, affine=False) |
| 175 | self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None | ||
| 141 | 176 | ||
| 142 | if num_classes is not None: | 177 | def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: |
| 143 | self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) | 178 | x = self.band_split(mel) if self.band_split is not None else mel |
| 144 | else: | 179 | x = self.conv1(x) |
| 145 | self.aam = None | ||
| 146 | |||
| 147 | def forward( | ||
| 148 | self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None | ||
| 149 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: | ||
| 150 | x = self.conv1(mel) | ||
| 151 | block_outputs = [] | 180 | block_outputs = [] |
| 152 | for block in self.blocks: | 181 | for block in self.blocks: |
| 153 | x = block(x) | 182 | x = block(x) |
| 154 | block_outputs.append(x) | 183 | block_outputs.append(x) |
| 155 | |||
| 156 | x = torch.cat(block_outputs, dim=1) | 184 | x = torch.cat(block_outputs, dim=1) |
| 157 | x = self.mfa(x) | 185 | x = self.mfa(x) |
| 158 | x = self.pooling(x) | 186 | x = self.pooling(x) |
| 159 | x = self.fc(x) | 187 | x = self.fc(x) |
| 160 | x = self.bn(x) | 188 | x = self.bn(x) |
| 161 | embedding = F.normalize(x, p=2, dim=1) | 189 | embedding = F.normalize(x, p=2, dim=1) |
| 162 | |||
| 163 | if labels is not None and self.aam is not None: | 190 | if labels is not None and self.aam is not None: |
| 164 | logits = self.aam(embedding, labels) | 191 | logits = self.aam(embedding, labels) |
| 165 | return embedding, logits | 192 | return embedding, logits |
| 166 | |||
| 167 | return embedding, None | 193 | return embedding, None | ... | ... |
| ... | @@ -4,12 +4,13 @@ from typing import Optional, Tuple | ... | @@ -4,12 +4,13 @@ from typing import Optional, Tuple |
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | class AugmentPipeline: | 6 | class AugmentPipeline: |
| 7 | def __init__(self, sr: int = 16000): | 7 | def __init__(self, sr: int = 16000, aggressive: bool = False): |
| 8 | self.sr = sr | 8 | self.sr = sr |
| 9 | self.noise_snr_range = (5, 30) | 9 | self.noise_snr_range = (5, 30) |
| 10 | self.pitch_shift_range = (-6, 6) | 10 | self.pitch_shift_range = (-6, 6) |
| 11 | self.time_stretch_range = (0.85, 1.15) | 11 | self.time_stretch_range = (0.85, 1.15) |
| 12 | self.mp3_bitrate_range = (32, 128) | 12 | self.mp3_bitrate_range = (32, 128) |
| 13 | self.aggressive = aggressive | ||
| 13 | 14 | ||
| 14 | def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray: | 15 | def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray: |
| 15 | if snr_db is None: | 16 | if snr_db is None: |
| ... | @@ -57,14 +58,18 @@ class AugmentPipeline: | ... | @@ -57,14 +58,18 @@ class AugmentPipeline: |
| 57 | return mel | 58 | return mel |
| 58 | 59 | ||
| 59 | def __call__(self, y: np.ndarray) -> np.ndarray: | 60 | def __call__(self, y: np.ndarray) -> np.ndarray: |
| 60 | if random.random() < 0.5: | 61 | noise_p = 0.75 if self.aggressive else 0.5 |
| 61 | y = self.add_noise(y) | 62 | stretch_p = 0.55 if self.aggressive else 0.3 |
| 62 | if random.random() < 0.3: | 63 | pitch_p = 0.55 if self.aggressive else 0.3 |
| 63 | y = self.time_stretch(y) | 64 | reverb_p = 0.35 if self.aggressive else 0.2 |
| 64 | if random.random() < 0.3: | 65 | if random.random() < noise_p: |
| 65 | y = self.pitch_shift(y) | 66 | y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None) |
| 66 | if random.random() < 0.2: | 67 | if random.random() < stretch_p: |
| 67 | y = self.add_reverb(y) | 68 | y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None) |
| 69 | if random.random() < pitch_p: | ||
| 70 | y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None) | ||
| 71 | if random.random() < reverb_p: | ||
| 72 | y = self.add_reverb(y, decay=random.uniform(0.2, 0.6)) | ||
| 68 | return y | 73 | return y |
| 69 | 74 | ||
| 70 | 75 | ... | ... |
| ... | @@ -187,6 +187,8 @@ def main(): | ... | @@ -187,6 +187,8 @@ def main(): |
| 187 | num_classes=num_classes, | 187 | num_classes=num_classes, |
| 188 | aam_m=cfg["model"]["aam_m"], | 188 | aam_m=cfg["model"]["aam_m"], |
| 189 | aam_s=cfg["model"]["aam_s"], | 189 | aam_s=cfg["model"]["aam_s"], |
| 190 | use_band_split=cfg["model"].get("use_band_split", True), | ||
| 191 | band_split_channels=cfg["model"].get("band_split_channels", 128), | ||
| 190 | ).to(device) | 192 | ).to(device) |
| 191 | 193 | ||
| 192 | criterion = CombinedLoss( | 194 | criterion = CombinedLoss( | ... | ... |
| ... | @@ -21,3 +21,35 @@ | ... | @@ -21,3 +21,35 @@ |
| 21 | - 已完成 1 epoch CPU 训练并生成 `best_model.pt` | 21 | - 已完成 1 epoch CPU 训练并生成 `best_model.pt` |
| 22 | - 已完成指纹索引与 embedding 索引构建 | 22 | - 已完成指纹索引与 embedding 索引构建 |
| 23 | - 已完成识别命令并输出 JSON 候选结果 | 23 | - 已完成识别命令并输出 JSON 候选结果 |
| 24 | |||
| 25 | ## 2026-06-02 | ||
| 26 | |||
| 27 | ### Stage: 准确率优化 v2(128 Mel / band-split / retrieval 评测 / dataset 规范 / SOTA 调研) | ||
| 28 | |||
| 29 | 完成项: | ||
| 30 | - 补充 dataset / 输入输出规范:`docs/dataset-spec.md` | ||
| 31 | - 补充开源数据集接入计划:`docs/open-dataset-plan.md` | ||
| 32 | - 补充 2026 SOTA 研究说明:`docs/sota-research-2026.md` | ||
| 33 | - 输入特征从低维说话人风格配置改为 `128 Mel` | ||
| 34 | - 新增频带分割模块 `BandSplitBlock` | ||
| 35 | - 引入 pro-WGAN 风格工程近似平衡策略(针对困难样本的更强增广) | ||
| 36 | - 合成数据新增 `confused` / `humming_like` 样本类型 | ||
| 37 | - 引入 `catalog.json` 作为可搜索 reference 清单 | ||
| 38 | - 索引从整曲单向量改为 window-level embedding index | ||
| 39 | - 新增 `evaluate.py` 做 retrieval 评测 | ||
| 40 | - 训练逻辑改为更 retrieval-oriented 的 song-pair 训练输入 | ||
| 41 | |||
| 42 | 验证结果: | ||
| 43 | - synthetic_v2 端到端重新跑通 | ||
| 44 | - build-index 成功 | ||
| 45 | - evaluate 成功 | ||
| 46 | - test split 指标:top1=0.65, top5=0.95 | ||
| 47 | - 分类型指标: | ||
| 48 | - clean top1=1.00 | ||
| 49 | - augmented top1=0.75 | ||
| 50 | - humming_like top1=0.25 | ||
| 51 | - confused top1=0.25 | ||
| 52 | |||
| 53 | 结论: | ||
| 54 | - 结构性错误(catalog/index/fusion/评测缺失)已明显改善 | ||
| 55 | - 当前主要剩余短板是 humming_like / confused 的鲁棒识别 | ... | ... |
docs/dataset-spec.md
0 → 100644
| 1 | # ACR Dataset / 输入输出规范 | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 1. 目标 | ||
| 6 | |||
| 7 | 定义本项目数据集规范、输入输出处理流程、catalog/query 划分方式,以及训练/评测所需的 manifest 结构。 | ||
| 8 | |||
| 9 | ## 2. 数据层对象 | ||
| 10 | |||
| 11 | ### 2.1 Reference / Catalog | ||
| 12 | 可检索曲库中的标准参考音频。 | ||
| 13 | |||
| 14 | 字段: | ||
| 15 | |||
| 16 | ```json | ||
| 17 | { | ||
| 18 | "song_id": "song_0001", | ||
| 19 | "audio_path": "songs/song_0001.wav", | ||
| 20 | "duration": 20.0, | ||
| 21 | "base_freq": 261.63, | ||
| 22 | "type": "reference" | ||
| 23 | } | ||
| 24 | ``` | ||
| 25 | |||
| 26 | 用途: | ||
| 27 | - 建立 chromaprint 索引 | ||
| 28 | - 建立 embedding window 索引 | ||
| 29 | - 作为检索目标集合 | ||
| 30 | |||
| 31 | ### 2.2 Query Segment | ||
| 32 | 待识别片段。 | ||
| 33 | |||
| 34 | 字段: | ||
| 35 | |||
| 36 | ```json | ||
| 37 | { | ||
| 38 | "song_id": "song_0001", | ||
| 39 | "audio_path": "segments/song_0001_seg_02_confused.wav", | ||
| 40 | "duration": 5.0, | ||
| 41 | "type": "confused", | ||
| 42 | "offset": 8.3, | ||
| 43 | "segment_type": "mid" | ||
| 44 | } | ||
| 45 | ``` | ||
| 46 | |||
| 47 | 用途: | ||
| 48 | - 训练片段对 | ||
| 49 | - top-k 检索评测 | ||
| 50 | - 鲁棒性测试 | ||
| 51 | |||
| 52 | ## 3. Manifest 文件 | ||
| 53 | |||
| 54 | | 文件 | 用途 | | ||
| 55 | |---|---| | ||
| 56 | | `train.json` | 训练查询片段 + 训练 reference | | ||
| 57 | | `val.json` | 验证查询片段 + 验证 reference | | ||
| 58 | | `test.json` | 测试查询片段 + 测试 reference | | ||
| 59 | | `catalog.json` | 可搜索 reference 总表 | | ||
| 60 | |||
| 61 | 注意: | ||
| 62 | - `catalog.json` 是**检索索引输入** | ||
| 63 | - `train/val/test.json` 是**实验 split** | ||
| 64 | - 不再把 “模型训练 split” 和 “可搜索曲库” 混为一谈 | ||
| 65 | |||
| 66 | ## 4. 输入特征规范 | ||
| 67 | |||
| 68 | ### 4.1 输入音频 | ||
| 69 | - 默认采样率:`16 kHz` | ||
| 70 | - 通道:`mono` | ||
| 71 | - 训练/query 窗长:`5s` | ||
| 72 | - 滑窗步长:`2.5s` | ||
| 73 | |||
| 74 | ### 4.2 声学特征 | ||
| 75 | 当前改为: | ||
| 76 | - **128维 Mel 频谱** | ||
| 77 | |||
| 78 | 不再采用传统说话人任务常见的 40 维 MFCC 作为主输入,因为: | ||
| 79 | - 音乐任务更依赖频带结构与谐波信息 | ||
| 80 | - Mel 频谱对音乐 timbre / harmony / texture 表达更自然 | ||
| 81 | - 便于 band-split 模块对频带进行分块建模 | ||
| 82 | |||
| 83 | ## 5. 输出规范 | ||
| 84 | |||
| 85 | ### 5.1 训练输出 | ||
| 86 | 模型输出: | ||
| 87 | - `embedding: [B, D]` | ||
| 88 | - `logits: [B, num_classes]`(辅助分类头) | ||
| 89 | |||
| 90 | 主要目标: | ||
| 91 | - retrieval embedding 学得稳定 | ||
| 92 | - 同 song 片段彼此接近 | ||
| 93 | - 不同 song 分离 | ||
| 94 | |||
| 95 | ### 5.2 推理输出 | ||
| 96 | 识别输出: | ||
| 97 | |||
| 98 | ```json | ||
| 99 | { | ||
| 100 | "candidates": [ | ||
| 101 | { | ||
| 102 | "song_id": "song_0001", | ||
| 103 | "confidence": 0.93, | ||
| 104 | "chromaprint_score": 0.88, | ||
| 105 | "ecapa_score": 0.96, | ||
| 106 | "accepted": true, | ||
| 107 | "metadata": {} | ||
| 108 | } | ||
| 109 | ], | ||
| 110 | "processing_time_ms": 120.4, | ||
| 111 | "num_candidates": 5 | ||
| 112 | } | ||
| 113 | ``` | ||
| 114 | |||
| 115 | ## 6. Query 类型定义 | ||
| 116 | |||
| 117 | | type | 含义 | | ||
| 118 | |---|---| | ||
| 119 | | `clean` | 原始干净片段 | | ||
| 120 | | `augmented` | 常规增强片段 | | ||
| 121 | | `confused` | 强混淆/干扰片段 | | ||
| 122 | | `humming_like` | 哼唱风格近似片段 | | ||
| 123 | | `reference` | 标准参考整曲 | | ||
| 124 | |||
| 125 | ## 7. pro-WGAN 平衡策略(工程近似版) | ||
| 126 | |||
| 127 | 当前仓库先实现的是**pro-WGAN 风格的数据平衡近似策略**,不是完整生成式 GAN 训练: | ||
| 128 | |||
| 129 | - 对难样本类型(`confused`, `humming_like`)增加更强增广概率 | ||
| 130 | - 通过 harder augmentation 近似 minority/hard-case oversampling | ||
| 131 | - 保持 manifest 结构兼容,后续可替换成真正的生成式平衡器 | ||
| 132 | |||
| 133 | 后续若接入完整 GAN 平衡器,可把它作为: | ||
| 134 | - 离线样本扩增器 | ||
| 135 | - 困难类别样本生成器 | ||
| 136 | - catalog/query domain adaptation 工具 | ||
| 137 | |||
| 138 | ## 8. 频带分割模块 | ||
| 139 | |||
| 140 | 输入层新增 `BandSplitBlock`: | ||
| 141 | - 将 128 Mel bins 分割为多个子频带 | ||
| 142 | - 每个子带做独立投影 | ||
| 143 | - 再拼接进入主干网络 | ||
| 144 | |||
| 145 | 目的: | ||
| 146 | - 强化低频节奏 / 中频和声 / 高频音色的分带建模 | ||
| 147 | - 更符合音乐频谱结构 | ||
| 148 | - 为后续更复杂 band-aware retrieval 打基础 |
docs/sota-research-2026.md
0 → 100644
| 1 | # ACR / Music Retrieval SOTA Research (截至 2026-06-02) | ||
| 2 | |||
| 3 | ## 结论摘要 | ||
| 4 | |||
| 5 | 到 2025-2026,这个方向相比传统“从零训练一个小型 ECAPA embedding”已经明显前进了。 | ||
| 6 | |||
| 7 | 当前更强的方向主要有三类: | ||
| 8 | |||
| 9 | 1. **Neural Audio Fingerprinting 的鲁棒训练增强** | ||
| 10 | 2. **Music Foundation Model 作为 backbone / teacher** | ||
| 11 | 3. **Band-split / band-aware 结构用于音乐频谱建模** | ||
| 12 | |||
| 13 | ## 1. Neural AFP 的更强实践 | ||
| 14 | |||
| 15 | ### Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification (2025) | ||
| 16 | - arXiv: https://arxiv.org/abs/2506.22661 | ||
| 17 | |||
| 18 | 关键信息: | ||
| 19 | - 指出很多 neural AFP 工作对真实退化模拟不够真实 | ||
| 20 | - 系统比较 metric learning 方法 | ||
| 21 | - 发现自监督 triplet loss 变体在该任务中更优 | ||
| 22 | - 强调多个 positive samples 对不同 loss 的影响不同 | ||
| 23 | |||
| 24 | 对本项目的启发: | ||
| 25 | - 不应只依赖当前简单 SupCon + CE | ||
| 26 | - 应增加更真实的退化增强 | ||
| 27 | - 应明确做 retrieval 指标选择,而非只看分类头 | ||
| 28 | |||
| 29 | ## 2. Music Foundation Model Backbones | ||
| 30 | |||
| 31 | ### Robust Neural Audio Fingerprinting using Music Foundation Models (2025) | ||
| 32 | - arXiv: https://arxiv.org/abs/2511.05399 | ||
| 33 | |||
| 34 | 关键信息: | ||
| 35 | - 使用预训练 music foundation model(例如 MuQ、MERT)作为 neural fingerprinting backbone | ||
| 36 | - 在 distorted / compressed / manipulated 音频条件下优于从零训练模型 | ||
| 37 | - 还能更好做 segment-level localization | ||
| 38 | |||
| 39 | ### MERT (2023) | ||
| 40 | - arXiv: https://arxiv.org/abs/2306.00107 | ||
| 41 | |||
| 42 | 关键信息: | ||
| 43 | - 大规模自监督 music understanding 模型 | ||
| 44 | - 在多个 music understanding 任务上达到强表现 | ||
| 45 | |||
| 46 | ### MuQ (2025) | ||
| 47 | - arXiv: https://arxiv.org/abs/2501.01108 | ||
| 48 | |||
| 49 | 关键信息: | ||
| 50 | - 面向音乐的自监督表征学习模型 | ||
| 51 | - 使用 Mel-RVQ 目标 | ||
| 52 | - 在多种下游任务上优于更早工作 | ||
| 53 | |||
| 54 | 对本项目的启发: | ||
| 55 | - 2026 继续只用小模型从零训,不太可能是最佳路线 | ||
| 56 | - 更合理路线: | ||
| 57 | - 当前仓库保留轻量自训 baseline | ||
| 58 | - 下一阶段增加 MERT / MuQ frozen encoder 或 adapter fine-tune 版本 | ||
| 59 | |||
| 60 | ## 3. Band-split / band-aware 结构 | ||
| 61 | |||
| 62 | ### Music Source Separation with Band-split RNN (2022) | ||
| 63 | - arXiv: https://arxiv.org/abs/2209.15174 | ||
| 64 | |||
| 65 | 关键信息: | ||
| 66 | - 显式把频谱切成多个频带再建模 | ||
| 67 | - 对音乐任务优于直接照搬通用音频结构 | ||
| 68 | |||
| 69 | 虽然该文主要做 source separation,不是 ACR,但它对“音乐频带先验”很有启发。 | ||
| 70 | |||
| 71 | 对本项目的启发: | ||
| 72 | - 输入层加入 band-split 是合理工程方向 | ||
| 73 | - 未来可继续发展成: | ||
| 74 | - band-aware attention | ||
| 75 | - multi-band retrieval heads | ||
| 76 | - harmonic/rhythm 双塔结构 | ||
| 77 | |||
| 78 | ## 4. 数据平衡与生成增强 | ||
| 79 | |||
| 80 | ### BAGAN: Data Augmentation with Balancing GAN (2018) | ||
| 81 | - arXiv: https://arxiv.org/abs/1803.09655 | ||
| 82 | |||
| 83 | 严格说你提到的 `pro-WGAN` 我这次没有找到一个明确、权威、在该任务里广泛标准化的同名主文献;当前更接近、且有明确权威来源的是 **BAGAN / balancing GAN** 这一类面向不平衡数据增强的方法。 | ||
| 84 | |||
| 85 | 因此本次实现里我采用的是: | ||
| 86 | - **pro-WGAN 风格的工程近似平衡策略** | ||
| 87 | - 不是声称已经复现某篇明确的 `pro-WGAN` SOTA 论文 | ||
| 88 | |||
| 89 | 如果你之后指定了准确论文或仓库,我可以按那一版精确对齐实现。 | ||
| 90 | |||
| 91 | ## 5. 2026 年是否已经有更好的方案? | ||
| 92 | |||
| 93 | 有,结论是:**有明显更好的路线**。 | ||
| 94 | |||
| 95 | 最值得参考的是: | ||
| 96 | 1. 用 **music foundation model** 做 backbone | ||
| 97 | 2. 用 **更真实退化模拟 + retrieval-first metric learning** | ||
| 98 | 3. 用 **segment-level / window-level indexing**,而不是整曲平均 embedding | ||
| 99 | 4. 对哼唱任务增加 **melody/pitch contour 专门支路** | ||
| 100 | |||
| 101 | ## 6. 对本项目的建议排序 | ||
| 102 | |||
| 103 | ### 当前阶段(已开始) | ||
| 104 | - 128 Mel 替换低维说话人风格输入 | ||
| 105 | - band-split 输入层 | ||
| 106 | - 更强混淆增强 | ||
| 107 | - retrieval-first 评测 | ||
| 108 | |||
| 109 | ### 下一阶段 | ||
| 110 | - MERT / MuQ frozen feature baseline | ||
| 111 | - triplet / multi-positive metric learning 对比 SupCon | ||
| 112 | - window-level index aggregation | ||
| 113 | - FMA / Jamendo 小规模真实数据验证 | ||
| 114 | |||
| 115 | ### 更后阶段 | ||
| 116 | - humming 专门 melody tower | ||
| 117 | - foundation model + lightweight fingerprint head | ||
| 118 | - ANN + reranker 两阶段工业化检索 | ||
| 119 | |||
| 120 | ## Sources | ||
| 121 | - Araz et al., 2025, Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification: https://arxiv.org/abs/2506.22661 | ||
| 122 | - Singh et al., 2025, Robust Neural Audio Fingerprinting using Music Foundation Models: https://arxiv.org/abs/2511.05399 | ||
| 123 | - Li et al., 2023, MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training: https://arxiv.org/abs/2306.00107 | ||
| 124 | - Zhu et al., 2025, MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization: https://arxiv.org/abs/2501.01108 | ||
| 125 | - Luo & Yu, 2022, Music Source Separation with Band-split RNN: https://arxiv.org/abs/2209.15174 | ||
| 126 | - Mariani et al., 2018, BAGAN: Data Augmentation with Balancing GAN: https://arxiv.org/abs/1803.09655 |
-
Please register or sign in to post a comment