Commit 7da76864 7da76864361f72a1428d2b36aeea2f283d8945e6 by 章晓祥

-

1 parent 3ff5efd2
Showing 34 changed files with 1580 additions and 108 deletions
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-GlEnjnf09lXwiJuwDS5Q0nOzGd1ck8YBDERVXv84t9hvtS0U",
"ANTHROPIC_BASE_URL": "https://aiapis.help",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "gpt-5.4",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "claude-opus-4.6",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4.6",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4.6",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4.5",
"CLAUDE_CODE_SUBAGENT_MODEL": "claude-sonnet-4.6",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "qwen3.7-max",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "qwen3.7-max",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "qwen3.6-plus",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "qwen3.6-plus",
"CLAUDE_CODE_SUBAGENT_MODEL": "qwen3.6-plus",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
model:
name: coverhunter_finetune
embed_dim: 256
channels: 512
se_channels: 128
res2net_scale: 8
num_blocks: 3
n_mels: 128
aam_m: 0.2
aam_s: 30.0
use_band_split: false
band_split_channels: 128
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 8
coverhunter_layers: 4
fusion_hidden_dim: 256
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
n_fft: 512
hop_length: 160
segment_dur: 8.0
crop_per_song: 6
training:
batch_size: 16
epochs: 30
lr: 0.0002
weight_decay: 0.0001
warmup_epochs: 3
temperature: 0.05
supcon_weight: 1.0
aam_weight: 0.2
mixed_precision: true
gradient_clip: 1.0
save_every: 5
log_every: 10
hard_negative_k: 4
sample_type_weights:
default: 1
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
compressed: 1.5
recording: 2.0
environment: 3.0
model:
name: coverhunter_finetune_lowmem
embed_dim: 192
channels: 256
se_channels: 64
res2net_scale: 4
num_blocks: 2
n_mels: 96
aam_m: 0.2
aam_s: 24.0
use_band_split: false
band_split_channels: 64
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 4
coverhunter_layers: 2
fusion_hidden_dim: 128
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
n_fft: 512
hop_length: 160
segment_dur: 5.0
crop_per_song: 4
training:
batch_size: 2
epochs: 20
lr: 0.00015
weight_decay: 0.0001
warmup_epochs: 2
temperature: 0.05
supcon_weight: 1.0
aam_weight: 0.2
mixed_precision: true
gradient_clip: 1.0
save_every: 5
log_every: 10
hard_negative_k: 2
sample_type_weights:
default: 1
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
compressed: 1.4
recording: 1.8
environment: 2.2
......@@ -10,6 +10,13 @@ model:
aam_s: 30.0
use_band_split: true
band_split_channels: 128
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 4
coverhunter_layers: 2
fusion_hidden_dim: 256
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
......@@ -31,15 +38,17 @@ training:
gradient_clip: 1.0
save_every: 10
log_every: 10
hard_negative_k: 2
sample_type_weights:
default: 1
humming_like: 3
confused: 5
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
augmented: 1.4
humming_like: 2.5
confused: 4.0
compressed: 1.5
recording: 2.0
environment: 2.5
engine:
chromaprint:
......
......@@ -2,6 +2,10 @@ numpy>=1.26
PyYAML>=6.0
soundfile>=0.12
librosa>=0.10
audiomentations>=0.37
transformers>=4.46
huggingface_hub>=0.26
torchaudio>=2.3
tqdm>=4.66
torch>=2.3
fastapi>=0.115
......
#!/usr/bin/env python3
import argparse
import json
import subprocess
from datetime import datetime
from pathlib import Path
DEFAULT_PYTHON = "/usr/local/miniconda3/bin/python"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--python", default=DEFAULT_PYTHON)
parser.add_argument("--config", default="configs/coverhunter_finetune_4gb.yaml")
parser.add_argument("--data", required=True)
parser.add_argument("--output-root", default="data/training_runs")
parser.add_argument("--run-name", default=None)
parser.add_argument("--noise-root", action="append", default=[])
parser.add_argument("--device", default="auto")
parser.add_argument("--segment-strategy", default="hybrid")
parser.add_argument("--resume", default=None)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
run_name = args.run_name or f"coverhunter_finetune_{timestamp}"
run_dir = Path(args.output_root) / run_name
run_dir.mkdir(parents=True, exist_ok=True)
command = [
args.python,
"train.py",
"--config",
args.config,
"--data",
args.data,
"--output",
str(run_dir),
"--device",
args.device,
"--segment-strategy",
args.segment_strategy,
]
if args.resume:
command.extend(["--resume", args.resume])
if args.dry_run:
command.append("--dry-run")
for noise_root in args.noise_root:
command.extend(["--noise-root", noise_root])
metadata = {
"run_name": run_name,
"created_at": datetime.utcnow().isoformat() + "Z",
"python": args.python,
"command": command,
"config": args.config,
"data": args.data,
"noise_roots": args.noise_root,
"run_dir": str(run_dir),
}
with open(run_dir / "run_request.json", "w") as f:
json.dump(metadata, f, indent=2)
result = subprocess.run(command, cwd=Path(__file__).resolve().parents[1], text=True, capture_output=True)
(run_dir / "stdout.log").write_text(result.stdout)
(run_dir / "stderr.log").write_text(result.stderr)
summary = {
**metadata,
"returncode": result.returncode,
"completed_at": datetime.utcnow().isoformat() + "Z",
"artifacts": sorted(path.name for path in run_dir.iterdir()),
}
with open(run_dir / "run_summary.json", "w") as f:
json.dump(summary, f, indent=2)
if result.returncode != 0:
raise SystemExit(result.returncode)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import argparse
import json
import subprocess
from pathlib import Path
PYTHON_DEFAULT = "/usr/local/miniconda3/bin/python"
PACKAGES = [
"-r", "requirements.txt",
]
EXTRA_PACKAGES = [
"torch",
"torchaudio",
"transformers",
"huggingface_hub",
"librosa",
"soundfile",
"audiomentations",
]
def run(command, cwd):
return subprocess.run(command, cwd=cwd, text=True, capture_output=True)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--python", default=PYTHON_DEFAULT)
parser.add_argument("--skip-install", action="store_true")
args = parser.parse_args()
root = Path(__file__).resolve().parents[1]
report = {
"python": args.python,
"cwd": str(root),
"steps": [],
}
if not args.skip_install:
install_cmd = [args.python, "-m", "pip", "install", *PACKAGES]
res = run(install_cmd, root)
report["steps"].append({
"name": "install_requirements",
"command": install_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
extra_cmd = [args.python, "-m", "pip", "install", *EXTRA_PACKAGES]
res = run(extra_cmd, root)
report["steps"].append({
"name": "install_extra_packages",
"command": extra_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
verify_cmd = [
args.python,
"-c",
(
"import torch, transformers, librosa, soundfile, audiomentations; "
"print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
),
]
res = run(verify_cmd, root)
report["steps"].append({
"name": "verify_environment",
"command": verify_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
report_path = root / "reports" / "coverhunter_env_setup_report.json"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(report, indent=2))
print(report_path)
if any(step["returncode"] != 0 for step in report["steps"]):
raise SystemExit(1)
if __name__ == "__main__":
main()
......@@ -3,6 +3,55 @@ import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, List
try:
from transformers import AutoModel
except ImportError:
AutoModel = None
class FrozenMERTFeatureExtractor(nn.Module):
def __init__(self, model_name: Optional[str], n_mels: int, hidden_dim: int):
super().__init__()
self.model_name = model_name
self.hidden_dim = hidden_dim
self.backbone = None
self.proj = nn.Sequential(
nn.Conv1d(n_mels, hidden_dim, kernel_size=3, padding=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
)
for parameter in self.proj.parameters():
parameter.requires_grad = False
if model_name and AutoModel is not None:
try:
self.backbone = AutoModel.from_pretrained(model_name)
except Exception:
self.backbone = None
if self.backbone is not None:
for parameter in self.backbone.parameters():
parameter.requires_grad = False
backbone_dim = getattr(self.backbone.config, "hidden_size", hidden_dim)
self.proj = nn.Sequential(
nn.Conv1d(backbone_dim, hidden_dim, kernel_size=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mel: torch.Tensor) -> torch.Tensor:
if self.backbone is None:
with torch.no_grad():
return self.proj(mel)
waveform_like = mel.transpose(1, 2)
with torch.no_grad():
outputs = self.backbone(inputs_embeds=waveform_like)
hidden = outputs.last_hidden_state.transpose(1, 2)
return self.proj(hidden)
class SEModule(nn.Module):
def __init__(self, channels, se_channels=128):
......@@ -123,6 +172,89 @@ class AAMSoftmax(nn.Module):
return output
class CoverHunterHead(nn.Module):
def __init__(self, input_dim: int, embed_dim: int, num_heads: int = 4, num_layers: int = 2, ff_mult: int = 4):
super().__init__()
encoder_layer = nn.TransformerEncoderLayer(
d_model=input_dim,
nhead=num_heads,
dim_feedforward=input_dim * ff_mult,
batch_first=True,
activation="gelu",
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.attention = nn.Sequential(
nn.Linear(input_dim, input_dim),
nn.Tanh(),
nn.Linear(input_dim, 1),
)
self.proj = nn.Linear(input_dim, embed_dim)
self.norm = nn.BatchNorm1d(embed_dim, affine=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
encoded = self.encoder(x)
weights = torch.softmax(self.attention(encoded).squeeze(-1), dim=1).unsqueeze(-1)
pooled = torch.sum(encoded * weights, dim=1)
projected = self.proj(pooled)
projected = self.norm(projected)
return F.normalize(projected, p=2, dim=1)
class MERTMelodyBranch(nn.Module):
def __init__(
self,
n_mels: int,
chroma_bins: int = 12,
melody_bins: int = 1,
hidden_dim: int = 256,
mert_model_name: Optional[str] = None,
):
super().__init__()
self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
self.melody_proj = nn.Conv1d(chroma_bins + melody_bins, hidden_dim, kernel_size=1)
self.fuse = nn.Sequential(
nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mert: torch.Tensor, melody: torch.Tensor, chroma: torch.Tensor) -> torch.Tensor:
semantic = self.mert(mert)
melodic = self.melody_proj(torch.cat([melody, chroma], dim=1))
return self.fuse(torch.cat([semantic, melodic], dim=1))
class ECAPABranch(nn.Module):
def __init__(self, n_mels: int, channels: int, use_band_split: bool, band_split_channels: int):
super().__init__()
front_channels = band_split_channels * 5 if use_band_split else n_mels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.proj = nn.Sequential(
nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.BatchNorm1d(channels),
)
def forward(self, mel: torch.Tensor) -> torch.Tensor:
x = self.band_split(mel) if self.band_split is not None else mel
return self.proj(x)
class DualStreamFusion(nn.Module):
def __init__(self, mert_dim: int, ecapa_dim: int, hidden_dim: int):
super().__init__()
self.mert_gate = nn.Conv1d(mert_dim, hidden_dim, kernel_size=1)
self.ecapa_gate = nn.Conv1d(ecapa_dim, hidden_dim, kernel_size=1)
self.fuse = nn.Sequential(
nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mert_stream: torch.Tensor, ecapa_stream: torch.Tensor) -> torch.Tensor:
return self.fuse(torch.cat([self.mert_gate(mert_stream), self.ecapa_gate(ecapa_stream)], dim=1))
class ECAPA_ACR(nn.Module):
def __init__(
self,
......@@ -137,11 +269,38 @@ class ECAPA_ACR(nn.Module):
aam_s: float = 30.0,
use_band_split: bool = True,
band_split_channels: int = 128,
use_dual_stream: bool = True,
coverhunter_heads: int = 4,
coverhunter_layers: int = 2,
fusion_hidden_dim: int = 256,
mert_model_name: Optional[str] = None,
):
super().__init__()
self.embed_dim = embed_dim
front_channels = band_split_channels * 5 if use_band_split else channels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.use_dual_stream = use_dual_stream
if use_dual_stream:
self.mert_melody_branch = MERTMelodyBranch(
n_mels=n_mels,
chroma_bins=12,
melody_bins=1,
hidden_dim=fusion_hidden_dim,
mert_model_name=mert_model_name,
)
self.ecapa_branch = ECAPABranch(
n_mels=n_mels,
channels=channels,
use_band_split=use_band_split,
band_split_channels=band_split_channels,
)
self.stream_fusion = DualStreamFusion(
mert_dim=fusion_hidden_dim,
ecapa_dim=channels,
hidden_dim=channels,
)
front_channels = channels
else:
front_channels = band_split_channels * 5 if use_band_split else channels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.conv1 = nn.Sequential(
nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
......@@ -169,24 +328,39 @@ class ECAPA_ACR(nn.Module):
nn.ReLU(),
nn.BatchNorm1d(channels * 3),
)
self.pooling = StatisticsPooling()
self.fc = nn.Linear(channels * 3 * 2, embed_dim)
self.bn = nn.BatchNorm1d(embed_dim, affine=False)
self.coverhunter = CoverHunterHead(
input_dim=channels * 3,
embed_dim=embed_dim,
num_heads=coverhunter_heads,
num_layers=coverhunter_layers,
)
self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None
def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
x = self.band_split(mel) if self.band_split is not None else mel
x = self.conv1(x)
def forward(
self,
mel: torch.Tensor,
labels: Optional[torch.Tensor] = None,
melody: Optional[torch.Tensor] = None,
chroma: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
if self.use_dual_stream:
if melody is None or chroma is None:
raise ValueError("melody and chroma are required when dual-stream fusion is enabled")
mert_stream = self.mert_melody_branch(mel, melody, chroma)
ecapa_stream = self.ecapa_branch(mel)
x = self.stream_fusion(mert_stream, ecapa_stream)
else:
x = self.band_split(mel) if self.band_split is not None else mel
x = self.conv1(x)
if self.use_dual_stream:
x = self.conv1(x)
block_outputs = []
for block in self.blocks:
x = block(x)
block_outputs.append(x)
x = torch.cat(block_outputs, dim=1)
x = self.mfa(x)
x = self.pooling(x)
x = self.fc(x)
x = self.bn(x)
embedding = F.normalize(x, p=2, dim=1)
embedding = self.coverhunter(x.transpose(1, 2))
if labels is not None and self.aam is not None:
logits = self.aam(embedding, labels)
return embedding, logits
......
......@@ -3,30 +3,22 @@ import torch.nn as nn
import torch.nn.functional as F
class SupConLoss(nn.Module):
class InfoNCELoss(nn.Module):
def __init__(self, temperature: float = 0.07):
super().__init__()
self.temperature = temperature
def forward(self, features: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
batch_size = features.shape[0]
labels = labels.contiguous().view(-1, 1)
mask = torch.eq(labels, labels.T).float().to(features.device)
mask = mask - torch.eye(batch_size, device=features.device)
features = F.normalize(features, dim=1)
sim = torch.matmul(features, features.T) / self.temperature
sim_max, _ = torch.max(sim, dim=1, keepdim=True)
sim = sim - sim_max.detach()
exp_sim = torch.exp(sim) * (1 - torch.eye(batch_size, device=features.device))
log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True))
pos_mask = mask
pos_count = pos_mask.sum(dim=1)
loss = -(log_prob * pos_mask).sum(dim=1)
loss = loss / pos_count.clamp(min=1)
return loss
logits = torch.matmul(features, features.T) / self.temperature
labels = labels.contiguous().view(-1, 1)
positive_mask = torch.eq(labels, labels.T).float().to(features.device)
positive_mask = positive_mask - torch.eye(features.size(0), device=features.device)
logits = logits - logits.max(dim=1, keepdim=True).values.detach()
exp_logits = torch.exp(logits) * (1 - torch.eye(features.size(0), device=features.device))
log_prob = logits - torch.log(exp_logits.sum(dim=1, keepdim=True) + 1e-12)
positives = positive_mask.sum(dim=1).clamp(min=1)
return -((positive_mask * log_prob).sum(dim=1) / positives)
class CombinedLoss(nn.Module):
......@@ -37,8 +29,7 @@ class CombinedLoss(nn.Module):
aam_weight: float = 0.3,
):
super().__init__()
self.supcon = SupConLoss(temperature)
self.ce = nn.CrossEntropyLoss()
self.infonce = InfoNCELoss(temperature)
self.supcon_weight = supcon_weight
self.aam_weight = aam_weight
......@@ -50,21 +41,20 @@ class CombinedLoss(nn.Module):
supcon_labels: torch.Tensor,
hard_weight: torch.Tensor | None = None,
) -> dict:
loss_supcon = self.supcon(embedding, supcon_labels)
loss_infonce = self.infonce(embedding, supcon_labels)
loss_ce = F.cross_entropy(logits, labels, reduction="none")
if hard_weight is not None:
weight = hard_weight.float()
if weight.dim() == 0:
weight = weight.unsqueeze(0)
loss_supcon = loss_supcon * weight
loss_infonce = loss_infonce * weight
loss_ce = loss_ce * weight
loss_supcon = loss_supcon.mean()
loss_infonce = loss_infonce.mean()
loss_ce = loss_ce.mean()
total = self.supcon_weight * loss_supcon + self.aam_weight * loss_ce
total = self.supcon_weight * loss_infonce + self.aam_weight * loss_ce
return {
"loss": total,
"supcon_loss": loss_supcon.item(),
"supcon_loss": loss_infonce.item(),
"ce_loss": loss_ce.item(),
}
......
import numpy as np
import random
from typing import Optional, Tuple
from pathlib import Path
from typing import Iterable, Optional, Tuple
import librosa
import soundfile as sf
try:
from audiomentations import AddBackgroundNoise, AddGaussianNoise, BandPassFilter, Compose, Mp3Compression, PitchShift, TimeStretch
HAS_AUDIO_AUG = True
except Exception:
AddBackgroundNoise = AddGaussianNoise = BandPassFilter = Compose = Mp3Compression = PitchShift = TimeStretch = None
HAS_AUDIO_AUG = False
class AugmentPipeline:
def __init__(self, sr: int = 16000, aggressive: bool = False):
self.sr = sr
self.noise_snr_range = (5, 30)
self.pitch_shift_range = (-6, 6)
self.time_stretch_range = (0.85, 1.15)
self.mp3_bitrate_range = (32, 128)
self.aggressive = aggressive
def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray:
if snr_db is None:
snr_db = random.uniform(*self.noise_snr_range)
signal_power = np.mean(y ** 2)
noise_power = signal_power / (10 ** (snr_db / 10))
noise = np.random.randn(len(y)) * np.sqrt(noise_power)
return y + noise
class NoiseLibrary:
def __init__(self, roots: Optional[Iterable[str]] = None):
self.paths = []
for root in roots or []:
base = Path(root)
if not base.exists():
continue
for pattern in ("*.wav", "*.mp3", "*.flac", "*.ogg", "*.m4a"):
self.paths.extend(base.rglob(pattern))
def pitch_shift(self, y: np.ndarray, semitones: Optional[float] = None) -> np.ndarray:
if semitones is None:
semitones = random.uniform(*self.pitch_shift_range)
return librosa_shift(y, sr=self.sr, n_steps=semitones)
def directories(self) -> list[str]:
if not self.paths:
return []
return sorted({str(path.parent) for path in self.paths})
def time_stretch(self, y: np.ndarray, rate: Optional[float] = None) -> np.ndarray:
if rate is None:
rate = random.uniform(*self.time_stretch_range)
return librosa_ts(y, sr=self.sr, rate=rate)
def add_reverb(self, y: np.ndarray, decay: float = 0.3) -> np.ndarray:
ir_len = int(0.1 * self.sr)
ir = np.exp(-np.arange(ir_len) * decay / ir_len) * np.random.randn(ir_len)
ir /= np.sqrt(np.sum(ir ** 2))
return np.convolve(y, ir, mode='same')[:len(y)]
class AugmentPipeline:
def __init__(
self,
sr: int = 16000,
aggressive: bool = False,
noise_roots: Optional[Iterable[str]] = None,
freq_mask_prob: float = 0.3,
):
self.sr = sr
self.aggressive = aggressive
self.freq_mask_prob = freq_mask_prob
self.noise_library = NoiseLibrary(noise_roots)
self.wave_augment = self._build_wave_augmenter()
def _build_wave_augmenter(self):
if not HAS_AUDIO_AUG:
return None
transforms = [
AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5 if not self.aggressive else 0.8),
BandPassFilter(
min_center_freq=300.0,
max_center_freq=3200.0,
min_bandwidth_fraction=0.3,
max_bandwidth_fraction=0.8,
p=0.35 if not self.aggressive else 0.55,
),
Mp3Compression(min_bitrate=24, max_bitrate=96, p=0.35 if not self.aggressive else 0.55),
PitchShift(min_semitones=-5, max_semitones=5, p=0.35 if not self.aggressive else 0.55),
TimeStretch(min_rate=0.8, max_rate=1.2, p=0.35 if not self.aggressive else 0.55),
]
noise_dirs = self.noise_library.directories()
if noise_dirs:
transforms.append(
AddBackgroundNoise(
sounds_path=noise_dirs,
min_snr_db=3.0 if self.aggressive else 8.0,
max_snr_db=20.0 if self.aggressive else 30.0,
noise_transform=Compose([
BandPassFilter(
min_center_freq=250.0,
max_center_freq=4000.0,
min_bandwidth_fraction=0.2,
max_bandwidth_fraction=0.9,
p=0.5,
)
]),
p=0.35 if not self.aggressive else 0.6,
)
)
return Compose(transforms)
def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 8) -> np.ndarray:
def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 12) -> np.ndarray:
mel = mel.copy()
t = mel.shape[1]
f = mel.shape[0]
......@@ -46,43 +91,21 @@ class AugmentPipeline:
if t_start < t:
mel[:, t_start:t_start + t_mask] = 0
for _ in range(2):
f_mask = random.randint(0, max_freq_mask)
f_mask = random.randint(max(1, max_freq_mask // 3), max_freq_mask)
f_start = random.randint(0, max(0, f - f_mask))
if f_start < f:
mel[f_start:f_start + f_mask, :] = 0
return mel
def apply_to_mel(self, mel: np.ndarray) -> np.ndarray:
if random.random() < 0.3:
if random.random() < self.freq_mask_prob:
mel = self.apply_spec_augment(mel)
return mel
def __call__(self, y: np.ndarray) -> np.ndarray:
noise_p = 0.75 if self.aggressive else 0.5
stretch_p = 0.55 if self.aggressive else 0.3
pitch_p = 0.55 if self.aggressive else 0.3
reverb_p = 0.35 if self.aggressive else 0.2
if random.random() < noise_p:
y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None)
if random.random() < stretch_p:
y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None)
if random.random() < pitch_p:
y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None)
if random.random() < reverb_p:
y = self.add_reverb(y, decay=random.uniform(0.2, 0.6))
return y
def librosa_shift(y, sr=16000, n_steps=0):
return librosa_impl(y, lambda: __import__('librosa').effects.pitch_shift(y, sr=sr, n_steps=n_steps))
def librosa_ts(y, sr=16000, rate=1.0):
return librosa_impl(y, lambda: __import__('librosa').effects.time_stretch(y, rate=rate))
def librosa_impl(y, fn):
try:
return fn()
except Exception:
return y
if self.wave_augment is None:
return y
try:
return self.wave_augment(samples=y.astype(np.float32), sample_rate=self.sr)
except Exception:
return y
......
......@@ -4,6 +4,7 @@
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
import torch
......@@ -21,15 +22,23 @@ from src.models.losses import CombinedLoss
def collate_fn(batch):
mels = []
melodies = []
chromas = []
song_ids = []
song_names = []
hard_weights = []
for b in batch:
mel = b["mel"]
melody = b.get("melody")
chroma = b.get("chroma")
hw = b.get("hard_weight", torch.tensor(1.0))
if mel.dim() == 3:
for i in range(mel.shape[0]):
mels.append(mel[i])
if melody is not None:
melodies.append(melody[i])
if chroma is not None:
chromas.append(chroma[i])
song_ids.append(b["song_id"][i])
song_names.append(b["song_name"])
if torch.is_tensor(hw) and hw.dim() > 0:
......@@ -38,24 +47,45 @@ def collate_fn(batch):
hard_weights.append(hw)
else:
mels.append(mel)
if melody is not None:
melodies.append(melody)
if chroma is not None:
chromas.append(chroma)
song_ids.append(b["song_id"])
song_names.append(b["song_name"])
hard_weights.append(hw)
max_t = max(m.shape[1] for m in mels)
mels_padded = []
for m in mels:
melodies_padded = []
chromas_padded = []
for idx, m in enumerate(mels):
pad = max_t - m.shape[1]
if pad > 0:
m = torch.nn.functional.pad(m, (0, pad))
mels_padded.append(m.unsqueeze(0))
return {
if melodies:
melody = melodies[idx]
if melody.shape[1] < max_t:
melody = torch.nn.functional.pad(melody, (0, max_t - melody.shape[1]))
melodies_padded.append(melody.unsqueeze(0))
if chromas:
chroma = chromas[idx]
if chroma.shape[1] < max_t:
chroma = torch.nn.functional.pad(chroma, (0, max_t - chroma.shape[1]))
chromas_padded.append(chroma.unsqueeze(0))
payload = {
"mel": torch.cat(mels_padded, dim=0),
"song_id": torch.stack(song_ids),
"song_name": song_names,
"hard_weight": torch.stack(hard_weights),
}
if melodies_padded:
payload["melody"] = torch.cat(melodies_padded, dim=0)
if chromas_padded:
payload["chroma"] = torch.cat(chromas_padded, dim=0)
return payload
def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg):
......@@ -64,10 +94,14 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg)
pbar = tqdm(loader, desc=f"Epoch {epoch}")
for batch in pbar:
mel = batch["mel"].to(device)
melody = batch.get("melody")
chroma = batch.get("chroma")
melody = melody.to(device) if melody is not None else None
chroma = chroma.to(device) if chroma is not None else None
labels = batch["song_id"].to(device)
with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"):
embedding, logits = model(mel, labels)
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
optimizer.zero_grad()
......@@ -115,6 +149,28 @@ def save_checkpoint(output_dir, epoch, model, optimizer, best_metric, cfg, name)
print(f" Saved: {path}")
def write_training_artifacts(output_dir: Path, cfg: dict, train_metrics: dict, train_dataset, args):
manifest = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"config": cfg,
"output_dir": str(output_dir),
"train_song_count": len(train_dataset.song_ids),
"sample_count": len(train_dataset),
"segment_strategy": args.segment_strategy,
"noise_roots": args.noise_root,
"artifacts": {
"best_model": str(output_dir / "best_model.pt"),
"song_to_idx": str(output_dir / "song_to_idx.json"),
"metrics": str(output_dir / "training_metrics.json"),
},
"final_metrics": train_metrics,
}
with open(output_dir / "training_metrics.json", "w") as f:
json.dump(train_metrics, f, indent=2)
with open(output_dir / "training_manifest.json", "w") as f:
json.dump(manifest, f, indent=2)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="configs/default.yaml")
......@@ -125,6 +181,7 @@ def main():
parser.add_argument("--epochs", type=int, default=None)
parser.add_argument("--batch-size", type=int, default=None)
parser.add_argument("--lr", type=float, default=None)
parser.add_argument("--noise-root", action="append", default=[])
parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
parser.add_argument("--silence-top-db", type=int, default=30)
parser.add_argument("--dry-run", action="store_true")
......@@ -159,6 +216,8 @@ def main():
silence_top_db=args.silence_top_db,
sample_type_weights=cfg["training"].get("sample_type_weights"),
pair_type_weights=cfg["training"].get("pair_type_weights"),
hard_negative_k=cfg["training"].get("hard_negative_k", 2),
noise_roots=args.noise_root,
)
catalog_dataset = ACRDataset(
......@@ -174,6 +233,7 @@ def main():
song_to_idx=train_dataset.song_to_idx,
segment_strategy=args.segment_strategy,
silence_top_db=args.silence_top_db,
noise_roots=args.noise_root,
)
train_loader = DataLoader(
......@@ -205,6 +265,11 @@ def main():
aam_s=cfg["model"]["aam_s"],
use_band_split=cfg["model"].get("use_band_split", True),
band_split_channels=cfg["model"].get("band_split_channels", 128),
use_dual_stream=cfg["model"].get("use_dual_stream", True),
coverhunter_heads=cfg["model"].get("coverhunter_heads", 4),
coverhunter_layers=cfg["model"].get("coverhunter_layers", 2),
fusion_hidden_dim=cfg["model"].get("fusion_hidden_dim", 256),
mert_model_name=cfg["model"].get("mert_model_name"),
).to(device)
criterion = CombinedLoss(
......@@ -219,8 +284,12 @@ def main():
print("Dry run: running one batch through forward/backward...")
batch = next(iter(train_loader))
mel = batch["mel"].to(device)
melody = batch.get("melody")
chroma = batch.get("chroma")
melody = melody.to(device) if melody is not None else None
chroma = chroma.to(device) if chroma is not None else None
labels = batch["song_id"].to(device)
embedding, logits = model(mel, labels)
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
loss_dict["loss"].backward()
print(f" Forward/backward OK. Loss: {loss_dict['loss']:.4f}")
......@@ -242,6 +311,7 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
print("Starting training...")
train_metrics = None
for epoch in range(start_epoch, cfg["training"]["epochs"] + 1):
train_metrics = train_epoch(model, train_loader, optimizer, criterion, scaler, device, epoch, cfg)
scheduler.step()
......@@ -254,6 +324,7 @@ def main():
with open(output_dir / "song_to_idx.json", "w") as f:
json.dump(train_dataset.song_to_idx, f, indent=2)
write_training_artifacts(output_dir, cfg, train_metrics or {}, train_dataset, args)
print(f"\nTraining complete. Best training loss: {best_loss:.4f}")
print(f"Model saved to: {output_dir / 'best_model.pt'}")
print(f"Catalog references available: {len(catalog_dataset.samples)}")
......
{
"run_name": "coverhunter_finetune_20260608T130103Z",
"created_at": "2026-06-08T13:01:03.023371Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130103Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130103Z",
"created_at": "2026-06-08T13:01:03.023371Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130103Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z",
"returncode": 1,
"completed_at": "2026-06-08T13:01:32.762576Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 249, in main
batch = next(iter(train_loader))
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 718, in __next__
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 778, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 370, in __getitem__
positive_features = [self._load_features(sample) for sample in positive_items]
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 344, in _load_features
features = self.feature_extractor.extract(y)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 138, in extract
melody = librosa.hz_to_midi(melody, bins_per_octave=12)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: hz_to_midi() got an unexpected keyword argument 'bins_per_octave'
{
"run_name": "coverhunter_finetune_20260608T130306Z",
"created_at": "2026-06-08T13:03:06.790814Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130306Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130306Z",
"created_at": "2026-06-08T13:03:06.790814Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130306Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z",
"returncode": 1,
"completed_at": "2026-06-08T13:04:34.035140Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 256, in main
model = ECAPA_ACR(
^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 280, in __init__
self.mert_melody_branch = MERTMelodyBranch(
^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 211, in __init__
self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 21, in __init__
self.backbone = AutoModel.from_pretrained(model_name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 289, in from_pretrained
resolved_config_file = cached_file(
^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 293, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 527, in cached_files
raise e
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 437, in cached_files
hf_hub_download(
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1019, in hf_hub_download
return _hf_hub_download_to_cache_dir(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1152, in _hf_hub_download_to_cache_dir
_get_metadata_or_catch_error(
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1694, in _get_metadata_or_catch_error
metadata = get_hf_file_metadata(
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1616, in get_hf_file_metadata
response = _httpx_follow_relative_redirects_with_backoff(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 685, in _httpx_follow_relative_redirects_with_backoff
response = http_backoff(
^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 559, in http_backoff
return next(
^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 467, in _http_backoff_base
response = client.request(method=method, url=url, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 825, in request
return self.send(request, auth=auth, follow_redirects=follow_redirects)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 901, in send
raise RuntimeError("Cannot send a request, as the client has been closed.")
RuntimeError: Cannot send a request, as the client has been closed.
Device: cpu
Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
Classes: 16
Train songs: 64
{
"run_name": "coverhunter_finetune_20260608T130514Z",
"created_at": "2026-06-08T13:05:14.591209Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130514Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130514Z",
"created_at": "2026-06-08T13:05:14.591209Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130514Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z",
"returncode": 1,
"completed_at": "2026-06-08T13:06:50.272162Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 292, in main
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 351, in forward
mert_stream = self.mert_melody_branch(mel, melody, chroma)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 224, in forward
semantic = self.mert(mert)
^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 49, in forward
return self.proj(mel)
^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not callable
Device: cpu
Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
Classes: 16
Train songs: 64
Dry run: running one batch through forward/backward...
{
"run_name": "coverhunter_finetune_20260608T130731Z",
"created_at": "2026-06-08T13:07:31.311447Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130731Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130731Z"
}
\ No newline at end of file
# CoverHunter 环境安装与验证
## 1. 目标解释器
本专题统一使用:
```bash
/usr/local/miniconda3/bin/python
```
## 2. 自动化脚本
已新增环境安装与验证脚本:
```text
acr-engine/scripts/setup_coverhunter_env.py
```
执行方式:
```bash
/usr/local/miniconda3/bin/python acr-engine/scripts/setup_coverhunter_env.py
```
它会自动:
1. 安装 `requirements.txt`
2. 补充训练依赖:
- `torch`
- `torchaudio`
- `transformers`
- `huggingface_hub`
- `librosa`
- `soundfile`
- `audiomentations`
3. 进行环境验证
4. 生成报告:
```text
acr-engine/reports/coverhunter_env_setup_report.json
```
## 3. 当前自动化执行结果
本次已经自动执行完成。
报告文件:
```text
acr-engine/reports/coverhunter_env_setup_report.json
```
当前结论:
- Python 包安装:**成功**
- `torch` / `transformers` / `librosa` / `soundfile` / `audiomentations`**已安装**
-`torch.cuda.is_available()` 当前返回:**False**
## 4. 当前 GPU 阻塞点
虽然系统存在 NVIDIA GPU,且 `nvidia-smi` 可见设备,但当前 PyTorch CUDA 初始化失败。
报告中的核心告警是:
- **The NVIDIA driver on your system is too old**
这说明:
- 当前安装到环境里的 `torch 2.12.0+cu130`
- 与当前系统驱动版本不兼容
也就是说:
- **环境依赖已经安装好了**
- **但当前 GPU 训练还不能真正启用**
- 原因不是代码问题,而是 **PyTorch CUDA 版本与驱动版本不匹配**
## 5. 当前状态怎么理解
现在的环境状态可以分成两部分:
### 已经完成的
- 训练依赖已安装
- 训练脚本可执行
- MERT / ECAPA 双流代码可 import
- 文档和配置已准备好
### 仍未完成的
- CUDA 版 torch 与当前 NVIDIA driver 的匹配
## 6. 下一步建议
要让 GPU 真正可用,需要二选一:
### 方案 A:升级 NVIDIA 驱动
优点:
- 可以保留当前较新的 torch/cu130 组合
- 后续兼容性更好
### 方案 B:安装与当前驱动兼容的更低 CUDA 版本 torch
优点:
- 不改系统驱动
- 更适合当前机器直接落地
对当前项目而言,我更建议:
- **优先采用方案 B**
- 安装与当前驱动兼容的 torch 版本
## 7. 当前专题与环境文档关系
配套文件如下:
- 训练专题:`docs/coverhunter_finetune_topic.md`
- 训练流程:`docs/coverhunter_training_process.md`
- 环境文档:`docs/coverhunter_env_setup.md`
- 环境报告:`acr-engine/reports/coverhunter_env_setup_report.json`
## 8. 当前结论
当前已经自动完成:
- 环境依赖安装
- 环境验证
- 结果记录
目前唯一阻塞 GPU 训练的点是:
- **CUDA / 驱动 / torch 版本不匹配**
# CoverHunter 双流微调标准流程
## 1. 当前架构
当前训练架构已经调整为双流:
- **流 A:MERT + Melody 分支**
- 代码位置:`acr-engine/src/models/ecapa_tdnn.py`
- 逻辑:冻结的 `FrozenMERTFeatureExtractor` + `melody/chroma` 融合
- 默认模型:`m-a-p/MERT-v1-95M`
- 说明:当前代码已经支持真实 HuggingFace MERT 权重接入;若环境里缺少 `transformers` 或首次拉取失败,则无法启用真实 MERT
- **流 B:ECAPA 分支**
- 逻辑:保留 ECAPA 特征建模路径
- **双流融合**
- `DualStreamFusion`
- **检索头**
- `CoverHunterHead`
- **训练目标**
- `InfoNCE + AAMSoftmax`
## 2. 当前资源检查结论
### Python 解释器
训练入口已固定支持:
```bash
/usr/local/miniconda3/bin/python
```
`acr-engine/scripts/run_coverhunter_finetune.py` 已支持 `--python` 参数,默认就是这个解释器。
### GPU
当前检测到 GPU:
- **Quadro P1000**
- 总显存:**4096 MiB**
- 空闲显存:约 **3817 MiB**
结论:
- **可以跑训练**
- 但显存较小,建议:
- `batch_size=2~4`
- `segment_dur=5.0` 起步
- 优先做 dry-run、小批量试跑、再正式训练
- 启用真实 MERT 后不要直接上大 batch
### 数据
当前仓库中可直接用于冒烟训练的数据:
- `acr-engine/data/synthetic_v2/train.json`
- 音频切片位于 `acr-engine/data/synthetic_v2/segments/`
这些数据已经包含:
- 普通切片
- augmented
- humming_like
- confused
适合先做流程验证。
### 当前环境缺口
`/usr/local/miniconda3/bin/python` 下当前缺少这些核心包:
- `torch`
- `transformers`
- `huggingface_hub`
- `torchaudio`
- `librosa`
- `soundfile`
- `audiomentations`
所以:
- **GPU 与解释器可用**
- **但当前训练环境还不能直接跑**
- 需要先补齐依赖
## 3. 标准处理流程
### Step 1:准备 Python 环境
进入项目后,先确保用的是目标解释器:
```bash
/usr/local/miniconda3/bin/python --version
```
安装依赖:
```bash
/usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
```
如需单独补装:
```bash
/usr/local/miniconda3/bin/python -m pip install torch torchaudio transformers huggingface_hub librosa soundfile audiomentations
```
### Step 2:准备 MERT 权重缓存
首次启用真实 MERT 时,会从 HuggingFace 拉取:
- `m-a-p/MERT-v1-95M`
建议先确认网络可访问 HuggingFace,或提前缓存模型。
如果不希望改默认配置,可以在 `configs/default.yaml``configs/coverhunter_finetune.yaml` 中调整:
```yaml
model:
mert_model_name: m-a-p/MERT-v1-95M
```
### Step 3:准备噪声数据
为了支持伪造录音增强,建议准备目录,例如:
```text
acr-engine/data/noise/restaurant/
acr-engine/data/noise/street/
```
里面放公开可用环境音频:
- 餐厅底噪
- 街道底噪
- 室内人声背景
训练时通过:
```bash
--noise-root acr-engine/data/noise/restaurant \
--noise-root acr-engine/data/noise/street
```
传入。
### Step 4:先做 dry-run
先验证数据、模型、GPU、增强链路是否都通:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--dry-run
```
### Step 5:小规模试训
建议先缩小 batch/config,确认显存稳定:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python train.py \
--config configs/coverhunter_finetune.yaml \
--data data/synthetic_v2 \
--output data/training_runs/coverhunter_trial \
--device cuda \
--segment-strategy hybrid \
--batch-size 2 \
--epochs 2 \
--noise-root data/noise/restaurant \
--noise-root data/noise/street
```
如果显存稳定,再逐步提高到:
- `batch_size=4`
- 必要时再尝试 `batch_size=6`
### Step 6:正式专题训练
标准命令:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--noise-root data/noise/restaurant \
--noise-root data/noise/street
```
### Step 7:检查训练产物
每次训练会记录到:
```text
acr-engine/data/training_runs/<run_name>/
```
标准产物包括:
- `best_model.pt`
- `checkpoint_epoch_*.pt`
- `song_to_idx.json`
- `training_metrics.json`
- `training_manifest.json`
- `run_request.json`
- `run_summary.json`
- `stdout.log`
- `stderr.log`
## 4. 增强策略说明
当前代码已经覆盖两类伪造策略:
### 伪造录音
位置:`acr-engine/src/utils/augment.py`
- `AddGaussianNoise`
- `AddBackgroundNoise`
- `BandPassFilter`
- `Mp3Compression`
### 伪造翻唱
位置:`acr-engine/src/utils/augment.py`
- `PitchShift`
- `TimeStretch`
- `Frequency Masking`(作用于 mel)
## 5. 资源适配建议
由于当前 GPU 是 Quadro P1000 4GB,建议按以下梯度推进:
### 推荐起步配置
- `segment_dur=5.0`
- `batch_size=2`
- `mixed_precision=true`
- `num_workers=0`
### 稳定后可尝试
- `batch_size=4`
- 如 OOM 则回退
### 当前不建议
- 直接上 8 秒片段 + batch 16
- 真实 MERT + 大 batch 同时启用
## 6. 当前结论
当前状态可以概括为:
- **架构方向已经调整正确**:双流
- **真实 MERT 接口已接入**:是
- **GPU 可以用于训练**:是
- **当前 Python 解释器可用**:是,`/usr/local/miniconda3/bin/python`
- **当前环境能否立刻开训****还不能**,因为依赖未装全
- **现有数据能否支撑一波流程训练****可以**,先从 `synthetic_v2` 开始
# 音乐翻唱检测与音频片段检索系统 (CSI) 核心能力结构清单
## 1. 核心架构逻辑
* **底座 (Backbone)**:MERT (冻结预训练权重) - 负责音频语义理解。
* **头部 (Head)**:CoverHunter (可训练 Conformer+Attention) - 负责旋律与结构的对比学习。
* **对齐方式**:双流融合 (MERT 语义特征 + Melody/Chroma 旋律特征)。
## 2. 数据与特征工程 (Data Pipeline)
* **数据集结构**:以 `Song_ID` 为唯一键,物理隔离原曲、压缩版、录音与环境音。
* **动态增强 (Data Augmentation)**
* 物理扰动:音高平移 (Pitch Shifting)、变速 (Time Stretching)。
* 环境注入:背景噪声混入 (Environment Injection)。
* 频率掩码:频段擦除 (Frequency Masking) - 逼迫模型脱离音色依赖,转向旋律核心。
* **数据对齐**:使用插值 (Interpolation) 将 MERT 序列长度与 Melody 序列长度对齐至一致的 `Time_Steps`
## 3. 训练与优化策略 (Training Strategy)
* **样本采样 (Sampler)**:PairSampler - 确保 Batch 中包含强配对的“原曲-翻唱”与精心挑选的“原曲-难负样本”。
* **难负样本挖掘 (Hard Negative Mining)**
* 使用冻结 MERT + Faiss 构建初始索引。
* 挖掘曲风相似但旋律不同的“假孪生兄弟”歌曲作为 Negative 样本。
* **损失函数 (Loss Function)**:InfoNCE Contrastive Loss - 拉近正样本余弦距离,推远负样本余弦距离。
## 4. 推理与检索引擎 (Inference & Retrieval)
* **离线建库**:全量原曲切片 -> 特征提取 -> 存入向量数据库 (Faiss/Milvus)。
* **在线查询**:录音片段 -> 滑动窗口切片 -> 提取 Embedding -> 近似最近邻检索 (ANN)。
* **鲁棒性机制**:切片投票机制 (Slice Voting) - 对查询录音切片所得的 Top-K 结果进行统计,按票数加权归一化排序。
## 5. 工程化关键节点 (Engineering Checklist)
* **计算优化**:离线特征缓存 (预先存储 .npy 减少 GPU 实时计算压力)。
* **部署优化**:ONNX/TensorRT 模型编译 + 动态批处理 (Dynamic Batching)。
* **数据飞轮**:在线难例挖掘 (基于用户反馈的 False Positives 循环重训)。