Commit 7da76864 7da76864361f72a1428d2b36aeea2f283d8945e6 by 章晓祥

-

1 parent 3ff5efd2
Showing 34 changed files with 2540 additions and 286 deletions
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-GlEnjnf09lXwiJuwDS5Q0nOzGd1ck8YBDERVXv84t9hvtS0U",
"ANTHROPIC_BASE_URL": "https://aiapis.help",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "gpt-5.4",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "claude-opus-4.6",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4.6",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4.6",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4.5",
"CLAUDE_CODE_SUBAGENT_MODEL": "claude-sonnet-4.6",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
"CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
{
"env": {
"ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
"ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
"ANTHROPIC_MODEL": "qwen3.7-max",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "qwen3.7-max",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "qwen3.6-plus",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "qwen3.6-plus",
"CLAUDE_CODE_SUBAGENT_MODEL": "qwen3.6-plus",
"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
"CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
"CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
"CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
},
"permissions": {
"allow": [],
"deny": []
},
"model": "sonnet",
"enabledPlugins": {
"claude-code-setup@claude-plugins-official": true,
"typescript-lsp@claude-plugins-official": true,
"rust-analyzer-lsp@claude-plugins-official": true,
"pr-review-toolkit@claude-plugins-official": true,
"ralph-loop@claude-plugins-official": true,
"superpowers@claude-plugins-official": true
},
"alwaysThinkingEnabled": false,
"skipDangerousModePermissionPrompt": true,
"theme": "dark-ansi",
"modelType": "anthropic"
}
model:
name: coverhunter_finetune
embed_dim: 256
channels: 512
se_channels: 128
res2net_scale: 8
num_blocks: 3
n_mels: 128
aam_m: 0.2
aam_s: 30.0
use_band_split: false
band_split_channels: 128
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 8
coverhunter_layers: 4
fusion_hidden_dim: 256
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
n_fft: 512
hop_length: 160
segment_dur: 8.0
crop_per_song: 6
training:
batch_size: 16
epochs: 30
lr: 0.0002
weight_decay: 0.0001
warmup_epochs: 3
temperature: 0.05
supcon_weight: 1.0
aam_weight: 0.2
mixed_precision: true
gradient_clip: 1.0
save_every: 5
log_every: 10
hard_negative_k: 4
sample_type_weights:
default: 1
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
compressed: 1.5
recording: 2.0
environment: 3.0
model:
name: coverhunter_finetune_lowmem
embed_dim: 192
channels: 256
se_channels: 64
res2net_scale: 4
num_blocks: 2
n_mels: 96
aam_m: 0.2
aam_s: 24.0
use_band_split: false
band_split_channels: 64
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 4
coverhunter_layers: 2
fusion_hidden_dim: 128
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
n_fft: 512
hop_length: 160
segment_dur: 5.0
crop_per_song: 4
training:
batch_size: 2
epochs: 20
lr: 0.00015
weight_decay: 0.0001
warmup_epochs: 2
temperature: 0.05
supcon_weight: 1.0
aam_weight: 0.2
mixed_precision: true
gradient_clip: 1.0
save_every: 5
log_every: 10
hard_negative_k: 2
sample_type_weights:
default: 1
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
compressed: 1.4
recording: 1.8
environment: 2.2
......@@ -10,6 +10,13 @@ model:
aam_s: 30.0
use_band_split: true
band_split_channels: 128
use_dual_stream: true
mert_melody_branch: true
ecapa_branch: true
coverhunter_heads: 4
coverhunter_layers: 2
fusion_hidden_dim: 256
mert_model_name: m-a-p/MERT-v1-95M
data:
sample_rate: 16000
......@@ -31,15 +38,17 @@ training:
gradient_clip: 1.0
save_every: 10
log_every: 10
hard_negative_k: 2
sample_type_weights:
default: 1
humming_like: 3
confused: 5
compressed: 2
recording: 3
environment: 4
pair_type_weights:
default: 1.0
augmented: 1.4
humming_like: 2.5
confused: 4.0
compressed: 1.5
recording: 2.0
environment: 2.5
engine:
chromaprint:
......
{
"python": "/usr/local/miniconda3/bin/python",
"cwd": "/mnt/e/hikoon-ACR/acr-engine",
"steps": [
{
"name": "install_requirements",
"command": [
"/usr/local/miniconda3/bin/python",
"-m",
"pip",
"install",
"-r",
"requirements.txt"
],
"returncode": 0,
"stdout": "\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 40.7/40.7 MB 10.9 MB/s 0:00:03\nDownloading nvidia_nvtx-13.0.85-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (148 kB)\nDownloading setuptools-81.0.0-py3-none-any.whl (1.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 1.1/1.1 MB 8.6 MB/s 0:00:00\nDownloading audioread-3.1.0-py3-none-any.whl (23 kB)\nDownloading click-8.4.1-py3-none-any.whl (116 kB)\nDownloading cuda_pathfinder-1.5.5-py3-none-any.whl (51 kB)\nDownloading decorator-5.3.1-py3-none-any.whl (10 kB)\nDownloading filelock-3.29.1-py3-none-any.whl (40 kB)\nDownloading fsspec-2026.4.0-py3-none-any.whl (203 kB)\nDownloading joblib-1.5.3-py3-none-any.whl (309 kB)\nDownloading lazy_loader-0.5-py3-none-any.whl (8.0 kB)\nDownloading networkx-3.6.1-py3-none-any.whl (2.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 2.1/2.1 MB 10.3 MB/s 0:00:00\nDownloading numba-0.65.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 3.8/3.8 MB 10.3 MB/s 0:00:00\nDownloading llvmlite-0.47.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (56.3 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 56.3/56.3 MB 10.8 MB/s 0:00:05\nDownloading pooch-1.9.0-py3-none-any.whl (67 kB)\nDownloading regex-2026.5.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (801 kB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 801.2/801.2 kB 8.5 MB/s 0:00:00\nDownloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (507 kB)\nDownloading scikit_learn-1.9.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (9.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 9.1/9.1 MB 10.0 MB/s 0:00:00\nDownloading narwhals-2.22.1-py3-none-any.whl (454 kB)\nDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 6.3/6.3 MB 10.7 MB/s 0:00:00\nDownloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 536.2/536.2 kB 7.1 MB/s 0:00:00\nDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\nDownloading jinja2-3.1.6-py3-none-any.whl (134 kB)\nDownloading markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)\nInstalling collected packages: torchaudio, nvidia-cusparselt-cu13, mpmath, cuda-toolkit, triton, threadpoolctl, sympy, setuptools, safetensors, regex, python-stretch, nvidia-nvtx, nvidia-nvshmem-cu13, nvidia-nvjitlink, nvidia-nccl-cu13, nvidia-curand, nvidia-cufile, nvidia-cuda-runtime, nvidia-cuda-nvrtc, nvidia-cuda-cupti, numpy, networkx, narwhals, MarkupSafe, llvmlite, lazy_loader, joblib, hf-xet, fsspec, filelock, decorator, cuda-pathfinder, click, audioread, soxr, soundfile, scipy, pooch, nvidia-cusparse, nvidia-cufft, nvidia-cublas, numpy-rms, numpy-minmax, numba, jinja2, cuda-bindings, scikit-learn, nvidia-cusolver, nvidia-cudnn-cu13, librosa, huggingface_hub, torch, tokenizers, audiomentations, transformers\n\nSuccessfully installed MarkupSafe-3.0.3 audiomentations-0.43.1 audioread-3.1.0 click-8.4.1 cuda-bindings-13.3.1 cuda-pathfinder-1.5.5 cuda-toolkit-13.0.2 decorator-5.3.1 filelock-3.29.1 fsspec-2026.4.0 hf-xet-1.5.0 huggingface_hub-1.18.0 jinja2-3.1.6 joblib-1.5.3 lazy_loader-0.5 librosa-0.11.0 llvmlite-0.47.0 mpmath-1.3.0 narwhals-2.22.1 networkx-3.6.1 numba-0.65.1 numpy-2.4.6 numpy-minmax-0.5.0 numpy-rms-0.6.0 nvidia-cublas-13.1.1.3 nvidia-cuda-cupti-13.0.85 nvidia-cuda-nvrtc-13.0.88 nvidia-cuda-runtime-13.0.96 nvidia-cudnn-cu13-9.20.0.48 nvidia-cufft-12.0.0.61 nvidia-cufile-1.15.1.6 nvidia-curand-10.4.0.35 nvidia-cusolver-12.0.4.66 nvidia-cusparse-12.6.3.3 nvidia-cusparselt-cu13-0.8.1 nvidia-nccl-cu13-2.29.7 nvidia-nvjitlink-13.0.88 nvidia-nvshmem-cu13-3.4.5 nvidia-nvtx-13.0.85 pooch-1.9.0 python-stretch-0.3.1 regex-2026.5.9 safetensors-0.7.0 scikit-learn-1.9.0 scipy-1.17.1 setuptools-81.0.0 soundfile-0.14.0 soxr-0.5.0.post1 sympy-1.14.0 threadpoolctl-3.6.0 tokenizers-0.22.2 torch-2.12.0 torchaudio-2.11.0 transformers-5.10.2 triton-3.7.0\n",
"stderr": " WARNING: The scripts proton and proton-viewer are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The script isympy is installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts f2py and numpy-config are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts hf, huggingface-cli and tiny-agents are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts torchfrtrace and torchrun are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The script transformers is installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n"
},
{
"name": "install_extra_packages",
"command": [
"/usr/local/miniconda3/bin/python",
"-m",
"pip",
"install",
"torch",
"torchaudio",
"transformers",
"huggingface_hub",
"librosa",
"soundfile",
"audiomentations"
],
"returncode": 0,
"stdout": "a3/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface_hub) (0.16.0)\nRequirement already satisfied: shellingham>=1.3.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (1.5.4)\nRequirement already satisfied: rich>=10.11.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (14.2.0)\nRequirement already satisfied: audioread>=2.1.9 in /home/user/.local/lib/python3.12/site-packages (from librosa) (3.1.0)\nRequirement already satisfied: numba>=0.51.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.65.1)\nRequirement already satisfied: scipy>=1.6.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.17.1)\nRequirement already satisfied: scikit-learn>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: joblib>=1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.5.3)\nRequirement already satisfied: decorator>=4.3.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (5.3.1)\nRequirement already satisfied: pooch>=1.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: soxr>=0.3.2 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5.0.post1)\nRequirement already satisfied: lazy_loader>=0.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5)\nRequirement already satisfied: msgpack>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from librosa) (1.1.1)\nRequirement already satisfied: cffi>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from soundfile) (1.17.1)\nRequirement already satisfied: numpy-minmax<1,>=0.3.0 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.5.0)\nRequirement already satisfied: numpy-rms<1,>=0.4.2 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.6.0)\nRequirement already satisfied: python-stretch<1,>=0.3.1 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.3.1)\nRequirement already satisfied: pycparser in /usr/local/miniconda3/lib/python3.12/site-packages (from cffi>=1.0->soundfile) (3.0)\nRequirement already satisfied: llvmlite<0.48,>=0.47.0dev0 in /home/user/.local/lib/python3.12/site-packages (from numba>=0.51.0->librosa) (0.47.0)\nRequirement already satisfied: platformdirs>=2.5.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (4.9.4)\nRequirement already satisfied: requests>=2.19.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (2.33.1)\nRequirement already satisfied: charset_normalizer<4,>=2 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (3.4.4)\nRequirement already satisfied: urllib3<3,>=1.26 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (2.6.3)\nRequirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (4.0.0)\nRequirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (2.20.0)\nRequirement already satisfied: mdurl~=0.1 in /usr/local/miniconda3/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer->transformers) (0.1.2)\nRequirement already satisfied: narwhals>=2.0.1 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (2.22.1)\nRequirement already satisfied: threadpoolctl>=3.5.0 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (3.6.0)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from sympy>=1.13.3->torch) (1.3.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /home/user/.local/lib/python3.12/site-packages (from jinja2->torch) (3.0.3)\n",
"stderr": ""
},
{
"name": "verify_environment",
"command": [
"/usr/local/miniconda3/bin/python",
"-c",
"import torch, transformers, librosa, soundfile, audiomentations; print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
],
"returncode": 0,
"stdout": "{'torch': '2.12.0+cu130', 'cuda': False, 'transformers': '5.10.2'}\n",
"stderr": "/home/user/.local/lib/python3.12/site-packages/torch/cuda/__init__.py:187: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 12080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:119.)\n return torch._C._cuda_getDeviceCount() > 0\n"
}
]
}
\ No newline at end of file
......@@ -2,6 +2,10 @@ numpy>=1.26
PyYAML>=6.0
soundfile>=0.12
librosa>=0.10
audiomentations>=0.37
transformers>=4.46
huggingface_hub>=0.26
torchaudio>=2.3
tqdm>=4.66
torch>=2.3
fastapi>=0.115
......
#!/usr/bin/env python3
import argparse
import json
import subprocess
from datetime import datetime
from pathlib import Path
DEFAULT_PYTHON = "/usr/local/miniconda3/bin/python"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--python", default=DEFAULT_PYTHON)
parser.add_argument("--config", default="configs/coverhunter_finetune_4gb.yaml")
parser.add_argument("--data", required=True)
parser.add_argument("--output-root", default="data/training_runs")
parser.add_argument("--run-name", default=None)
parser.add_argument("--noise-root", action="append", default=[])
parser.add_argument("--device", default="auto")
parser.add_argument("--segment-strategy", default="hybrid")
parser.add_argument("--resume", default=None)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
run_name = args.run_name or f"coverhunter_finetune_{timestamp}"
run_dir = Path(args.output_root) / run_name
run_dir.mkdir(parents=True, exist_ok=True)
command = [
args.python,
"train.py",
"--config",
args.config,
"--data",
args.data,
"--output",
str(run_dir),
"--device",
args.device,
"--segment-strategy",
args.segment_strategy,
]
if args.resume:
command.extend(["--resume", args.resume])
if args.dry_run:
command.append("--dry-run")
for noise_root in args.noise_root:
command.extend(["--noise-root", noise_root])
metadata = {
"run_name": run_name,
"created_at": datetime.utcnow().isoformat() + "Z",
"python": args.python,
"command": command,
"config": args.config,
"data": args.data,
"noise_roots": args.noise_root,
"run_dir": str(run_dir),
}
with open(run_dir / "run_request.json", "w") as f:
json.dump(metadata, f, indent=2)
result = subprocess.run(command, cwd=Path(__file__).resolve().parents[1], text=True, capture_output=True)
(run_dir / "stdout.log").write_text(result.stdout)
(run_dir / "stderr.log").write_text(result.stderr)
summary = {
**metadata,
"returncode": result.returncode,
"completed_at": datetime.utcnow().isoformat() + "Z",
"artifacts": sorted(path.name for path in run_dir.iterdir()),
}
with open(run_dir / "run_summary.json", "w") as f:
json.dump(summary, f, indent=2)
if result.returncode != 0:
raise SystemExit(result.returncode)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import argparse
import json
import subprocess
from pathlib import Path
PYTHON_DEFAULT = "/usr/local/miniconda3/bin/python"
PACKAGES = [
"-r", "requirements.txt",
]
EXTRA_PACKAGES = [
"torch",
"torchaudio",
"transformers",
"huggingface_hub",
"librosa",
"soundfile",
"audiomentations",
]
def run(command, cwd):
return subprocess.run(command, cwd=cwd, text=True, capture_output=True)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--python", default=PYTHON_DEFAULT)
parser.add_argument("--skip-install", action="store_true")
args = parser.parse_args()
root = Path(__file__).resolve().parents[1]
report = {
"python": args.python,
"cwd": str(root),
"steps": [],
}
if not args.skip_install:
install_cmd = [args.python, "-m", "pip", "install", *PACKAGES]
res = run(install_cmd, root)
report["steps"].append({
"name": "install_requirements",
"command": install_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
extra_cmd = [args.python, "-m", "pip", "install", *EXTRA_PACKAGES]
res = run(extra_cmd, root)
report["steps"].append({
"name": "install_extra_packages",
"command": extra_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
verify_cmd = [
args.python,
"-c",
(
"import torch, transformers, librosa, soundfile, audiomentations; "
"print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
),
]
res = run(verify_cmd, root)
report["steps"].append({
"name": "verify_environment",
"command": verify_cmd,
"returncode": res.returncode,
"stdout": res.stdout[-4000:],
"stderr": res.stderr[-4000:],
})
report_path = root / "reports" / "coverhunter_env_setup_report.json"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(report, indent=2))
print(report_path)
if any(step["returncode"] != 0 for step in report["steps"]):
raise SystemExit(1)
if __name__ == "__main__":
main()
......@@ -8,6 +8,9 @@ import numpy as np
import torch
from torch.utils.data import Dataset
from src.utils.audio import AudioProcessor
from src.utils.augment import AugmentPipeline
def compute_candidate_offsets(
y: np.ndarray,
......@@ -124,6 +127,267 @@ def compute_candidate_offsets(
return []
class DualStreamFeatureExtractor:
def __init__(self, sr: int, n_mels: int, n_fft: int, hop_length: int):
self.audio = AudioProcessor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
self.n_mels = n_mels
def extract(self, y: np.ndarray) -> Dict[str, torch.Tensor]:
mel = self.audio.to_mel(y)
melody = self.audio.extract_f0(y)
melody = librosa.hz_to_midi(melody)
melody = np.nan_to_num(melody, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
chroma = self.audio.extract_chroma(y).astype(np.float32)
time_steps = mel.shape[1]
if melody.size == 0:
melody = np.zeros(time_steps, dtype=np.float32)
else:
melody = np.interp(
np.linspace(0, melody.size - 1, time_steps),
np.arange(melody.size),
melody,
).astype(np.float32)
chroma_resized = np.stack(
[
np.interp(
np.linspace(0, chroma.shape[1] - 1, time_steps),
np.arange(chroma.shape[1]),
chroma_row,
)
for chroma_row in chroma
],
axis=0,
).astype(np.float32)
return {
"mel": torch.FloatTensor(mel),
"melody": torch.FloatTensor(melody).unsqueeze(0),
"chroma": torch.FloatTensor(chroma_resized),
}
class PairSamplerDataset(Dataset):
def __init__(
self,
data_dir: str,
split: str = "train",
sr: int = 16000,
n_mels: int = 80,
n_fft: int = 512,
hop_length: int = 160,
segment_dur: float = 5.0,
augment: bool = True,
segment_strategy: str = "random",
silence_top_db: int = 30,
sample_type_weights: Optional[Dict[str, int]] = None,
pair_type_weights: Optional[Dict[str, float]] = None,
hard_negative_k: int = 1,
noise_roots: Optional[List[str]] = None,
):
self.sr = sr
self.n_mels = n_mels
self.n_fft = n_fft
self.hop_length = hop_length
self.segment_len = int(segment_dur * sr)
self.augment = augment
self.segment_strategy = segment_strategy
self.silence_top_db = silence_top_db
self.data_dir = Path(data_dir)
self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
self.sample_type_weights = {
"default": 1,
"compressed": 2,
"recording": 3,
"environment": 4,
**(sample_type_weights or {}),
}
self.pair_type_weights = {
"default": 1.0,
"compressed": 1.5,
"recording": 2.0,
"environment": 2.5,
**(pair_type_weights or {}),
}
self.hard_negative_k = hard_negative_k
self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
self.aggressive_augmenter = AugmentPipeline(sr, aggressive=True, noise_roots=noise_roots)
with open(self.data_dir / f"{split}.json") as f:
metadata = json.load(f)
self.by_song: Dict[str, List[Dict]] = {}
for item in metadata:
if not self._is_training_candidate(item):
continue
p = self.asset_root / item["audio_path"]
if p.exists():
self.by_song.setdefault(item["song_id"], []).append(item)
self.song_ids = sorted(self.by_song)
self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
self.sample_song_ids = []
self.hard_negative_map: Dict[str, List[str]] = self._build_hard_negative_map()
for sid, items in self.by_song.items():
item_types = {self._normalize_sample_type(x.get("type")) for x in items}
weight = self.sample_type_weights.get("default", 1)
for item_type in item_types:
weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
self.sample_song_ids.extend([sid] * weight)
@staticmethod
def _normalize_sample_type(sample_type: Optional[str]) -> str:
mapping = {
"reference": "reference",
"compressed": "compressed",
"recording": "recording",
"environment": "environment",
"humming_like": "recording",
"confused": "environment",
None: "default",
}
return mapping.get(sample_type, sample_type or "default")
def _is_training_candidate(self, item: Dict) -> bool:
sample_type = self._normalize_sample_type(item.get("type"))
return sample_type != "reference"
def _build_hard_negative_map(self) -> Dict[str, List[str]]:
song_features: Dict[str, np.ndarray] = {}
for song_id, items in self.by_song.items():
feats = []
for item in items[:2]:
path = self.asset_root / item["audio_path"]
try:
y, _ = librosa.load(str(path), sr=self.sr, mono=True, duration=8.0)
mel = self.feature_extractor.audio.to_mel(y)
feats.append(np.mean(mel, axis=1))
except Exception:
continue
if feats:
song_features[song_id] = np.mean(feats, axis=0)
hard_negative_map: Dict[str, List[str]] = {}
song_ids = list(song_features)
for song_id in song_ids:
anchor = song_features[song_id]
anchor_norm = np.linalg.norm(anchor) + 1e-12
scored = []
for other_song_id in song_ids:
if other_song_id == song_id:
continue
other = song_features[other_song_id]
score = float(np.dot(anchor, other) / (anchor_norm * (np.linalg.norm(other) + 1e-12)))
scored.append((score, other_song_id))
scored.sort(reverse=True)
hard_negative_map[song_id] = [other_song_id for _, other_song_id in scored[: max(self.hard_negative_k, 1) * 4]]
return hard_negative_map
def __len__(self):
return len(self.sample_song_ids)
def _load_clip(self, sample: Dict) -> np.ndarray:
path = self.asset_root / sample["audio_path"]
full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
duration = float(sample.get("duration", len(full_y) / self.sr))
max_offset = max(0.0, duration - (self.segment_len / self.sr))
offset = 0.0
if max_offset > 0:
if self.segment_strategy == "random":
offset = random.uniform(0, max_offset)
else:
direct_candidates = compute_candidate_offsets(
y=full_y,
sr=self.sr,
segment_len=self.segment_len,
strategy=self.segment_strategy,
silence_top_db=self.silence_top_db,
)
if direct_candidates:
offset = min(random.choice(direct_candidates) / self.sr, max_offset)
elif self.segment_strategy == "hybrid":
candidate_pool: List[int] = []
for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
candidate_pool.extend(
compute_candidate_offsets(
y=full_y,
sr=self.sr,
segment_len=self.segment_len,
strategy=strategy,
silence_top_db=self.silence_top_db,
)
)
if candidate_pool and random.random() < 0.75:
offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
else:
offset = random.uniform(0, max_offset)
else:
offset = random.uniform(0, max_offset)
start = int(offset * self.sr)
y = full_y[start : start + self.segment_len]
if len(y) < self.segment_len:
y = np.pad(y, (0, self.segment_len - len(y)))
return y
def _augment_wave(self, sample: Dict, y: np.ndarray) -> np.ndarray:
if not self.augment:
return y
sample_type = self._normalize_sample_type(sample.get("type"))
if sample_type in {"recording", "environment"}:
return self.aggressive_augmenter(y)
return self.augmenter(y)
def _load_features(self, sample: Dict) -> Dict[str, torch.Tensor]:
y = self._load_clip(sample)
y = self._augment_wave(sample, y)
features = self.feature_extractor.extract(y)
features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
return features
def _pick_positive_pair(self, song_id: str) -> tuple[Dict, Dict]:
choices = self.by_song[song_id]
if len(choices) == 1:
return choices[0], choices[0]
return tuple(random.sample(choices, 2))
def _pick_negative(self, song_id: str) -> Dict:
hard_songs = self.hard_negative_map.get(song_id, [])
candidate_song_ids = hard_songs[: self.hard_negative_k] if hard_songs else []
if candidate_song_ids and random.random() < 0.8:
negative_song_id = random.choice(candidate_song_ids)
else:
pool = [sid for sid in self.song_ids if sid != song_id]
negative_song_id = random.choice(pool)
return random.choice(self.by_song[negative_song_id])
def __getitem__(self, idx):
song_id = self.sample_song_ids[idx]
pos_a, pos_b = self._pick_positive_pair(song_id)
negative = self._pick_negative(song_id)
positive_items = [pos_a, pos_b]
positive_features = [self._load_features(sample) for sample in positive_items]
negative_features = self._load_features(negative)
hard_weights = [
self.pair_type_weights.get(self._normalize_sample_type(sample.get("type")), self.pair_type_weights["default"])
for sample in positive_items
]
hard_weights.append(self.pair_type_weights.get("environment", 2.5))
label = self.song_to_idx[song_id]
negative_label = self.song_to_idx[negative["song_id"]]
return {
"mel": torch.stack([feat["mel"] for feat in positive_features] + [negative_features["mel"]], dim=0),
"melody": torch.stack([feat["melody"] for feat in positive_features] + [negative_features["melody"]], dim=0),
"chroma": torch.stack([feat["chroma"] for feat in positive_features] + [negative_features["chroma"]], dim=0),
"song_id": torch.tensor([label, label, negative_label], dtype=torch.long),
"song_name": song_id,
"hard_weight": torch.tensor(hard_weights, dtype=torch.float32),
}
class ACRDataset(Dataset):
def __init__(
self,
......@@ -140,6 +404,7 @@ class ACRDataset(Dataset):
references_only: bool = False,
segment_strategy: str = "random",
silence_top_db: int = 30,
noise_roots: Optional[List[str]] = None,
):
self.sr = sr
self.n_mels = n_mels
......@@ -152,6 +417,8 @@ class ACRDataset(Dataset):
self.silence_top_db = silence_top_db
self.data_dir = Path(data_dir)
self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
meta_path = self.data_dir / f"{split}.json"
with open(meta_path) as f:
......@@ -179,16 +446,6 @@ class ACRDataset(Dataset):
y = y[: self.segment_len]
return y
def _to_mel(self, y: np.ndarray) -> np.ndarray:
mel = librosa.feature.melspectrogram(
y=y,
sr=self.sr,
n_mels=self.n_mels,
n_fft=self.n_fft,
hop_length=self.hop_length,
)
return librosa.power_to_db(mel, ref=np.max)
def _choose_offset(self, sample: Dict, audio_path: Path) -> float:
duration = float(sample["duration"])
max_offset = max(0.0, duration - (self.segment_len / self.sr))
......@@ -231,24 +488,22 @@ class ACRDataset(Dataset):
def __getitem__(self, idx):
sample = self.samples[idx // self.n_crops]
audio_path = self.asset_root / sample["audio_path"]
offset = self._choose_offset(sample, audio_path)
y = self._load_segment(str(audio_path), offset, 5.0)
if self.augment and sample.get("type") != "reference":
from src.utils.augment import AugmentPipeline
aug = AugmentPipeline(self.sr)
y = aug(y)
y = self.augmenter(y)
mel = self._to_mel(y)
mel_tensor = torch.FloatTensor(mel)
features = self.feature_extractor.extract(y)
features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
song_id = sample["song_id"]
class_id = self.song_to_idx[song_id]
return {
"mel": mel_tensor,
"mel": features["mel"],
"melody": features["melody"],
"chroma": features["chroma"],
"song_id": torch.tensor(class_id, dtype=torch.long),
"song_name": song_id,
"type": sample.get("type", "unknown"),
......@@ -272,6 +527,7 @@ class ACRTestDataset(Dataset):
self.hop_length = hop_length
self.data_dir = Path(data_dir)
self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
meta_path = self.data_dir / f"{split}.json"
with open(meta_path) as f:
......@@ -299,171 +555,17 @@ class ACRTestDataset(Dataset):
else:
y = y[:seg_len]
mel = librosa.power_to_db(
librosa.feature.melspectrogram(
y=y,
sr=self.sr,
n_mels=self.n_mels,
n_fft=self.n_fft,
hop_length=self.hop_length,
),
ref=np.max,
)
features = self.feature_extractor.extract(y)
class_id = self.song_to_idx[sample["song_id"]]
return {
"mel": torch.FloatTensor(mel),
"mel": features["mel"],
"melody": features["melody"],
"chroma": features["chroma"],
"song_id": torch.tensor(class_id, dtype=torch.long),
"song_name": sample["song_id"],
"type": sample.get("type", "unknown"),
}
class SongPairDataset(Dataset):
def __init__(
self,
data_dir: str,
split: str = "train",
sr: int = 16000,
n_mels: int = 80,
n_fft: int = 512,
hop_length: int = 160,
segment_dur: float = 5.0,
augment: bool = True,
segment_strategy: str = "random",
silence_top_db: int = 30,
sample_type_weights: Optional[Dict[str, int]] = None,
pair_type_weights: Optional[Dict[str, float]] = None,
):
self.sr = sr
self.n_mels = n_mels
self.n_fft = n_fft
self.hop_length = hop_length
self.segment_len = int(segment_dur * sr)
self.augment = augment
self.segment_strategy = segment_strategy
self.silence_top_db = silence_top_db
self.data_dir = Path(data_dir)
self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
self.sample_type_weights = {
"default": 1,
"humming_like": 3,
"confused": 5,
**(sample_type_weights or {}),
}
self.pair_type_weights = {
"default": 1.0,
"augmented": 1.4,
"humming_like": 2.5,
"confused": 4.0,
**(pair_type_weights or {}),
}
with open(self.data_dir / f"{split}.json") as f:
metadata = json.load(f)
self.by_song: Dict[str, List[Dict]] = {}
for item in metadata:
if item.get("type") == "reference":
continue
p = self.asset_root / item["audio_path"]
if p.exists():
self.by_song.setdefault(item["song_id"], []).append(item)
self.song_ids = sorted(self.by_song)
self.sample_song_ids = []
for sid, items in self.by_song.items():
item_types = {x.get("type") for x in items}
weight = self.sample_type_weights.get("default", 1)
for item_type in item_types:
weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
self.sample_song_ids.extend([sid] * weight)
self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
def __len__(self):
return len(self.sample_song_ids)
def _load_clip(self, sample: Dict) -> np.ndarray:
path = self.asset_root / sample["audio_path"]
full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
duration = float(sample.get("duration", len(full_y) / self.sr))
max_offset = max(0.0, duration - (self.segment_len / self.sr))
offset = 0.0
if max_offset > 0:
if self.segment_strategy == "random":
offset = random.uniform(0, max_offset)
else:
direct_candidates = compute_candidate_offsets(
y=full_y,
sr=self.sr,
segment_len=self.segment_len,
strategy=self.segment_strategy,
silence_top_db=self.silence_top_db,
)
if direct_candidates:
offset = min(random.choice(direct_candidates) / self.sr, max_offset)
elif self.segment_strategy == "hybrid":
candidate_pool: List[int] = []
for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
candidate_pool.extend(
compute_candidate_offsets(
y=full_y,
sr=self.sr,
segment_len=self.segment_len,
strategy=strategy,
silence_top_db=self.silence_top_db,
)
)
if candidate_pool and random.random() < 0.75:
offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
else:
offset = random.uniform(0, max_offset)
else:
offset = random.uniform(0, max_offset)
start = int(offset * self.sr)
y = full_y[start : start + self.segment_len]
if len(y) < self.segment_len:
y = np.pad(y, (0, self.segment_len - len(y)))
return y
def _to_mel(self, y: np.ndarray) -> torch.Tensor:
mel = librosa.feature.melspectrogram(
y=y,
sr=self.sr,
n_mels=self.n_mels,
n_fft=self.n_fft,
hop_length=self.hop_length,
)
mel = librosa.power_to_db(mel, ref=np.max)
return torch.FloatTensor(mel)
def __getitem__(self, idx):
song_id = self.sample_song_ids[idx]
choices = self.by_song[song_id]
if len(choices) == 1:
a = b = choices[0]
else:
a, b = random.sample(choices, 2)
pair_weights = [
self.pair_type_weights.get(a.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
self.pair_type_weights.get(b.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
]
wavs = []
for sample in (a, b):
y = self._load_clip(sample)
if self.augment:
from src.utils.augment import AugmentPipeline
y = AugmentPipeline(self.sr, aggressive=sample.get("type") in {"confused", "humming_like"})(y)
wavs.append(self._to_mel(y))
max_t = max(w.shape[1] for w in wavs)
wavs = [torch.nn.functional.pad(w, (0, max_t - w.shape[1])) if w.shape[1] < max_t else w for w in wavs]
label = self.song_to_idx[song_id]
return {
"mel": torch.stack(wavs, dim=0),
"song_id": torch.tensor([label, label], dtype=torch.long),
"song_name": song_id,
"hard_weight": torch.tensor(pair_weights, dtype=torch.float32),
}
class SongPairDataset(PairSamplerDataset):
pass
......
......@@ -3,6 +3,55 @@ import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, List
try:
from transformers import AutoModel
except ImportError:
AutoModel = None
class FrozenMERTFeatureExtractor(nn.Module):
def __init__(self, model_name: Optional[str], n_mels: int, hidden_dim: int):
super().__init__()
self.model_name = model_name
self.hidden_dim = hidden_dim
self.backbone = None
self.proj = nn.Sequential(
nn.Conv1d(n_mels, hidden_dim, kernel_size=3, padding=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
)
for parameter in self.proj.parameters():
parameter.requires_grad = False
if model_name and AutoModel is not None:
try:
self.backbone = AutoModel.from_pretrained(model_name)
except Exception:
self.backbone = None
if self.backbone is not None:
for parameter in self.backbone.parameters():
parameter.requires_grad = False
backbone_dim = getattr(self.backbone.config, "hidden_size", hidden_dim)
self.proj = nn.Sequential(
nn.Conv1d(backbone_dim, hidden_dim, kernel_size=1),
nn.GELU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mel: torch.Tensor) -> torch.Tensor:
if self.backbone is None:
with torch.no_grad():
return self.proj(mel)
waveform_like = mel.transpose(1, 2)
with torch.no_grad():
outputs = self.backbone(inputs_embeds=waveform_like)
hidden = outputs.last_hidden_state.transpose(1, 2)
return self.proj(hidden)
class SEModule(nn.Module):
def __init__(self, channels, se_channels=128):
......@@ -123,6 +172,89 @@ class AAMSoftmax(nn.Module):
return output
class CoverHunterHead(nn.Module):
def __init__(self, input_dim: int, embed_dim: int, num_heads: int = 4, num_layers: int = 2, ff_mult: int = 4):
super().__init__()
encoder_layer = nn.TransformerEncoderLayer(
d_model=input_dim,
nhead=num_heads,
dim_feedforward=input_dim * ff_mult,
batch_first=True,
activation="gelu",
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.attention = nn.Sequential(
nn.Linear(input_dim, input_dim),
nn.Tanh(),
nn.Linear(input_dim, 1),
)
self.proj = nn.Linear(input_dim, embed_dim)
self.norm = nn.BatchNorm1d(embed_dim, affine=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
encoded = self.encoder(x)
weights = torch.softmax(self.attention(encoded).squeeze(-1), dim=1).unsqueeze(-1)
pooled = torch.sum(encoded * weights, dim=1)
projected = self.proj(pooled)
projected = self.norm(projected)
return F.normalize(projected, p=2, dim=1)
class MERTMelodyBranch(nn.Module):
def __init__(
self,
n_mels: int,
chroma_bins: int = 12,
melody_bins: int = 1,
hidden_dim: int = 256,
mert_model_name: Optional[str] = None,
):
super().__init__()
self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
self.melody_proj = nn.Conv1d(chroma_bins + melody_bins, hidden_dim, kernel_size=1)
self.fuse = nn.Sequential(
nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mert: torch.Tensor, melody: torch.Tensor, chroma: torch.Tensor) -> torch.Tensor:
semantic = self.mert(mert)
melodic = self.melody_proj(torch.cat([melody, chroma], dim=1))
return self.fuse(torch.cat([semantic, melodic], dim=1))
class ECAPABranch(nn.Module):
def __init__(self, n_mels: int, channels: int, use_band_split: bool, band_split_channels: int):
super().__init__()
front_channels = band_split_channels * 5 if use_band_split else n_mels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.proj = nn.Sequential(
nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.BatchNorm1d(channels),
)
def forward(self, mel: torch.Tensor) -> torch.Tensor:
x = self.band_split(mel) if self.band_split is not None else mel
return self.proj(x)
class DualStreamFusion(nn.Module):
def __init__(self, mert_dim: int, ecapa_dim: int, hidden_dim: int):
super().__init__()
self.mert_gate = nn.Conv1d(mert_dim, hidden_dim, kernel_size=1)
self.ecapa_gate = nn.Conv1d(ecapa_dim, hidden_dim, kernel_size=1)
self.fuse = nn.Sequential(
nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(hidden_dim),
)
def forward(self, mert_stream: torch.Tensor, ecapa_stream: torch.Tensor) -> torch.Tensor:
return self.fuse(torch.cat([self.mert_gate(mert_stream), self.ecapa_gate(ecapa_stream)], dim=1))
class ECAPA_ACR(nn.Module):
def __init__(
self,
......@@ -137,11 +269,38 @@ class ECAPA_ACR(nn.Module):
aam_s: float = 30.0,
use_band_split: bool = True,
band_split_channels: int = 128,
use_dual_stream: bool = True,
coverhunter_heads: int = 4,
coverhunter_layers: int = 2,
fusion_hidden_dim: int = 256,
mert_model_name: Optional[str] = None,
):
super().__init__()
self.embed_dim = embed_dim
front_channels = band_split_channels * 5 if use_band_split else channels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.use_dual_stream = use_dual_stream
if use_dual_stream:
self.mert_melody_branch = MERTMelodyBranch(
n_mels=n_mels,
chroma_bins=12,
melody_bins=1,
hidden_dim=fusion_hidden_dim,
mert_model_name=mert_model_name,
)
self.ecapa_branch = ECAPABranch(
n_mels=n_mels,
channels=channels,
use_band_split=use_band_split,
band_split_channels=band_split_channels,
)
self.stream_fusion = DualStreamFusion(
mert_dim=fusion_hidden_dim,
ecapa_dim=channels,
hidden_dim=channels,
)
front_channels = channels
else:
front_channels = band_split_channels * 5 if use_band_split else channels
self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
self.conv1 = nn.Sequential(
nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
......@@ -169,24 +328,39 @@ class ECAPA_ACR(nn.Module):
nn.ReLU(),
nn.BatchNorm1d(channels * 3),
)
self.pooling = StatisticsPooling()
self.fc = nn.Linear(channels * 3 * 2, embed_dim)
self.bn = nn.BatchNorm1d(embed_dim, affine=False)
self.coverhunter = CoverHunterHead(
input_dim=channels * 3,
embed_dim=embed_dim,
num_heads=coverhunter_heads,
num_layers=coverhunter_layers,
)
self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None
def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
x = self.band_split(mel) if self.band_split is not None else mel
x = self.conv1(x)
def forward(
self,
mel: torch.Tensor,
labels: Optional[torch.Tensor] = None,
melody: Optional[torch.Tensor] = None,
chroma: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
if self.use_dual_stream:
if melody is None or chroma is None:
raise ValueError("melody and chroma are required when dual-stream fusion is enabled")
mert_stream = self.mert_melody_branch(mel, melody, chroma)
ecapa_stream = self.ecapa_branch(mel)
x = self.stream_fusion(mert_stream, ecapa_stream)
else:
x = self.band_split(mel) if self.band_split is not None else mel
x = self.conv1(x)
if self.use_dual_stream:
x = self.conv1(x)
block_outputs = []
for block in self.blocks:
x = block(x)
block_outputs.append(x)
x = torch.cat(block_outputs, dim=1)
x = self.mfa(x)
x = self.pooling(x)
x = self.fc(x)
x = self.bn(x)
embedding = F.normalize(x, p=2, dim=1)
embedding = self.coverhunter(x.transpose(1, 2))
if labels is not None and self.aam is not None:
logits = self.aam(embedding, labels)
return embedding, logits
......
......@@ -3,30 +3,22 @@ import torch.nn as nn
import torch.nn.functional as F
class SupConLoss(nn.Module):
class InfoNCELoss(nn.Module):
def __init__(self, temperature: float = 0.07):
super().__init__()
self.temperature = temperature
def forward(self, features: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
batch_size = features.shape[0]
labels = labels.contiguous().view(-1, 1)
mask = torch.eq(labels, labels.T).float().to(features.device)
mask = mask - torch.eye(batch_size, device=features.device)
features = F.normalize(features, dim=1)
sim = torch.matmul(features, features.T) / self.temperature
sim_max, _ = torch.max(sim, dim=1, keepdim=True)
sim = sim - sim_max.detach()
exp_sim = torch.exp(sim) * (1 - torch.eye(batch_size, device=features.device))
log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True))
pos_mask = mask
pos_count = pos_mask.sum(dim=1)
loss = -(log_prob * pos_mask).sum(dim=1)
loss = loss / pos_count.clamp(min=1)
return loss
logits = torch.matmul(features, features.T) / self.temperature
labels = labels.contiguous().view(-1, 1)
positive_mask = torch.eq(labels, labels.T).float().to(features.device)
positive_mask = positive_mask - torch.eye(features.size(0), device=features.device)
logits = logits - logits.max(dim=1, keepdim=True).values.detach()
exp_logits = torch.exp(logits) * (1 - torch.eye(features.size(0), device=features.device))
log_prob = logits - torch.log(exp_logits.sum(dim=1, keepdim=True) + 1e-12)
positives = positive_mask.sum(dim=1).clamp(min=1)
return -((positive_mask * log_prob).sum(dim=1) / positives)
class CombinedLoss(nn.Module):
......@@ -37,8 +29,7 @@ class CombinedLoss(nn.Module):
aam_weight: float = 0.3,
):
super().__init__()
self.supcon = SupConLoss(temperature)
self.ce = nn.CrossEntropyLoss()
self.infonce = InfoNCELoss(temperature)
self.supcon_weight = supcon_weight
self.aam_weight = aam_weight
......@@ -50,21 +41,20 @@ class CombinedLoss(nn.Module):
supcon_labels: torch.Tensor,
hard_weight: torch.Tensor | None = None,
) -> dict:
loss_supcon = self.supcon(embedding, supcon_labels)
loss_infonce = self.infonce(embedding, supcon_labels)
loss_ce = F.cross_entropy(logits, labels, reduction="none")
if hard_weight is not None:
weight = hard_weight.float()
if weight.dim() == 0:
weight = weight.unsqueeze(0)
loss_supcon = loss_supcon * weight
loss_infonce = loss_infonce * weight
loss_ce = loss_ce * weight
loss_supcon = loss_supcon.mean()
loss_infonce = loss_infonce.mean()
loss_ce = loss_ce.mean()
total = self.supcon_weight * loss_supcon + self.aam_weight * loss_ce
total = self.supcon_weight * loss_infonce + self.aam_weight * loss_ce
return {
"loss": total,
"supcon_loss": loss_supcon.item(),
"supcon_loss": loss_infonce.item(),
"ce_loss": loss_ce.item(),
}
......
import numpy as np
import random
from typing import Optional, Tuple
from pathlib import Path
from typing import Iterable, Optional, Tuple
import librosa
import soundfile as sf
try:
from audiomentations import AddBackgroundNoise, AddGaussianNoise, BandPassFilter, Compose, Mp3Compression, PitchShift, TimeStretch
HAS_AUDIO_AUG = True
except Exception:
AddBackgroundNoise = AddGaussianNoise = BandPassFilter = Compose = Mp3Compression = PitchShift = TimeStretch = None
HAS_AUDIO_AUG = False
class AugmentPipeline:
def __init__(self, sr: int = 16000, aggressive: bool = False):
self.sr = sr
self.noise_snr_range = (5, 30)
self.pitch_shift_range = (-6, 6)
self.time_stretch_range = (0.85, 1.15)
self.mp3_bitrate_range = (32, 128)
self.aggressive = aggressive
def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray:
if snr_db is None:
snr_db = random.uniform(*self.noise_snr_range)
signal_power = np.mean(y ** 2)
noise_power = signal_power / (10 ** (snr_db / 10))
noise = np.random.randn(len(y)) * np.sqrt(noise_power)
return y + noise
class NoiseLibrary:
def __init__(self, roots: Optional[Iterable[str]] = None):
self.paths = []
for root in roots or []:
base = Path(root)
if not base.exists():
continue
for pattern in ("*.wav", "*.mp3", "*.flac", "*.ogg", "*.m4a"):
self.paths.extend(base.rglob(pattern))
def pitch_shift(self, y: np.ndarray, semitones: Optional[float] = None) -> np.ndarray:
if semitones is None:
semitones = random.uniform(*self.pitch_shift_range)
return librosa_shift(y, sr=self.sr, n_steps=semitones)
def directories(self) -> list[str]:
if not self.paths:
return []
return sorted({str(path.parent) for path in self.paths})
def time_stretch(self, y: np.ndarray, rate: Optional[float] = None) -> np.ndarray:
if rate is None:
rate = random.uniform(*self.time_stretch_range)
return librosa_ts(y, sr=self.sr, rate=rate)
def add_reverb(self, y: np.ndarray, decay: float = 0.3) -> np.ndarray:
ir_len = int(0.1 * self.sr)
ir = np.exp(-np.arange(ir_len) * decay / ir_len) * np.random.randn(ir_len)
ir /= np.sqrt(np.sum(ir ** 2))
return np.convolve(y, ir, mode='same')[:len(y)]
class AugmentPipeline:
def __init__(
self,
sr: int = 16000,
aggressive: bool = False,
noise_roots: Optional[Iterable[str]] = None,
freq_mask_prob: float = 0.3,
):
self.sr = sr
self.aggressive = aggressive
self.freq_mask_prob = freq_mask_prob
self.noise_library = NoiseLibrary(noise_roots)
self.wave_augment = self._build_wave_augmenter()
def _build_wave_augmenter(self):
if not HAS_AUDIO_AUG:
return None
transforms = [
AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5 if not self.aggressive else 0.8),
BandPassFilter(
min_center_freq=300.0,
max_center_freq=3200.0,
min_bandwidth_fraction=0.3,
max_bandwidth_fraction=0.8,
p=0.35 if not self.aggressive else 0.55,
),
Mp3Compression(min_bitrate=24, max_bitrate=96, p=0.35 if not self.aggressive else 0.55),
PitchShift(min_semitones=-5, max_semitones=5, p=0.35 if not self.aggressive else 0.55),
TimeStretch(min_rate=0.8, max_rate=1.2, p=0.35 if not self.aggressive else 0.55),
]
noise_dirs = self.noise_library.directories()
if noise_dirs:
transforms.append(
AddBackgroundNoise(
sounds_path=noise_dirs,
min_snr_db=3.0 if self.aggressive else 8.0,
max_snr_db=20.0 if self.aggressive else 30.0,
noise_transform=Compose([
BandPassFilter(
min_center_freq=250.0,
max_center_freq=4000.0,
min_bandwidth_fraction=0.2,
max_bandwidth_fraction=0.9,
p=0.5,
)
]),
p=0.35 if not self.aggressive else 0.6,
)
)
return Compose(transforms)
def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 8) -> np.ndarray:
def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 12) -> np.ndarray:
mel = mel.copy()
t = mel.shape[1]
f = mel.shape[0]
......@@ -46,43 +91,21 @@ class AugmentPipeline:
if t_start < t:
mel[:, t_start:t_start + t_mask] = 0
for _ in range(2):
f_mask = random.randint(0, max_freq_mask)
f_mask = random.randint(max(1, max_freq_mask // 3), max_freq_mask)
f_start = random.randint(0, max(0, f - f_mask))
if f_start < f:
mel[f_start:f_start + f_mask, :] = 0
return mel
def apply_to_mel(self, mel: np.ndarray) -> np.ndarray:
if random.random() < 0.3:
if random.random() < self.freq_mask_prob:
mel = self.apply_spec_augment(mel)
return mel
def __call__(self, y: np.ndarray) -> np.ndarray:
noise_p = 0.75 if self.aggressive else 0.5
stretch_p = 0.55 if self.aggressive else 0.3
pitch_p = 0.55 if self.aggressive else 0.3
reverb_p = 0.35 if self.aggressive else 0.2
if random.random() < noise_p:
y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None)
if random.random() < stretch_p:
y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None)
if random.random() < pitch_p:
y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None)
if random.random() < reverb_p:
y = self.add_reverb(y, decay=random.uniform(0.2, 0.6))
return y
def librosa_shift(y, sr=16000, n_steps=0):
return librosa_impl(y, lambda: __import__('librosa').effects.pitch_shift(y, sr=sr, n_steps=n_steps))
def librosa_ts(y, sr=16000, rate=1.0):
return librosa_impl(y, lambda: __import__('librosa').effects.time_stretch(y, rate=rate))
def librosa_impl(y, fn):
try:
return fn()
except Exception:
return y
if self.wave_augment is None:
return y
try:
return self.wave_augment(samples=y.astype(np.float32), sample_rate=self.sr)
except Exception:
return y
......
......@@ -4,6 +4,7 @@
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
import torch
......@@ -21,15 +22,23 @@ from src.models.losses import CombinedLoss
def collate_fn(batch):
mels = []
melodies = []
chromas = []
song_ids = []
song_names = []
hard_weights = []
for b in batch:
mel = b["mel"]
melody = b.get("melody")
chroma = b.get("chroma")
hw = b.get("hard_weight", torch.tensor(1.0))
if mel.dim() == 3:
for i in range(mel.shape[0]):
mels.append(mel[i])
if melody is not None:
melodies.append(melody[i])
if chroma is not None:
chromas.append(chroma[i])
song_ids.append(b["song_id"][i])
song_names.append(b["song_name"])
if torch.is_tensor(hw) and hw.dim() > 0:
......@@ -38,24 +47,45 @@ def collate_fn(batch):
hard_weights.append(hw)
else:
mels.append(mel)
if melody is not None:
melodies.append(melody)
if chroma is not None:
chromas.append(chroma)
song_ids.append(b["song_id"])
song_names.append(b["song_name"])
hard_weights.append(hw)
max_t = max(m.shape[1] for m in mels)
mels_padded = []
for m in mels:
melodies_padded = []
chromas_padded = []
for idx, m in enumerate(mels):
pad = max_t - m.shape[1]
if pad > 0:
m = torch.nn.functional.pad(m, (0, pad))
mels_padded.append(m.unsqueeze(0))
return {
if melodies:
melody = melodies[idx]
if melody.shape[1] < max_t:
melody = torch.nn.functional.pad(melody, (0, max_t - melody.shape[1]))
melodies_padded.append(melody.unsqueeze(0))
if chromas:
chroma = chromas[idx]
if chroma.shape[1] < max_t:
chroma = torch.nn.functional.pad(chroma, (0, max_t - chroma.shape[1]))
chromas_padded.append(chroma.unsqueeze(0))
payload = {
"mel": torch.cat(mels_padded, dim=0),
"song_id": torch.stack(song_ids),
"song_name": song_names,
"hard_weight": torch.stack(hard_weights),
}
if melodies_padded:
payload["melody"] = torch.cat(melodies_padded, dim=0)
if chromas_padded:
payload["chroma"] = torch.cat(chromas_padded, dim=0)
return payload
def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg):
......@@ -64,10 +94,14 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg)
pbar = tqdm(loader, desc=f"Epoch {epoch}")
for batch in pbar:
mel = batch["mel"].to(device)
melody = batch.get("melody")
chroma = batch.get("chroma")
melody = melody.to(device) if melody is not None else None
chroma = chroma.to(device) if chroma is not None else None
labels = batch["song_id"].to(device)
with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"):
embedding, logits = model(mel, labels)
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
optimizer.zero_grad()
......@@ -115,6 +149,28 @@ def save_checkpoint(output_dir, epoch, model, optimizer, best_metric, cfg, name)
print(f" Saved: {path}")
def write_training_artifacts(output_dir: Path, cfg: dict, train_metrics: dict, train_dataset, args):
manifest = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"config": cfg,
"output_dir": str(output_dir),
"train_song_count": len(train_dataset.song_ids),
"sample_count": len(train_dataset),
"segment_strategy": args.segment_strategy,
"noise_roots": args.noise_root,
"artifacts": {
"best_model": str(output_dir / "best_model.pt"),
"song_to_idx": str(output_dir / "song_to_idx.json"),
"metrics": str(output_dir / "training_metrics.json"),
},
"final_metrics": train_metrics,
}
with open(output_dir / "training_metrics.json", "w") as f:
json.dump(train_metrics, f, indent=2)
with open(output_dir / "training_manifest.json", "w") as f:
json.dump(manifest, f, indent=2)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="configs/default.yaml")
......@@ -125,6 +181,7 @@ def main():
parser.add_argument("--epochs", type=int, default=None)
parser.add_argument("--batch-size", type=int, default=None)
parser.add_argument("--lr", type=float, default=None)
parser.add_argument("--noise-root", action="append", default=[])
parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
parser.add_argument("--silence-top-db", type=int, default=30)
parser.add_argument("--dry-run", action="store_true")
......@@ -159,6 +216,8 @@ def main():
silence_top_db=args.silence_top_db,
sample_type_weights=cfg["training"].get("sample_type_weights"),
pair_type_weights=cfg["training"].get("pair_type_weights"),
hard_negative_k=cfg["training"].get("hard_negative_k", 2),
noise_roots=args.noise_root,
)
catalog_dataset = ACRDataset(
......@@ -174,6 +233,7 @@ def main():
song_to_idx=train_dataset.song_to_idx,
segment_strategy=args.segment_strategy,
silence_top_db=args.silence_top_db,
noise_roots=args.noise_root,
)
train_loader = DataLoader(
......@@ -205,6 +265,11 @@ def main():
aam_s=cfg["model"]["aam_s"],
use_band_split=cfg["model"].get("use_band_split", True),
band_split_channels=cfg["model"].get("band_split_channels", 128),
use_dual_stream=cfg["model"].get("use_dual_stream", True),
coverhunter_heads=cfg["model"].get("coverhunter_heads", 4),
coverhunter_layers=cfg["model"].get("coverhunter_layers", 2),
fusion_hidden_dim=cfg["model"].get("fusion_hidden_dim", 256),
mert_model_name=cfg["model"].get("mert_model_name"),
).to(device)
criterion = CombinedLoss(
......@@ -219,8 +284,12 @@ def main():
print("Dry run: running one batch through forward/backward...")
batch = next(iter(train_loader))
mel = batch["mel"].to(device)
melody = batch.get("melody")
chroma = batch.get("chroma")
melody = melody.to(device) if melody is not None else None
chroma = chroma.to(device) if chroma is not None else None
labels = batch["song_id"].to(device)
embedding, logits = model(mel, labels)
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
loss_dict["loss"].backward()
print(f" Forward/backward OK. Loss: {loss_dict['loss']:.4f}")
......@@ -242,6 +311,7 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True)
print("Starting training...")
train_metrics = None
for epoch in range(start_epoch, cfg["training"]["epochs"] + 1):
train_metrics = train_epoch(model, train_loader, optimizer, criterion, scaler, device, epoch, cfg)
scheduler.step()
......@@ -254,6 +324,7 @@ def main():
with open(output_dir / "song_to_idx.json", "w") as f:
json.dump(train_dataset.song_to_idx, f, indent=2)
write_training_artifacts(output_dir, cfg, train_metrics or {}, train_dataset, args)
print(f"\nTraining complete. Best training loss: {best_loss:.4f}")
print(f"Model saved to: {output_dir / 'best_model.pt'}")
print(f"Catalog references available: {len(catalog_dataset.samples)}")
......
{
"run_name": "coverhunter_finetune_20260608T130103Z",
"created_at": "2026-06-08T13:01:03.023371Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130103Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130103Z",
"created_at": "2026-06-08T13:01:03.023371Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130103Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z",
"returncode": 1,
"completed_at": "2026-06-08T13:01:32.762576Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 249, in main
batch = next(iter(train_loader))
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 718, in __next__
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 778, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 370, in __getitem__
positive_features = [self._load_features(sample) for sample in positive_items]
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 344, in _load_features
features = self.feature_extractor.extract(y)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 138, in extract
melody = librosa.hz_to_midi(melody, bins_per_octave=12)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: hz_to_midi() got an unexpected keyword argument 'bins_per_octave'
{
"run_name": "coverhunter_finetune_20260608T130306Z",
"created_at": "2026-06-08T13:03:06.790814Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130306Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130306Z",
"created_at": "2026-06-08T13:03:06.790814Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130306Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z",
"returncode": 1,
"completed_at": "2026-06-08T13:04:34.035140Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 256, in main
model = ECAPA_ACR(
^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 280, in __init__
self.mert_melody_branch = MERTMelodyBranch(
^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 211, in __init__
self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 21, in __init__
self.backbone = AutoModel.from_pretrained(model_name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 289, in from_pretrained
resolved_config_file = cached_file(
^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 293, in cached_file
file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 527, in cached_files
raise e
File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 437, in cached_files
hf_hub_download(
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1019, in hf_hub_download
return _hf_hub_download_to_cache_dir(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1152, in _hf_hub_download_to_cache_dir
_get_metadata_or_catch_error(
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1694, in _get_metadata_or_catch_error
metadata = get_hf_file_metadata(
^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1616, in get_hf_file_metadata
response = _httpx_follow_relative_redirects_with_backoff(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 685, in _httpx_follow_relative_redirects_with_backoff
response = http_backoff(
^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 559, in http_backoff
return next(
^^^^^
File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 467, in _http_backoff_base
response = client.request(method=method, url=url, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 825, in request
return self.send(request, auth=auth, follow_redirects=follow_redirects)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 901, in send
raise RuntimeError("Cannot send a request, as the client has been closed.")
RuntimeError: Cannot send a request, as the client has been closed.
Device: cpu
Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
Classes: 16
Train songs: 64
{
"run_name": "coverhunter_finetune_20260608T130514Z",
"created_at": "2026-06-08T13:05:14.591209Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130514Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z"
}
\ No newline at end of file
{
"run_name": "coverhunter_finetune_20260608T130514Z",
"created_at": "2026-06-08T13:05:14.591209Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130514Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z",
"returncode": 1,
"completed_at": "2026-06-08T13:06:50.272162Z",
"artifacts": [
"run_request.json",
"stderr.log",
"stdout.log"
]
}
\ No newline at end of file
/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
Traceback (most recent call last):
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
main()
File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 292, in main
embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 351, in forward
mert_stream = self.mert_melody_branch(mel, melody, chroma)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 224, in forward
semantic = self.mert(mert)
^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 49, in forward
return self.proj(mel)
^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not callable
Device: cpu
Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
Classes: 16
Train songs: 64
Dry run: running one batch through forward/backward...
{
"run_name": "coverhunter_finetune_20260608T130731Z",
"created_at": "2026-06-08T13:07:31.311447Z",
"python": "/usr/local/miniconda3/bin/python",
"command": [
"/usr/local/miniconda3/bin/python",
"train.py",
"--config",
"configs/coverhunter_finetune_4gb.yaml",
"--data",
"data/synthetic_v2",
"--output",
"data/training_runs/coverhunter_finetune_20260608T130731Z",
"--device",
"cpu",
"--segment-strategy",
"hybrid",
"--dry-run"
],
"config": "configs/coverhunter_finetune_4gb.yaml",
"data": "data/synthetic_v2",
"noise_roots": [],
"run_dir": "data/training_runs/coverhunter_finetune_20260608T130731Z"
}
\ No newline at end of file
# CoverHunter 环境安装与验证
## 1. 目标解释器
本专题统一使用:
```bash
/usr/local/miniconda3/bin/python
```
## 2. 自动化脚本
已新增环境安装与验证脚本:
```text
acr-engine/scripts/setup_coverhunter_env.py
```
执行方式:
```bash
/usr/local/miniconda3/bin/python acr-engine/scripts/setup_coverhunter_env.py
```
它会自动:
1. 安装 `requirements.txt`
2. 补充训练依赖:
- `torch`
- `torchaudio`
- `transformers`
- `huggingface_hub`
- `librosa`
- `soundfile`
- `audiomentations`
3. 进行环境验证
4. 生成报告:
```text
acr-engine/reports/coverhunter_env_setup_report.json
```
## 3. 当前自动化执行结果
本次已经自动执行完成。
报告文件:
```text
acr-engine/reports/coverhunter_env_setup_report.json
```
当前结论:
- Python 包安装:**成功**
- `torch` / `transformers` / `librosa` / `soundfile` / `audiomentations`**已安装**
-`torch.cuda.is_available()` 当前返回:**False**
## 4. 当前 GPU 阻塞点
虽然系统存在 NVIDIA GPU,且 `nvidia-smi` 可见设备,但当前 PyTorch CUDA 初始化失败。
报告中的核心告警是:
- **The NVIDIA driver on your system is too old**
这说明:
- 当前安装到环境里的 `torch 2.12.0+cu130`
- 与当前系统驱动版本不兼容
也就是说:
- **环境依赖已经安装好了**
- **但当前 GPU 训练还不能真正启用**
- 原因不是代码问题,而是 **PyTorch CUDA 版本与驱动版本不匹配**
## 5. 当前状态怎么理解
现在的环境状态可以分成两部分:
### 已经完成的
- 训练依赖已安装
- 训练脚本可执行
- MERT / ECAPA 双流代码可 import
- 文档和配置已准备好
### 仍未完成的
- CUDA 版 torch 与当前 NVIDIA driver 的匹配
## 6. 下一步建议
要让 GPU 真正可用,需要二选一:
### 方案 A:升级 NVIDIA 驱动
优点:
- 可以保留当前较新的 torch/cu130 组合
- 后续兼容性更好
### 方案 B:安装与当前驱动兼容的更低 CUDA 版本 torch
优点:
- 不改系统驱动
- 更适合当前机器直接落地
对当前项目而言,我更建议:
- **优先采用方案 B**
- 安装与当前驱动兼容的 torch 版本
## 7. 当前专题与环境文档关系
配套文件如下:
- 训练专题:`docs/coverhunter_finetune_topic.md`
- 训练流程:`docs/coverhunter_training_process.md`
- 环境文档:`docs/coverhunter_env_setup.md`
- 环境报告:`acr-engine/reports/coverhunter_env_setup_report.json`
## 8. 当前结论
当前已经自动完成:
- 环境依赖安装
- 环境验证
- 结果记录
目前唯一阻塞 GPU 训练的点是:
- **CUDA / 驱动 / torch 版本不匹配**
# CoverHunter 双流微调专题方案
## 1. 专题目标
本专题目标是围绕当前仓库,建立一套可持续扩展的 **CoverHunter 双流微调方案**,用于音乐翻唱识别、哼唱检索、录音片段检索和抗噪 ACR 检索。
专题的核心方向不是一次性跑通训练,而是建立一条可反复扩展的训练专题链路:
1. 明确现有音源与数据资产
2. 定义双流训练架构
3. 设计分阶段训练计划
4. 形成标准训练流程
5. 规范训练产物与权重使用方式
6. 为后续补充更多 music 语料预留稳定入口
---
## 2. 当前已有音源与数据资产
### 2.1 当前仓库内可直接使用的数据
当前可直接用于训练与冒烟验证的数据位于:
```text
acr-engine/data/synthetic_v2/
```
其中包含:
- `train.json`
- `test.json`
- `segments/*.wav`
### 2.2 当前训练集统计
基于 `acr-engine/data/synthetic_v2/train.json` 的统计结果:
- 样本总数:**96**
- `song_id` 数量:**16**
- 类型分布:
- `reference`: **16**
- `clean`: **32**
- `augmented`: **16**
- `humming_like`: **16**
- `confused`: **16**
### 2.3 当前音源的含义
按现有数据结构,可以理解为每首歌目前至少对应以下几类样本:
1. **reference**
- 作为标准原曲/参考版本
- 用于建立稳定的正样本锚点
2. **clean**
- 较干净的切片
- 代表相对理想的检索输入
3. **augmented**
- 已经经过部分增强的样本
- 用于初步提升泛化能力
4. **humming_like**
- 偏哼唱/偏旋律化表达的近似样本
- 用于强化“忽略音色、聚焦旋律”能力
5. **confused**
- 易混淆样本
- 用于构建难负样本与边界学习能力
### 2.4 当前音源的局限性
当前 `synthetic_v2` 更适合做:
- 训练链路验证
- 双流结构验证
- 小规模参数/损失/显存调优
- 产物定义与使用方式验证
当前它还不适合直接视为最终生产训练集,原因包括:
- 歌曲数较少(16 首)
- 类型覆盖有限
- 录音噪声场景仍偏少
- 真实翻唱的多样性不足
- 真实设备采集差异不足
所以本专题应采用 **分阶段训练策略**
---
## 3. 当前双流训练架构
### 3.1 架构定义
当前已按双流结构实现:
### 流 A:MERT + Melody 分支
位置:
- `acr-engine/src/models/ecapa_tdnn.py`
职责:
- 建模高层语义与旋律表达
- 提高跨音色、跨设备、跨唱法的语义对齐能力
当前组成:
- `FrozenMERTFeatureExtractor`
- `melody/chroma` 特征投影与融合
默认模型配置:
```yaml
model:
mert_model_name: m-a-p/MERT-v1-95M
```
### 流 B:ECAPA 分支
职责:
- 强化局部声学结构与 timbre/韵律相关判别信息
- 作为与 MERT 分支互补的检索支路
### 双流融合
- `DualStreamFusion`
作用:
- 将语义旋律流与 ECAPA 流融合到统一时序空间
### 检索头
- `CoverHunterHead`
作用:
- 将融合后的时序特征进一步编码
- 输出最终 embedding 用于对比训练和检索
### 训练目标
- `InfoNCE`
- `AAMSoftmax`
---
## 4. 训练专题的总体思路
本专题不建议“一步到位”直接上大规模真实全量训练,而建议按三个阶段推进。
### 阶段 A:链路验证阶段
目标:
- 验证模型结构、数据流、增强链路、权重产物、运行日志是否闭环
训练数据:
- `acr-engine/data/synthetic_v2`
产出:
- 跑通训练
- 确认显存
- 确认增强是否有效
- 确认权重可以导出并复用
### 阶段 B:专题微调阶段
目标:
- 在当前专题下引入更多音乐语料
- 逐步扩充:原曲、翻唱、录音、哼唱、噪声注入样本
- 建立更稳定的双流 CoverHunter embedding
训练数据规划:
- 原曲标准音源
- 真实或半真实翻唱音源
- 设备录音音源
- 环境噪声音源
- 难负样本音源
### 阶段 C:检索权重沉淀阶段
目标:
- 固化最优权重
- 建立 reference embedding 索引流程
- 形成线上/离线检索用权重标准
---
## 5. 训练数据计划
后续你提到会补充更多 music 语料,因此建议数据建设按下面结构统一。
### 5.1 推荐数据结构
建议每首歌围绕 `song_id` 组织为:
- `reference`
- `clean`
- `cover`
- `recording`
- `environment`
- `humming_like`
- `confused`
### 5.2 推荐含义
#### reference
- 标准原曲版本
- 用于构建基准 embedding 和 reference index
#### clean
- 质量较好的切片/相对干净音频
- 用于稳定正样本训练
#### cover
- 真实翻唱版本
- 用于训练旋律一致、音色不同的对齐能力
#### recording
- 手机/麦克风录制版本
- 用于训练设备失真和场景采集鲁棒性
#### environment
- 注入环境噪声或真实环境录音
- 用于训练抗噪能力
#### humming_like
- 哼唱、跟唱、弱伴奏旋律版本
- 用于训练旋律驱动检索能力
#### confused
- 容易相似但不属于同一首歌的样本
- 用于强化难负样本学习
### 5.3 当前专题的样本补充建议
优先补充顺序建议为:
1. **更多 reference / clean 原曲**
2. **更多 recording / environment 样本**
3. **更多真实 cover 样本**
4. **更多 confused 难负样本**
5. **更多 humming_like 样本**
原因:
- 当前抗噪与设备泛化是近期最容易拉开效果差异的方向
- cover / humming 的价值很高,但数据准备成本更高
---
## 6. 数据增强计划
当前代码已实现两大类增强,用于“伪造录音”和“伪造翻唱”。
位置:
- `acr-engine/src/utils/augment.py`
### 6.1 伪造录音增强
包括:
- `AddGaussianNoise`
- `AddBackgroundNoise`
- `BandPassFilter`
- `Mp3Compression`
作用:
- 模拟餐厅底噪、街道底噪
- 模拟廉价设备频响缺失
- 模拟压缩带来的失真
- 提高抗噪与抗设备变化能力
### 6.2 伪造翻唱增强
包括:
- `PitchShift`
- `TimeStretch`
- `Frequency Masking`
作用:
- 模拟升降调翻唱
- 模拟节奏变化
- 逼迫模型降低音色依赖,关注旋律主线
### 6.3 当前专题下的增强原则
- `reference` 不建议过强增强
- `clean` 可做轻增强
- `recording / environment` 可做强增强
- `humming_like / confused` 应提高采样权重
---
## 7. 训练流程
### 7.1 环境准备
解释器:
```bash
/usr/local/miniconda3/bin/python
```
安装依赖:
```bash
/usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
```
当前依赖至少需要:
- `torch`
- `torchaudio`
- `transformers`
- `huggingface_hub`
- `librosa`
- `soundfile`
- `audiomentations`
### 7.2 4GB GPU 专用配置
当前 GPU 为:
- `Quadro P1000`
- 4GB 显存
因此我已经新增专用配置:
- `acr-engine/configs/coverhunter_finetune_4gb.yaml`
特点:
- 更小 `batch_size`
- 更短片段
- 更小通道数
- 更浅层数
- 更适合当前显存资源
### 7.3 首次验证流程
先 dry-run:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--config configs/coverhunter_finetune_4gb.yaml \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--dry-run
```
### 7.4 小规模试训
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python train.py \
--config configs/coverhunter_finetune_4gb.yaml \
--data data/synthetic_v2 \
--output data/training_runs/coverhunter_4gb_trial \
--device cuda \
--segment-strategy hybrid \
--batch-size 2 \
--epochs 2
```
### 7.5 专题正式训练
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--config configs/coverhunter_finetune_4gb.yaml \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--noise-root data/noise/restaurant \
--noise-root data/noise/street
```
### 7.6 后续扩容训练
当你补充新的 music 语料后,建议:
1. 先保持 `song_id + type + audio_path + duration` 元数据结构一致
2. 新语料先做小批量接入
3. 先跑 2 epoch 验证
4. 再逐步扩大训练轮次
---
## 8. 训练过程会产生什么产物
每次训练会生成目录:
```text
acr-engine/data/training_runs/<run_name>/
```
标准产物包括:
- `best_model.pt`
- `checkpoint_epoch_*.pt`
- `song_to_idx.json`
- `training_metrics.json`
- `training_manifest.json`
- `run_request.json`
- `run_summary.json`
- `stdout.log`
- `stderr.log`
### 8.1 各产物的用途
#### best_model.pt
- 当前训练过程中最优权重
- 后续检索、建库、推理优先使用它
#### checkpoint_epoch_*.pt
- 周期性保存点
- 用于中断恢复、回溯比较
#### song_to_idx.json
- 训练类别到 `song_id` 的映射
- 用于解释训练分类头与标签对应关系
#### training_metrics.json
- 记录最后一次训练指标
- 用于专题对比不同配置
#### training_manifest.json
- 记录本次训练的配置、输入、产物路径
- 适合作为专题可追溯记录
#### run_request.json / run_summary.json
- 记录本次运行命令、解释器、配置与运行结果
- 便于回放与专题管理
---
## 9. 预期权重怎么使用
这是专题里非常关键的一部分。
### 9.1 训练权重的核心用途
训练出来的 `best_model.pt` 不是只为了看 loss,而是为了后续两类使用:
1. **离线建库**
2. **在线查询 embedding 提取**
### 9.2 离线建库
目标:
- 使用参考音源(reference)切片提取 embedding
- 建立 reference 向量索引
预期流程:
1. 读取 `reference` 音源
2. 切片
3. 用双流模型提 embedding
4. 存成 embedding matrix
5. 后续接 Faiss / pgvector / Milvus
### 9.3 在线查询
目标:
- 输入录音、翻唱、哼唱片段
- 提取 embedding
- 与 reference index 做相似度检索
预期方式:
1. 加载 `best_model.pt`
2. 对查询音频切片
3. 提取 embedding
4. 与 reference embedding 做 ANN 检索
5. 结合 vote / rerank 输出最终结果
### 9.4 推荐使用策略
#### 最佳权重
生产或专题评估优先使用:
- `best_model.pt`
#### 恢复训练
继续训练优先使用:
- `checkpoint_epoch_*.pt`
#### 对比实验
建议每个专题 run 保留完整目录,不覆盖历史 run。
---
## 10. 预计怎么推进专题训练
### 第 1 步:先跑通当前 synthetic_v2
目标:
- 验证链路
- 验证显存
- 验证双流结构
- 验证 MERT 接口
### 第 2 步:补录音噪声语料
优先补充:
- 餐厅
- 街道
- 室内人声背景
- 手机录制样本
目标:
- 提升抗噪与设备鲁棒性
### 第 3 步:补真实翻唱/旋律相近样本
目标:
- 强化旋律对齐
- 降低音色依赖
### 第 4 步:补难负样本
目标:
- 降低误识别
- 提高边界判别能力
### 第 5 步:固化最优专题权重
目标:
- 形成一个可用于离线建库与线上检索的标准权重版本
---
## 11. 当前专题的资源结论
### 可以做的事
- 继续完善训练链路
-`synthetic_v2` 做小规模训练
- 做双流模型结构验证
- 做 4GB GPU 轻量试训
- 规范化训练产物与权重使用方式
### 当前暂时受限的事
- 由于环境缺依赖,**还不能直接启动真实训练**
- 由于 GPU 只有 4GB,**真实 MERT + ECAPA 双流正式训练需要保守配置**
- 当前真实音乐语料仍不足,**暂时更适合专题验证,不适合最终权重定版**
---
## 12. 本专题当前落地文件
### 配置
- `acr-engine/configs/coverhunter_finetune.yaml`
- `acr-engine/configs/coverhunter_finetune_4gb.yaml`
- `acr-engine/configs/default.yaml`
### 模型与训练
- `acr-engine/src/models/ecapa_tdnn.py`
- `acr-engine/src/models/losses.py`
- `acr-engine/src/data/dataset.py`
- `acr-engine/src/utils/augment.py`
- `acr-engine/train.py`
- `acr-engine/scripts/run_coverhunter_finetune.py`
### 文档
- `docs/coverhunter_training_process.md`
- `docs/coverhunter_finetune_topic.md`
---
## 13. 当前专题结论
当前已经具备:
- 双流 CoverHunter 微调架构
- 4GB GPU 专用轻量配置
- 训练流程脚本
- 训练产物记录机制
- 专题级训练文档
当前下一步最实际的动作是:
1.`/usr/local/miniconda3/bin/python` 下补齐依赖
2.`coverhunter_finetune_4gb.yaml` 跑 dry-run
3.`synthetic_v2` 做 2 epoch 小规模试训
4. 再逐步接入更多 music 语料
# CoverHunter 双流微调标准流程
## 1. 当前架构
当前训练架构已经调整为双流:
- **流 A:MERT + Melody 分支**
- 代码位置:`acr-engine/src/models/ecapa_tdnn.py`
- 逻辑:冻结的 `FrozenMERTFeatureExtractor` + `melody/chroma` 融合
- 默认模型:`m-a-p/MERT-v1-95M`
- 说明:当前代码已经支持真实 HuggingFace MERT 权重接入;若环境里缺少 `transformers` 或首次拉取失败,则无法启用真实 MERT
- **流 B:ECAPA 分支**
- 逻辑:保留 ECAPA 特征建模路径
- **双流融合**
- `DualStreamFusion`
- **检索头**
- `CoverHunterHead`
- **训练目标**
- `InfoNCE + AAMSoftmax`
## 2. 当前资源检查结论
### Python 解释器
训练入口已固定支持:
```bash
/usr/local/miniconda3/bin/python
```
`acr-engine/scripts/run_coverhunter_finetune.py` 已支持 `--python` 参数,默认就是这个解释器。
### GPU
当前检测到 GPU:
- **Quadro P1000**
- 总显存:**4096 MiB**
- 空闲显存:约 **3817 MiB**
结论:
- **可以跑训练**
- 但显存较小,建议:
- `batch_size=2~4`
- `segment_dur=5.0` 起步
- 优先做 dry-run、小批量试跑、再正式训练
- 启用真实 MERT 后不要直接上大 batch
### 数据
当前仓库中可直接用于冒烟训练的数据:
- `acr-engine/data/synthetic_v2/train.json`
- 音频切片位于 `acr-engine/data/synthetic_v2/segments/`
这些数据已经包含:
- 普通切片
- augmented
- humming_like
- confused
适合先做流程验证。
### 当前环境缺口
`/usr/local/miniconda3/bin/python` 下当前缺少这些核心包:
- `torch`
- `transformers`
- `huggingface_hub`
- `torchaudio`
- `librosa`
- `soundfile`
- `audiomentations`
所以:
- **GPU 与解释器可用**
- **但当前训练环境还不能直接跑**
- 需要先补齐依赖
## 3. 标准处理流程
### Step 1:准备 Python 环境
进入项目后,先确保用的是目标解释器:
```bash
/usr/local/miniconda3/bin/python --version
```
安装依赖:
```bash
/usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
```
如需单独补装:
```bash
/usr/local/miniconda3/bin/python -m pip install torch torchaudio transformers huggingface_hub librosa soundfile audiomentations
```
### Step 2:准备 MERT 权重缓存
首次启用真实 MERT 时,会从 HuggingFace 拉取:
- `m-a-p/MERT-v1-95M`
建议先确认网络可访问 HuggingFace,或提前缓存模型。
如果不希望改默认配置,可以在 `configs/default.yaml``configs/coverhunter_finetune.yaml` 中调整:
```yaml
model:
mert_model_name: m-a-p/MERT-v1-95M
```
### Step 3:准备噪声数据
为了支持伪造录音增强,建议准备目录,例如:
```text
acr-engine/data/noise/restaurant/
acr-engine/data/noise/street/
```
里面放公开可用环境音频:
- 餐厅底噪
- 街道底噪
- 室内人声背景
训练时通过:
```bash
--noise-root acr-engine/data/noise/restaurant \
--noise-root acr-engine/data/noise/street
```
传入。
### Step 4:先做 dry-run
先验证数据、模型、GPU、增强链路是否都通:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--dry-run
```
### Step 5:小规模试训
建议先缩小 batch/config,确认显存稳定:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python train.py \
--config configs/coverhunter_finetune.yaml \
--data data/synthetic_v2 \
--output data/training_runs/coverhunter_trial \
--device cuda \
--segment-strategy hybrid \
--batch-size 2 \
--epochs 2 \
--noise-root data/noise/restaurant \
--noise-root data/noise/street
```
如果显存稳定,再逐步提高到:
- `batch_size=4`
- 必要时再尝试 `batch_size=6`
### Step 6:正式专题训练
标准命令:
```bash
cd /mnt/e/hikoon-ACR/acr-engine && \
/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
--python /usr/local/miniconda3/bin/python \
--data data/synthetic_v2 \
--device cuda \
--segment-strategy hybrid \
--noise-root data/noise/restaurant \
--noise-root data/noise/street
```
### Step 7:检查训练产物
每次训练会记录到:
```text
acr-engine/data/training_runs/<run_name>/
```
标准产物包括:
- `best_model.pt`
- `checkpoint_epoch_*.pt`
- `song_to_idx.json`
- `training_metrics.json`
- `training_manifest.json`
- `run_request.json`
- `run_summary.json`
- `stdout.log`
- `stderr.log`
## 4. 增强策略说明
当前代码已经覆盖两类伪造策略:
### 伪造录音
位置:`acr-engine/src/utils/augment.py`
- `AddGaussianNoise`
- `AddBackgroundNoise`
- `BandPassFilter`
- `Mp3Compression`
### 伪造翻唱
位置:`acr-engine/src/utils/augment.py`
- `PitchShift`
- `TimeStretch`
- `Frequency Masking`(作用于 mel)
## 5. 资源适配建议
由于当前 GPU 是 Quadro P1000 4GB,建议按以下梯度推进:
### 推荐起步配置
- `segment_dur=5.0`
- `batch_size=2`
- `mixed_precision=true`
- `num_workers=0`
### 稳定后可尝试
- `batch_size=4`
- 如 OOM 则回退
### 当前不建议
- 直接上 8 秒片段 + batch 16
- 真实 MERT + 大 batch 同时启用
## 6. 当前结论
当前状态可以概括为:
- **架构方向已经调整正确**:双流
- **真实 MERT 接口已接入**:是
- **GPU 可以用于训练**:是
- **当前 Python 解释器可用**:是,`/usr/local/miniconda3/bin/python`
- **当前环境能否立刻开训****还不能**,因为依赖未装全
- **现有数据能否支撑一波流程训练****可以**,先从 `synthetic_v2` 开始
# 音乐翻唱检测与音频片段检索系统 (CSI) 核心能力结构清单
## 1. 核心架构逻辑
* **底座 (Backbone)**:MERT (冻结预训练权重) - 负责音频语义理解。
* **头部 (Head)**:CoverHunter (可训练 Conformer+Attention) - 负责旋律与结构的对比学习。
* **对齐方式**:双流融合 (MERT 语义特征 + Melody/Chroma 旋律特征)。
## 2. 数据与特征工程 (Data Pipeline)
* **数据集结构**:以 `Song_ID` 为唯一键,物理隔离原曲、压缩版、录音与环境音。
* **动态增强 (Data Augmentation)**
* 物理扰动:音高平移 (Pitch Shifting)、变速 (Time Stretching)。
* 环境注入:背景噪声混入 (Environment Injection)。
* 频率掩码:频段擦除 (Frequency Masking) - 逼迫模型脱离音色依赖,转向旋律核心。
* **数据对齐**:使用插值 (Interpolation) 将 MERT 序列长度与 Melody 序列长度对齐至一致的 `Time_Steps`
## 3. 训练与优化策略 (Training Strategy)
* **样本采样 (Sampler)**:PairSampler - 确保 Batch 中包含强配对的“原曲-翻唱”与精心挑选的“原曲-难负样本”。
* **难负样本挖掘 (Hard Negative Mining)**
* 使用冻结 MERT + Faiss 构建初始索引。
* 挖掘曲风相似但旋律不同的“假孪生兄弟”歌曲作为 Negative 样本。
* **损失函数 (Loss Function)**:InfoNCE Contrastive Loss - 拉近正样本余弦距离,推远负样本余弦距离。
## 4. 推理与检索引擎 (Inference & Retrieval)
* **离线建库**:全量原曲切片 -> 特征提取 -> 存入向量数据库 (Faiss/Milvus)。
* **在线查询**:录音片段 -> 滑动窗口切片 -> 提取 Embedding -> 近似最近邻检索 (ANN)。
* **鲁棒性机制**:切片投票机制 (Slice Voting) - 对查询录音切片所得的 Top-K 结果进行统计,按票数加权归一化排序。
## 5. 工程化关键节点 (Engineering Checklist)
* **计算优化**:离线特征缓存 (预先存储 .npy 减少 GPU 实时计算压力)。
* **部署优化**:ONNX/TensorRT 模型编译 + 动态批处理 (Dynamic Batching)。
* **数据飞轮**:在线难例挖掘 (基于用户反馈的 False Positives 循环重训)。