Commit 7da76864 7da76864361f72a1428d2b36aeea2f283d8945e6 by 章晓祥

-

1 parent 3ff5efd2
Showing 34 changed files with 2532 additions and 278 deletions
1 {
2 "env": {
3 "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
4 "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
5 "ANTHROPIC_MODEL": "gpt-5.4",
6 "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
7 "ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
8 "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
9 "CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
10 "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
11 "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
12 "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
13 "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
14 },
15 "permissions": {
16 "allow": [],
17 "deny": []
18 },
19 "model": "sonnet",
20 "enabledPlugins": {
21 "claude-code-setup@claude-plugins-official": true,
22 "typescript-lsp@claude-plugins-official": true,
23 "rust-analyzer-lsp@claude-plugins-official": true,
24 "pr-review-toolkit@claude-plugins-official": true,
25 "ralph-loop@claude-plugins-official": true,
26 "superpowers@claude-plugins-official": true
27 },
28 "alwaysThinkingEnabled": false,
29 "skipDangerousModePermissionPrompt": true,
30 "theme": "dark-ansi",
31 "modelType": "anthropic"
32 }
1 {
2 "env": {
3 "ANTHROPIC_AUTH_TOKEN": "sk-GlEnjnf09lXwiJuwDS5Q0nOzGd1ck8YBDERVXv84t9hvtS0U",
4 "ANTHROPIC_BASE_URL": "https://aiapis.help",
5 "ANTHROPIC_MODEL": "gpt-5.4",
6 "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
7 "ANTHROPIC_DEFAULT_SONNET_MODEL": "gpt-5.4",
8 "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
9 "CLAUDE_CODE_SUBAGENT_MODEL": "gpt-5.4",
10 "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
11 "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
12 "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
13 "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
14 },
15 "permissions": {
16 "allow": [],
17 "deny": []
18 },
19 "model": "sonnet",
20 "enabledPlugins": {
21 "claude-code-setup@claude-plugins-official": true,
22 "typescript-lsp@claude-plugins-official": true,
23 "rust-analyzer-lsp@claude-plugins-official": true,
24 "pr-review-toolkit@claude-plugins-official": true,
25 "ralph-loop@claude-plugins-official": true,
26 "superpowers@claude-plugins-official": true
27 },
28 "alwaysThinkingEnabled": false,
29 "skipDangerousModePermissionPrompt": true,
30 "theme": "dark-ansi",
31 "modelType": "anthropic"
32 }
1 {
2 "env": {
3 "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
4 "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
5 "ANTHROPIC_MODEL": "claude-opus-4.6",
6 "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4.6",
7 "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4.6",
8 "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4.5",
9 "CLAUDE_CODE_SUBAGENT_MODEL": "claude-sonnet-4.6",
10 "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
11 "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
12 "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
13 "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
14 },
15 "permissions": {
16 "allow": [],
17 "deny": []
18 },
19 "model": "sonnet",
20 "enabledPlugins": {
21 "claude-code-setup@claude-plugins-official": true,
22 "typescript-lsp@claude-plugins-official": true,
23 "rust-analyzer-lsp@claude-plugins-official": true,
24 "pr-review-toolkit@claude-plugins-official": true,
25 "ralph-loop@claude-plugins-official": true,
26 "superpowers@claude-plugins-official": true
27 },
28 "alwaysThinkingEnabled": false,
29 "skipDangerousModePermissionPrompt": true,
30 "theme": "dark-ansi",
31 "modelType": "anthropic"
32 }
1 {
2 "env": {
3 "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
4 "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
5 "ANTHROPIC_MODEL": "gpt-5.4",
6 "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
7 "ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
8 "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
9 "CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
10 "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
11 "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
12 "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
13 "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
14 },
15 "permissions": {
16 "allow": [],
17 "deny": []
18 },
19 "model": "sonnet",
20 "enabledPlugins": {
21 "claude-code-setup@claude-plugins-official": true,
22 "typescript-lsp@claude-plugins-official": true,
23 "rust-analyzer-lsp@claude-plugins-official": true,
24 "pr-review-toolkit@claude-plugins-official": true,
25 "ralph-loop@claude-plugins-official": true,
26 "superpowers@claude-plugins-official": true
27 },
28 "alwaysThinkingEnabled": false,
29 "skipDangerousModePermissionPrompt": true,
30 "theme": "dark-ansi",
31 "modelType": "anthropic"
32 }
1 {
2 "env": {
3 "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
4 "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
5 "ANTHROPIC_MODEL": "qwen3.7-max",
6 "ANTHROPIC_DEFAULT_OPUS_MODEL": "qwen3.7-max",
7 "ANTHROPIC_DEFAULT_SONNET_MODEL": "qwen3.6-plus",
8 "ANTHROPIC_DEFAULT_HAIKU_MODEL": "qwen3.6-plus",
9 "CLAUDE_CODE_SUBAGENT_MODEL": "qwen3.6-plus",
10 "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
11 "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1",
12 "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
13 "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
14 },
15 "permissions": {
16 "allow": [],
17 "deny": []
18 },
19 "model": "sonnet",
20 "enabledPlugins": {
21 "claude-code-setup@claude-plugins-official": true,
22 "typescript-lsp@claude-plugins-official": true,
23 "rust-analyzer-lsp@claude-plugins-official": true,
24 "pr-review-toolkit@claude-plugins-official": true,
25 "ralph-loop@claude-plugins-official": true,
26 "superpowers@claude-plugins-official": true
27 },
28 "alwaysThinkingEnabled": false,
29 "skipDangerousModePermissionPrompt": true,
30 "theme": "dark-ansi",
31 "modelType": "anthropic"
32 }
1 model:
2 name: coverhunter_finetune
3 embed_dim: 256
4 channels: 512
5 se_channels: 128
6 res2net_scale: 8
7 num_blocks: 3
8 n_mels: 128
9 aam_m: 0.2
10 aam_s: 30.0
11 use_band_split: false
12 band_split_channels: 128
13 use_dual_stream: true
14 mert_melody_branch: true
15 ecapa_branch: true
16 coverhunter_heads: 8
17 coverhunter_layers: 4
18 fusion_hidden_dim: 256
19 mert_model_name: m-a-p/MERT-v1-95M
20
21 data:
22 sample_rate: 16000
23 n_fft: 512
24 hop_length: 160
25 segment_dur: 8.0
26 crop_per_song: 6
27
28 training:
29 batch_size: 16
30 epochs: 30
31 lr: 0.0002
32 weight_decay: 0.0001
33 warmup_epochs: 3
34 temperature: 0.05
35 supcon_weight: 1.0
36 aam_weight: 0.2
37 mixed_precision: true
38 gradient_clip: 1.0
39 save_every: 5
40 log_every: 10
41 hard_negative_k: 4
42 sample_type_weights:
43 default: 1
44 compressed: 2
45 recording: 3
46 environment: 4
47 pair_type_weights:
48 default: 1.0
49 compressed: 1.5
50 recording: 2.0
51 environment: 3.0
1 model:
2 name: coverhunter_finetune_lowmem
3 embed_dim: 192
4 channels: 256
5 se_channels: 64
6 res2net_scale: 4
7 num_blocks: 2
8 n_mels: 96
9 aam_m: 0.2
10 aam_s: 24.0
11 use_band_split: false
12 band_split_channels: 64
13 use_dual_stream: true
14 mert_melody_branch: true
15 ecapa_branch: true
16 coverhunter_heads: 4
17 coverhunter_layers: 2
18 fusion_hidden_dim: 128
19 mert_model_name: m-a-p/MERT-v1-95M
20
21 data:
22 sample_rate: 16000
23 n_fft: 512
24 hop_length: 160
25 segment_dur: 5.0
26 crop_per_song: 4
27
28 training:
29 batch_size: 2
30 epochs: 20
31 lr: 0.00015
32 weight_decay: 0.0001
33 warmup_epochs: 2
34 temperature: 0.05
35 supcon_weight: 1.0
36 aam_weight: 0.2
37 mixed_precision: true
38 gradient_clip: 1.0
39 save_every: 5
40 log_every: 10
41 hard_negative_k: 2
42 sample_type_weights:
43 default: 1
44 compressed: 2
45 recording: 3
46 environment: 4
47 pair_type_weights:
48 default: 1.0
49 compressed: 1.4
50 recording: 1.8
51 environment: 2.2
...@@ -10,6 +10,13 @@ model: ...@@ -10,6 +10,13 @@ model:
10 aam_s: 30.0 10 aam_s: 30.0
11 use_band_split: true 11 use_band_split: true
12 band_split_channels: 128 12 band_split_channels: 128
13 use_dual_stream: true
14 mert_melody_branch: true
15 ecapa_branch: true
16 coverhunter_heads: 4
17 coverhunter_layers: 2
18 fusion_hidden_dim: 256
19 mert_model_name: m-a-p/MERT-v1-95M
13 20
14 data: 21 data:
15 sample_rate: 16000 22 sample_rate: 16000
...@@ -31,15 +38,17 @@ training: ...@@ -31,15 +38,17 @@ training:
31 gradient_clip: 1.0 38 gradient_clip: 1.0
32 save_every: 10 39 save_every: 10
33 log_every: 10 40 log_every: 10
41 hard_negative_k: 2
34 sample_type_weights: 42 sample_type_weights:
35 default: 1 43 default: 1
36 humming_like: 3 44 compressed: 2
37 confused: 5 45 recording: 3
46 environment: 4
38 pair_type_weights: 47 pair_type_weights:
39 default: 1.0 48 default: 1.0
40 augmented: 1.4 49 compressed: 1.5
41 humming_like: 2.5 50 recording: 2.0
42 confused: 4.0 51 environment: 2.5
43 52
44 engine: 53 engine:
45 chromaprint: 54 chromaprint:
......
1 {
2 "python": "/usr/local/miniconda3/bin/python",
3 "cwd": "/mnt/e/hikoon-ACR/acr-engine",
4 "steps": [
5 {
6 "name": "install_requirements",
7 "command": [
8 "/usr/local/miniconda3/bin/python",
9 "-m",
10 "pip",
11 "install",
12 "-r",
13 "requirements.txt"
14 ],
15 "returncode": 0,
16 "stdout": "\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 40.7/40.7 MB 10.9 MB/s 0:00:03\nDownloading nvidia_nvtx-13.0.85-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (148 kB)\nDownloading setuptools-81.0.0-py3-none-any.whl (1.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 1.1/1.1 MB 8.6 MB/s 0:00:00\nDownloading audioread-3.1.0-py3-none-any.whl (23 kB)\nDownloading click-8.4.1-py3-none-any.whl (116 kB)\nDownloading cuda_pathfinder-1.5.5-py3-none-any.whl (51 kB)\nDownloading decorator-5.3.1-py3-none-any.whl (10 kB)\nDownloading filelock-3.29.1-py3-none-any.whl (40 kB)\nDownloading fsspec-2026.4.0-py3-none-any.whl (203 kB)\nDownloading joblib-1.5.3-py3-none-any.whl (309 kB)\nDownloading lazy_loader-0.5-py3-none-any.whl (8.0 kB)\nDownloading networkx-3.6.1-py3-none-any.whl (2.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 2.1/2.1 MB 10.3 MB/s 0:00:00\nDownloading numba-0.65.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 3.8/3.8 MB 10.3 MB/s 0:00:00\nDownloading llvmlite-0.47.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (56.3 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 56.3/56.3 MB 10.8 MB/s 0:00:05\nDownloading pooch-1.9.0-py3-none-any.whl (67 kB)\nDownloading regex-2026.5.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (801 kB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 801.2/801.2 kB 8.5 MB/s 0:00:00\nDownloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (507 kB)\nDownloading scikit_learn-1.9.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (9.1 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 9.1/9.1 MB 10.0 MB/s 0:00:00\nDownloading narwhals-2.22.1-py3-none-any.whl (454 kB)\nDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 6.3/6.3 MB 10.7 MB/s 0:00:00\nDownloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 536.2/536.2 kB 7.1 MB/s 0:00:00\nDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\nDownloading jinja2-3.1.6-py3-none-any.whl (134 kB)\nDownloading markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)\nInstalling collected packages: torchaudio, nvidia-cusparselt-cu13, mpmath, cuda-toolkit, triton, threadpoolctl, sympy, setuptools, safetensors, regex, python-stretch, nvidia-nvtx, nvidia-nvshmem-cu13, nvidia-nvjitlink, nvidia-nccl-cu13, nvidia-curand, nvidia-cufile, nvidia-cuda-runtime, nvidia-cuda-nvrtc, nvidia-cuda-cupti, numpy, networkx, narwhals, MarkupSafe, llvmlite, lazy_loader, joblib, hf-xet, fsspec, filelock, decorator, cuda-pathfinder, click, audioread, soxr, soundfile, scipy, pooch, nvidia-cusparse, nvidia-cufft, nvidia-cublas, numpy-rms, numpy-minmax, numba, jinja2, cuda-bindings, scikit-learn, nvidia-cusolver, nvidia-cudnn-cu13, librosa, huggingface_hub, torch, tokenizers, audiomentations, transformers\n\nSuccessfully installed MarkupSafe-3.0.3 audiomentations-0.43.1 audioread-3.1.0 click-8.4.1 cuda-bindings-13.3.1 cuda-pathfinder-1.5.5 cuda-toolkit-13.0.2 decorator-5.3.1 filelock-3.29.1 fsspec-2026.4.0 hf-xet-1.5.0 huggingface_hub-1.18.0 jinja2-3.1.6 joblib-1.5.3 lazy_loader-0.5 librosa-0.11.0 llvmlite-0.47.0 mpmath-1.3.0 narwhals-2.22.1 networkx-3.6.1 numba-0.65.1 numpy-2.4.6 numpy-minmax-0.5.0 numpy-rms-0.6.0 nvidia-cublas-13.1.1.3 nvidia-cuda-cupti-13.0.85 nvidia-cuda-nvrtc-13.0.88 nvidia-cuda-runtime-13.0.96 nvidia-cudnn-cu13-9.20.0.48 nvidia-cufft-12.0.0.61 nvidia-cufile-1.15.1.6 nvidia-curand-10.4.0.35 nvidia-cusolver-12.0.4.66 nvidia-cusparse-12.6.3.3 nvidia-cusparselt-cu13-0.8.1 nvidia-nccl-cu13-2.29.7 nvidia-nvjitlink-13.0.88 nvidia-nvshmem-cu13-3.4.5 nvidia-nvtx-13.0.85 pooch-1.9.0 python-stretch-0.3.1 regex-2026.5.9 safetensors-0.7.0 scikit-learn-1.9.0 scipy-1.17.1 setuptools-81.0.0 soundfile-0.14.0 soxr-0.5.0.post1 sympy-1.14.0 threadpoolctl-3.6.0 tokenizers-0.22.2 torch-2.12.0 torchaudio-2.11.0 transformers-5.10.2 triton-3.7.0\n",
17 "stderr": " WARNING: The scripts proton and proton-viewer are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The script isympy is installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts f2py and numpy-config are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts hf, huggingface-cli and tiny-agents are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The scripts torchfrtrace and torchrun are installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n WARNING: The script transformers is installed in '/home/user/.local/bin' which is not on PATH.\n Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n"
18 },
19 {
20 "name": "install_extra_packages",
21 "command": [
22 "/usr/local/miniconda3/bin/python",
23 "-m",
24 "pip",
25 "install",
26 "torch",
27 "torchaudio",
28 "transformers",
29 "huggingface_hub",
30 "librosa",
31 "soundfile",
32 "audiomentations"
33 ],
34 "returncode": 0,
35 "stdout": "a3/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface_hub) (0.16.0)\nRequirement already satisfied: shellingham>=1.3.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (1.5.4)\nRequirement already satisfied: rich>=10.11.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (14.2.0)\nRequirement already satisfied: audioread>=2.1.9 in /home/user/.local/lib/python3.12/site-packages (from librosa) (3.1.0)\nRequirement already satisfied: numba>=0.51.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.65.1)\nRequirement already satisfied: scipy>=1.6.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.17.1)\nRequirement already satisfied: scikit-learn>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: joblib>=1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.5.3)\nRequirement already satisfied: decorator>=4.3.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (5.3.1)\nRequirement already satisfied: pooch>=1.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: soxr>=0.3.2 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5.0.post1)\nRequirement already satisfied: lazy_loader>=0.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5)\nRequirement already satisfied: msgpack>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from librosa) (1.1.1)\nRequirement already satisfied: cffi>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from soundfile) (1.17.1)\nRequirement already satisfied: numpy-minmax<1,>=0.3.0 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.5.0)\nRequirement already satisfied: numpy-rms<1,>=0.4.2 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.6.0)\nRequirement already satisfied: python-stretch<1,>=0.3.1 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.3.1)\nRequirement already satisfied: pycparser in /usr/local/miniconda3/lib/python3.12/site-packages (from cffi>=1.0->soundfile) (3.0)\nRequirement already satisfied: llvmlite<0.48,>=0.47.0dev0 in /home/user/.local/lib/python3.12/site-packages (from numba>=0.51.0->librosa) (0.47.0)\nRequirement already satisfied: platformdirs>=2.5.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (4.9.4)\nRequirement already satisfied: requests>=2.19.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (2.33.1)\nRequirement already satisfied: charset_normalizer<4,>=2 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (3.4.4)\nRequirement already satisfied: urllib3<3,>=1.26 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (2.6.3)\nRequirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (4.0.0)\nRequirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (2.20.0)\nRequirement already satisfied: mdurl~=0.1 in /usr/local/miniconda3/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer->transformers) (0.1.2)\nRequirement already satisfied: narwhals>=2.0.1 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (2.22.1)\nRequirement already satisfied: threadpoolctl>=3.5.0 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (3.6.0)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from sympy>=1.13.3->torch) (1.3.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /home/user/.local/lib/python3.12/site-packages (from jinja2->torch) (3.0.3)\n",
36 "stderr": ""
37 },
38 {
39 "name": "verify_environment",
40 "command": [
41 "/usr/local/miniconda3/bin/python",
42 "-c",
43 "import torch, transformers, librosa, soundfile, audiomentations; print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
44 ],
45 "returncode": 0,
46 "stdout": "{'torch': '2.12.0+cu130', 'cuda': False, 'transformers': '5.10.2'}\n",
47 "stderr": "/home/user/.local/lib/python3.12/site-packages/torch/cuda/__init__.py:187: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 12080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:119.)\n return torch._C._cuda_getDeviceCount() > 0\n"
48 }
49 ]
50 }
...\ No newline at end of file ...\ No newline at end of file
...@@ -2,6 +2,10 @@ numpy>=1.26 ...@@ -2,6 +2,10 @@ numpy>=1.26
2 PyYAML>=6.0 2 PyYAML>=6.0
3 soundfile>=0.12 3 soundfile>=0.12
4 librosa>=0.10 4 librosa>=0.10
5 audiomentations>=0.37
6 transformers>=4.46
7 huggingface_hub>=0.26
8 torchaudio>=2.3
5 tqdm>=4.66 9 tqdm>=4.66
6 torch>=2.3 10 torch>=2.3
7 fastapi>=0.115 11 fastapi>=0.115
......
1 #!/usr/bin/env python3
2 import argparse
3 import json
4 import subprocess
5 from datetime import datetime
6 from pathlib import Path
7
8
9 DEFAULT_PYTHON = "/usr/local/miniconda3/bin/python"
10
11
12 def main():
13 parser = argparse.ArgumentParser()
14 parser.add_argument("--python", default=DEFAULT_PYTHON)
15 parser.add_argument("--config", default="configs/coverhunter_finetune_4gb.yaml")
16 parser.add_argument("--data", required=True)
17 parser.add_argument("--output-root", default="data/training_runs")
18 parser.add_argument("--run-name", default=None)
19 parser.add_argument("--noise-root", action="append", default=[])
20 parser.add_argument("--device", default="auto")
21 parser.add_argument("--segment-strategy", default="hybrid")
22 parser.add_argument("--resume", default=None)
23 parser.add_argument("--dry-run", action="store_true")
24 args = parser.parse_args()
25
26 timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
27 run_name = args.run_name or f"coverhunter_finetune_{timestamp}"
28 run_dir = Path(args.output_root) / run_name
29 run_dir.mkdir(parents=True, exist_ok=True)
30
31 command = [
32 args.python,
33 "train.py",
34 "--config",
35 args.config,
36 "--data",
37 args.data,
38 "--output",
39 str(run_dir),
40 "--device",
41 args.device,
42 "--segment-strategy",
43 args.segment_strategy,
44 ]
45 if args.resume:
46 command.extend(["--resume", args.resume])
47 if args.dry_run:
48 command.append("--dry-run")
49 for noise_root in args.noise_root:
50 command.extend(["--noise-root", noise_root])
51
52 metadata = {
53 "run_name": run_name,
54 "created_at": datetime.utcnow().isoformat() + "Z",
55 "python": args.python,
56 "command": command,
57 "config": args.config,
58 "data": args.data,
59 "noise_roots": args.noise_root,
60 "run_dir": str(run_dir),
61 }
62 with open(run_dir / "run_request.json", "w") as f:
63 json.dump(metadata, f, indent=2)
64
65 result = subprocess.run(command, cwd=Path(__file__).resolve().parents[1], text=True, capture_output=True)
66 (run_dir / "stdout.log").write_text(result.stdout)
67 (run_dir / "stderr.log").write_text(result.stderr)
68 summary = {
69 **metadata,
70 "returncode": result.returncode,
71 "completed_at": datetime.utcnow().isoformat() + "Z",
72 "artifacts": sorted(path.name for path in run_dir.iterdir()),
73 }
74 with open(run_dir / "run_summary.json", "w") as f:
75 json.dump(summary, f, indent=2)
76 if result.returncode != 0:
77 raise SystemExit(result.returncode)
78
79
80 if __name__ == "__main__":
81 main()
1 #!/usr/bin/env python3
2 import argparse
3 import json
4 import subprocess
5 from pathlib import Path
6
7 PYTHON_DEFAULT = "/usr/local/miniconda3/bin/python"
8 PACKAGES = [
9 "-r", "requirements.txt",
10 ]
11 EXTRA_PACKAGES = [
12 "torch",
13 "torchaudio",
14 "transformers",
15 "huggingface_hub",
16 "librosa",
17 "soundfile",
18 "audiomentations",
19 ]
20
21
22 def run(command, cwd):
23 return subprocess.run(command, cwd=cwd, text=True, capture_output=True)
24
25
26 def main():
27 parser = argparse.ArgumentParser()
28 parser.add_argument("--python", default=PYTHON_DEFAULT)
29 parser.add_argument("--skip-install", action="store_true")
30 args = parser.parse_args()
31
32 root = Path(__file__).resolve().parents[1]
33 report = {
34 "python": args.python,
35 "cwd": str(root),
36 "steps": [],
37 }
38
39 if not args.skip_install:
40 install_cmd = [args.python, "-m", "pip", "install", *PACKAGES]
41 res = run(install_cmd, root)
42 report["steps"].append({
43 "name": "install_requirements",
44 "command": install_cmd,
45 "returncode": res.returncode,
46 "stdout": res.stdout[-4000:],
47 "stderr": res.stderr[-4000:],
48 })
49
50 extra_cmd = [args.python, "-m", "pip", "install", *EXTRA_PACKAGES]
51 res = run(extra_cmd, root)
52 report["steps"].append({
53 "name": "install_extra_packages",
54 "command": extra_cmd,
55 "returncode": res.returncode,
56 "stdout": res.stdout[-4000:],
57 "stderr": res.stderr[-4000:],
58 })
59
60 verify_cmd = [
61 args.python,
62 "-c",
63 (
64 "import torch, transformers, librosa, soundfile, audiomentations; "
65 "print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
66 ),
67 ]
68 res = run(verify_cmd, root)
69 report["steps"].append({
70 "name": "verify_environment",
71 "command": verify_cmd,
72 "returncode": res.returncode,
73 "stdout": res.stdout[-4000:],
74 "stderr": res.stderr[-4000:],
75 })
76
77 report_path = root / "reports" / "coverhunter_env_setup_report.json"
78 report_path.parent.mkdir(parents=True, exist_ok=True)
79 report_path.write_text(json.dumps(report, indent=2))
80 print(report_path)
81
82 if any(step["returncode"] != 0 for step in report["steps"]):
83 raise SystemExit(1)
84
85
86 if __name__ == "__main__":
87 main()
...@@ -8,6 +8,9 @@ import numpy as np ...@@ -8,6 +8,9 @@ import numpy as np
8 import torch 8 import torch
9 from torch.utils.data import Dataset 9 from torch.utils.data import Dataset
10 10
11 from src.utils.audio import AudioProcessor
12 from src.utils.augment import AugmentPipeline
13
11 14
12 def compute_candidate_offsets( 15 def compute_candidate_offsets(
13 y: np.ndarray, 16 y: np.ndarray,
...@@ -124,6 +127,267 @@ def compute_candidate_offsets( ...@@ -124,6 +127,267 @@ def compute_candidate_offsets(
124 return [] 127 return []
125 128
126 129
130 class DualStreamFeatureExtractor:
131 def __init__(self, sr: int, n_mels: int, n_fft: int, hop_length: int):
132 self.audio = AudioProcessor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
133 self.n_mels = n_mels
134
135 def extract(self, y: np.ndarray) -> Dict[str, torch.Tensor]:
136 mel = self.audio.to_mel(y)
137 melody = self.audio.extract_f0(y)
138 melody = librosa.hz_to_midi(melody)
139 melody = np.nan_to_num(melody, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
140 chroma = self.audio.extract_chroma(y).astype(np.float32)
141
142 time_steps = mel.shape[1]
143 if melody.size == 0:
144 melody = np.zeros(time_steps, dtype=np.float32)
145 else:
146 melody = np.interp(
147 np.linspace(0, melody.size - 1, time_steps),
148 np.arange(melody.size),
149 melody,
150 ).astype(np.float32)
151 chroma_resized = np.stack(
152 [
153 np.interp(
154 np.linspace(0, chroma.shape[1] - 1, time_steps),
155 np.arange(chroma.shape[1]),
156 chroma_row,
157 )
158 for chroma_row in chroma
159 ],
160 axis=0,
161 ).astype(np.float32)
162
163 return {
164 "mel": torch.FloatTensor(mel),
165 "melody": torch.FloatTensor(melody).unsqueeze(0),
166 "chroma": torch.FloatTensor(chroma_resized),
167 }
168
169
170 class PairSamplerDataset(Dataset):
171 def __init__(
172 self,
173 data_dir: str,
174 split: str = "train",
175 sr: int = 16000,
176 n_mels: int = 80,
177 n_fft: int = 512,
178 hop_length: int = 160,
179 segment_dur: float = 5.0,
180 augment: bool = True,
181 segment_strategy: str = "random",
182 silence_top_db: int = 30,
183 sample_type_weights: Optional[Dict[str, int]] = None,
184 pair_type_weights: Optional[Dict[str, float]] = None,
185 hard_negative_k: int = 1,
186 noise_roots: Optional[List[str]] = None,
187 ):
188 self.sr = sr
189 self.n_mels = n_mels
190 self.n_fft = n_fft
191 self.hop_length = hop_length
192 self.segment_len = int(segment_dur * sr)
193 self.augment = augment
194 self.segment_strategy = segment_strategy
195 self.silence_top_db = silence_top_db
196 self.data_dir = Path(data_dir)
197 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
198 self.sample_type_weights = {
199 "default": 1,
200 "compressed": 2,
201 "recording": 3,
202 "environment": 4,
203 **(sample_type_weights or {}),
204 }
205 self.pair_type_weights = {
206 "default": 1.0,
207 "compressed": 1.5,
208 "recording": 2.0,
209 "environment": 2.5,
210 **(pair_type_weights or {}),
211 }
212 self.hard_negative_k = hard_negative_k
213 self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
214 self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
215 self.aggressive_augmenter = AugmentPipeline(sr, aggressive=True, noise_roots=noise_roots)
216
217 with open(self.data_dir / f"{split}.json") as f:
218 metadata = json.load(f)
219
220 self.by_song: Dict[str, List[Dict]] = {}
221 for item in metadata:
222 if not self._is_training_candidate(item):
223 continue
224 p = self.asset_root / item["audio_path"]
225 if p.exists():
226 self.by_song.setdefault(item["song_id"], []).append(item)
227
228 self.song_ids = sorted(self.by_song)
229 self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
230 self.sample_song_ids = []
231 self.hard_negative_map: Dict[str, List[str]] = self._build_hard_negative_map()
232 for sid, items in self.by_song.items():
233 item_types = {self._normalize_sample_type(x.get("type")) for x in items}
234 weight = self.sample_type_weights.get("default", 1)
235 for item_type in item_types:
236 weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
237 self.sample_song_ids.extend([sid] * weight)
238
239 @staticmethod
240 def _normalize_sample_type(sample_type: Optional[str]) -> str:
241 mapping = {
242 "reference": "reference",
243 "compressed": "compressed",
244 "recording": "recording",
245 "environment": "environment",
246 "humming_like": "recording",
247 "confused": "environment",
248 None: "default",
249 }
250 return mapping.get(sample_type, sample_type or "default")
251
252 def _is_training_candidate(self, item: Dict) -> bool:
253 sample_type = self._normalize_sample_type(item.get("type"))
254 return sample_type != "reference"
255
256 def _build_hard_negative_map(self) -> Dict[str, List[str]]:
257 song_features: Dict[str, np.ndarray] = {}
258 for song_id, items in self.by_song.items():
259 feats = []
260 for item in items[:2]:
261 path = self.asset_root / item["audio_path"]
262 try:
263 y, _ = librosa.load(str(path), sr=self.sr, mono=True, duration=8.0)
264 mel = self.feature_extractor.audio.to_mel(y)
265 feats.append(np.mean(mel, axis=1))
266 except Exception:
267 continue
268 if feats:
269 song_features[song_id] = np.mean(feats, axis=0)
270
271 hard_negative_map: Dict[str, List[str]] = {}
272 song_ids = list(song_features)
273 for song_id in song_ids:
274 anchor = song_features[song_id]
275 anchor_norm = np.linalg.norm(anchor) + 1e-12
276 scored = []
277 for other_song_id in song_ids:
278 if other_song_id == song_id:
279 continue
280 other = song_features[other_song_id]
281 score = float(np.dot(anchor, other) / (anchor_norm * (np.linalg.norm(other) + 1e-12)))
282 scored.append((score, other_song_id))
283 scored.sort(reverse=True)
284 hard_negative_map[song_id] = [other_song_id for _, other_song_id in scored[: max(self.hard_negative_k, 1) * 4]]
285 return hard_negative_map
286
287 def __len__(self):
288 return len(self.sample_song_ids)
289
290 def _load_clip(self, sample: Dict) -> np.ndarray:
291 path = self.asset_root / sample["audio_path"]
292 full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
293 duration = float(sample.get("duration", len(full_y) / self.sr))
294 max_offset = max(0.0, duration - (self.segment_len / self.sr))
295 offset = 0.0
296 if max_offset > 0:
297 if self.segment_strategy == "random":
298 offset = random.uniform(0, max_offset)
299 else:
300 direct_candidates = compute_candidate_offsets(
301 y=full_y,
302 sr=self.sr,
303 segment_len=self.segment_len,
304 strategy=self.segment_strategy,
305 silence_top_db=self.silence_top_db,
306 )
307 if direct_candidates:
308 offset = min(random.choice(direct_candidates) / self.sr, max_offset)
309 elif self.segment_strategy == "hybrid":
310 candidate_pool: List[int] = []
311 for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
312 candidate_pool.extend(
313 compute_candidate_offsets(
314 y=full_y,
315 sr=self.sr,
316 segment_len=self.segment_len,
317 strategy=strategy,
318 silence_top_db=self.silence_top_db,
319 )
320 )
321 if candidate_pool and random.random() < 0.75:
322 offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
323 else:
324 offset = random.uniform(0, max_offset)
325 else:
326 offset = random.uniform(0, max_offset)
327 start = int(offset * self.sr)
328 y = full_y[start : start + self.segment_len]
329 if len(y) < self.segment_len:
330 y = np.pad(y, (0, self.segment_len - len(y)))
331 return y
332
333 def _augment_wave(self, sample: Dict, y: np.ndarray) -> np.ndarray:
334 if not self.augment:
335 return y
336 sample_type = self._normalize_sample_type(sample.get("type"))
337 if sample_type in {"recording", "environment"}:
338 return self.aggressive_augmenter(y)
339 return self.augmenter(y)
340
341 def _load_features(self, sample: Dict) -> Dict[str, torch.Tensor]:
342 y = self._load_clip(sample)
343 y = self._augment_wave(sample, y)
344 features = self.feature_extractor.extract(y)
345 features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
346 return features
347
348 def _pick_positive_pair(self, song_id: str) -> tuple[Dict, Dict]:
349 choices = self.by_song[song_id]
350 if len(choices) == 1:
351 return choices[0], choices[0]
352 return tuple(random.sample(choices, 2))
353
354 def _pick_negative(self, song_id: str) -> Dict:
355 hard_songs = self.hard_negative_map.get(song_id, [])
356 candidate_song_ids = hard_songs[: self.hard_negative_k] if hard_songs else []
357 if candidate_song_ids and random.random() < 0.8:
358 negative_song_id = random.choice(candidate_song_ids)
359 else:
360 pool = [sid for sid in self.song_ids if sid != song_id]
361 negative_song_id = random.choice(pool)
362 return random.choice(self.by_song[negative_song_id])
363
364 def __getitem__(self, idx):
365 song_id = self.sample_song_ids[idx]
366 pos_a, pos_b = self._pick_positive_pair(song_id)
367 negative = self._pick_negative(song_id)
368
369 positive_items = [pos_a, pos_b]
370 positive_features = [self._load_features(sample) for sample in positive_items]
371 negative_features = self._load_features(negative)
372
373 hard_weights = [
374 self.pair_type_weights.get(self._normalize_sample_type(sample.get("type")), self.pair_type_weights["default"])
375 for sample in positive_items
376 ]
377 hard_weights.append(self.pair_type_weights.get("environment", 2.5))
378
379 label = self.song_to_idx[song_id]
380 negative_label = self.song_to_idx[negative["song_id"]]
381 return {
382 "mel": torch.stack([feat["mel"] for feat in positive_features] + [negative_features["mel"]], dim=0),
383 "melody": torch.stack([feat["melody"] for feat in positive_features] + [negative_features["melody"]], dim=0),
384 "chroma": torch.stack([feat["chroma"] for feat in positive_features] + [negative_features["chroma"]], dim=0),
385 "song_id": torch.tensor([label, label, negative_label], dtype=torch.long),
386 "song_name": song_id,
387 "hard_weight": torch.tensor(hard_weights, dtype=torch.float32),
388 }
389
390
127 class ACRDataset(Dataset): 391 class ACRDataset(Dataset):
128 def __init__( 392 def __init__(
129 self, 393 self,
...@@ -140,6 +404,7 @@ class ACRDataset(Dataset): ...@@ -140,6 +404,7 @@ class ACRDataset(Dataset):
140 references_only: bool = False, 404 references_only: bool = False,
141 segment_strategy: str = "random", 405 segment_strategy: str = "random",
142 silence_top_db: int = 30, 406 silence_top_db: int = 30,
407 noise_roots: Optional[List[str]] = None,
143 ): 408 ):
144 self.sr = sr 409 self.sr = sr
145 self.n_mels = n_mels 410 self.n_mels = n_mels
...@@ -152,6 +417,8 @@ class ACRDataset(Dataset): ...@@ -152,6 +417,8 @@ class ACRDataset(Dataset):
152 self.silence_top_db = silence_top_db 417 self.silence_top_db = silence_top_db
153 self.data_dir = Path(data_dir) 418 self.data_dir = Path(data_dir)
154 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir 419 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
420 self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
421 self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
155 422
156 meta_path = self.data_dir / f"{split}.json" 423 meta_path = self.data_dir / f"{split}.json"
157 with open(meta_path) as f: 424 with open(meta_path) as f:
...@@ -179,16 +446,6 @@ class ACRDataset(Dataset): ...@@ -179,16 +446,6 @@ class ACRDataset(Dataset):
179 y = y[: self.segment_len] 446 y = y[: self.segment_len]
180 return y 447 return y
181 448
182 def _to_mel(self, y: np.ndarray) -> np.ndarray:
183 mel = librosa.feature.melspectrogram(
184 y=y,
185 sr=self.sr,
186 n_mels=self.n_mels,
187 n_fft=self.n_fft,
188 hop_length=self.hop_length,
189 )
190 return librosa.power_to_db(mel, ref=np.max)
191
192 def _choose_offset(self, sample: Dict, audio_path: Path) -> float: 449 def _choose_offset(self, sample: Dict, audio_path: Path) -> float:
193 duration = float(sample["duration"]) 450 duration = float(sample["duration"])
194 max_offset = max(0.0, duration - (self.segment_len / self.sr)) 451 max_offset = max(0.0, duration - (self.segment_len / self.sr))
...@@ -231,24 +488,22 @@ class ACRDataset(Dataset): ...@@ -231,24 +488,22 @@ class ACRDataset(Dataset):
231 488
232 def __getitem__(self, idx): 489 def __getitem__(self, idx):
233 sample = self.samples[idx // self.n_crops] 490 sample = self.samples[idx // self.n_crops]
234
235 audio_path = self.asset_root / sample["audio_path"] 491 audio_path = self.asset_root / sample["audio_path"]
236 offset = self._choose_offset(sample, audio_path) 492 offset = self._choose_offset(sample, audio_path)
237 y = self._load_segment(str(audio_path), offset, 5.0) 493 y = self._load_segment(str(audio_path), offset, 5.0)
238 494
239 if self.augment and sample.get("type") != "reference": 495 if self.augment and sample.get("type") != "reference":
240 from src.utils.augment import AugmentPipeline 496 y = self.augmenter(y)
241 aug = AugmentPipeline(self.sr)
242 y = aug(y)
243 497
244 mel = self._to_mel(y) 498 features = self.feature_extractor.extract(y)
245 mel_tensor = torch.FloatTensor(mel) 499 features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
246 500
247 song_id = sample["song_id"] 501 song_id = sample["song_id"]
248 class_id = self.song_to_idx[song_id] 502 class_id = self.song_to_idx[song_id]
249
250 return { 503 return {
251 "mel": mel_tensor, 504 "mel": features["mel"],
505 "melody": features["melody"],
506 "chroma": features["chroma"],
252 "song_id": torch.tensor(class_id, dtype=torch.long), 507 "song_id": torch.tensor(class_id, dtype=torch.long),
253 "song_name": song_id, 508 "song_name": song_id,
254 "type": sample.get("type", "unknown"), 509 "type": sample.get("type", "unknown"),
...@@ -272,6 +527,7 @@ class ACRTestDataset(Dataset): ...@@ -272,6 +527,7 @@ class ACRTestDataset(Dataset):
272 self.hop_length = hop_length 527 self.hop_length = hop_length
273 self.data_dir = Path(data_dir) 528 self.data_dir = Path(data_dir)
274 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir 529 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
530 self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
275 531
276 meta_path = self.data_dir / f"{split}.json" 532 meta_path = self.data_dir / f"{split}.json"
277 with open(meta_path) as f: 533 with open(meta_path) as f:
...@@ -299,171 +555,17 @@ class ACRTestDataset(Dataset): ...@@ -299,171 +555,17 @@ class ACRTestDataset(Dataset):
299 else: 555 else:
300 y = y[:seg_len] 556 y = y[:seg_len]
301 557
302 mel = librosa.power_to_db( 558 features = self.feature_extractor.extract(y)
303 librosa.feature.melspectrogram(
304 y=y,
305 sr=self.sr,
306 n_mels=self.n_mels,
307 n_fft=self.n_fft,
308 hop_length=self.hop_length,
309 ),
310 ref=np.max,
311 )
312 class_id = self.song_to_idx[sample["song_id"]] 559 class_id = self.song_to_idx[sample["song_id"]]
313 return { 560 return {
314 "mel": torch.FloatTensor(mel), 561 "mel": features["mel"],
562 "melody": features["melody"],
563 "chroma": features["chroma"],
315 "song_id": torch.tensor(class_id, dtype=torch.long), 564 "song_id": torch.tensor(class_id, dtype=torch.long),
316 "song_name": sample["song_id"], 565 "song_name": sample["song_id"],
317 "type": sample.get("type", "unknown"), 566 "type": sample.get("type", "unknown"),
318 } 567 }
319 568
320 569
321 class SongPairDataset(Dataset): 570 class SongPairDataset(PairSamplerDataset):
322 def __init__( 571 pass
323 self,
324 data_dir: str,
325 split: str = "train",
326 sr: int = 16000,
327 n_mels: int = 80,
328 n_fft: int = 512,
329 hop_length: int = 160,
330 segment_dur: float = 5.0,
331 augment: bool = True,
332 segment_strategy: str = "random",
333 silence_top_db: int = 30,
334 sample_type_weights: Optional[Dict[str, int]] = None,
335 pair_type_weights: Optional[Dict[str, float]] = None,
336 ):
337 self.sr = sr
338 self.n_mels = n_mels
339 self.n_fft = n_fft
340 self.hop_length = hop_length
341 self.segment_len = int(segment_dur * sr)
342 self.augment = augment
343 self.segment_strategy = segment_strategy
344 self.silence_top_db = silence_top_db
345 self.data_dir = Path(data_dir)
346 self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
347 self.sample_type_weights = {
348 "default": 1,
349 "humming_like": 3,
350 "confused": 5,
351 **(sample_type_weights or {}),
352 }
353 self.pair_type_weights = {
354 "default": 1.0,
355 "augmented": 1.4,
356 "humming_like": 2.5,
357 "confused": 4.0,
358 **(pair_type_weights or {}),
359 }
360
361 with open(self.data_dir / f"{split}.json") as f:
362 metadata = json.load(f)
363
364 self.by_song: Dict[str, List[Dict]] = {}
365 for item in metadata:
366 if item.get("type") == "reference":
367 continue
368 p = self.asset_root / item["audio_path"]
369 if p.exists():
370 self.by_song.setdefault(item["song_id"], []).append(item)
371
372 self.song_ids = sorted(self.by_song)
373 self.sample_song_ids = []
374 for sid, items in self.by_song.items():
375 item_types = {x.get("type") for x in items}
376 weight = self.sample_type_weights.get("default", 1)
377 for item_type in item_types:
378 weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
379 self.sample_song_ids.extend([sid] * weight)
380 self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
381
382 def __len__(self):
383 return len(self.sample_song_ids)
384
385 def _load_clip(self, sample: Dict) -> np.ndarray:
386 path = self.asset_root / sample["audio_path"]
387 full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
388 duration = float(sample.get("duration", len(full_y) / self.sr))
389 max_offset = max(0.0, duration - (self.segment_len / self.sr))
390 offset = 0.0
391 if max_offset > 0:
392 if self.segment_strategy == "random":
393 offset = random.uniform(0, max_offset)
394 else:
395 direct_candidates = compute_candidate_offsets(
396 y=full_y,
397 sr=self.sr,
398 segment_len=self.segment_len,
399 strategy=self.segment_strategy,
400 silence_top_db=self.silence_top_db,
401 )
402 if direct_candidates:
403 offset = min(random.choice(direct_candidates) / self.sr, max_offset)
404 elif self.segment_strategy == "hybrid":
405 candidate_pool: List[int] = []
406 for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
407 candidate_pool.extend(
408 compute_candidate_offsets(
409 y=full_y,
410 sr=self.sr,
411 segment_len=self.segment_len,
412 strategy=strategy,
413 silence_top_db=self.silence_top_db,
414 )
415 )
416 if candidate_pool and random.random() < 0.75:
417 offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
418 else:
419 offset = random.uniform(0, max_offset)
420 else:
421 offset = random.uniform(0, max_offset)
422 start = int(offset * self.sr)
423 y = full_y[start : start + self.segment_len]
424 if len(y) < self.segment_len:
425 y = np.pad(y, (0, self.segment_len - len(y)))
426 return y
427
428 def _to_mel(self, y: np.ndarray) -> torch.Tensor:
429 mel = librosa.feature.melspectrogram(
430 y=y,
431 sr=self.sr,
432 n_mels=self.n_mels,
433 n_fft=self.n_fft,
434 hop_length=self.hop_length,
435 )
436 mel = librosa.power_to_db(mel, ref=np.max)
437 return torch.FloatTensor(mel)
438
439 def __getitem__(self, idx):
440 song_id = self.sample_song_ids[idx]
441 choices = self.by_song[song_id]
442 if len(choices) == 1:
443 a = b = choices[0]
444 else:
445 a, b = random.sample(choices, 2)
446
447 pair_weights = [
448 self.pair_type_weights.get(a.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
449 self.pair_type_weights.get(b.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
450 ]
451
452 wavs = []
453 for sample in (a, b):
454 y = self._load_clip(sample)
455 if self.augment:
456 from src.utils.augment import AugmentPipeline
457 y = AugmentPipeline(self.sr, aggressive=sample.get("type") in {"confused", "humming_like"})(y)
458 wavs.append(self._to_mel(y))
459
460 max_t = max(w.shape[1] for w in wavs)
461 wavs = [torch.nn.functional.pad(w, (0, max_t - w.shape[1])) if w.shape[1] < max_t else w for w in wavs]
462
463 label = self.song_to_idx[song_id]
464 return {
465 "mel": torch.stack(wavs, dim=0),
466 "song_id": torch.tensor([label, label], dtype=torch.long),
467 "song_name": song_id,
468 "hard_weight": torch.tensor(pair_weights, dtype=torch.float32),
469 }
......
...@@ -3,6 +3,55 @@ import torch.nn as nn ...@@ -3,6 +3,55 @@ import torch.nn as nn
3 import torch.nn.functional as F 3 import torch.nn.functional as F
4 from typing import Optional, Tuple, List 4 from typing import Optional, Tuple, List
5 5
6 try:
7 from transformers import AutoModel
8 except ImportError:
9 AutoModel = None
10
11
12 class FrozenMERTFeatureExtractor(nn.Module):
13 def __init__(self, model_name: Optional[str], n_mels: int, hidden_dim: int):
14 super().__init__()
15 self.model_name = model_name
16 self.hidden_dim = hidden_dim
17 self.backbone = None
18 self.proj = nn.Sequential(
19 nn.Conv1d(n_mels, hidden_dim, kernel_size=3, padding=1),
20 nn.GELU(),
21 nn.BatchNorm1d(hidden_dim),
22 nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
23 nn.GELU(),
24 nn.BatchNorm1d(hidden_dim),
25 )
26 for parameter in self.proj.parameters():
27 parameter.requires_grad = False
28
29 if model_name and AutoModel is not None:
30 try:
31 self.backbone = AutoModel.from_pretrained(model_name)
32 except Exception:
33 self.backbone = None
34 if self.backbone is not None:
35 for parameter in self.backbone.parameters():
36 parameter.requires_grad = False
37 backbone_dim = getattr(self.backbone.config, "hidden_size", hidden_dim)
38 self.proj = nn.Sequential(
39 nn.Conv1d(backbone_dim, hidden_dim, kernel_size=1),
40 nn.GELU(),
41 nn.BatchNorm1d(hidden_dim),
42 )
43
44 def forward(self, mel: torch.Tensor) -> torch.Tensor:
45 if self.backbone is None:
46 with torch.no_grad():
47 return self.proj(mel)
48
49 waveform_like = mel.transpose(1, 2)
50 with torch.no_grad():
51 outputs = self.backbone(inputs_embeds=waveform_like)
52 hidden = outputs.last_hidden_state.transpose(1, 2)
53 return self.proj(hidden)
54
6 55
7 class SEModule(nn.Module): 56 class SEModule(nn.Module):
8 def __init__(self, channels, se_channels=128): 57 def __init__(self, channels, se_channels=128):
...@@ -123,6 +172,89 @@ class AAMSoftmax(nn.Module): ...@@ -123,6 +172,89 @@ class AAMSoftmax(nn.Module):
123 return output 172 return output
124 173
125 174
175 class CoverHunterHead(nn.Module):
176 def __init__(self, input_dim: int, embed_dim: int, num_heads: int = 4, num_layers: int = 2, ff_mult: int = 4):
177 super().__init__()
178 encoder_layer = nn.TransformerEncoderLayer(
179 d_model=input_dim,
180 nhead=num_heads,
181 dim_feedforward=input_dim * ff_mult,
182 batch_first=True,
183 activation="gelu",
184 )
185 self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
186 self.attention = nn.Sequential(
187 nn.Linear(input_dim, input_dim),
188 nn.Tanh(),
189 nn.Linear(input_dim, 1),
190 )
191 self.proj = nn.Linear(input_dim, embed_dim)
192 self.norm = nn.BatchNorm1d(embed_dim, affine=False)
193
194 def forward(self, x: torch.Tensor) -> torch.Tensor:
195 encoded = self.encoder(x)
196 weights = torch.softmax(self.attention(encoded).squeeze(-1), dim=1).unsqueeze(-1)
197 pooled = torch.sum(encoded * weights, dim=1)
198 projected = self.proj(pooled)
199 projected = self.norm(projected)
200 return F.normalize(projected, p=2, dim=1)
201
202
203 class MERTMelodyBranch(nn.Module):
204 def __init__(
205 self,
206 n_mels: int,
207 chroma_bins: int = 12,
208 melody_bins: int = 1,
209 hidden_dim: int = 256,
210 mert_model_name: Optional[str] = None,
211 ):
212 super().__init__()
213 self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
214 self.melody_proj = nn.Conv1d(chroma_bins + melody_bins, hidden_dim, kernel_size=1)
215 self.fuse = nn.Sequential(
216 nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
217 nn.ReLU(),
218 nn.BatchNorm1d(hidden_dim),
219 )
220
221 def forward(self, mert: torch.Tensor, melody: torch.Tensor, chroma: torch.Tensor) -> torch.Tensor:
222 semantic = self.mert(mert)
223 melodic = self.melody_proj(torch.cat([melody, chroma], dim=1))
224 return self.fuse(torch.cat([semantic, melodic], dim=1))
225
226
227 class ECAPABranch(nn.Module):
228 def __init__(self, n_mels: int, channels: int, use_band_split: bool, band_split_channels: int):
229 super().__init__()
230 front_channels = band_split_channels * 5 if use_band_split else n_mels
231 self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
232 self.proj = nn.Sequential(
233 nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
234 nn.ReLU(),
235 nn.BatchNorm1d(channels),
236 )
237
238 def forward(self, mel: torch.Tensor) -> torch.Tensor:
239 x = self.band_split(mel) if self.band_split is not None else mel
240 return self.proj(x)
241
242
243 class DualStreamFusion(nn.Module):
244 def __init__(self, mert_dim: int, ecapa_dim: int, hidden_dim: int):
245 super().__init__()
246 self.mert_gate = nn.Conv1d(mert_dim, hidden_dim, kernel_size=1)
247 self.ecapa_gate = nn.Conv1d(ecapa_dim, hidden_dim, kernel_size=1)
248 self.fuse = nn.Sequential(
249 nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
250 nn.ReLU(),
251 nn.BatchNorm1d(hidden_dim),
252 )
253
254 def forward(self, mert_stream: torch.Tensor, ecapa_stream: torch.Tensor) -> torch.Tensor:
255 return self.fuse(torch.cat([self.mert_gate(mert_stream), self.ecapa_gate(ecapa_stream)], dim=1))
256
257
126 class ECAPA_ACR(nn.Module): 258 class ECAPA_ACR(nn.Module):
127 def __init__( 259 def __init__(
128 self, 260 self,
...@@ -137,9 +269,36 @@ class ECAPA_ACR(nn.Module): ...@@ -137,9 +269,36 @@ class ECAPA_ACR(nn.Module):
137 aam_s: float = 30.0, 269 aam_s: float = 30.0,
138 use_band_split: bool = True, 270 use_band_split: bool = True,
139 band_split_channels: int = 128, 271 band_split_channels: int = 128,
272 use_dual_stream: bool = True,
273 coverhunter_heads: int = 4,
274 coverhunter_layers: int = 2,
275 fusion_hidden_dim: int = 256,
276 mert_model_name: Optional[str] = None,
140 ): 277 ):
141 super().__init__() 278 super().__init__()
142 self.embed_dim = embed_dim 279 self.embed_dim = embed_dim
280 self.use_dual_stream = use_dual_stream
281 if use_dual_stream:
282 self.mert_melody_branch = MERTMelodyBranch(
283 n_mels=n_mels,
284 chroma_bins=12,
285 melody_bins=1,
286 hidden_dim=fusion_hidden_dim,
287 mert_model_name=mert_model_name,
288 )
289 self.ecapa_branch = ECAPABranch(
290 n_mels=n_mels,
291 channels=channels,
292 use_band_split=use_band_split,
293 band_split_channels=band_split_channels,
294 )
295 self.stream_fusion = DualStreamFusion(
296 mert_dim=fusion_hidden_dim,
297 ecapa_dim=channels,
298 hidden_dim=channels,
299 )
300 front_channels = channels
301 else:
143 front_channels = band_split_channels * 5 if use_band_split else channels 302 front_channels = band_split_channels * 5 if use_band_split else channels
144 self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None 303 self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
145 304
...@@ -169,24 +328,39 @@ class ECAPA_ACR(nn.Module): ...@@ -169,24 +328,39 @@ class ECAPA_ACR(nn.Module):
169 nn.ReLU(), 328 nn.ReLU(),
170 nn.BatchNorm1d(channels * 3), 329 nn.BatchNorm1d(channels * 3),
171 ) 330 )
172 self.pooling = StatisticsPooling() 331 self.coverhunter = CoverHunterHead(
173 self.fc = nn.Linear(channels * 3 * 2, embed_dim) 332 input_dim=channels * 3,
174 self.bn = nn.BatchNorm1d(embed_dim, affine=False) 333 embed_dim=embed_dim,
334 num_heads=coverhunter_heads,
335 num_layers=coverhunter_layers,
336 )
175 self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None 337 self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None
176 338
177 def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: 339 def forward(
340 self,
341 mel: torch.Tensor,
342 labels: Optional[torch.Tensor] = None,
343 melody: Optional[torch.Tensor] = None,
344 chroma: Optional[torch.Tensor] = None,
345 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
346 if self.use_dual_stream:
347 if melody is None or chroma is None:
348 raise ValueError("melody and chroma are required when dual-stream fusion is enabled")
349 mert_stream = self.mert_melody_branch(mel, melody, chroma)
350 ecapa_stream = self.ecapa_branch(mel)
351 x = self.stream_fusion(mert_stream, ecapa_stream)
352 else:
178 x = self.band_split(mel) if self.band_split is not None else mel 353 x = self.band_split(mel) if self.band_split is not None else mel
179 x = self.conv1(x) 354 x = self.conv1(x)
355 if self.use_dual_stream:
356 x = self.conv1(x)
180 block_outputs = [] 357 block_outputs = []
181 for block in self.blocks: 358 for block in self.blocks:
182 x = block(x) 359 x = block(x)
183 block_outputs.append(x) 360 block_outputs.append(x)
184 x = torch.cat(block_outputs, dim=1) 361 x = torch.cat(block_outputs, dim=1)
185 x = self.mfa(x) 362 x = self.mfa(x)
186 x = self.pooling(x) 363 embedding = self.coverhunter(x.transpose(1, 2))
187 x = self.fc(x)
188 x = self.bn(x)
189 embedding = F.normalize(x, p=2, dim=1)
190 if labels is not None and self.aam is not None: 364 if labels is not None and self.aam is not None:
191 logits = self.aam(embedding, labels) 365 logits = self.aam(embedding, labels)
192 return embedding, logits 366 return embedding, logits
......
...@@ -3,30 +3,22 @@ import torch.nn as nn ...@@ -3,30 +3,22 @@ import torch.nn as nn
3 import torch.nn.functional as F 3 import torch.nn.functional as F
4 4
5 5
6 class SupConLoss(nn.Module): 6 class InfoNCELoss(nn.Module):
7 def __init__(self, temperature: float = 0.07): 7 def __init__(self, temperature: float = 0.07):
8 super().__init__() 8 super().__init__()
9 self.temperature = temperature 9 self.temperature = temperature
10 10
11 def forward(self, features: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: 11 def forward(self, features: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
12 batch_size = features.shape[0]
13 labels = labels.contiguous().view(-1, 1)
14 mask = torch.eq(labels, labels.T).float().to(features.device)
15 mask = mask - torch.eye(batch_size, device=features.device)
16
17 features = F.normalize(features, dim=1) 12 features = F.normalize(features, dim=1)
18 sim = torch.matmul(features, features.T) / self.temperature 13 logits = torch.matmul(features, features.T) / self.temperature
19 sim_max, _ = torch.max(sim, dim=1, keepdim=True) 14 labels = labels.contiguous().view(-1, 1)
20 sim = sim - sim_max.detach() 15 positive_mask = torch.eq(labels, labels.T).float().to(features.device)
21 16 positive_mask = positive_mask - torch.eye(features.size(0), device=features.device)
22 exp_sim = torch.exp(sim) * (1 - torch.eye(batch_size, device=features.device)) 17 logits = logits - logits.max(dim=1, keepdim=True).values.detach()
23 log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True)) 18 exp_logits = torch.exp(logits) * (1 - torch.eye(features.size(0), device=features.device))
24 19 log_prob = logits - torch.log(exp_logits.sum(dim=1, keepdim=True) + 1e-12)
25 pos_mask = mask 20 positives = positive_mask.sum(dim=1).clamp(min=1)
26 pos_count = pos_mask.sum(dim=1) 21 return -((positive_mask * log_prob).sum(dim=1) / positives)
27 loss = -(log_prob * pos_mask).sum(dim=1)
28 loss = loss / pos_count.clamp(min=1)
29 return loss
30 22
31 23
32 class CombinedLoss(nn.Module): 24 class CombinedLoss(nn.Module):
...@@ -37,8 +29,7 @@ class CombinedLoss(nn.Module): ...@@ -37,8 +29,7 @@ class CombinedLoss(nn.Module):
37 aam_weight: float = 0.3, 29 aam_weight: float = 0.3,
38 ): 30 ):
39 super().__init__() 31 super().__init__()
40 self.supcon = SupConLoss(temperature) 32 self.infonce = InfoNCELoss(temperature)
41 self.ce = nn.CrossEntropyLoss()
42 self.supcon_weight = supcon_weight 33 self.supcon_weight = supcon_weight
43 self.aam_weight = aam_weight 34 self.aam_weight = aam_weight
44 35
...@@ -50,21 +41,20 @@ class CombinedLoss(nn.Module): ...@@ -50,21 +41,20 @@ class CombinedLoss(nn.Module):
50 supcon_labels: torch.Tensor, 41 supcon_labels: torch.Tensor,
51 hard_weight: torch.Tensor | None = None, 42 hard_weight: torch.Tensor | None = None,
52 ) -> dict: 43 ) -> dict:
53 loss_supcon = self.supcon(embedding, supcon_labels) 44 loss_infonce = self.infonce(embedding, supcon_labels)
54 loss_ce = F.cross_entropy(logits, labels, reduction="none") 45 loss_ce = F.cross_entropy(logits, labels, reduction="none")
55 if hard_weight is not None: 46 if hard_weight is not None:
56 weight = hard_weight.float() 47 weight = hard_weight.float()
57 if weight.dim() == 0: 48 if weight.dim() == 0:
58 weight = weight.unsqueeze(0) 49 weight = weight.unsqueeze(0)
59 loss_supcon = loss_supcon * weight 50 loss_infonce = loss_infonce * weight
60 loss_ce = loss_ce * weight 51 loss_ce = loss_ce * weight
61 52
62 loss_supcon = loss_supcon.mean() 53 loss_infonce = loss_infonce.mean()
63 loss_ce = loss_ce.mean() 54 loss_ce = loss_ce.mean()
64 55 total = self.supcon_weight * loss_infonce + self.aam_weight * loss_ce
65 total = self.supcon_weight * loss_supcon + self.aam_weight * loss_ce
66 return { 56 return {
67 "loss": total, 57 "loss": total,
68 "supcon_loss": loss_supcon.item(), 58 "supcon_loss": loss_infonce.item(),
69 "ce_loss": loss_ce.item(), 59 "ce_loss": loss_ce.item(),
70 } 60 }
......
1 import numpy as np 1 import numpy as np
2 import random 2 import random
3 from typing import Optional, Tuple 3 from pathlib import Path
4 from typing import Iterable, Optional, Tuple
4 5
6 import librosa
7 import soundfile as sf
8
9 try:
10 from audiomentations import AddBackgroundNoise, AddGaussianNoise, BandPassFilter, Compose, Mp3Compression, PitchShift, TimeStretch
11 HAS_AUDIO_AUG = True
12 except Exception:
13 AddBackgroundNoise = AddGaussianNoise = BandPassFilter = Compose = Mp3Compression = PitchShift = TimeStretch = None
14 HAS_AUDIO_AUG = False
5 15
6 class AugmentPipeline:
7 def __init__(self, sr: int = 16000, aggressive: bool = False):
8 self.sr = sr
9 self.noise_snr_range = (5, 30)
10 self.pitch_shift_range = (-6, 6)
11 self.time_stretch_range = (0.85, 1.15)
12 self.mp3_bitrate_range = (32, 128)
13 self.aggressive = aggressive
14 16
15 def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray: 17 class NoiseLibrary:
16 if snr_db is None: 18 def __init__(self, roots: Optional[Iterable[str]] = None):
17 snr_db = random.uniform(*self.noise_snr_range) 19 self.paths = []
18 signal_power = np.mean(y ** 2) 20 for root in roots or []:
19 noise_power = signal_power / (10 ** (snr_db / 10)) 21 base = Path(root)
20 noise = np.random.randn(len(y)) * np.sqrt(noise_power) 22 if not base.exists():
21 return y + noise 23 continue
24 for pattern in ("*.wav", "*.mp3", "*.flac", "*.ogg", "*.m4a"):
25 self.paths.extend(base.rglob(pattern))
22 26
23 def pitch_shift(self, y: np.ndarray, semitones: Optional[float] = None) -> np.ndarray: 27 def directories(self) -> list[str]:
24 if semitones is None: 28 if not self.paths:
25 semitones = random.uniform(*self.pitch_shift_range) 29 return []
26 return librosa_shift(y, sr=self.sr, n_steps=semitones) 30 return sorted({str(path.parent) for path in self.paths})
27 31
28 def time_stretch(self, y: np.ndarray, rate: Optional[float] = None) -> np.ndarray:
29 if rate is None:
30 rate = random.uniform(*self.time_stretch_range)
31 return librosa_ts(y, sr=self.sr, rate=rate)
32 32
33 def add_reverb(self, y: np.ndarray, decay: float = 0.3) -> np.ndarray: 33 class AugmentPipeline:
34 ir_len = int(0.1 * self.sr) 34 def __init__(
35 ir = np.exp(-np.arange(ir_len) * decay / ir_len) * np.random.randn(ir_len) 35 self,
36 ir /= np.sqrt(np.sum(ir ** 2)) 36 sr: int = 16000,
37 return np.convolve(y, ir, mode='same')[:len(y)] 37 aggressive: bool = False,
38 noise_roots: Optional[Iterable[str]] = None,
39 freq_mask_prob: float = 0.3,
40 ):
41 self.sr = sr
42 self.aggressive = aggressive
43 self.freq_mask_prob = freq_mask_prob
44 self.noise_library = NoiseLibrary(noise_roots)
45 self.wave_augment = self._build_wave_augmenter()
46
47 def _build_wave_augmenter(self):
48 if not HAS_AUDIO_AUG:
49 return None
50 transforms = [
51 AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5 if not self.aggressive else 0.8),
52 BandPassFilter(
53 min_center_freq=300.0,
54 max_center_freq=3200.0,
55 min_bandwidth_fraction=0.3,
56 max_bandwidth_fraction=0.8,
57 p=0.35 if not self.aggressive else 0.55,
58 ),
59 Mp3Compression(min_bitrate=24, max_bitrate=96, p=0.35 if not self.aggressive else 0.55),
60 PitchShift(min_semitones=-5, max_semitones=5, p=0.35 if not self.aggressive else 0.55),
61 TimeStretch(min_rate=0.8, max_rate=1.2, p=0.35 if not self.aggressive else 0.55),
62 ]
63 noise_dirs = self.noise_library.directories()
64 if noise_dirs:
65 transforms.append(
66 AddBackgroundNoise(
67 sounds_path=noise_dirs,
68 min_snr_db=3.0 if self.aggressive else 8.0,
69 max_snr_db=20.0 if self.aggressive else 30.0,
70 noise_transform=Compose([
71 BandPassFilter(
72 min_center_freq=250.0,
73 max_center_freq=4000.0,
74 min_bandwidth_fraction=0.2,
75 max_bandwidth_fraction=0.9,
76 p=0.5,
77 )
78 ]),
79 p=0.35 if not self.aggressive else 0.6,
80 )
81 )
82 return Compose(transforms)
38 83
39 def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 8) -> np.ndarray: 84 def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 12) -> np.ndarray:
40 mel = mel.copy() 85 mel = mel.copy()
41 t = mel.shape[1] 86 t = mel.shape[1]
42 f = mel.shape[0] 87 f = mel.shape[0]
...@@ -46,43 +91,21 @@ class AugmentPipeline: ...@@ -46,43 +91,21 @@ class AugmentPipeline:
46 if t_start < t: 91 if t_start < t:
47 mel[:, t_start:t_start + t_mask] = 0 92 mel[:, t_start:t_start + t_mask] = 0
48 for _ in range(2): 93 for _ in range(2):
49 f_mask = random.randint(0, max_freq_mask) 94 f_mask = random.randint(max(1, max_freq_mask // 3), max_freq_mask)
50 f_start = random.randint(0, max(0, f - f_mask)) 95 f_start = random.randint(0, max(0, f - f_mask))
51 if f_start < f: 96 if f_start < f:
52 mel[f_start:f_start + f_mask, :] = 0 97 mel[f_start:f_start + f_mask, :] = 0
53 return mel 98 return mel
54 99
55 def apply_to_mel(self, mel: np.ndarray) -> np.ndarray: 100 def apply_to_mel(self, mel: np.ndarray) -> np.ndarray:
56 if random.random() < 0.3: 101 if random.random() < self.freq_mask_prob:
57 mel = self.apply_spec_augment(mel) 102 mel = self.apply_spec_augment(mel)
58 return mel 103 return mel
59 104
60 def __call__(self, y: np.ndarray) -> np.ndarray: 105 def __call__(self, y: np.ndarray) -> np.ndarray:
61 noise_p = 0.75 if self.aggressive else 0.5 106 if self.wave_augment is None:
62 stretch_p = 0.55 if self.aggressive else 0.3
63 pitch_p = 0.55 if self.aggressive else 0.3
64 reverb_p = 0.35 if self.aggressive else 0.2
65 if random.random() < noise_p:
66 y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None)
67 if random.random() < stretch_p:
68 y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None)
69 if random.random() < pitch_p:
70 y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None)
71 if random.random() < reverb_p:
72 y = self.add_reverb(y, decay=random.uniform(0.2, 0.6))
73 return y 107 return y
74
75
76 def librosa_shift(y, sr=16000, n_steps=0):
77 return librosa_impl(y, lambda: __import__('librosa').effects.pitch_shift(y, sr=sr, n_steps=n_steps))
78
79
80 def librosa_ts(y, sr=16000, rate=1.0):
81 return librosa_impl(y, lambda: __import__('librosa').effects.time_stretch(y, rate=rate))
82
83
84 def librosa_impl(y, fn):
85 try: 108 try:
86 return fn() 109 return self.wave_augment(samples=y.astype(np.float32), sample_rate=self.sr)
87 except Exception: 110 except Exception:
88 return y 111 return y
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
4 import argparse 4 import argparse
5 import json 5 import json
6 import sys 6 import sys
7 from datetime import datetime
7 from pathlib import Path 8 from pathlib import Path
8 9
9 import torch 10 import torch
...@@ -21,15 +22,23 @@ from src.models.losses import CombinedLoss ...@@ -21,15 +22,23 @@ from src.models.losses import CombinedLoss
21 22
22 def collate_fn(batch): 23 def collate_fn(batch):
23 mels = [] 24 mels = []
25 melodies = []
26 chromas = []
24 song_ids = [] 27 song_ids = []
25 song_names = [] 28 song_names = []
26 hard_weights = [] 29 hard_weights = []
27 for b in batch: 30 for b in batch:
28 mel = b["mel"] 31 mel = b["mel"]
32 melody = b.get("melody")
33 chroma = b.get("chroma")
29 hw = b.get("hard_weight", torch.tensor(1.0)) 34 hw = b.get("hard_weight", torch.tensor(1.0))
30 if mel.dim() == 3: 35 if mel.dim() == 3:
31 for i in range(mel.shape[0]): 36 for i in range(mel.shape[0]):
32 mels.append(mel[i]) 37 mels.append(mel[i])
38 if melody is not None:
39 melodies.append(melody[i])
40 if chroma is not None:
41 chromas.append(chroma[i])
33 song_ids.append(b["song_id"][i]) 42 song_ids.append(b["song_id"][i])
34 song_names.append(b["song_name"]) 43 song_names.append(b["song_name"])
35 if torch.is_tensor(hw) and hw.dim() > 0: 44 if torch.is_tensor(hw) and hw.dim() > 0:
...@@ -38,24 +47,45 @@ def collate_fn(batch): ...@@ -38,24 +47,45 @@ def collate_fn(batch):
38 hard_weights.append(hw) 47 hard_weights.append(hw)
39 else: 48 else:
40 mels.append(mel) 49 mels.append(mel)
50 if melody is not None:
51 melodies.append(melody)
52 if chroma is not None:
53 chromas.append(chroma)
41 song_ids.append(b["song_id"]) 54 song_ids.append(b["song_id"])
42 song_names.append(b["song_name"]) 55 song_names.append(b["song_name"])
43 hard_weights.append(hw) 56 hard_weights.append(hw)
44 57
45 max_t = max(m.shape[1] for m in mels) 58 max_t = max(m.shape[1] for m in mels)
46 mels_padded = [] 59 mels_padded = []
47 for m in mels: 60 melodies_padded = []
61 chromas_padded = []
62 for idx, m in enumerate(mels):
48 pad = max_t - m.shape[1] 63 pad = max_t - m.shape[1]
49 if pad > 0: 64 if pad > 0:
50 m = torch.nn.functional.pad(m, (0, pad)) 65 m = torch.nn.functional.pad(m, (0, pad))
51 mels_padded.append(m.unsqueeze(0)) 66 mels_padded.append(m.unsqueeze(0))
52 67 if melodies:
53 return { 68 melody = melodies[idx]
69 if melody.shape[1] < max_t:
70 melody = torch.nn.functional.pad(melody, (0, max_t - melody.shape[1]))
71 melodies_padded.append(melody.unsqueeze(0))
72 if chromas:
73 chroma = chromas[idx]
74 if chroma.shape[1] < max_t:
75 chroma = torch.nn.functional.pad(chroma, (0, max_t - chroma.shape[1]))
76 chromas_padded.append(chroma.unsqueeze(0))
77
78 payload = {
54 "mel": torch.cat(mels_padded, dim=0), 79 "mel": torch.cat(mels_padded, dim=0),
55 "song_id": torch.stack(song_ids), 80 "song_id": torch.stack(song_ids),
56 "song_name": song_names, 81 "song_name": song_names,
57 "hard_weight": torch.stack(hard_weights), 82 "hard_weight": torch.stack(hard_weights),
58 } 83 }
84 if melodies_padded:
85 payload["melody"] = torch.cat(melodies_padded, dim=0)
86 if chromas_padded:
87 payload["chroma"] = torch.cat(chromas_padded, dim=0)
88 return payload
59 89
60 90
61 def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg): 91 def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg):
...@@ -64,10 +94,14 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg) ...@@ -64,10 +94,14 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg)
64 pbar = tqdm(loader, desc=f"Epoch {epoch}") 94 pbar = tqdm(loader, desc=f"Epoch {epoch}")
65 for batch in pbar: 95 for batch in pbar:
66 mel = batch["mel"].to(device) 96 mel = batch["mel"].to(device)
97 melody = batch.get("melody")
98 chroma = batch.get("chroma")
99 melody = melody.to(device) if melody is not None else None
100 chroma = chroma.to(device) if chroma is not None else None
67 labels = batch["song_id"].to(device) 101 labels = batch["song_id"].to(device)
68 102
69 with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"): 103 with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"):
70 embedding, logits = model(mel, labels) 104 embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
71 loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None) 105 loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
72 106
73 optimizer.zero_grad() 107 optimizer.zero_grad()
...@@ -115,6 +149,28 @@ def save_checkpoint(output_dir, epoch, model, optimizer, best_metric, cfg, name) ...@@ -115,6 +149,28 @@ def save_checkpoint(output_dir, epoch, model, optimizer, best_metric, cfg, name)
115 print(f" Saved: {path}") 149 print(f" Saved: {path}")
116 150
117 151
152 def write_training_artifacts(output_dir: Path, cfg: dict, train_metrics: dict, train_dataset, args):
153 manifest = {
154 "timestamp": datetime.utcnow().isoformat() + "Z",
155 "config": cfg,
156 "output_dir": str(output_dir),
157 "train_song_count": len(train_dataset.song_ids),
158 "sample_count": len(train_dataset),
159 "segment_strategy": args.segment_strategy,
160 "noise_roots": args.noise_root,
161 "artifacts": {
162 "best_model": str(output_dir / "best_model.pt"),
163 "song_to_idx": str(output_dir / "song_to_idx.json"),
164 "metrics": str(output_dir / "training_metrics.json"),
165 },
166 "final_metrics": train_metrics,
167 }
168 with open(output_dir / "training_metrics.json", "w") as f:
169 json.dump(train_metrics, f, indent=2)
170 with open(output_dir / "training_manifest.json", "w") as f:
171 json.dump(manifest, f, indent=2)
172
173
118 def main(): 174 def main():
119 parser = argparse.ArgumentParser() 175 parser = argparse.ArgumentParser()
120 parser.add_argument("--config", type=str, default="configs/default.yaml") 176 parser.add_argument("--config", type=str, default="configs/default.yaml")
...@@ -125,6 +181,7 @@ def main(): ...@@ -125,6 +181,7 @@ def main():
125 parser.add_argument("--epochs", type=int, default=None) 181 parser.add_argument("--epochs", type=int, default=None)
126 parser.add_argument("--batch-size", type=int, default=None) 182 parser.add_argument("--batch-size", type=int, default=None)
127 parser.add_argument("--lr", type=float, default=None) 183 parser.add_argument("--lr", type=float, default=None)
184 parser.add_argument("--noise-root", action="append", default=[])
128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") 185 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
129 parser.add_argument("--silence-top-db", type=int, default=30) 186 parser.add_argument("--silence-top-db", type=int, default=30)
130 parser.add_argument("--dry-run", action="store_true") 187 parser.add_argument("--dry-run", action="store_true")
...@@ -159,6 +216,8 @@ def main(): ...@@ -159,6 +216,8 @@ def main():
159 silence_top_db=args.silence_top_db, 216 silence_top_db=args.silence_top_db,
160 sample_type_weights=cfg["training"].get("sample_type_weights"), 217 sample_type_weights=cfg["training"].get("sample_type_weights"),
161 pair_type_weights=cfg["training"].get("pair_type_weights"), 218 pair_type_weights=cfg["training"].get("pair_type_weights"),
219 hard_negative_k=cfg["training"].get("hard_negative_k", 2),
220 noise_roots=args.noise_root,
162 ) 221 )
163 222
164 catalog_dataset = ACRDataset( 223 catalog_dataset = ACRDataset(
...@@ -174,6 +233,7 @@ def main(): ...@@ -174,6 +233,7 @@ def main():
174 song_to_idx=train_dataset.song_to_idx, 233 song_to_idx=train_dataset.song_to_idx,
175 segment_strategy=args.segment_strategy, 234 segment_strategy=args.segment_strategy,
176 silence_top_db=args.silence_top_db, 235 silence_top_db=args.silence_top_db,
236 noise_roots=args.noise_root,
177 ) 237 )
178 238
179 train_loader = DataLoader( 239 train_loader = DataLoader(
...@@ -205,6 +265,11 @@ def main(): ...@@ -205,6 +265,11 @@ def main():
205 aam_s=cfg["model"]["aam_s"], 265 aam_s=cfg["model"]["aam_s"],
206 use_band_split=cfg["model"].get("use_band_split", True), 266 use_band_split=cfg["model"].get("use_band_split", True),
207 band_split_channels=cfg["model"].get("band_split_channels", 128), 267 band_split_channels=cfg["model"].get("band_split_channels", 128),
268 use_dual_stream=cfg["model"].get("use_dual_stream", True),
269 coverhunter_heads=cfg["model"].get("coverhunter_heads", 4),
270 coverhunter_layers=cfg["model"].get("coverhunter_layers", 2),
271 fusion_hidden_dim=cfg["model"].get("fusion_hidden_dim", 256),
272 mert_model_name=cfg["model"].get("mert_model_name"),
208 ).to(device) 273 ).to(device)
209 274
210 criterion = CombinedLoss( 275 criterion = CombinedLoss(
...@@ -219,8 +284,12 @@ def main(): ...@@ -219,8 +284,12 @@ def main():
219 print("Dry run: running one batch through forward/backward...") 284 print("Dry run: running one batch through forward/backward...")
220 batch = next(iter(train_loader)) 285 batch = next(iter(train_loader))
221 mel = batch["mel"].to(device) 286 mel = batch["mel"].to(device)
287 melody = batch.get("melody")
288 chroma = batch.get("chroma")
289 melody = melody.to(device) if melody is not None else None
290 chroma = chroma.to(device) if chroma is not None else None
222 labels = batch["song_id"].to(device) 291 labels = batch["song_id"].to(device)
223 embedding, logits = model(mel, labels) 292 embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
224 loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None) 293 loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
225 loss_dict["loss"].backward() 294 loss_dict["loss"].backward()
226 print(f" Forward/backward OK. Loss: {loss_dict['loss']:.4f}") 295 print(f" Forward/backward OK. Loss: {loss_dict['loss']:.4f}")
...@@ -242,6 +311,7 @@ def main(): ...@@ -242,6 +311,7 @@ def main():
242 output_dir.mkdir(parents=True, exist_ok=True) 311 output_dir.mkdir(parents=True, exist_ok=True)
243 312
244 print("Starting training...") 313 print("Starting training...")
314 train_metrics = None
245 for epoch in range(start_epoch, cfg["training"]["epochs"] + 1): 315 for epoch in range(start_epoch, cfg["training"]["epochs"] + 1):
246 train_metrics = train_epoch(model, train_loader, optimizer, criterion, scaler, device, epoch, cfg) 316 train_metrics = train_epoch(model, train_loader, optimizer, criterion, scaler, device, epoch, cfg)
247 scheduler.step() 317 scheduler.step()
...@@ -254,6 +324,7 @@ def main(): ...@@ -254,6 +324,7 @@ def main():
254 324
255 with open(output_dir / "song_to_idx.json", "w") as f: 325 with open(output_dir / "song_to_idx.json", "w") as f:
256 json.dump(train_dataset.song_to_idx, f, indent=2) 326 json.dump(train_dataset.song_to_idx, f, indent=2)
327 write_training_artifacts(output_dir, cfg, train_metrics or {}, train_dataset, args)
257 print(f"\nTraining complete. Best training loss: {best_loss:.4f}") 328 print(f"\nTraining complete. Best training loss: {best_loss:.4f}")
258 print(f"Model saved to: {output_dir / 'best_model.pt'}") 329 print(f"Model saved to: {output_dir / 'best_model.pt'}")
259 print(f"Catalog references available: {len(catalog_dataset.samples)}") 330 print(f"Catalog references available: {len(catalog_dataset.samples)}")
......
1 {
2 "run_name": "coverhunter_finetune_20260608T130103Z",
3 "created_at": "2026-06-08T13:01:03.023371Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130103Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z"
24 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "run_name": "coverhunter_finetune_20260608T130103Z",
3 "created_at": "2026-06-08T13:01:03.023371Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130103Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z",
24 "returncode": 1,
25 "completed_at": "2026-06-08T13:01:32.762576Z",
26 "artifacts": [
27 "run_request.json",
28 "stderr.log",
29 "stdout.log"
30 ]
31 }
...\ No newline at end of file ...\ No newline at end of file
1 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
2 Traceback (most recent call last):
3 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
4 main()
5 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 249, in main
6 batch = next(iter(train_loader))
7 ^^^^^^^^^^^^^^^^^^^^^^^^
8 File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 718, in __next__
9 data = self._next_data()
10 ^^^^^^^^^^^^^^^^^
11 File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 778, in _next_data
12 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
13 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
14 File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
15 data = [self.dataset[idx] for idx in possibly_batched_index]
16 ~~~~~~~~~~~~^^^^^
17 File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 370, in __getitem__
18 positive_features = [self._load_features(sample) for sample in positive_items]
19 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
20 File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 344, in _load_features
21 features = self.feature_extractor.extract(y)
22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23 File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 138, in extract
24 melody = librosa.hz_to_midi(melody, bins_per_octave=12)
25 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
26 TypeError: hz_to_midi() got an unexpected keyword argument 'bins_per_octave'
1 {
2 "run_name": "coverhunter_finetune_20260608T130306Z",
3 "created_at": "2026-06-08T13:03:06.790814Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130306Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z"
24 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "run_name": "coverhunter_finetune_20260608T130306Z",
3 "created_at": "2026-06-08T13:03:06.790814Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130306Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z",
24 "returncode": 1,
25 "completed_at": "2026-06-08T13:04:34.035140Z",
26 "artifacts": [
27 "run_request.json",
28 "stderr.log",
29 "stdout.log"
30 ]
31 }
...\ No newline at end of file ...\ No newline at end of file
1 /home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
2 midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
3 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
4 /home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
5 midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
6 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
7 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
8 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
9 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
10 '[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
11 Retrying in 1s [Retry 1/5].
12 Traceback (most recent call last):
13 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
14 main()
15 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 256, in main
16 model = ECAPA_ACR(
17 ^^^^^^^^^^
18 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 280, in __init__
19 self.mert_melody_branch = MERTMelodyBranch(
20 ^^^^^^^^^^^^^^^^^
21 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 211, in __init__
22 self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
23 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 21, in __init__
25 self.backbone = AutoModel.from_pretrained(model_name)
26 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27 File "/home/user/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 289, in from_pretrained
28 resolved_config_file = cached_file(
29 ^^^^^^^^^^^^
30 File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 293, in cached_file
31 file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
33 File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 527, in cached_files
34 raise e
35 File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 437, in cached_files
36 hf_hub_download(
37 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
38 return fn(*args, **kwargs)
39 ^^^^^^^^^^^^^^^^^^^
40 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1019, in hf_hub_download
41 return _hf_hub_download_to_cache_dir(
42 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1152, in _hf_hub_download_to_cache_dir
44 _get_metadata_or_catch_error(
45 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1694, in _get_metadata_or_catch_error
46 metadata = get_hf_file_metadata(
47 ^^^^^^^^^^^^^^^^^^^^^
48 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
49 return fn(*args, **kwargs)
50 ^^^^^^^^^^^^^^^^^^^
51 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1616, in get_hf_file_metadata
52 response = _httpx_follow_relative_redirects_with_backoff(
53 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
54 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 685, in _httpx_follow_relative_redirects_with_backoff
55 response = http_backoff(
56 ^^^^^^^^^^^^^
57 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 559, in http_backoff
58 return next(
59 ^^^^^
60 File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 467, in _http_backoff_base
61 response = client.request(method=method, url=url, **kwargs)
62 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
63 File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 825, in request
64 return self.send(request, auth=auth, follow_redirects=follow_redirects)
65 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
66 File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 901, in send
67 raise RuntimeError("Cannot send a request, as the client has been closed.")
68 RuntimeError: Cannot send a request, as the client has been closed.
1 Device: cpu
2 Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
3 Classes: 16
4 Train songs: 64
1 {
2 "run_name": "coverhunter_finetune_20260608T130514Z",
3 "created_at": "2026-06-08T13:05:14.591209Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130514Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z"
24 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "run_name": "coverhunter_finetune_20260608T130514Z",
3 "created_at": "2026-06-08T13:05:14.591209Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130514Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z",
24 "returncode": 1,
25 "completed_at": "2026-06-08T13:06:50.272162Z",
26 "artifacts": [
27 "run_request.json",
28 "stderr.log",
29 "stdout.log"
30 ]
31 }
...\ No newline at end of file ...\ No newline at end of file
1 /home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
2 midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
3 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
4 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
5 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
6 '[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
7 Retrying in 1s [Retry 1/5].
8 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
9 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
10 Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
11 Traceback (most recent call last):
12 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
13 main()
14 File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 292, in main
15 embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
16 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
18 return self._call_impl(*args, **kwargs)
19 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
21 return forward_call(*args, **kwargs)
22 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 351, in forward
24 mert_stream = self.mert_melody_branch(mel, melody, chroma)
25 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
26 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
27 return self._call_impl(*args, **kwargs)
28 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
29 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
30 return forward_call(*args, **kwargs)
31 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
32 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 224, in forward
33 semantic = self.mert(mert)
34 ^^^^^^^^^^^^^^^
35 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
36 return self._call_impl(*args, **kwargs)
37 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
38 File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
39 return forward_call(*args, **kwargs)
40 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41 File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 49, in forward
42 return self.proj(mel)
43 ^^^^^^^^^^^^^^
44 TypeError: 'NoneType' object is not callable
1 Device: cpu
2 Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
3 Classes: 16
4 Train songs: 64
5 Dry run: running one batch through forward/backward...
1 {
2 "run_name": "coverhunter_finetune_20260608T130731Z",
3 "created_at": "2026-06-08T13:07:31.311447Z",
4 "python": "/usr/local/miniconda3/bin/python",
5 "command": [
6 "/usr/local/miniconda3/bin/python",
7 "train.py",
8 "--config",
9 "configs/coverhunter_finetune_4gb.yaml",
10 "--data",
11 "data/synthetic_v2",
12 "--output",
13 "data/training_runs/coverhunter_finetune_20260608T130731Z",
14 "--device",
15 "cpu",
16 "--segment-strategy",
17 "hybrid",
18 "--dry-run"
19 ],
20 "config": "configs/coverhunter_finetune_4gb.yaml",
21 "data": "data/synthetic_v2",
22 "noise_roots": [],
23 "run_dir": "data/training_runs/coverhunter_finetune_20260608T130731Z"
24 }
...\ No newline at end of file ...\ No newline at end of file
1 # CoverHunter 环境安装与验证
2
3 ## 1. 目标解释器
4
5 本专题统一使用:
6
7 ```bash
8 /usr/local/miniconda3/bin/python
9 ```
10
11 ## 2. 自动化脚本
12
13 已新增环境安装与验证脚本:
14
15 ```text
16 acr-engine/scripts/setup_coverhunter_env.py
17 ```
18
19 执行方式:
20
21 ```bash
22 /usr/local/miniconda3/bin/python acr-engine/scripts/setup_coverhunter_env.py
23 ```
24
25 它会自动:
26
27 1. 安装 `requirements.txt`
28 2. 补充训练依赖:
29 - `torch`
30 - `torchaudio`
31 - `transformers`
32 - `huggingface_hub`
33 - `librosa`
34 - `soundfile`
35 - `audiomentations`
36 3. 进行环境验证
37 4. 生成报告:
38
39 ```text
40 acr-engine/reports/coverhunter_env_setup_report.json
41 ```
42
43 ## 3. 当前自动化执行结果
44
45 本次已经自动执行完成。
46
47 报告文件:
48
49 ```text
50 acr-engine/reports/coverhunter_env_setup_report.json
51 ```
52
53 当前结论:
54
55 - Python 包安装:**成功**
56 - `torch` / `transformers` / `librosa` / `soundfile` / `audiomentations`**已安装**
57 -`torch.cuda.is_available()` 当前返回:**False**
58
59 ## 4. 当前 GPU 阻塞点
60
61 虽然系统存在 NVIDIA GPU,且 `nvidia-smi` 可见设备,但当前 PyTorch CUDA 初始化失败。
62
63 报告中的核心告警是:
64
65 - **The NVIDIA driver on your system is too old**
66
67 这说明:
68
69 - 当前安装到环境里的 `torch 2.12.0+cu130`
70 - 与当前系统驱动版本不兼容
71
72 也就是说:
73
74 - **环境依赖已经安装好了**
75 - **但当前 GPU 训练还不能真正启用**
76 - 原因不是代码问题,而是 **PyTorch CUDA 版本与驱动版本不匹配**
77
78 ## 5. 当前状态怎么理解
79
80 现在的环境状态可以分成两部分:
81
82 ### 已经完成的
83
84 - 训练依赖已安装
85 - 训练脚本可执行
86 - MERT / ECAPA 双流代码可 import
87 - 文档和配置已准备好
88
89 ### 仍未完成的
90
91 - CUDA 版 torch 与当前 NVIDIA driver 的匹配
92
93 ## 6. 下一步建议
94
95 要让 GPU 真正可用,需要二选一:
96
97 ### 方案 A:升级 NVIDIA 驱动
98
99 优点:
100
101 - 可以保留当前较新的 torch/cu130 组合
102 - 后续兼容性更好
103
104 ### 方案 B:安装与当前驱动兼容的更低 CUDA 版本 torch
105
106 优点:
107
108 - 不改系统驱动
109 - 更适合当前机器直接落地
110
111 对当前项目而言,我更建议:
112
113 - **优先采用方案 B**
114 - 安装与当前驱动兼容的 torch 版本
115
116 ## 7. 当前专题与环境文档关系
117
118 配套文件如下:
119
120 - 训练专题:`docs/coverhunter_finetune_topic.md`
121 - 训练流程:`docs/coverhunter_training_process.md`
122 - 环境文档:`docs/coverhunter_env_setup.md`
123 - 环境报告:`acr-engine/reports/coverhunter_env_setup_report.json`
124
125 ## 8. 当前结论
126
127 当前已经自动完成:
128
129 - 环境依赖安装
130 - 环境验证
131 - 结果记录
132
133 目前唯一阻塞 GPU 训练的点是:
134
135 - **CUDA / 驱动 / torch 版本不匹配**
1 # CoverHunter 双流微调专题方案
2
3 ## 1. 专题目标
4
5 本专题目标是围绕当前仓库,建立一套可持续扩展的 **CoverHunter 双流微调方案**,用于音乐翻唱识别、哼唱检索、录音片段检索和抗噪 ACR 检索。
6
7 专题的核心方向不是一次性跑通训练,而是建立一条可反复扩展的训练专题链路:
8
9 1. 明确现有音源与数据资产
10 2. 定义双流训练架构
11 3. 设计分阶段训练计划
12 4. 形成标准训练流程
13 5. 规范训练产物与权重使用方式
14 6. 为后续补充更多 music 语料预留稳定入口
15
16 ---
17
18 ## 2. 当前已有音源与数据资产
19
20 ### 2.1 当前仓库内可直接使用的数据
21
22 当前可直接用于训练与冒烟验证的数据位于:
23
24 ```text
25 acr-engine/data/synthetic_v2/
26 ```
27
28 其中包含:
29
30 - `train.json`
31 - `test.json`
32 - `segments/*.wav`
33
34 ### 2.2 当前训练集统计
35
36 基于 `acr-engine/data/synthetic_v2/train.json` 的统计结果:
37
38 - 样本总数:**96**
39 - `song_id` 数量:**16**
40 - 类型分布:
41 - `reference`: **16**
42 - `clean`: **32**
43 - `augmented`: **16**
44 - `humming_like`: **16**
45 - `confused`: **16**
46
47 ### 2.3 当前音源的含义
48
49 按现有数据结构,可以理解为每首歌目前至少对应以下几类样本:
50
51 1. **reference**
52 - 作为标准原曲/参考版本
53 - 用于建立稳定的正样本锚点
54 2. **clean**
55 - 较干净的切片
56 - 代表相对理想的检索输入
57 3. **augmented**
58 - 已经经过部分增强的样本
59 - 用于初步提升泛化能力
60 4. **humming_like**
61 - 偏哼唱/偏旋律化表达的近似样本
62 - 用于强化“忽略音色、聚焦旋律”能力
63 5. **confused**
64 - 易混淆样本
65 - 用于构建难负样本与边界学习能力
66
67 ### 2.4 当前音源的局限性
68
69 当前 `synthetic_v2` 更适合做:
70
71 - 训练链路验证
72 - 双流结构验证
73 - 小规模参数/损失/显存调优
74 - 产物定义与使用方式验证
75
76 当前它还不适合直接视为最终生产训练集,原因包括:
77
78 - 歌曲数较少(16 首)
79 - 类型覆盖有限
80 - 录音噪声场景仍偏少
81 - 真实翻唱的多样性不足
82 - 真实设备采集差异不足
83
84 所以本专题应采用 **分阶段训练策略**
85
86 ---
87
88 ## 3. 当前双流训练架构
89
90 ### 3.1 架构定义
91
92 当前已按双流结构实现:
93
94 ### 流 A:MERT + Melody 分支
95
96 位置:
97
98 - `acr-engine/src/models/ecapa_tdnn.py`
99
100 职责:
101
102 - 建模高层语义与旋律表达
103 - 提高跨音色、跨设备、跨唱法的语义对齐能力
104
105 当前组成:
106
107 - `FrozenMERTFeatureExtractor`
108 - `melody/chroma` 特征投影与融合
109
110 默认模型配置:
111
112 ```yaml
113 model:
114 mert_model_name: m-a-p/MERT-v1-95M
115 ```
116
117 ### 流 B:ECAPA 分支
118
119 职责:
120
121 - 强化局部声学结构与 timbre/韵律相关判别信息
122 - 作为与 MERT 分支互补的检索支路
123
124 ### 双流融合
125
126 - `DualStreamFusion`
127
128 作用:
129
130 - 将语义旋律流与 ECAPA 流融合到统一时序空间
131
132 ### 检索头
133
134 - `CoverHunterHead`
135
136 作用:
137
138 - 将融合后的时序特征进一步编码
139 - 输出最终 embedding 用于对比训练和检索
140
141 ### 训练目标
142
143 - `InfoNCE`
144 - `AAMSoftmax`
145
146 ---
147
148 ## 4. 训练专题的总体思路
149
150 本专题不建议“一步到位”直接上大规模真实全量训练,而建议按三个阶段推进。
151
152 ### 阶段 A:链路验证阶段
153
154 目标:
155
156 - 验证模型结构、数据流、增强链路、权重产物、运行日志是否闭环
157
158 训练数据:
159
160 - `acr-engine/data/synthetic_v2`
161
162 产出:
163
164 - 跑通训练
165 - 确认显存
166 - 确认增强是否有效
167 - 确认权重可以导出并复用
168
169 ### 阶段 B:专题微调阶段
170
171 目标:
172
173 - 在当前专题下引入更多音乐语料
174 - 逐步扩充:原曲、翻唱、录音、哼唱、噪声注入样本
175 - 建立更稳定的双流 CoverHunter embedding
176
177 训练数据规划:
178
179 - 原曲标准音源
180 - 真实或半真实翻唱音源
181 - 设备录音音源
182 - 环境噪声音源
183 - 难负样本音源
184
185 ### 阶段 C:检索权重沉淀阶段
186
187 目标:
188
189 - 固化最优权重
190 - 建立 reference embedding 索引流程
191 - 形成线上/离线检索用权重标准
192
193 ---
194
195 ## 5. 训练数据计划
196
197 后续你提到会补充更多 music 语料,因此建议数据建设按下面结构统一。
198
199 ### 5.1 推荐数据结构
200
201 建议每首歌围绕 `song_id` 组织为:
202
203 - `reference`
204 - `clean`
205 - `cover`
206 - `recording`
207 - `environment`
208 - `humming_like`
209 - `confused`
210
211 ### 5.2 推荐含义
212
213 #### reference
214
215 - 标准原曲版本
216 - 用于构建基准 embedding 和 reference index
217
218 #### clean
219
220 - 质量较好的切片/相对干净音频
221 - 用于稳定正样本训练
222
223 #### cover
224
225 - 真实翻唱版本
226 - 用于训练旋律一致、音色不同的对齐能力
227
228 #### recording
229
230 - 手机/麦克风录制版本
231 - 用于训练设备失真和场景采集鲁棒性
232
233 #### environment
234
235 - 注入环境噪声或真实环境录音
236 - 用于训练抗噪能力
237
238 #### humming_like
239
240 - 哼唱、跟唱、弱伴奏旋律版本
241 - 用于训练旋律驱动检索能力
242
243 #### confused
244
245 - 容易相似但不属于同一首歌的样本
246 - 用于强化难负样本学习
247
248 ### 5.3 当前专题的样本补充建议
249
250 优先补充顺序建议为:
251
252 1. **更多 reference / clean 原曲**
253 2. **更多 recording / environment 样本**
254 3. **更多真实 cover 样本**
255 4. **更多 confused 难负样本**
256 5. **更多 humming_like 样本**
257
258 原因:
259
260 - 当前抗噪与设备泛化是近期最容易拉开效果差异的方向
261 - cover / humming 的价值很高,但数据准备成本更高
262
263 ---
264
265 ## 6. 数据增强计划
266
267 当前代码已实现两大类增强,用于“伪造录音”和“伪造翻唱”。
268
269 位置:
270
271 - `acr-engine/src/utils/augment.py`
272
273 ### 6.1 伪造录音增强
274
275 包括:
276
277 - `AddGaussianNoise`
278 - `AddBackgroundNoise`
279 - `BandPassFilter`
280 - `Mp3Compression`
281
282 作用:
283
284 - 模拟餐厅底噪、街道底噪
285 - 模拟廉价设备频响缺失
286 - 模拟压缩带来的失真
287 - 提高抗噪与抗设备变化能力
288
289 ### 6.2 伪造翻唱增强
290
291 包括:
292
293 - `PitchShift`
294 - `TimeStretch`
295 - `Frequency Masking`
296
297 作用:
298
299 - 模拟升降调翻唱
300 - 模拟节奏变化
301 - 逼迫模型降低音色依赖,关注旋律主线
302
303 ### 6.3 当前专题下的增强原则
304
305 - `reference` 不建议过强增强
306 - `clean` 可做轻增强
307 - `recording / environment` 可做强增强
308 - `humming_like / confused` 应提高采样权重
309
310 ---
311
312 ## 7. 训练流程
313
314 ### 7.1 环境准备
315
316 解释器:
317
318 ```bash
319 /usr/local/miniconda3/bin/python
320 ```
321
322 安装依赖:
323
324 ```bash
325 /usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
326 ```
327
328 当前依赖至少需要:
329
330 - `torch`
331 - `torchaudio`
332 - `transformers`
333 - `huggingface_hub`
334 - `librosa`
335 - `soundfile`
336 - `audiomentations`
337
338 ### 7.2 4GB GPU 专用配置
339
340 当前 GPU 为:
341
342 - `Quadro P1000`
343 - 4GB 显存
344
345 因此我已经新增专用配置:
346
347 - `acr-engine/configs/coverhunter_finetune_4gb.yaml`
348
349 特点:
350
351 - 更小 `batch_size`
352 - 更短片段
353 - 更小通道数
354 - 更浅层数
355 - 更适合当前显存资源
356
357 ### 7.3 首次验证流程
358
359 先 dry-run:
360
361 ```bash
362 cd /mnt/e/hikoon-ACR/acr-engine && \
363 /usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
364 --python /usr/local/miniconda3/bin/python \
365 --config configs/coverhunter_finetune_4gb.yaml \
366 --data data/synthetic_v2 \
367 --device cuda \
368 --segment-strategy hybrid \
369 --dry-run
370 ```
371
372 ### 7.4 小规模试训
373
374 ```bash
375 cd /mnt/e/hikoon-ACR/acr-engine && \
376 /usr/local/miniconda3/bin/python train.py \
377 --config configs/coverhunter_finetune_4gb.yaml \
378 --data data/synthetic_v2 \
379 --output data/training_runs/coverhunter_4gb_trial \
380 --device cuda \
381 --segment-strategy hybrid \
382 --batch-size 2 \
383 --epochs 2
384 ```
385
386 ### 7.5 专题正式训练
387
388 ```bash
389 cd /mnt/e/hikoon-ACR/acr-engine && \
390 /usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
391 --python /usr/local/miniconda3/bin/python \
392 --config configs/coverhunter_finetune_4gb.yaml \
393 --data data/synthetic_v2 \
394 --device cuda \
395 --segment-strategy hybrid \
396 --noise-root data/noise/restaurant \
397 --noise-root data/noise/street
398 ```
399
400 ### 7.6 后续扩容训练
401
402 当你补充新的 music 语料后,建议:
403
404 1. 先保持 `song_id + type + audio_path + duration` 元数据结构一致
405 2. 新语料先做小批量接入
406 3. 先跑 2 epoch 验证
407 4. 再逐步扩大训练轮次
408
409 ---
410
411 ## 8. 训练过程会产生什么产物
412
413 每次训练会生成目录:
414
415 ```text
416 acr-engine/data/training_runs/<run_name>/
417 ```
418
419 标准产物包括:
420
421 - `best_model.pt`
422 - `checkpoint_epoch_*.pt`
423 - `song_to_idx.json`
424 - `training_metrics.json`
425 - `training_manifest.json`
426 - `run_request.json`
427 - `run_summary.json`
428 - `stdout.log`
429 - `stderr.log`
430
431 ### 8.1 各产物的用途
432
433 #### best_model.pt
434
435 - 当前训练过程中最优权重
436 - 后续检索、建库、推理优先使用它
437
438 #### checkpoint_epoch_*.pt
439
440 - 周期性保存点
441 - 用于中断恢复、回溯比较
442
443 #### song_to_idx.json
444
445 - 训练类别到 `song_id` 的映射
446 - 用于解释训练分类头与标签对应关系
447
448 #### training_metrics.json
449
450 - 记录最后一次训练指标
451 - 用于专题对比不同配置
452
453 #### training_manifest.json
454
455 - 记录本次训练的配置、输入、产物路径
456 - 适合作为专题可追溯记录
457
458 #### run_request.json / run_summary.json
459
460 - 记录本次运行命令、解释器、配置与运行结果
461 - 便于回放与专题管理
462
463 ---
464
465 ## 9. 预期权重怎么使用
466
467 这是专题里非常关键的一部分。
468
469 ### 9.1 训练权重的核心用途
470
471 训练出来的 `best_model.pt` 不是只为了看 loss,而是为了后续两类使用:
472
473 1. **离线建库**
474 2. **在线查询 embedding 提取**
475
476 ### 9.2 离线建库
477
478 目标:
479
480 - 使用参考音源(reference)切片提取 embedding
481 - 建立 reference 向量索引
482
483 预期流程:
484
485 1. 读取 `reference` 音源
486 2. 切片
487 3. 用双流模型提 embedding
488 4. 存成 embedding matrix
489 5. 后续接 Faiss / pgvector / Milvus
490
491 ### 9.3 在线查询
492
493 目标:
494
495 - 输入录音、翻唱、哼唱片段
496 - 提取 embedding
497 - 与 reference index 做相似度检索
498
499 预期方式:
500
501 1. 加载 `best_model.pt`
502 2. 对查询音频切片
503 3. 提取 embedding
504 4. 与 reference embedding 做 ANN 检索
505 5. 结合 vote / rerank 输出最终结果
506
507 ### 9.4 推荐使用策略
508
509 #### 最佳权重
510
511 生产或专题评估优先使用:
512
513 - `best_model.pt`
514
515 #### 恢复训练
516
517 继续训练优先使用:
518
519 - `checkpoint_epoch_*.pt`
520
521 #### 对比实验
522
523 建议每个专题 run 保留完整目录,不覆盖历史 run。
524
525 ---
526
527 ## 10. 预计怎么推进专题训练
528
529 ### 第 1 步:先跑通当前 synthetic_v2
530
531 目标:
532
533 - 验证链路
534 - 验证显存
535 - 验证双流结构
536 - 验证 MERT 接口
537
538 ### 第 2 步:补录音噪声语料
539
540 优先补充:
541
542 - 餐厅
543 - 街道
544 - 室内人声背景
545 - 手机录制样本
546
547 目标:
548
549 - 提升抗噪与设备鲁棒性
550
551 ### 第 3 步:补真实翻唱/旋律相近样本
552
553 目标:
554
555 - 强化旋律对齐
556 - 降低音色依赖
557
558 ### 第 4 步:补难负样本
559
560 目标:
561
562 - 降低误识别
563 - 提高边界判别能力
564
565 ### 第 5 步:固化最优专题权重
566
567 目标:
568
569 - 形成一个可用于离线建库与线上检索的标准权重版本
570
571 ---
572
573 ## 11. 当前专题的资源结论
574
575 ### 可以做的事
576
577 - 继续完善训练链路
578 -`synthetic_v2` 做小规模训练
579 - 做双流模型结构验证
580 - 做 4GB GPU 轻量试训
581 - 规范化训练产物与权重使用方式
582
583 ### 当前暂时受限的事
584
585 - 由于环境缺依赖,**还不能直接启动真实训练**
586 - 由于 GPU 只有 4GB,**真实 MERT + ECAPA 双流正式训练需要保守配置**
587 - 当前真实音乐语料仍不足,**暂时更适合专题验证,不适合最终权重定版**
588
589 ---
590
591 ## 12. 本专题当前落地文件
592
593 ### 配置
594
595 - `acr-engine/configs/coverhunter_finetune.yaml`
596 - `acr-engine/configs/coverhunter_finetune_4gb.yaml`
597 - `acr-engine/configs/default.yaml`
598
599 ### 模型与训练
600
601 - `acr-engine/src/models/ecapa_tdnn.py`
602 - `acr-engine/src/models/losses.py`
603 - `acr-engine/src/data/dataset.py`
604 - `acr-engine/src/utils/augment.py`
605 - `acr-engine/train.py`
606 - `acr-engine/scripts/run_coverhunter_finetune.py`
607
608 ### 文档
609
610 - `docs/coverhunter_training_process.md`
611 - `docs/coverhunter_finetune_topic.md`
612
613 ---
614
615 ## 13. 当前专题结论
616
617 当前已经具备:
618
619 - 双流 CoverHunter 微调架构
620 - 4GB GPU 专用轻量配置
621 - 训练流程脚本
622 - 训练产物记录机制
623 - 专题级训练文档
624
625 当前下一步最实际的动作是:
626
627 1.`/usr/local/miniconda3/bin/python` 下补齐依赖
628 2.`coverhunter_finetune_4gb.yaml` 跑 dry-run
629 3.`synthetic_v2` 做 2 epoch 小规模试训
630 4. 再逐步接入更多 music 语料
1 # CoverHunter 双流微调标准流程
2
3 ## 1. 当前架构
4
5 当前训练架构已经调整为双流:
6
7 - **流 A:MERT + Melody 分支**
8 - 代码位置:`acr-engine/src/models/ecapa_tdnn.py`
9 - 逻辑:冻结的 `FrozenMERTFeatureExtractor` + `melody/chroma` 融合
10 - 默认模型:`m-a-p/MERT-v1-95M`
11 - 说明:当前代码已经支持真实 HuggingFace MERT 权重接入;若环境里缺少 `transformers` 或首次拉取失败,则无法启用真实 MERT
12 - **流 B:ECAPA 分支**
13 - 逻辑:保留 ECAPA 特征建模路径
14 - **双流融合**
15 - `DualStreamFusion`
16 - **检索头**
17 - `CoverHunterHead`
18 - **训练目标**
19 - `InfoNCE + AAMSoftmax`
20
21 ## 2. 当前资源检查结论
22
23 ### Python 解释器
24
25 训练入口已固定支持:
26
27 ```bash
28 /usr/local/miniconda3/bin/python
29 ```
30
31 `acr-engine/scripts/run_coverhunter_finetune.py` 已支持 `--python` 参数,默认就是这个解释器。
32
33 ### GPU
34
35 当前检测到 GPU:
36
37 - **Quadro P1000**
38 - 总显存:**4096 MiB**
39 - 空闲显存:约 **3817 MiB**
40
41 结论:
42
43 - **可以跑训练**
44 - 但显存较小,建议:
45 - `batch_size=2~4`
46 - `segment_dur=5.0` 起步
47 - 优先做 dry-run、小批量试跑、再正式训练
48 - 启用真实 MERT 后不要直接上大 batch
49
50 ### 数据
51
52 当前仓库中可直接用于冒烟训练的数据:
53
54 - `acr-engine/data/synthetic_v2/train.json`
55 - 音频切片位于 `acr-engine/data/synthetic_v2/segments/`
56
57 这些数据已经包含:
58
59 - 普通切片
60 - augmented
61 - humming_like
62 - confused
63
64 适合先做流程验证。
65
66 ### 当前环境缺口
67
68 `/usr/local/miniconda3/bin/python` 下当前缺少这些核心包:
69
70 - `torch`
71 - `transformers`
72 - `huggingface_hub`
73 - `torchaudio`
74 - `librosa`
75 - `soundfile`
76 - `audiomentations`
77
78 所以:
79
80 - **GPU 与解释器可用**
81 - **但当前训练环境还不能直接跑**
82 - 需要先补齐依赖
83
84 ## 3. 标准处理流程
85
86 ### Step 1:准备 Python 环境
87
88 进入项目后,先确保用的是目标解释器:
89
90 ```bash
91 /usr/local/miniconda3/bin/python --version
92 ```
93
94 安装依赖:
95
96 ```bash
97 /usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
98 ```
99
100 如需单独补装:
101
102 ```bash
103 /usr/local/miniconda3/bin/python -m pip install torch torchaudio transformers huggingface_hub librosa soundfile audiomentations
104 ```
105
106 ### Step 2:准备 MERT 权重缓存
107
108 首次启用真实 MERT 时,会从 HuggingFace 拉取:
109
110 - `m-a-p/MERT-v1-95M`
111
112 建议先确认网络可访问 HuggingFace,或提前缓存模型。
113
114 如果不希望改默认配置,可以在 `configs/default.yaml``configs/coverhunter_finetune.yaml` 中调整:
115
116 ```yaml
117 model:
118 mert_model_name: m-a-p/MERT-v1-95M
119 ```
120
121 ### Step 3:准备噪声数据
122
123 为了支持伪造录音增强,建议准备目录,例如:
124
125 ```text
126 acr-engine/data/noise/restaurant/
127 acr-engine/data/noise/street/
128 ```
129
130 里面放公开可用环境音频:
131
132 - 餐厅底噪
133 - 街道底噪
134 - 室内人声背景
135
136 训练时通过:
137
138 ```bash
139 --noise-root acr-engine/data/noise/restaurant \
140 --noise-root acr-engine/data/noise/street
141 ```
142
143 传入。
144
145 ### Step 4:先做 dry-run
146
147 先验证数据、模型、GPU、增强链路是否都通:
148
149 ```bash
150 cd /mnt/e/hikoon-ACR/acr-engine && \
151 /usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
152 --python /usr/local/miniconda3/bin/python \
153 --data data/synthetic_v2 \
154 --device cuda \
155 --segment-strategy hybrid \
156 --dry-run
157 ```
158
159 ### Step 5:小规模试训
160
161 建议先缩小 batch/config,确认显存稳定:
162
163 ```bash
164 cd /mnt/e/hikoon-ACR/acr-engine && \
165 /usr/local/miniconda3/bin/python train.py \
166 --config configs/coverhunter_finetune.yaml \
167 --data data/synthetic_v2 \
168 --output data/training_runs/coverhunter_trial \
169 --device cuda \
170 --segment-strategy hybrid \
171 --batch-size 2 \
172 --epochs 2 \
173 --noise-root data/noise/restaurant \
174 --noise-root data/noise/street
175 ```
176
177 如果显存稳定,再逐步提高到:
178
179 - `batch_size=4`
180 - 必要时再尝试 `batch_size=6`
181
182 ### Step 6:正式专题训练
183
184 标准命令:
185
186 ```bash
187 cd /mnt/e/hikoon-ACR/acr-engine && \
188 /usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
189 --python /usr/local/miniconda3/bin/python \
190 --data data/synthetic_v2 \
191 --device cuda \
192 --segment-strategy hybrid \
193 --noise-root data/noise/restaurant \
194 --noise-root data/noise/street
195 ```
196
197 ### Step 7:检查训练产物
198
199 每次训练会记录到:
200
201 ```text
202 acr-engine/data/training_runs/<run_name>/
203 ```
204
205 标准产物包括:
206
207 - `best_model.pt`
208 - `checkpoint_epoch_*.pt`
209 - `song_to_idx.json`
210 - `training_metrics.json`
211 - `training_manifest.json`
212 - `run_request.json`
213 - `run_summary.json`
214 - `stdout.log`
215 - `stderr.log`
216
217 ## 4. 增强策略说明
218
219 当前代码已经覆盖两类伪造策略:
220
221 ### 伪造录音
222
223 位置:`acr-engine/src/utils/augment.py`
224
225 - `AddGaussianNoise`
226 - `AddBackgroundNoise`
227 - `BandPassFilter`
228 - `Mp3Compression`
229
230 ### 伪造翻唱
231
232 位置:`acr-engine/src/utils/augment.py`
233
234 - `PitchShift`
235 - `TimeStretch`
236 - `Frequency Masking`(作用于 mel)
237
238 ## 5. 资源适配建议
239
240 由于当前 GPU 是 Quadro P1000 4GB,建议按以下梯度推进:
241
242 ### 推荐起步配置
243
244 - `segment_dur=5.0`
245 - `batch_size=2`
246 - `mixed_precision=true`
247 - `num_workers=0`
248
249 ### 稳定后可尝试
250
251 - `batch_size=4`
252 - 如 OOM 则回退
253
254 ### 当前不建议
255
256 - 直接上 8 秒片段 + batch 16
257 - 真实 MERT + 大 batch 同时启用
258
259 ## 6. 当前结论
260
261 当前状态可以概括为:
262
263 - **架构方向已经调整正确**:双流
264 - **真实 MERT 接口已接入**:是
265 - **GPU 可以用于训练**:是
266 - **当前 Python 解释器可用**:是,`/usr/local/miniconda3/bin/python`
267 - **当前环境能否立刻开训****还不能**,因为依赖未装全
268 - **现有数据能否支撑一波流程训练****可以**,先从 `synthetic_v2` 开始
1 # 音乐翻唱检测与音频片段检索系统 (CSI) 核心能力结构清单
2
3 ## 1. 核心架构逻辑
4 * **底座 (Backbone)**:MERT (冻结预训练权重) - 负责音频语义理解。
5 * **头部 (Head)**:CoverHunter (可训练 Conformer+Attention) - 负责旋律与结构的对比学习。
6 * **对齐方式**:双流融合 (MERT 语义特征 + Melody/Chroma 旋律特征)。
7
8 ## 2. 数据与特征工程 (Data Pipeline)
9 * **数据集结构**:以 `Song_ID` 为唯一键,物理隔离原曲、压缩版、录音与环境音。
10 * **动态增强 (Data Augmentation)**
11 * 物理扰动:音高平移 (Pitch Shifting)、变速 (Time Stretching)。
12 * 环境注入:背景噪声混入 (Environment Injection)。
13 * 频率掩码:频段擦除 (Frequency Masking) - 逼迫模型脱离音色依赖,转向旋律核心。
14 * **数据对齐**:使用插值 (Interpolation) 将 MERT 序列长度与 Melody 序列长度对齐至一致的 `Time_Steps`
15
16 ## 3. 训练与优化策略 (Training Strategy)
17 * **样本采样 (Sampler)**:PairSampler - 确保 Batch 中包含强配对的“原曲-翻唱”与精心挑选的“原曲-难负样本”。
18 * **难负样本挖掘 (Hard Negative Mining)**
19 * 使用冻结 MERT + Faiss 构建初始索引。
20 * 挖掘曲风相似但旋律不同的“假孪生兄弟”歌曲作为 Negative 样本。
21 * **损失函数 (Loss Function)**:InfoNCE Contrastive Loss - 拉近正样本余弦距离,推远负样本余弦距离。
22
23 ## 4. 推理与检索引擎 (Inference & Retrieval)
24 * **离线建库**:全量原曲切片 -> 特征提取 -> 存入向量数据库 (Faiss/Milvus)。
25 * **在线查询**:录音片段 -> 滑动窗口切片 -> 提取 Embedding -> 近似最近邻检索 (ANN)。
26 * **鲁棒性机制**:切片投票机制 (Slice Voting) - 对查询录音切片所得的 Top-K 结果进行统计,按票数加权归一化排序。
27
28 ## 5. 工程化关键节点 (Engineering Checklist)
29 * **计算优化**:离线特征缓存 (预先存储 .npy 减少 GPU 实时计算压力)。
30 * **部署优化**:ONNX/TensorRT 模型编译 + 动态批处理 (Dynamic Batching)。
31 * **数据飞轮**:在线难例挖掘 (基于用户反馈的 False Positives 循环重训)。