-

章晓祥
Commit 7da76864 ... 7da76864361f72a1428d2b36aeea2f283d8945e6 authored 2026-06-08 21:09:05 +0800 by 章晓祥
Showing 34 changed files with 2532 additions and 278 deletions
.claude/settings.json
.claude/settings.json.aiapis
.claude/settings.json.cc
.claude/settings.json.gpt
.claude/settings.json.qwen
acr-engine/configs/coverhunter_finetune.yaml
acr-engine/configs/coverhunter_finetune_4gb.yaml
acr-engine/configs/default.yaml
acr-engine/reports/coverhunter_env_setup_report.json
acr-engine/requirements.txt
acr-engine/scripts/run_coverhunter_finetune.py
acr-engine/scripts/setup_coverhunter_env.py
acr-engine/src/data/dataset.py
acr-engine/src/models/ecapa_tdnn.py
acr-engine/src/models/losses.py
acr-engine/src/utils/augment.py
acr-engine/train.py
data/training_runs/coverhunter_finetune_20260608T130103Z/run_request.json
data/training_runs/coverhunter_finetune_20260608T130103Z/run_summary.json
data/training_runs/coverhunter_finetune_20260608T130103Z/stderr.log
--- a/.claude/settings.json 0 → 100644
View file @7da7686
+++ b/.claude/settings.json 0 → 100644
View file @7da7686
+{
+  "env": {
+    "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
+    "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
+    "ANTHROPIC_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
+    "CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
+    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
+    "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1", 
+    "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
+    "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
+  },
+  "permissions": {
+    "allow": [],
+    "deny": []
+  },
+  "model": "sonnet",
+  "enabledPlugins": {
+    "claude-code-setup@claude-plugins-official": true,
+    "typescript-lsp@claude-plugins-official": true,
+    "rust-analyzer-lsp@claude-plugins-official": true,
+    "pr-review-toolkit@claude-plugins-official": true,
+    "ralph-loop@claude-plugins-official": true,
+    "superpowers@claude-plugins-official": true
+  },
+  "alwaysThinkingEnabled": false,
+  "skipDangerousModePermissionPrompt": true,
+  "theme": "dark-ansi",
+  "modelType": "anthropic"
+}
--- a/.claude/settings.json.aiapis 0 → 100644
View file @7da7686
+++ b/.claude/settings.json.aiapis 0 → 100644
View file @7da7686
+{
+  "env": {
+    "ANTHROPIC_AUTH_TOKEN": "sk-GlEnjnf09lXwiJuwDS5Q0nOzGd1ck8YBDERVXv84t9hvtS0U",
+    "ANTHROPIC_BASE_URL": "https://aiapis.help",
+    "ANTHROPIC_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
+    "CLAUDE_CODE_SUBAGENT_MODEL": "gpt-5.4",
+    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
+    "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1", 
+    "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
+    "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
+  },
+  "permissions": {
+    "allow": [],
+    "deny": []
+  },
+  "model": "sonnet",
+  "enabledPlugins": {
+    "claude-code-setup@claude-plugins-official": true,
+    "typescript-lsp@claude-plugins-official": true,
+    "rust-analyzer-lsp@claude-plugins-official": true,
+    "pr-review-toolkit@claude-plugins-official": true,
+    "ralph-loop@claude-plugins-official": true,
+    "superpowers@claude-plugins-official": true
+  },
+  "alwaysThinkingEnabled": false,
+  "skipDangerousModePermissionPrompt": true,
+  "theme": "dark-ansi",
+  "modelType": "anthropic"
+}
--- a/.claude/settings.json.cc 0 → 100644
View file @7da7686
+++ b/.claude/settings.json.cc 0 → 100644
View file @7da7686
+{
+  "env": {
+    "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
+    "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
+    "ANTHROPIC_MODEL": "claude-opus-4.6",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4.6",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4.6",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4.5",
+    "CLAUDE_CODE_SUBAGENT_MODEL": "claude-sonnet-4.6",
+    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
+    "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1", 
+    "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
+    "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
+  },
+  "permissions": {
+    "allow": [],
+    "deny": []
+  },
+  "model": "sonnet",
+  "enabledPlugins": {
+    "claude-code-setup@claude-plugins-official": true,
+    "typescript-lsp@claude-plugins-official": true,
+    "rust-analyzer-lsp@claude-plugins-official": true,
+    "pr-review-toolkit@claude-plugins-official": true,
+    "ralph-loop@claude-plugins-official": true,
+    "superpowers@claude-plugins-official": true
+  },
+  "alwaysThinkingEnabled": false,
+  "skipDangerousModePermissionPrompt": true,
+  "theme": "dark-ansi",
+  "modelType": "anthropic"
+}
--- a/.claude/settings.json.gpt 0 → 100644
View file @7da7686
+++ b/.claude/settings.json.gpt 0 → 100644
View file @7da7686
+{
+  "env": {
+    "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
+    "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
+    "ANTHROPIC_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "gpt-5.4",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "minimaxai/minimax-m2.7",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "gpt-5.4-mini",
+    "CLAUDE_CODE_SUBAGENT_MODEL": "minimaxai/minimax-m2.7",
+    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
+    "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1", 
+    "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
+    "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
+  },
+  "permissions": {
+    "allow": [],
+    "deny": []
+  },
+  "model": "sonnet",
+  "enabledPlugins": {
+    "claude-code-setup@claude-plugins-official": true,
+    "typescript-lsp@claude-plugins-official": true,
+    "rust-analyzer-lsp@claude-plugins-official": true,
+    "pr-review-toolkit@claude-plugins-official": true,
+    "ralph-loop@claude-plugins-official": true,
+    "superpowers@claude-plugins-official": true
+  },
+  "alwaysThinkingEnabled": false,
+  "skipDangerousModePermissionPrompt": true,
+  "theme": "dark-ansi",
+  "modelType": "anthropic"
+}
--- a/.claude/settings.json.qwen 0 → 100644
View file @7da7686
+++ b/.claude/settings.json.qwen 0 → 100644
View file @7da7686
+{
+  "env": {
+    "ANTHROPIC_AUTH_TOKEN": "sk-1yrWrqU7xDxHgz8MIQu3zkeOUb6EqYx2i32jTtwao6780C2o",
+    "ANTHROPIC_BASE_URL": "http://43.155.145.78:65432",
+    "ANTHROPIC_MODEL": "qwen3.7-max",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "qwen3.7-max",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "qwen3.6-plus",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "qwen3.6-plus",
+    "CLAUDE_CODE_SUBAGENT_MODEL": "qwen3.6-plus",
+    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "32000",
+    "CLAUDE_CODE_DISABLE_AUTO_UPDATE": "1", 
+    "CLAUDE_CODE_ATTRIBUTION_HEADER": "0",
+    "CLAUDE_CODE_STOP_HOOK_BLOCK_CAP": 20
+  },
+  "permissions": {
+    "allow": [],
+    "deny": []
+  },
+  "model": "sonnet",
+  "enabledPlugins": {
+    "claude-code-setup@claude-plugins-official": true,
+    "typescript-lsp@claude-plugins-official": true,
+    "rust-analyzer-lsp@claude-plugins-official": true,
+    "pr-review-toolkit@claude-plugins-official": true,
+    "ralph-loop@claude-plugins-official": true,
+    "superpowers@claude-plugins-official": true
+  },
+  "alwaysThinkingEnabled": false,
+  "skipDangerousModePermissionPrompt": true,
+  "theme": "dark-ansi",
+  "modelType": "anthropic"
+}
--- a/acr-engine/configs/coverhunter_finetune.yaml 0 → 100644
View file @7da7686
+++ b/acr-engine/configs/coverhunter_finetune.yaml 0 → 100644
View file @7da7686
+model:
+  name: coverhunter_finetune
+  embed_dim: 256
+  channels: 512
+  se_channels: 128
+  res2net_scale: 8
+  num_blocks: 3
+  n_mels: 128
+  aam_m: 0.2
+  aam_s: 30.0
+  use_band_split: false
+  band_split_channels: 128
+  use_dual_stream: true
+  mert_melody_branch: true
+  ecapa_branch: true
+  coverhunter_heads: 8
+  coverhunter_layers: 4
+  fusion_hidden_dim: 256
+  mert_model_name: m-a-p/MERT-v1-95M
+data:
+  sample_rate: 16000
+  n_fft: 512
+  hop_length: 160
+  segment_dur: 8.0
+  crop_per_song: 6
+training:
+  batch_size: 16
+  epochs: 30
+  lr: 0.0002
+  weight_decay: 0.0001
+  warmup_epochs: 3
+  temperature: 0.05
+  supcon_weight: 1.0
+  aam_weight: 0.2
+  mixed_precision: true
+  gradient_clip: 1.0
+  save_every: 5
+  log_every: 10
+  hard_negative_k: 4
+  sample_type_weights:
+    default: 1
+    compressed: 2
+    recording: 3
+    environment: 4
+  pair_type_weights:
+    default: 1.0
+    compressed: 1.5
+    recording: 2.0
+    environment: 3.0
--- a/acr-engine/configs/coverhunter_finetune_4gb.yaml 0 → 100644
View file @7da7686
+++ b/acr-engine/configs/coverhunter_finetune_4gb.yaml 0 → 100644
View file @7da7686
+model:
+  name: coverhunter_finetune_lowmem
+  embed_dim: 192
+  channels: 256
+  se_channels: 64
+  res2net_scale: 4
+  num_blocks: 2
+  n_mels: 96
+  aam_m: 0.2
+  aam_s: 24.0
+  use_band_split: false
+  band_split_channels: 64
+  use_dual_stream: true
+  mert_melody_branch: true
+  ecapa_branch: true
+  coverhunter_heads: 4
+  coverhunter_layers: 2
+  fusion_hidden_dim: 128
+  mert_model_name: m-a-p/MERT-v1-95M
+data:
+  sample_rate: 16000
+  n_fft: 512
+  hop_length: 160
+  segment_dur: 5.0
+  crop_per_song: 4
+training:
+  batch_size: 2
+  epochs: 20
+  lr: 0.00015
+  weight_decay: 0.0001
+  warmup_epochs: 2
+  temperature: 0.05
+  supcon_weight: 1.0
+  aam_weight: 0.2
+  mixed_precision: true
+  gradient_clip: 1.0
+  save_every: 5
+  log_every: 10
+  hard_negative_k: 2
+  sample_type_weights:
+    default: 1
+    compressed: 2
+    recording: 3
+    environment: 4
+  pair_type_weights:
+    default: 1.0
+    compressed: 1.4
+    recording: 1.8
+    environment: 2.2
--- a/acr-engine/configs/default.yaml
View file @7da7686
+++ b/acr-engine/configs/default.yaml
View file @7da7686
@@ -10,6 +10,13 @@ model:
  aam_s: 30.0
  use_band_split: true
  band_split_channels: 128
+  use_dual_stream: true
+  mert_melody_branch: true
+  ecapa_branch: true
+  coverhunter_heads: 4
+  coverhunter_layers: 2
+  fusion_hidden_dim: 256
+  mert_model_name: m-a-p/MERT-v1-95M
 data:
  sample_rate: 16000
@@ -31,15 +38,17 @@ training:
  gradient_clip: 1.0
  save_every: 10
  log_every: 10
+  hard_negative_k: 2
  sample_type_weights:
    default: 1
-    humming_like: 3
+    compressed: 2
-    confused: 5
+    recording: 3
+    environment: 4
  pair_type_weights:
    default: 1.0
-    augmented: 1.4
+    compressed: 1.5
-    humming_like: 2.5
+    recording: 2.0
-    confused: 4.0
+    environment: 2.5
 engine:
  chromaprint:
--- a/acr-engine/reports/coverhunter_env_setup_report.json 0 → 100644
View file @7da7686
+++ b/acr-engine/reports/coverhunter_env_setup_report.json 0 → 100644
View file @7da7686
+{
+  "python": "/usr/local/miniconda3/bin/python",
+  "cwd": "/mnt/e/hikoon-ACR/acr-engine",
+  "steps": [
+    {
+      "name": "install_requirements",
+      "command": [
+        "/usr/local/miniconda3/bin/python",
+        "-m",
+        "pip",
+        "install",
+        "-r",
+        "requirements.txt"
+      ],
+      "returncode": 0,
+      "stdout": "\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 40.7/40.7 MB 10.9 MB/s  0:00:03\nDownloading nvidia_nvtx-13.0.85-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (148 kB)\nDownloading setuptools-81.0.0-py3-none-any.whl (1.1 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 1.1/1.1 MB 8.6 MB/s  0:00:00\nDownloading audioread-3.1.0-py3-none-any.whl (23 kB)\nDownloading click-8.4.1-py3-none-any.whl (116 kB)\nDownloading cuda_pathfinder-1.5.5-py3-none-any.whl (51 kB)\nDownloading decorator-5.3.1-py3-none-any.whl (10 kB)\nDownloading filelock-3.29.1-py3-none-any.whl (40 kB)\nDownloading fsspec-2026.4.0-py3-none-any.whl (203 kB)\nDownloading joblib-1.5.3-py3-none-any.whl (309 kB)\nDownloading lazy_loader-0.5-py3-none-any.whl (8.0 kB)\nDownloading networkx-3.6.1-py3-none-any.whl (2.1 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 2.1/2.1 MB 10.3 MB/s  0:00:00\nDownloading numba-0.65.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 3.8/3.8 MB 10.3 MB/s  0:00:00\nDownloading llvmlite-0.47.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (56.3 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 56.3/56.3 MB 10.8 MB/s  0:00:05\nDownloading pooch-1.9.0-py3-none-any.whl (67 kB)\nDownloading regex-2026.5.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (801 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 801.2/801.2 kB 8.5 MB/s  0:00:00\nDownloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (507 kB)\nDownloading scikit_learn-1.9.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (9.1 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 9.1/9.1 MB 10.0 MB/s  0:00:00\nDownloading narwhals-2.22.1-py3-none-any.whl (454 kB)\nDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 6.3/6.3 MB 10.7 MB/s  0:00:00\nDownloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n   \u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501 536.2/536.2 kB 7.1 MB/s  0:00:00\nDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\nDownloading jinja2-3.1.6-py3-none-any.whl (134 kB)\nDownloading markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)\nInstalling collected packages: torchaudio, nvidia-cusparselt-cu13, mpmath, cuda-toolkit, triton, threadpoolctl, sympy, setuptools, safetensors, regex, python-stretch, nvidia-nvtx, nvidia-nvshmem-cu13, nvidia-nvjitlink, nvidia-nccl-cu13, nvidia-curand, nvidia-cufile, nvidia-cuda-runtime, nvidia-cuda-nvrtc, nvidia-cuda-cupti, numpy, networkx, narwhals, MarkupSafe, llvmlite, lazy_loader, joblib, hf-xet, fsspec, filelock, decorator, cuda-pathfinder, click, audioread, soxr, soundfile, scipy, pooch, nvidia-cusparse, nvidia-cufft, nvidia-cublas, numpy-rms, numpy-minmax, numba, jinja2, cuda-bindings, scikit-learn, nvidia-cusolver, nvidia-cudnn-cu13, librosa, huggingface_hub, torch, tokenizers, audiomentations, transformers\n\nSuccessfully installed MarkupSafe-3.0.3 audiomentations-0.43.1 audioread-3.1.0 click-8.4.1 cuda-bindings-13.3.1 cuda-pathfinder-1.5.5 cuda-toolkit-13.0.2 decorator-5.3.1 filelock-3.29.1 fsspec-2026.4.0 hf-xet-1.5.0 huggingface_hub-1.18.0 jinja2-3.1.6 joblib-1.5.3 lazy_loader-0.5 librosa-0.11.0 llvmlite-0.47.0 mpmath-1.3.0 narwhals-2.22.1 networkx-3.6.1 numba-0.65.1 numpy-2.4.6 numpy-minmax-0.5.0 numpy-rms-0.6.0 nvidia-cublas-13.1.1.3 nvidia-cuda-cupti-13.0.85 nvidia-cuda-nvrtc-13.0.88 nvidia-cuda-runtime-13.0.96 nvidia-cudnn-cu13-9.20.0.48 nvidia-cufft-12.0.0.61 nvidia-cufile-1.15.1.6 nvidia-curand-10.4.0.35 nvidia-cusolver-12.0.4.66 nvidia-cusparse-12.6.3.3 nvidia-cusparselt-cu13-0.8.1 nvidia-nccl-cu13-2.29.7 nvidia-nvjitlink-13.0.88 nvidia-nvshmem-cu13-3.4.5 nvidia-nvtx-13.0.85 pooch-1.9.0 python-stretch-0.3.1 regex-2026.5.9 safetensors-0.7.0 scikit-learn-1.9.0 scipy-1.17.1 setuptools-81.0.0 soundfile-0.14.0 soxr-0.5.0.post1 sympy-1.14.0 threadpoolctl-3.6.0 tokenizers-0.22.2 torch-2.12.0 torchaudio-2.11.0 transformers-5.10.2 triton-3.7.0\n",
+      "stderr": "  WARNING: The scripts proton and proton-viewer are installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n  WARNING: The script isympy is installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n  WARNING: The scripts f2py and numpy-config are installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n  WARNING: The scripts hf, huggingface-cli and tiny-agents are installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n  WARNING: The scripts torchfrtrace and torchrun are installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n  WARNING: The script transformers is installed in '/home/user/.local/bin' which is not on PATH.\n  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n"
+    },
+    {
+      "name": "install_extra_packages",
+      "command": [
+        "/usr/local/miniconda3/bin/python",
+        "-m",
+        "pip",
+        "install",
+        "torch",
+        "torchaudio",
+        "transformers",
+        "huggingface_hub",
+        "librosa",
+        "soundfile",
+        "audiomentations"
+      ],
+      "returncode": 0,
+      "stdout": "a3/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface_hub) (0.16.0)\nRequirement already satisfied: shellingham>=1.3.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (1.5.4)\nRequirement already satisfied: rich>=10.11.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from typer->transformers) (14.2.0)\nRequirement already satisfied: audioread>=2.1.9 in /home/user/.local/lib/python3.12/site-packages (from librosa) (3.1.0)\nRequirement already satisfied: numba>=0.51.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.65.1)\nRequirement already satisfied: scipy>=1.6.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.17.1)\nRequirement already satisfied: scikit-learn>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: joblib>=1.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.5.3)\nRequirement already satisfied: decorator>=4.3.0 in /home/user/.local/lib/python3.12/site-packages (from librosa) (5.3.1)\nRequirement already satisfied: pooch>=1.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (1.9.0)\nRequirement already satisfied: soxr>=0.3.2 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5.0.post1)\nRequirement already satisfied: lazy_loader>=0.1 in /home/user/.local/lib/python3.12/site-packages (from librosa) (0.5)\nRequirement already satisfied: msgpack>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from librosa) (1.1.1)\nRequirement already satisfied: cffi>=1.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from soundfile) (1.17.1)\nRequirement already satisfied: numpy-minmax<1,>=0.3.0 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.5.0)\nRequirement already satisfied: numpy-rms<1,>=0.4.2 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.6.0)\nRequirement already satisfied: python-stretch<1,>=0.3.1 in /home/user/.local/lib/python3.12/site-packages (from audiomentations) (0.3.1)\nRequirement already satisfied: pycparser in /usr/local/miniconda3/lib/python3.12/site-packages (from cffi>=1.0->soundfile) (3.0)\nRequirement already satisfied: llvmlite<0.48,>=0.47.0dev0 in /home/user/.local/lib/python3.12/site-packages (from numba>=0.51.0->librosa) (0.47.0)\nRequirement already satisfied: platformdirs>=2.5.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (4.9.4)\nRequirement already satisfied: requests>=2.19.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from pooch>=1.1->librosa) (2.33.1)\nRequirement already satisfied: charset_normalizer<4,>=2 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (3.4.4)\nRequirement already satisfied: urllib3<3,>=1.26 in /usr/local/miniconda3/lib/python3.12/site-packages (from requests>=2.19.0->pooch>=1.1->librosa) (2.6.3)\nRequirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (4.0.0)\nRequirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/miniconda3/lib/python3.12/site-packages (from rich>=10.11.0->typer->transformers) (2.20.0)\nRequirement already satisfied: mdurl~=0.1 in /usr/local/miniconda3/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer->transformers) (0.1.2)\nRequirement already satisfied: narwhals>=2.0.1 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (2.22.1)\nRequirement already satisfied: threadpoolctl>=3.5.0 in /home/user/.local/lib/python3.12/site-packages (from scikit-learn>=1.1.0->librosa) (3.6.0)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /home/user/.local/lib/python3.12/site-packages (from sympy>=1.13.3->torch) (1.3.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /home/user/.local/lib/python3.12/site-packages (from jinja2->torch) (3.0.3)\n",
+      "stderr": ""
+    },
+    {
+      "name": "verify_environment",
+      "command": [
+        "/usr/local/miniconda3/bin/python",
+        "-c",
+        "import torch, transformers, librosa, soundfile, audiomentations; print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
+      ],
+      "returncode": 0,
+      "stdout": "{'torch': '2.12.0+cu130', 'cuda': False, 'transformers': '5.10.2'}\n",
+      "stderr": "/home/user/.local/lib/python3.12/site-packages/torch/cuda/__init__.py:187: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 12080). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:119.)\n  return torch._C._cuda_getDeviceCount() > 0\n"
+    }
+  ]
+}
\ No newline at end of file
--- a/acr-engine/requirements.txt
View file @7da7686
+++ b/acr-engine/requirements.txt
View file @7da7686
@@ -2,6 +2,10 @@ numpy>=1.26
 PyYAML>=6.0
 soundfile>=0.12
 librosa>=0.10
+audiomentations>=0.37
+transformers>=4.46
+huggingface_hub>=0.26
+torchaudio>=2.3
 tqdm>=4.66
 torch>=2.3
 fastapi>=0.115
--- a/acr-engine/scripts/run_coverhunter_finetune.py 0 → 100644
View file @7da7686
+++ b/acr-engine/scripts/run_coverhunter_finetune.py 0 → 100644
View file @7da7686
+#!/usr/bin/env python3
+import argparse
+import json
+import subprocess
+from datetime import datetime
+from pathlib import Path
+DEFAULT_PYTHON = "/usr/local/miniconda3/bin/python"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--python", default=DEFAULT_PYTHON)
+    parser.add_argument("--config", default="configs/coverhunter_finetune_4gb.yaml")
+    parser.add_argument("--data", required=True)
+    parser.add_argument("--output-root", default="data/training_runs")
+    parser.add_argument("--run-name", default=None)
+    parser.add_argument("--noise-root", action="append", default=[])
+    parser.add_argument("--device", default="auto")
+    parser.add_argument("--segment-strategy", default="hybrid")
+    parser.add_argument("--resume", default=None)
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+    timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
+    run_name = args.run_name or f"coverhunter_finetune_{timestamp}"
+    run_dir = Path(args.output_root) / run_name
+    run_dir.mkdir(parents=True, exist_ok=True)
+    command = [
+        args.python,
+        "train.py",
+        "--config",
+        args.config,
+        "--data",
+        args.data,
+        "--output",
+        str(run_dir),
+        "--device",
+        args.device,
+        "--segment-strategy",
+        args.segment_strategy,
+    ]
+    if args.resume:
+        command.extend(["--resume", args.resume])
+    if args.dry_run:
+        command.append("--dry-run")
+    for noise_root in args.noise_root:
+        command.extend(["--noise-root", noise_root])
+    metadata = {
+        "run_name": run_name,
+        "created_at": datetime.utcnow().isoformat() + "Z",
+        "python": args.python,
+        "command": command,
+        "config": args.config,
+        "data": args.data,
+        "noise_roots": args.noise_root,
+        "run_dir": str(run_dir),
+    }
+    with open(run_dir / "run_request.json", "w") as f:
+        json.dump(metadata, f, indent=2)
+    result = subprocess.run(command, cwd=Path(__file__).resolve().parents[1], text=True, capture_output=True)
+    (run_dir / "stdout.log").write_text(result.stdout)
+    (run_dir / "stderr.log").write_text(result.stderr)
+    summary = {
+        **metadata,
+        "returncode": result.returncode,
+        "completed_at": datetime.utcnow().isoformat() + "Z",
+        "artifacts": sorted(path.name for path in run_dir.iterdir()),
+    }
+    with open(run_dir / "run_summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    if result.returncode != 0:
+        raise SystemExit(result.returncode)
+if __name__ == "__main__":
+    main()
--- a/acr-engine/scripts/setup_coverhunter_env.py 0 → 100644
View file @7da7686
+++ b/acr-engine/scripts/setup_coverhunter_env.py 0 → 100644
View file @7da7686
+#!/usr/bin/env python3
+import argparse
+import json
+import subprocess
+from pathlib import Path
+PYTHON_DEFAULT = "/usr/local/miniconda3/bin/python"
+PACKAGES = [
+    "-r", "requirements.txt",
+]
+EXTRA_PACKAGES = [
+    "torch",
+    "torchaudio",
+    "transformers",
+    "huggingface_hub",
+    "librosa",
+    "soundfile",
+    "audiomentations",
+]
+def run(command, cwd):
+    return subprocess.run(command, cwd=cwd, text=True, capture_output=True)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--python", default=PYTHON_DEFAULT)
+    parser.add_argument("--skip-install", action="store_true")
+    args = parser.parse_args()
+    root = Path(__file__).resolve().parents[1]
+    report = {
+        "python": args.python,
+        "cwd": str(root),
+        "steps": [],
+    }
+    if not args.skip_install:
+        install_cmd = [args.python, "-m", "pip", "install", *PACKAGES]
+        res = run(install_cmd, root)
+        report["steps"].append({
+            "name": "install_requirements",
+            "command": install_cmd,
+            "returncode": res.returncode,
+            "stdout": res.stdout[-4000:],
+            "stderr": res.stderr[-4000:],
+        })
+        extra_cmd = [args.python, "-m", "pip", "install", *EXTRA_PACKAGES]
+        res = run(extra_cmd, root)
+        report["steps"].append({
+            "name": "install_extra_packages",
+            "command": extra_cmd,
+            "returncode": res.returncode,
+            "stdout": res.stdout[-4000:],
+            "stderr": res.stderr[-4000:],
+        })
+    verify_cmd = [
+        args.python,
+        "-c",
+        (
+            "import torch, transformers, librosa, soundfile, audiomentations; "
+            "print({'torch': torch.__version__, 'cuda': torch.cuda.is_available(), 'transformers': transformers.__version__})"
+        ),
+    ]
+    res = run(verify_cmd, root)
+    report["steps"].append({
+        "name": "verify_environment",
+        "command": verify_cmd,
+        "returncode": res.returncode,
+        "stdout": res.stdout[-4000:],
+        "stderr": res.stderr[-4000:],
+    })
+    report_path = root / "reports" / "coverhunter_env_setup_report.json"
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    report_path.write_text(json.dumps(report, indent=2))
+    print(report_path)
+    if any(step["returncode"] != 0 for step in report["steps"]):
+        raise SystemExit(1)
+if __name__ == "__main__":
+    main()
--- a/acr-engine/src/data/dataset.py
View file @7da7686
+++ b/acr-engine/src/data/dataset.py
View file @7da7686
@@ -8,6 +8,9 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset
+from src.utils.audio import AudioProcessor
+from src.utils.augment import AugmentPipeline
 def compute_candidate_offsets(
    y: np.ndarray,
@@ -124,6 +127,267 @@ def compute_candidate_offsets(
    return []
+class DualStreamFeatureExtractor:
+    def __init__(self, sr: int, n_mels: int, n_fft: int, hop_length: int):
+        self.audio = AudioProcessor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
+        self.n_mels = n_mels
+    def extract(self, y: np.ndarray) -> Dict[str, torch.Tensor]:
+        mel = self.audio.to_mel(y)
+        melody = self.audio.extract_f0(y)
+        melody = librosa.hz_to_midi(melody)
+        melody = np.nan_to_num(melody, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
+        chroma = self.audio.extract_chroma(y).astype(np.float32)
+        time_steps = mel.shape[1]
+        if melody.size == 0:
+            melody = np.zeros(time_steps, dtype=np.float32)
+        else:
+            melody = np.interp(
+                np.linspace(0, melody.size - 1, time_steps),
+                np.arange(melody.size),
+                melody,
+            ).astype(np.float32)
+        chroma_resized = np.stack(
+            [
+                np.interp(
+                    np.linspace(0, chroma.shape[1] - 1, time_steps),
+                    np.arange(chroma.shape[1]),
+                    chroma_row,
+                )
+                for chroma_row in chroma
+            ],
+            axis=0,
+        ).astype(np.float32)
+        return {
+            "mel": torch.FloatTensor(mel),
+            "melody": torch.FloatTensor(melody).unsqueeze(0),
+            "chroma": torch.FloatTensor(chroma_resized),
+        }
+class PairSamplerDataset(Dataset):
+    def __init__(
+        self,
+        data_dir: str,
+        split: str = "train",
+        sr: int = 16000,
+        n_mels: int = 80,
+        n_fft: int = 512,
+        hop_length: int = 160,
+        segment_dur: float = 5.0,
+        augment: bool = True,
+        segment_strategy: str = "random",
+        silence_top_db: int = 30,
+        sample_type_weights: Optional[Dict[str, int]] = None,
+        pair_type_weights: Optional[Dict[str, float]] = None,
+        hard_negative_k: int = 1,
+        noise_roots: Optional[List[str]] = None,
+    ):
+        self.sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.segment_len = int(segment_dur * sr)
+        self.augment = augment
+        self.segment_strategy = segment_strategy
+        self.silence_top_db = silence_top_db
+        self.data_dir = Path(data_dir)
+        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
+        self.sample_type_weights = {
+            "default": 1,
+            "compressed": 2,
+            "recording": 3,
+            "environment": 4,
+            **(sample_type_weights or {}),
+        }
+        self.pair_type_weights = {
+            "default": 1.0,
+            "compressed": 1.5,
+            "recording": 2.0,
+            "environment": 2.5,
+            **(pair_type_weights or {}),
+        }
+        self.hard_negative_k = hard_negative_k
+        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
+        self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
+        self.aggressive_augmenter = AugmentPipeline(sr, aggressive=True, noise_roots=noise_roots)
+        with open(self.data_dir / f"{split}.json") as f:
+            metadata = json.load(f)
+        self.by_song: Dict[str, List[Dict]] = {}
+        for item in metadata:
+            if not self._is_training_candidate(item):
+                continue
+            p = self.asset_root / item["audio_path"]
+            if p.exists():
+                self.by_song.setdefault(item["song_id"], []).append(item)
+        self.song_ids = sorted(self.by_song)
+        self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
+        self.sample_song_ids = []
+        self.hard_negative_map: Dict[str, List[str]] = self._build_hard_negative_map()
+        for sid, items in self.by_song.items():
+            item_types = {self._normalize_sample_type(x.get("type")) for x in items}
+            weight = self.sample_type_weights.get("default", 1)
+            for item_type in item_types:
+                weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
+            self.sample_song_ids.extend([sid] * weight)
+    @staticmethod
+    def _normalize_sample_type(sample_type: Optional[str]) -> str:
+        mapping = {
+            "reference": "reference",
+            "compressed": "compressed",
+            "recording": "recording",
+            "environment": "environment",
+            "humming_like": "recording",
+            "confused": "environment",
+            None: "default",
+        }
+        return mapping.get(sample_type, sample_type or "default")
+    def _is_training_candidate(self, item: Dict) -> bool:
+        sample_type = self._normalize_sample_type(item.get("type"))
+        return sample_type != "reference"
+    def _build_hard_negative_map(self) -> Dict[str, List[str]]:
+        song_features: Dict[str, np.ndarray] = {}
+        for song_id, items in self.by_song.items():
+            feats = []
+            for item in items[:2]:
+                path = self.asset_root / item["audio_path"]
+                try:
+                    y, _ = librosa.load(str(path), sr=self.sr, mono=True, duration=8.0)
+                    mel = self.feature_extractor.audio.to_mel(y)
+                    feats.append(np.mean(mel, axis=1))
+                except Exception:
+                    continue
+            if feats:
+                song_features[song_id] = np.mean(feats, axis=0)
+        hard_negative_map: Dict[str, List[str]] = {}
+        song_ids = list(song_features)
+        for song_id in song_ids:
+            anchor = song_features[song_id]
+            anchor_norm = np.linalg.norm(anchor) + 1e-12
+            scored = []
+            for other_song_id in song_ids:
+                if other_song_id == song_id:
+                    continue
+                other = song_features[other_song_id]
+                score = float(np.dot(anchor, other) / (anchor_norm * (np.linalg.norm(other) + 1e-12)))
+                scored.append((score, other_song_id))
+            scored.sort(reverse=True)
+            hard_negative_map[song_id] = [other_song_id for _, other_song_id in scored[: max(self.hard_negative_k, 1) * 4]]
+        return hard_negative_map
+    def __len__(self):
+        return len(self.sample_song_ids)
+    def _load_clip(self, sample: Dict) -> np.ndarray:
+        path = self.asset_root / sample["audio_path"]
+        full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
+        duration = float(sample.get("duration", len(full_y) / self.sr))
+        max_offset = max(0.0, duration - (self.segment_len / self.sr))
+        offset = 0.0
+        if max_offset > 0:
+            if self.segment_strategy == "random":
+                offset = random.uniform(0, max_offset)
+            else:
+                direct_candidates = compute_candidate_offsets(
+                    y=full_y,
+                    sr=self.sr,
+                    segment_len=self.segment_len,
+                    strategy=self.segment_strategy,
+                    silence_top_db=self.silence_top_db,
+                )
+                if direct_candidates:
+                    offset = min(random.choice(direct_candidates) / self.sr, max_offset)
+                elif self.segment_strategy == "hybrid":
+                    candidate_pool: List[int] = []
+                    for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
+                        candidate_pool.extend(
+                            compute_candidate_offsets(
+                                y=full_y,
+                                sr=self.sr,
+                                segment_len=self.segment_len,
+                                strategy=strategy,
+                                silence_top_db=self.silence_top_db,
+                            )
+                        )
+                    if candidate_pool and random.random() < 0.75:
+                        offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
+                    else:
+                        offset = random.uniform(0, max_offset)
+                else:
+                    offset = random.uniform(0, max_offset)
+        start = int(offset * self.sr)
+        y = full_y[start : start + self.segment_len]
+        if len(y) < self.segment_len:
+            y = np.pad(y, (0, self.segment_len - len(y)))
+        return y
+    def _augment_wave(self, sample: Dict, y: np.ndarray) -> np.ndarray:
+        if not self.augment:
+            return y
+        sample_type = self._normalize_sample_type(sample.get("type"))
+        if sample_type in {"recording", "environment"}:
+            return self.aggressive_augmenter(y)
+        return self.augmenter(y)
+    def _load_features(self, sample: Dict) -> Dict[str, torch.Tensor]:
+        y = self._load_clip(sample)
+        y = self._augment_wave(sample, y)
+        features = self.feature_extractor.extract(y)
+        features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
+        return features
+    def _pick_positive_pair(self, song_id: str) -> tuple[Dict, Dict]:
+        choices = self.by_song[song_id]
+        if len(choices) == 1:
+            return choices[0], choices[0]
+        return tuple(random.sample(choices, 2))
+    def _pick_negative(self, song_id: str) -> Dict:
+        hard_songs = self.hard_negative_map.get(song_id, [])
+        candidate_song_ids = hard_songs[: self.hard_negative_k] if hard_songs else []
+        if candidate_song_ids and random.random() < 0.8:
+            negative_song_id = random.choice(candidate_song_ids)
+        else:
+            pool = [sid for sid in self.song_ids if sid != song_id]
+            negative_song_id = random.choice(pool)
+        return random.choice(self.by_song[negative_song_id])
+    def __getitem__(self, idx):
+        song_id = self.sample_song_ids[idx]
+        pos_a, pos_b = self._pick_positive_pair(song_id)
+        negative = self._pick_negative(song_id)
+        positive_items = [pos_a, pos_b]
+        positive_features = [self._load_features(sample) for sample in positive_items]
+        negative_features = self._load_features(negative)
+        hard_weights = [
+            self.pair_type_weights.get(self._normalize_sample_type(sample.get("type")), self.pair_type_weights["default"])
+            for sample in positive_items
+        ]
+        hard_weights.append(self.pair_type_weights.get("environment", 2.5))
+        label = self.song_to_idx[song_id]
+        negative_label = self.song_to_idx[negative["song_id"]]
+        return {
+            "mel": torch.stack([feat["mel"] for feat in positive_features] + [negative_features["mel"]], dim=0),
+            "melody": torch.stack([feat["melody"] for feat in positive_features] + [negative_features["melody"]], dim=0),
+            "chroma": torch.stack([feat["chroma"] for feat in positive_features] + [negative_features["chroma"]], dim=0),
+            "song_id": torch.tensor([label, label, negative_label], dtype=torch.long),
+            "song_name": song_id,
+            "hard_weight": torch.tensor(hard_weights, dtype=torch.float32),
+        }
 class ACRDataset(Dataset):
    def __init__(
        self,
@@ -140,6 +404,7 @@ class ACRDataset(Dataset):
        references_only: bool = False,
        segment_strategy: str = "random",
        silence_top_db: int = 30,
+        noise_roots: Optional[List[str]] = None,
    ):
        self.sr = sr
        self.n_mels = n_mels
@@ -152,6 +417,8 @@ class ACRDataset(Dataset):
        self.silence_top_db = silence_top_db
        self.data_dir = Path(data_dir)
        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
+        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
+        self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
        meta_path = self.data_dir / f"{split}.json"
        with open(meta_path) as f:
@@ -179,16 +446,6 @@ class ACRDataset(Dataset):
            y = y[: self.segment_len]
        return y
-    def _to_mel(self, y: np.ndarray) -> np.ndarray:
-        mel = librosa.feature.melspectrogram(
-            y=y,
-            sr=self.sr,
-            n_mels=self.n_mels,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-        )
-        return librosa.power_to_db(mel, ref=np.max)
    def _choose_offset(self, sample: Dict, audio_path: Path) -> float:
        duration = float(sample["duration"])
        max_offset = max(0.0, duration - (self.segment_len / self.sr))
@@ -231,24 +488,22 @@ class ACRDataset(Dataset):
    def __getitem__(self, idx):
        sample = self.samples[idx // self.n_crops]
        audio_path = self.asset_root / sample["audio_path"]
        offset = self._choose_offset(sample, audio_path)
        y = self._load_segment(str(audio_path), offset, 5.0)
        if self.augment and sample.get("type") != "reference":
-            from src.utils.augment import AugmentPipeline
+            y = self.augmenter(y)
-            aug = AugmentPipeline(self.sr)
-            y = aug(y)
-        mel = self._to_mel(y)
+        features = self.feature_extractor.extract(y)
-        mel_tensor = torch.FloatTensor(mel)
+        features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
        song_id = sample["song_id"]
        class_id = self.song_to_idx[song_id]
        return {
-            "mel": mel_tensor,
+            "mel": features["mel"],
+            "melody": features["melody"],
+            "chroma": features["chroma"],
            "song_id": torch.tensor(class_id, dtype=torch.long),
            "song_name": song_id,
            "type": sample.get("type", "unknown"),
@@ -272,6 +527,7 @@ class ACRTestDataset(Dataset):
        self.hop_length = hop_length
        self.data_dir = Path(data_dir)
        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
+        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        meta_path = self.data_dir / f"{split}.json"
        with open(meta_path) as f:
@@ -299,171 +555,17 @@ class ACRTestDataset(Dataset):
        else:
            y = y[:seg_len]
-        mel = librosa.power_to_db(
+        features = self.feature_extractor.extract(y)
-            librosa.feature.melspectrogram(
-                y=y,
-                sr=self.sr,
-                n_mels=self.n_mels,
-                n_fft=self.n_fft,
-                hop_length=self.hop_length,
-            ),
-            ref=np.max,
-        )
        class_id = self.song_to_idx[sample["song_id"]]
        return {
-            "mel": torch.FloatTensor(mel),
+            "mel": features["mel"],
+            "melody": features["melody"],
+            "chroma": features["chroma"],
            "song_id": torch.tensor(class_id, dtype=torch.long),
            "song_name": sample["song_id"],
            "type": sample.get("type", "unknown"),
        }
-class SongPairDataset(Dataset):
+class SongPairDataset(PairSamplerDataset):
-    def __init__(
+    pass
-        self,
-        data_dir: str,
-        split: str = "train",
-        sr: int = 16000,
-        n_mels: int = 80,
-        n_fft: int = 512,
-        hop_length: int = 160,
-        segment_dur: float = 5.0,
-        augment: bool = True,
-        segment_strategy: str = "random",
-        silence_top_db: int = 30,
-        sample_type_weights: Optional[Dict[str, int]] = None,
-        pair_type_weights: Optional[Dict[str, float]] = None,
-    ):
-        self.sr = sr
-        self.n_mels = n_mels
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.segment_len = int(segment_dur * sr)
-        self.augment = augment
-        self.segment_strategy = segment_strategy
-        self.silence_top_db = silence_top_db
-        self.data_dir = Path(data_dir)
-        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
-        self.sample_type_weights = {
-            "default": 1,
-            "humming_like": 3,
-            "confused": 5,
-            **(sample_type_weights or {}),
-        }
-        self.pair_type_weights = {
-            "default": 1.0,
-            "augmented": 1.4,
-            "humming_like": 2.5,
-            "confused": 4.0,
-            **(pair_type_weights or {}),
-        }
-        with open(self.data_dir / f"{split}.json") as f:
-            metadata = json.load(f)
-        self.by_song: Dict[str, List[Dict]] = {}
-        for item in metadata:
-            if item.get("type") == "reference":
-                continue
-            p = self.asset_root / item["audio_path"]
-            if p.exists():
-                self.by_song.setdefault(item["song_id"], []).append(item)
-        self.song_ids = sorted(self.by_song)
-        self.sample_song_ids = []
-        for sid, items in self.by_song.items():
-            item_types = {x.get("type") for x in items}
-            weight = self.sample_type_weights.get("default", 1)
-            for item_type in item_types:
-                weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
-            self.sample_song_ids.extend([sid] * weight)
-        self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
-    def __len__(self):
-        return len(self.sample_song_ids)
-    def _load_clip(self, sample: Dict) -> np.ndarray:
-        path = self.asset_root / sample["audio_path"]
-        full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
-        duration = float(sample.get("duration", len(full_y) / self.sr))
-        max_offset = max(0.0, duration - (self.segment_len / self.sr))
-        offset = 0.0
-        if max_offset > 0:
-            if self.segment_strategy == "random":
-                offset = random.uniform(0, max_offset)
-            else:
-                direct_candidates = compute_candidate_offsets(
-                    y=full_y,
-                    sr=self.sr,
-                    segment_len=self.segment_len,
-                    strategy=self.segment_strategy,
-                    silence_top_db=self.silence_top_db,
-                )
-                if direct_candidates:
-                    offset = min(random.choice(direct_candidates) / self.sr, max_offset)
-                elif self.segment_strategy == "hybrid":
-                    candidate_pool: List[int] = []
-                    for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
-                        candidate_pool.extend(
-                            compute_candidate_offsets(
-                                y=full_y,
-                                sr=self.sr,
-                                segment_len=self.segment_len,
-                                strategy=strategy,
-                                silence_top_db=self.silence_top_db,
-                            )
-                        )
-                    if candidate_pool and random.random() < 0.75:
-                        offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
-                    else:
-                        offset = random.uniform(0, max_offset)
-                else:
-                    offset = random.uniform(0, max_offset)
-        start = int(offset * self.sr)
-        y = full_y[start : start + self.segment_len]
-        if len(y) < self.segment_len:
-            y = np.pad(y, (0, self.segment_len - len(y)))
-        return y
-    def _to_mel(self, y: np.ndarray) -> torch.Tensor:
-        mel = librosa.feature.melspectrogram(
-            y=y,
-            sr=self.sr,
-            n_mels=self.n_mels,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-        )
-        mel = librosa.power_to_db(mel, ref=np.max)
-        return torch.FloatTensor(mel)
-    def __getitem__(self, idx):
-        song_id = self.sample_song_ids[idx]
-        choices = self.by_song[song_id]
-        if len(choices) == 1:
-            a = b = choices[0]
-        else:
-            a, b = random.sample(choices, 2)
-        pair_weights = [
-            self.pair_type_weights.get(a.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
-            self.pair_type_weights.get(b.get("type", "unknown"), self.pair_type_weights.get("default", 1.0)),
-        ]
-        wavs = []
-        for sample in (a, b):
-            y = self._load_clip(sample)
-            if self.augment:
-                from src.utils.augment import AugmentPipeline
-                y = AugmentPipeline(self.sr, aggressive=sample.get("type") in {"confused", "humming_like"})(y)
-            wavs.append(self._to_mel(y))
-        max_t = max(w.shape[1] for w in wavs)
-        wavs = [torch.nn.functional.pad(w, (0, max_t - w.shape[1])) if w.shape[1] < max_t else w for w in wavs]
-        label = self.song_to_idx[song_id]
-        return {
-            "mel": torch.stack(wavs, dim=0),
-            "song_id": torch.tensor([label, label], dtype=torch.long),
-            "song_name": song_id,
-            "hard_weight": torch.tensor(pair_weights, dtype=torch.float32),
-        }
--- a/acr-engine/src/models/ecapa_tdnn.py
View file @7da7686
+++ b/acr-engine/src/models/ecapa_tdnn.py
View file @7da7686
@@ -3,6 +3,55 @@ import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Tuple, List
+try:
+    from transformers import AutoModel
+except ImportError:
+    AutoModel = None
+class FrozenMERTFeatureExtractor(nn.Module):
+    def __init__(self, model_name: Optional[str], n_mels: int, hidden_dim: int):
+        super().__init__()
+        self.model_name = model_name
+        self.hidden_dim = hidden_dim
+        self.backbone = None
+        self.proj = nn.Sequential(
+            nn.Conv1d(n_mels, hidden_dim, kernel_size=3, padding=1),
+            nn.GELU(),
+            nn.BatchNorm1d(hidden_dim),
+            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
+            nn.GELU(),
+            nn.BatchNorm1d(hidden_dim),
+        )
+        for parameter in self.proj.parameters():
+            parameter.requires_grad = False
+        if model_name and AutoModel is not None:
+            try:
+                self.backbone = AutoModel.from_pretrained(model_name)
+            except Exception:
+                self.backbone = None
+            if self.backbone is not None:
+                for parameter in self.backbone.parameters():
+                    parameter.requires_grad = False
+                backbone_dim = getattr(self.backbone.config, "hidden_size", hidden_dim)
+                self.proj = nn.Sequential(
+                    nn.Conv1d(backbone_dim, hidden_dim, kernel_size=1),
+                    nn.GELU(),
+                    nn.BatchNorm1d(hidden_dim),
+                )
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        if self.backbone is None:
+            with torch.no_grad():
+                return self.proj(mel)
+        waveform_like = mel.transpose(1, 2)
+        with torch.no_grad():
+            outputs = self.backbone(inputs_embeds=waveform_like)
+            hidden = outputs.last_hidden_state.transpose(1, 2)
+        return self.proj(hidden)
 class SEModule(nn.Module):
    def __init__(self, channels, se_channels=128):
@@ -123,6 +172,89 @@ class AAMSoftmax(nn.Module):
        return output
+class CoverHunterHead(nn.Module):
+    def __init__(self, input_dim: int, embed_dim: int, num_heads: int = 4, num_layers: int = 2, ff_mult: int = 4):
+        super().__init__()
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=input_dim,
+            nhead=num_heads,
+            dim_feedforward=input_dim * ff_mult,
+            batch_first=True,
+            activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.attention = nn.Sequential(
+            nn.Linear(input_dim, input_dim),
+            nn.Tanh(),
+            nn.Linear(input_dim, 1),
+        )
+        self.proj = nn.Linear(input_dim, embed_dim)
+        self.norm = nn.BatchNorm1d(embed_dim, affine=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        encoded = self.encoder(x)
+        weights = torch.softmax(self.attention(encoded).squeeze(-1), dim=1).unsqueeze(-1)
+        pooled = torch.sum(encoded * weights, dim=1)
+        projected = self.proj(pooled)
+        projected = self.norm(projected)
+        return F.normalize(projected, p=2, dim=1)
+class MERTMelodyBranch(nn.Module):
+    def __init__(
+        self,
+        n_mels: int,
+        chroma_bins: int = 12,
+        melody_bins: int = 1,
+        hidden_dim: int = 256,
+        mert_model_name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
+        self.melody_proj = nn.Conv1d(chroma_bins + melody_bins, hidden_dim, kernel_size=1)
+        self.fuse = nn.Sequential(
+            nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(hidden_dim),
+        )
+    def forward(self, mert: torch.Tensor, melody: torch.Tensor, chroma: torch.Tensor) -> torch.Tensor:
+        semantic = self.mert(mert)
+        melodic = self.melody_proj(torch.cat([melody, chroma], dim=1))
+        return self.fuse(torch.cat([semantic, melodic], dim=1))
+class ECAPABranch(nn.Module):
+    def __init__(self, n_mels: int, channels: int, use_band_split: bool, band_split_channels: int):
+        super().__init__()
+        front_channels = band_split_channels * 5 if use_band_split else n_mels
+        self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
+        self.proj = nn.Sequential(
+            nn.Conv1d(front_channels, channels, kernel_size=5, stride=1, padding=2),
+            nn.ReLU(),
+            nn.BatchNorm1d(channels),
+        )
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        x = self.band_split(mel) if self.band_split is not None else mel
+        return self.proj(x)
+class DualStreamFusion(nn.Module):
+    def __init__(self, mert_dim: int, ecapa_dim: int, hidden_dim: int):
+        super().__init__()
+        self.mert_gate = nn.Conv1d(mert_dim, hidden_dim, kernel_size=1)
+        self.ecapa_gate = nn.Conv1d(ecapa_dim, hidden_dim, kernel_size=1)
+        self.fuse = nn.Sequential(
+            nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(hidden_dim),
+        )
+    def forward(self, mert_stream: torch.Tensor, ecapa_stream: torch.Tensor) -> torch.Tensor:
+        return self.fuse(torch.cat([self.mert_gate(mert_stream), self.ecapa_gate(ecapa_stream)], dim=1))
 class ECAPA_ACR(nn.Module):
    def __init__(
        self,
@@ -137,9 +269,36 @@ class ECAPA_ACR(nn.Module):
        aam_s: float = 30.0,
        use_band_split: bool = True,
        band_split_channels: int = 128,
+        use_dual_stream: bool = True,
+        coverhunter_heads: int = 4,
+        coverhunter_layers: int = 2,
+        fusion_hidden_dim: int = 256,
+        mert_model_name: Optional[str] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
+        self.use_dual_stream = use_dual_stream
+        if use_dual_stream:
+            self.mert_melody_branch = MERTMelodyBranch(
+                n_mels=n_mels,
+                chroma_bins=12,
+                melody_bins=1,
+                hidden_dim=fusion_hidden_dim,
+                mert_model_name=mert_model_name,
+            )
+            self.ecapa_branch = ECAPABranch(
+                n_mels=n_mels,
+                channels=channels,
+                use_band_split=use_band_split,
+                band_split_channels=band_split_channels,
+            )
+            self.stream_fusion = DualStreamFusion(
+                mert_dim=fusion_hidden_dim,
+                ecapa_dim=channels,
+                hidden_dim=channels,
+            )
+            front_channels = channels
+        else:
            front_channels = band_split_channels * 5 if use_band_split else channels
            self.band_split = BandSplitBlock(n_mels=n_mels, out_channels=band_split_channels) if use_band_split else None
@@ -169,24 +328,39 @@ class ECAPA_ACR(nn.Module):
            nn.ReLU(),
            nn.BatchNorm1d(channels * 3),
        )
-        self.pooling = StatisticsPooling()
+        self.coverhunter = CoverHunterHead(
-        self.fc = nn.Linear(channels * 3 * 2, embed_dim)
+            input_dim=channels * 3,
-        self.bn = nn.BatchNorm1d(embed_dim, affine=False)
+            embed_dim=embed_dim,
+            num_heads=coverhunter_heads,
+            num_layers=coverhunter_layers,
+        )
        self.aam = AAMSoftmax(embed_dim, num_classes, m=aam_m, s=aam_s) if num_classes is not None else None
-    def forward(self, mel: torch.Tensor, labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    def forward(
+        self,
+        mel: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        melody: Optional[torch.Tensor] = None,
+        chroma: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.use_dual_stream:
+            if melody is None or chroma is None:
+                raise ValueError("melody and chroma are required when dual-stream fusion is enabled")
+            mert_stream = self.mert_melody_branch(mel, melody, chroma)
+            ecapa_stream = self.ecapa_branch(mel)
+            x = self.stream_fusion(mert_stream, ecapa_stream)
+        else:
            x = self.band_split(mel) if self.band_split is not None else mel
            x = self.conv1(x)
+        if self.use_dual_stream:
+            x = self.conv1(x)
        block_outputs = []
        for block in self.blocks:
            x = block(x)
            block_outputs.append(x)
        x = torch.cat(block_outputs, dim=1)
        x = self.mfa(x)
-        x = self.pooling(x)
+        embedding = self.coverhunter(x.transpose(1, 2))
-        x = self.fc(x)
-        x = self.bn(x)
-        embedding = F.normalize(x, p=2, dim=1)
        if labels is not None and self.aam is not None:
            logits = self.aam(embedding, labels)
            return embedding, logits
--- a/acr-engine/src/models/losses.py
View file @7da7686
+++ b/acr-engine/src/models/losses.py
View file @7da7686
@@ -3,30 +3,22 @@ import torch.nn as nn
 import torch.nn.functional as F
-class SupConLoss(nn.Module):
+class InfoNCELoss(nn.Module):
    def __init__(self, temperature: float = 0.07):
        super().__init__()
        self.temperature = temperature
    def forward(self, features: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-        batch_size = features.shape[0]
-        labels = labels.contiguous().view(-1, 1)
-        mask = torch.eq(labels, labels.T).float().to(features.device)
-        mask = mask - torch.eye(batch_size, device=features.device)
        features = F.normalize(features, dim=1)
-        sim = torch.matmul(features, features.T) / self.temperature
+        logits = torch.matmul(features, features.T) / self.temperature
-        sim_max, _ = torch.max(sim, dim=1, keepdim=True)
+        labels = labels.contiguous().view(-1, 1)
-        sim = sim - sim_max.detach()
+        positive_mask = torch.eq(labels, labels.T).float().to(features.device)
+        positive_mask = positive_mask - torch.eye(features.size(0), device=features.device)
-        exp_sim = torch.exp(sim) * (1 - torch.eye(batch_size, device=features.device))
+        logits = logits - logits.max(dim=1, keepdim=True).values.detach()
-        log_prob = sim - torch.log(exp_sim.sum(dim=1, keepdim=True))
+        exp_logits = torch.exp(logits) * (1 - torch.eye(features.size(0), device=features.device))
+        log_prob = logits - torch.log(exp_logits.sum(dim=1, keepdim=True) + 1e-12)
-        pos_mask = mask
+        positives = positive_mask.sum(dim=1).clamp(min=1)
-        pos_count = pos_mask.sum(dim=1)
+        return -((positive_mask * log_prob).sum(dim=1) / positives)
-        loss = -(log_prob * pos_mask).sum(dim=1)
-        loss = loss / pos_count.clamp(min=1)
-        return loss
 class CombinedLoss(nn.Module):
@@ -37,8 +29,7 @@ class CombinedLoss(nn.Module):
        aam_weight: float = 0.3,
    ):
        super().__init__()
-        self.supcon = SupConLoss(temperature)
+        self.infonce = InfoNCELoss(temperature)
-        self.ce = nn.CrossEntropyLoss()
        self.supcon_weight = supcon_weight
        self.aam_weight = aam_weight
@@ -50,21 +41,20 @@ class CombinedLoss(nn.Module):
        supcon_labels: torch.Tensor,
        hard_weight: torch.Tensor | None = None,
    ) -> dict:
-        loss_supcon = self.supcon(embedding, supcon_labels)
+        loss_infonce = self.infonce(embedding, supcon_labels)
        loss_ce = F.cross_entropy(logits, labels, reduction="none")
        if hard_weight is not None:
            weight = hard_weight.float()
            if weight.dim() == 0:
                weight = weight.unsqueeze(0)
-            loss_supcon = loss_supcon * weight
+            loss_infonce = loss_infonce * weight
            loss_ce = loss_ce * weight
-        loss_supcon = loss_supcon.mean()
+        loss_infonce = loss_infonce.mean()
        loss_ce = loss_ce.mean()
+        total = self.supcon_weight * loss_infonce + self.aam_weight * loss_ce
-        total = self.supcon_weight * loss_supcon + self.aam_weight * loss_ce
        return {
            "loss": total,
-            "supcon_loss": loss_supcon.item(),
+            "supcon_loss": loss_infonce.item(),
            "ce_loss": loss_ce.item(),
        }
--- a/acr-engine/src/utils/augment.py
View file @7da7686
+++ b/acr-engine/src/utils/augment.py
View file @7da7686
 import numpy as np
 import random
-from typing import Optional, Tuple
+from pathlib import Path
+from typing import Iterable, Optional, Tuple
+import librosa
+import soundfile as sf
+try:
+    from audiomentations import AddBackgroundNoise, AddGaussianNoise, BandPassFilter, Compose, Mp3Compression, PitchShift, TimeStretch
+    HAS_AUDIO_AUG = True
+except Exception:
+    AddBackgroundNoise = AddGaussianNoise = BandPassFilter = Compose = Mp3Compression = PitchShift = TimeStretch = None
+    HAS_AUDIO_AUG = False
-class AugmentPipeline:
-    def __init__(self, sr: int = 16000, aggressive: bool = False):
-        self.sr = sr
-        self.noise_snr_range = (5, 30)
-        self.pitch_shift_range = (-6, 6)
-        self.time_stretch_range = (0.85, 1.15)
-        self.mp3_bitrate_range = (32, 128)
-        self.aggressive = aggressive
-    def add_noise(self, y: np.ndarray, snr_db: Optional[float] = None) -> np.ndarray:
+class NoiseLibrary:
-        if snr_db is None:
+    def __init__(self, roots: Optional[Iterable[str]] = None):
-            snr_db = random.uniform(*self.noise_snr_range)
+        self.paths = []
-        signal_power = np.mean(y ** 2)
+        for root in roots or []:
-        noise_power = signal_power / (10 ** (snr_db / 10))
+            base = Path(root)
-        noise = np.random.randn(len(y)) * np.sqrt(noise_power)
+            if not base.exists():
-        return y + noise
+                continue
+            for pattern in ("*.wav", "*.mp3", "*.flac", "*.ogg", "*.m4a"):
+                self.paths.extend(base.rglob(pattern))
-    def pitch_shift(self, y: np.ndarray, semitones: Optional[float] = None) -> np.ndarray:
+    def directories(self) -> list[str]:
-        if semitones is None:
+        if not self.paths:
-            semitones = random.uniform(*self.pitch_shift_range)
+            return []
-        return librosa_shift(y, sr=self.sr, n_steps=semitones)
+        return sorted({str(path.parent) for path in self.paths})
-    def time_stretch(self, y: np.ndarray, rate: Optional[float] = None) -> np.ndarray:
-        if rate is None:
-            rate = random.uniform(*self.time_stretch_range)
-        return librosa_ts(y, sr=self.sr, rate=rate)
-    def add_reverb(self, y: np.ndarray, decay: float = 0.3) -> np.ndarray:
+class AugmentPipeline:
-        ir_len = int(0.1 * self.sr)
+    def __init__(
-        ir = np.exp(-np.arange(ir_len) * decay / ir_len) * np.random.randn(ir_len)
+        self,
-        ir /= np.sqrt(np.sum(ir ** 2))
+        sr: int = 16000,
-        return np.convolve(y, ir, mode='same')[:len(y)]
+        aggressive: bool = False,
+        noise_roots: Optional[Iterable[str]] = None,
+        freq_mask_prob: float = 0.3,
+    ):
+        self.sr = sr
+        self.aggressive = aggressive
+        self.freq_mask_prob = freq_mask_prob
+        self.noise_library = NoiseLibrary(noise_roots)
+        self.wave_augment = self._build_wave_augmenter()
+    def _build_wave_augmenter(self):
+        if not HAS_AUDIO_AUG:
+            return None
+        transforms = [
+            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5 if not self.aggressive else 0.8),
+            BandPassFilter(
+                min_center_freq=300.0,
+                max_center_freq=3200.0,
+                min_bandwidth_fraction=0.3,
+                max_bandwidth_fraction=0.8,
+                p=0.35 if not self.aggressive else 0.55,
+            ),
+            Mp3Compression(min_bitrate=24, max_bitrate=96, p=0.35 if not self.aggressive else 0.55),
+            PitchShift(min_semitones=-5, max_semitones=5, p=0.35 if not self.aggressive else 0.55),
+            TimeStretch(min_rate=0.8, max_rate=1.2, p=0.35 if not self.aggressive else 0.55),
+        ]
+        noise_dirs = self.noise_library.directories()
+        if noise_dirs:
+            transforms.append(
+                AddBackgroundNoise(
+                    sounds_path=noise_dirs,
+                    min_snr_db=3.0 if self.aggressive else 8.0,
+                    max_snr_db=20.0 if self.aggressive else 30.0,
+                    noise_transform=Compose([
+                        BandPassFilter(
+                            min_center_freq=250.0,
+                            max_center_freq=4000.0,
+                            min_bandwidth_fraction=0.2,
+                            max_bandwidth_fraction=0.9,
+                            p=0.5,
+                        )
+                    ]),
+                    p=0.35 if not self.aggressive else 0.6,
+                )
+            )
+        return Compose(transforms)
-    def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 8) -> np.ndarray:
+    def apply_spec_augment(self, mel: np.ndarray, max_time_mask: int = 20, max_freq_mask: int = 12) -> np.ndarray:
        mel = mel.copy()
        t = mel.shape[1]
        f = mel.shape[0]
@@ -46,43 +91,21 @@ class AugmentPipeline:
            if t_start < t:
                mel[:, t_start:t_start + t_mask] = 0
        for _ in range(2):
-            f_mask = random.randint(0, max_freq_mask)
+            f_mask = random.randint(max(1, max_freq_mask // 3), max_freq_mask)
            f_start = random.randint(0, max(0, f - f_mask))
            if f_start < f:
                mel[f_start:f_start + f_mask, :] = 0
        return mel
    def apply_to_mel(self, mel: np.ndarray) -> np.ndarray:
-        if random.random() < 0.3:
+        if random.random() < self.freq_mask_prob:
            mel = self.apply_spec_augment(mel)
        return mel
    def __call__(self, y: np.ndarray) -> np.ndarray:
-        noise_p = 0.75 if self.aggressive else 0.5
+        if self.wave_augment is None:
-        stretch_p = 0.55 if self.aggressive else 0.3
-        pitch_p = 0.55 if self.aggressive else 0.3
-        reverb_p = 0.35 if self.aggressive else 0.2
-        if random.random() < noise_p:
-            y = self.add_noise(y, snr_db=random.uniform(0, 18) if self.aggressive else None)
-        if random.random() < stretch_p:
-            y = self.time_stretch(y, rate=random.uniform(0.8, 1.2) if self.aggressive else None)
-        if random.random() < pitch_p:
-            y = self.pitch_shift(y, semitones=random.uniform(-8, 8) if self.aggressive else None)
-        if random.random() < reverb_p:
-            y = self.add_reverb(y, decay=random.uniform(0.2, 0.6))
            return y
-def librosa_shift(y, sr=16000, n_steps=0):
-    return librosa_impl(y, lambda: __import__('librosa').effects.pitch_shift(y, sr=sr, n_steps=n_steps))
-def librosa_ts(y, sr=16000, rate=1.0):
-    return librosa_impl(y, lambda: __import__('librosa').effects.time_stretch(y, rate=rate))
-def librosa_impl(y, fn):
        try:
-        return fn()
+            return self.wave_augment(samples=y.astype(np.float32), sample_rate=self.sr)
        except Exception:
            return y
--- a/acr-engine/train.py
View file @7da7686
+++ b/acr-engine/train.py
View file @7da7686
@@ -4,6 +4,7 @@
 import argparse
 import json
 import sys
+from datetime import datetime
 from pathlib import Path
 import torch
@@ -21,15 +22,23 @@ from src.models.losses import CombinedLoss
 def collate_fn(batch):
    mels = []
+    melodies = []
+    chromas = []
    song_ids = []
    song_names = []
    hard_weights = []
    for b in batch:
        mel = b["mel"]
+        melody = b.get("melody")
+        chroma = b.get("chroma")
        hw = b.get("hard_weight", torch.tensor(1.0))
        if mel.dim() == 3:
            for i in range(mel.shape[0]):
                mels.append(mel[i])
+                if melody is not None:
+                    melodies.append(melody[i])
+                if chroma is not None:
+                    chromas.append(chroma[i])
                song_ids.append(b["song_id"][i])
                song_names.append(b["song_name"])
                if torch.is_tensor(hw) and hw.dim() > 0:
@@ -38,24 +47,45 @@ def collate_fn(batch):
                    hard_weights.append(hw)
        else:
            mels.append(mel)
+            if melody is not None:
+                melodies.append(melody)
+            if chroma is not None:
+                chromas.append(chroma)
            song_ids.append(b["song_id"])
            song_names.append(b["song_name"])
            hard_weights.append(hw)
    max_t = max(m.shape[1] for m in mels)
    mels_padded = []
-    for m in mels:
+    melodies_padded = []
+    chromas_padded = []
+    for idx, m in enumerate(mels):
        pad = max_t - m.shape[1]
        if pad > 0:
            m = torch.nn.functional.pad(m, (0, pad))
        mels_padded.append(m.unsqueeze(0))
+        if melodies:
-    return {
+            melody = melodies[idx]
+            if melody.shape[1] < max_t:
+                melody = torch.nn.functional.pad(melody, (0, max_t - melody.shape[1]))
+            melodies_padded.append(melody.unsqueeze(0))
+        if chromas:
+            chroma = chromas[idx]
+            if chroma.shape[1] < max_t:
+                chroma = torch.nn.functional.pad(chroma, (0, max_t - chroma.shape[1]))
+            chromas_padded.append(chroma.unsqueeze(0))
+    payload = {
        "mel": torch.cat(mels_padded, dim=0),
        "song_id": torch.stack(song_ids),
        "song_name": song_names,
        "hard_weight": torch.stack(hard_weights),
    }
+    if melodies_padded:
+        payload["melody"] = torch.cat(melodies_padded, dim=0)
+    if chromas_padded:
+        payload["chroma"] = torch.cat(chromas_padded, dim=0)
+    return payload
 def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg):
@@ -64,10 +94,14 @@ def train_epoch(model, loader, optimizer, criterion, scaler, device, epoch, cfg)
    pbar = tqdm(loader, desc=f"Epoch {epoch}")
    for batch in pbar:
        mel = batch["mel"].to(device)
+        melody = batch.get("melody")
+        chroma = batch.get("chroma")
+        melody = melody.to(device) if melody is not None else None
+        chroma = chroma.to(device) if chroma is not None else None
        labels = batch["song_id"].to(device)
        with torch.amp.autocast("cuda", enabled=cfg["training"]["mixed_precision"] and device.type == "cuda"):
-            embedding, logits = model(mel, labels)
+            embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
            loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
        optimizer.zero_grad()
@@ -115,6 +149,28 @@ def save_checkpoint(output_dir, epoch, model, optimizer, best_metric, cfg, name)
    print(f"  Saved: {path}")
+def write_training_artifacts(output_dir: Path, cfg: dict, train_metrics: dict, train_dataset, args):
+    manifest = {
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "config": cfg,
+        "output_dir": str(output_dir),
+        "train_song_count": len(train_dataset.song_ids),
+        "sample_count": len(train_dataset),
+        "segment_strategy": args.segment_strategy,
+        "noise_roots": args.noise_root,
+        "artifacts": {
+            "best_model": str(output_dir / "best_model.pt"),
+            "song_to_idx": str(output_dir / "song_to_idx.json"),
+            "metrics": str(output_dir / "training_metrics.json"),
+        },
+        "final_metrics": train_metrics,
+    }
+    with open(output_dir / "training_metrics.json", "w") as f:
+        json.dump(train_metrics, f, indent=2)
+    with open(output_dir / "training_manifest.json", "w") as f:
+        json.dump(manifest, f, indent=2)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="configs/default.yaml")
@@ -125,6 +181,7 @@ def main():
    parser.add_argument("--epochs", type=int, default=None)
    parser.add_argument("--batch-size", type=int, default=None)
    parser.add_argument("--lr", type=float, default=None)
+    parser.add_argument("--noise-root", action="append", default=[])
    parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
    parser.add_argument("--silence-top-db", type=int, default=30)
    parser.add_argument("--dry-run", action="store_true")
@@ -159,6 +216,8 @@ def main():
        silence_top_db=args.silence_top_db,
        sample_type_weights=cfg["training"].get("sample_type_weights"),
        pair_type_weights=cfg["training"].get("pair_type_weights"),
+        hard_negative_k=cfg["training"].get("hard_negative_k", 2),
+        noise_roots=args.noise_root,
    )
    catalog_dataset = ACRDataset(
@@ -174,6 +233,7 @@ def main():
        song_to_idx=train_dataset.song_to_idx,
        segment_strategy=args.segment_strategy,
        silence_top_db=args.silence_top_db,
+        noise_roots=args.noise_root,
    )
    train_loader = DataLoader(
@@ -205,6 +265,11 @@ def main():
        aam_s=cfg["model"]["aam_s"],
        use_band_split=cfg["model"].get("use_band_split", True),
        band_split_channels=cfg["model"].get("band_split_channels", 128),
+        use_dual_stream=cfg["model"].get("use_dual_stream", True),
+        coverhunter_heads=cfg["model"].get("coverhunter_heads", 4),
+        coverhunter_layers=cfg["model"].get("coverhunter_layers", 2),
+        fusion_hidden_dim=cfg["model"].get("fusion_hidden_dim", 256),
+        mert_model_name=cfg["model"].get("mert_model_name"),
    ).to(device)
    criterion = CombinedLoss(
@@ -219,8 +284,12 @@ def main():
        print("Dry run: running one batch through forward/backward...")
        batch = next(iter(train_loader))
        mel = batch["mel"].to(device)
+        melody = batch.get("melody")
+        chroma = batch.get("chroma")
+        melody = melody.to(device) if melody is not None else None
+        chroma = chroma.to(device) if chroma is not None else None
        labels = batch["song_id"].to(device)
-        embedding, logits = model(mel, labels)
+        embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
        loss_dict = criterion(embedding, logits, labels, labels, batch.get("hard_weight", None).to(device) if "hard_weight" in batch else None)
        loss_dict["loss"].backward()
        print(f"  Forward/backward OK. Loss: {loss_dict['loss']:.4f}")
@@ -242,6 +311,7 @@ def main():
    output_dir.mkdir(parents=True, exist_ok=True)
    print("Starting training...")
+    train_metrics = None
    for epoch in range(start_epoch, cfg["training"]["epochs"] + 1):
        train_metrics = train_epoch(model, train_loader, optimizer, criterion, scaler, device, epoch, cfg)
        scheduler.step()
@@ -254,6 +324,7 @@ def main():
    with open(output_dir / "song_to_idx.json", "w") as f:
        json.dump(train_dataset.song_to_idx, f, indent=2)
+    write_training_artifacts(output_dir, cfg, train_metrics or {}, train_dataset, args)
    print(f"\nTraining complete. Best training loss: {best_loss:.4f}")
    print(f"Model saved to: {output_dir / 'best_model.pt'}")
    print(f"Catalog references available: {len(catalog_dataset.samples)}")
--- a/data/training_runs/coverhunter_finetune_20260608T130103Z/run_request.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130103Z/run_request.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130103Z",
+  "created_at": "2026-06-08T13:01:03.023371Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130103Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z"
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130103Z/run_summary.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130103Z/run_summary.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130103Z",
+  "created_at": "2026-06-08T13:01:03.023371Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130103Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130103Z",
+  "returncode": 1,
+  "completed_at": "2026-06-08T13:01:32.762576Z",
+  "artifacts": [
+    "run_request.json",
+    "stderr.log",
+    "stdout.log"
+  ]
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130103Z/stderr.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130103Z/stderr.log 0 → 100644
View file @7da7686
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Traceback (most recent call last):
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
+    main()
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 249, in main
+    batch = next(iter(train_loader))
+            ^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 718, in __next__
+    data = self._next_data()
+           ^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 778, in _next_data
+    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    data = [self.dataset[idx] for idx in possibly_batched_index]
+            ~~~~~~~~~~~~^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 370, in __getitem__
+    positive_features = [self._load_features(sample) for sample in positive_items]
+                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 344, in _load_features
+    features = self.feature_extractor.extract(y)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/data/dataset.py", line 138, in extract
+    melody = librosa.hz_to_midi(melody, bins_per_octave=12)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: hz_to_midi() got an unexpected keyword argument 'bins_per_octave'
--- a/data/training_runs/coverhunter_finetune_20260608T130103Z/stdout.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130103Z/stdout.log 0 → 100644
View file @7da7686
+Device: cpu
--- a/data/training_runs/coverhunter_finetune_20260608T130306Z/run_request.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130306Z/run_request.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130306Z",
+  "created_at": "2026-06-08T13:03:06.790814Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130306Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z"
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130306Z/run_summary.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130306Z/run_summary.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130306Z",
+  "created_at": "2026-06-08T13:03:06.790814Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130306Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130306Z",
+  "returncode": 1,
+  "completed_at": "2026-06-08T13:04:34.035140Z",
+  "artifacts": [
+    "run_request.json",
+    "stderr.log",
+    "stdout.log"
+  ]
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130306Z/stderr.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130306Z/stderr.log 0 → 100644
View file @7da7686
+/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
+  midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
+  midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
+Retrying in 1s [Retry 1/5].
+Traceback (most recent call last):
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
+    main()
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 256, in main
+    model = ECAPA_ACR(
+            ^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 280, in __init__
+    self.mert_melody_branch = MERTMelodyBranch(
+                              ^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 211, in __init__
+    self.mert = FrozenMERTFeatureExtractor(model_name=mert_model_name, n_mels=n_mels, hidden_dim=hidden_dim)
+                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 21, in __init__
+    self.backbone = AutoModel.from_pretrained(model_name)
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 289, in from_pretrained
+    resolved_config_file = cached_file(
+                           ^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 293, in cached_file
+    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 527, in cached_files
+    raise e
+  File "/home/user/.local/lib/python3.12/site-packages/transformers/utils/hub.py", line 437, in cached_files
+    hf_hub_download(
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1019, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1152, in _hf_hub_download_to_cache_dir
+    _get_metadata_or_catch_error(
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1694, in _get_metadata_or_catch_error
+    metadata = get_hf_file_metadata(
+               ^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 88, in _inner_fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1616, in get_hf_file_metadata
+    response = _httpx_follow_relative_redirects_with_backoff(
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 685, in _httpx_follow_relative_redirects_with_backoff
+    response = http_backoff(
+               ^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 559, in http_backoff
+    return next(
+           ^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 467, in _http_backoff_base
+    response = client.request(method=method, url=url, **kwargs)
+               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 825, in request
+    return self.send(request, auth=auth, follow_redirects=follow_redirects)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/miniconda3/lib/python3.12/site-packages/httpx/_client.py", line 901, in send
+    raise RuntimeError("Cannot send a request, as the client has been closed.")
+RuntimeError: Cannot send a request, as the client has been closed.
--- a/data/training_runs/coverhunter_finetune_20260608T130306Z/stdout.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130306Z/stdout.log 0 → 100644
View file @7da7686
+Device: cpu
+Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
+Classes: 16
+Train songs: 64
--- a/data/training_runs/coverhunter_finetune_20260608T130514Z/run_request.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130514Z/run_request.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130514Z",
+  "created_at": "2026-06-08T13:05:14.591209Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130514Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z"
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130514Z/run_summary.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130514Z/run_summary.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130514Z",
+  "created_at": "2026-06-08T13:05:14.591209Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130514Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130514Z",
+  "returncode": 1,
+  "completed_at": "2026-06-08T13:06:50.272162Z",
+  "artifacts": [
+    "run_request.json",
+    "stderr.log",
+    "stdout.log"
+  ]
+}
\ No newline at end of file
--- a/data/training_runs/coverhunter_finetune_20260608T130514Z/stderr.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130514Z/stderr.log 0 → 100644
View file @7da7686
+/home/user/.local/lib/python3.12/site-packages/librosa/core/convert.py:1094: RuntimeWarning: divide by zero encountered in log2
+  midi: np.ndarray = 12 * (np.log2(np.asanyarray(frequencies)) - np.log2(440.0)) + 69
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/m-a-p/MERT-v1-95M/resolve/main/config.json
+Retrying in 1s [Retry 1/5].
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Failed to import fast_mp3_augment. Maybe it is not installed? To install the optional fast_mp3_augment dependency of audiomentations, run `pip install audiomentations[extras]` or simply `pip install fast_mp3_augment`
+Traceback (most recent call last):
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 334, in <module>
+    main()
+  File "/mnt/e/hikoon-ACR/acr-engine/train.py", line 292, in main
+    embedding, logits = model(mel, labels, melody=melody, chroma=chroma)
+                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 351, in forward
+    mert_stream = self.mert_melody_branch(mel, melody, chroma)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 224, in forward
+    semantic = self.mert(mert)
+               ^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1778, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/user/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1789, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/mnt/e/hikoon-ACR/acr-engine/src/models/ecapa_tdnn.py", line 49, in forward
+    return self.proj(mel)
+           ^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not callable
--- a/data/training_runs/coverhunter_finetune_20260608T130514Z/stdout.log 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130514Z/stdout.log 0 → 100644
View file @7da7686
+Device: cpu
+Dry batch shape: torch.Size([6, 96, 501]) torch.Size([6])
+Classes: 16
+Train songs: 64
+Dry run: running one batch through forward/backward...
--- a/data/training_runs/coverhunter_finetune_20260608T130731Z/run_request.json 0 → 100644
View file @7da7686
+++ b/data/training_runs/coverhunter_finetune_20260608T130731Z/run_request.json 0 → 100644
View file @7da7686
+{
+  "run_name": "coverhunter_finetune_20260608T130731Z",
+  "created_at": "2026-06-08T13:07:31.311447Z",
+  "python": "/usr/local/miniconda3/bin/python",
+  "command": [
+    "/usr/local/miniconda3/bin/python",
+    "train.py",
+    "--config",
+    "configs/coverhunter_finetune_4gb.yaml",
+    "--data",
+    "data/synthetic_v2",
+    "--output",
+    "data/training_runs/coverhunter_finetune_20260608T130731Z",
+    "--device",
+    "cpu",
+    "--segment-strategy",
+    "hybrid",
+    "--dry-run"
+  ],
+  "config": "configs/coverhunter_finetune_4gb.yaml",
+  "data": "data/synthetic_v2",
+  "noise_roots": [],
+  "run_dir": "data/training_runs/coverhunter_finetune_20260608T130731Z"
+}
\ No newline at end of file
--- a/docs/coverhunter_env_setup.md 0 → 100644
View file @7da7686
+++ b/docs/coverhunter_env_setup.md 0 → 100644
View file @7da7686
+# CoverHunter 环境安装与验证
+## 1. 目标解释器
+本专题统一使用：
+```bash
+/usr/local/miniconda3/bin/python
+```
+## 2. 自动化脚本
+已新增环境安装与验证脚本：
+```text
+acr-engine/scripts/setup_coverhunter_env.py
+```
+执行方式：
+```bash
+/usr/local/miniconda3/bin/python acr-engine/scripts/setup_coverhunter_env.py
+```
+它会自动：
+1. 安装 `requirements.txt`
+2. 补充训练依赖：
+   - `torch`
+   - `torchaudio`
+   - `transformers`
+   - `huggingface_hub`
+   - `librosa`
+   - `soundfile`
+   - `audiomentations`
+3. 进行环境验证
+4. 生成报告：
+```text
+acr-engine/reports/coverhunter_env_setup_report.json
+```
+## 3. 当前自动化执行结果
+本次已经自动执行完成。
+报告文件：
+```text
+acr-engine/reports/coverhunter_env_setup_report.json
+```
+当前结论：
+- Python 包安装：**成功**
+- `torch` / `transformers` / `librosa` / `soundfile` / `audiomentations`：**已安装**
+- 但 `torch.cuda.is_available()` 当前返回：**False**
+## 4. 当前 GPU 阻塞点
+虽然系统存在 NVIDIA GPU，且 `nvidia-smi` 可见设备，但当前 PyTorch CUDA 初始化失败。
+报告中的核心告警是：
+- **The NVIDIA driver on your system is too old**
+这说明：
+- 当前安装到环境里的 `torch 2.12.0+cu130`
+- 与当前系统驱动版本不兼容
+也就是说：
+- **环境依赖已经安装好了**
+- **但当前 GPU 训练还不能真正启用**
+- 原因不是代码问题，而是 **PyTorch CUDA 版本与驱动版本不匹配**
+## 5. 当前状态怎么理解
+现在的环境状态可以分成两部分：
+### 已经完成的
+- 训练依赖已安装
+- 训练脚本可执行
+- MERT / ECAPA 双流代码可 import
+- 文档和配置已准备好
+### 仍未完成的
+- CUDA 版 torch 与当前 NVIDIA driver 的匹配
+## 6. 下一步建议
+要让 GPU 真正可用，需要二选一：
+### 方案 A：升级 NVIDIA 驱动
+优点：
+- 可以保留当前较新的 torch/cu130 组合
+- 后续兼容性更好
+### 方案 B：安装与当前驱动兼容的更低 CUDA 版本 torch
+优点：
+- 不改系统驱动
+- 更适合当前机器直接落地
+对当前项目而言，我更建议：
+- **优先采用方案 B**
+- 安装与当前驱动兼容的 torch 版本
+## 7. 当前专题与环境文档关系
+配套文件如下：
+- 训练专题：`docs/coverhunter_finetune_topic.md`
+- 训练流程：`docs/coverhunter_training_process.md`
+- 环境文档：`docs/coverhunter_env_setup.md`
+- 环境报告：`acr-engine/reports/coverhunter_env_setup_report.json`
+## 8. 当前结论
+当前已经自动完成：
+- 环境依赖安装
+- 环境验证
+- 结果记录
+目前唯一阻塞 GPU 训练的点是：
+- **CUDA / 驱动 / torch 版本不匹配**
--- a/docs/coverhunter_finetune_topic.md 0 → 100644
View file @7da7686
+++ b/docs/coverhunter_finetune_topic.md 0 → 100644
View file @7da7686
+# CoverHunter 双流微调专题方案
+## 1. 专题目标
+本专题目标是围绕当前仓库，建立一套可持续扩展的 **CoverHunter 双流微调方案**，用于音乐翻唱识别、哼唱检索、录音片段检索和抗噪 ACR 检索。
+专题的核心方向不是一次性跑通训练，而是建立一条可反复扩展的训练专题链路：
+1. 明确现有音源与数据资产
+2. 定义双流训练架构
+3. 设计分阶段训练计划
+4. 形成标准训练流程
+5. 规范训练产物与权重使用方式
+6. 为后续补充更多 music 语料预留稳定入口
+---
+## 2. 当前已有音源与数据资产
+### 2.1 当前仓库内可直接使用的数据
+当前可直接用于训练与冒烟验证的数据位于：
+```text
+acr-engine/data/synthetic_v2/
+```
+其中包含：
+- `train.json`
+- `test.json`
+- `segments/*.wav`
+### 2.2 当前训练集统计
+基于 `acr-engine/data/synthetic_v2/train.json` 的统计结果：
+- 样本总数：**96**
+- `song_id` 数量：**16**
+- 类型分布：
+  - `reference`: **16**
+  - `clean`: **32**
+  - `augmented`: **16**
+  - `humming_like`: **16**
+  - `confused`: **16**
+### 2.3 当前音源的含义
+按现有数据结构，可以理解为每首歌目前至少对应以下几类样本：
+1. **reference**
+   - 作为标准原曲/参考版本
+   - 用于建立稳定的正样本锚点
+2. **clean**
+   - 较干净的切片
+   - 代表相对理想的检索输入
+3. **augmented**
+   - 已经经过部分增强的样本
+   - 用于初步提升泛化能力
+4. **humming_like**
+   - 偏哼唱/偏旋律化表达的近似样本
+   - 用于强化“忽略音色、聚焦旋律”能力
+5. **confused**
+   - 易混淆样本
+   - 用于构建难负样本与边界学习能力
+### 2.4 当前音源的局限性
+当前 `synthetic_v2` 更适合做：
+- 训练链路验证
+- 双流结构验证
+- 小规模参数/损失/显存调优
+- 产物定义与使用方式验证
+当前它还不适合直接视为最终生产训练集，原因包括：
+- 歌曲数较少（16 首）
+- 类型覆盖有限
+- 录音噪声场景仍偏少
+- 真实翻唱的多样性不足
+- 真实设备采集差异不足
+所以本专题应采用 **分阶段训练策略**。
+---
+## 3. 当前双流训练架构
+### 3.1 架构定义
+当前已按双流结构实现：
+### 流 A：MERT + Melody 分支
+位置：
+- `acr-engine/src/models/ecapa_tdnn.py`
+职责：
+- 建模高层语义与旋律表达
+- 提高跨音色、跨设备、跨唱法的语义对齐能力
+当前组成：
+- `FrozenMERTFeatureExtractor`
+- `melody/chroma` 特征投影与融合
+默认模型配置：
+```yaml
+model:
+  mert_model_name: m-a-p/MERT-v1-95M
+```
+### 流 B：ECAPA 分支
+职责：
+- 强化局部声学结构与 timbre/韵律相关判别信息
+- 作为与 MERT 分支互补的检索支路
+### 双流融合
+- `DualStreamFusion`
+作用：
+- 将语义旋律流与 ECAPA 流融合到统一时序空间
+### 检索头
+- `CoverHunterHead`
+作用：
+- 将融合后的时序特征进一步编码
+- 输出最终 embedding 用于对比训练和检索
+### 训练目标
+- `InfoNCE`
+- `AAMSoftmax`
+---
+## 4. 训练专题的总体思路
+本专题不建议“一步到位”直接上大规模真实全量训练，而建议按三个阶段推进。
+### 阶段 A：链路验证阶段
+目标：
+- 验证模型结构、数据流、增强链路、权重产物、运行日志是否闭环
+训练数据：
+- `acr-engine/data/synthetic_v2`
+产出：
+- 跑通训练
+- 确认显存
+- 确认增强是否有效
+- 确认权重可以导出并复用
+### 阶段 B：专题微调阶段
+目标：
+- 在当前专题下引入更多音乐语料
+- 逐步扩充：原曲、翻唱、录音、哼唱、噪声注入样本
+- 建立更稳定的双流 CoverHunter embedding
+训练数据规划：
+- 原曲标准音源
+- 真实或半真实翻唱音源
+- 设备录音音源
+- 环境噪声音源
+- 难负样本音源
+### 阶段 C：检索权重沉淀阶段
+目标：
+- 固化最优权重
+- 建立 reference embedding 索引流程
+- 形成线上/离线检索用权重标准
+---
+## 5. 训练数据计划
+后续你提到会补充更多 music 语料，因此建议数据建设按下面结构统一。
+### 5.1 推荐数据结构
+建议每首歌围绕 `song_id` 组织为：
+- `reference`
+- `clean`
+- `cover`
+- `recording`
+- `environment`
+- `humming_like`
+- `confused`
+### 5.2 推荐含义
+#### reference
+- 标准原曲版本
+- 用于构建基准 embedding 和 reference index
+#### clean
+- 质量较好的切片/相对干净音频
+- 用于稳定正样本训练
+#### cover
+- 真实翻唱版本
+- 用于训练旋律一致、音色不同的对齐能力
+#### recording
+- 手机/麦克风录制版本
+- 用于训练设备失真和场景采集鲁棒性
+#### environment
+- 注入环境噪声或真实环境录音
+- 用于训练抗噪能力
+#### humming_like
+- 哼唱、跟唱、弱伴奏旋律版本
+- 用于训练旋律驱动检索能力
+#### confused
+- 容易相似但不属于同一首歌的样本
+- 用于强化难负样本学习
+### 5.3 当前专题的样本补充建议
+优先补充顺序建议为：
+1. **更多 reference / clean 原曲**
+2. **更多 recording / environment 样本**
+3. **更多真实 cover 样本**
+4. **更多 confused 难负样本**
+5. **更多 humming_like 样本**
+原因：
+- 当前抗噪与设备泛化是近期最容易拉开效果差异的方向
+- cover / humming 的价值很高，但数据准备成本更高
+---
+## 6. 数据增强计划
+当前代码已实现两大类增强，用于“伪造录音”和“伪造翻唱”。
+位置：
+- `acr-engine/src/utils/augment.py`
+### 6.1 伪造录音增强
+包括：
+- `AddGaussianNoise`
+- `AddBackgroundNoise`
+- `BandPassFilter`
+- `Mp3Compression`
+作用：
+- 模拟餐厅底噪、街道底噪
+- 模拟廉价设备频响缺失
+- 模拟压缩带来的失真
+- 提高抗噪与抗设备变化能力
+### 6.2 伪造翻唱增强
+包括：
+- `PitchShift`
+- `TimeStretch`
+- `Frequency Masking`
+作用：
+- 模拟升降调翻唱
+- 模拟节奏变化
+- 逼迫模型降低音色依赖，关注旋律主线
+### 6.3 当前专题下的增强原则
+- `reference` 不建议过强增强
+- `clean` 可做轻增强
+- `recording / environment` 可做强增强
+- `humming_like / confused` 应提高采样权重
+---
+## 7. 训练流程
+### 7.1 环境准备
+解释器：
+```bash
+/usr/local/miniconda3/bin/python
+```
+安装依赖：
+```bash
+/usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
+```
+当前依赖至少需要：
+- `torch`
+- `torchaudio`
+- `transformers`
+- `huggingface_hub`
+- `librosa`
+- `soundfile`
+- `audiomentations`
+### 7.2 4GB GPU 专用配置
+当前 GPU 为：
+- `Quadro P1000`
+- 4GB 显存
+因此我已经新增专用配置：
+- `acr-engine/configs/coverhunter_finetune_4gb.yaml`
+特点：
+- 更小 `batch_size`
+- 更短片段
+- 更小通道数
+- 更浅层数
+- 更适合当前显存资源
+### 7.3 首次验证流程
+先 dry-run：
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
+  --python /usr/local/miniconda3/bin/python \
+  --config configs/coverhunter_finetune_4gb.yaml \
+  --data data/synthetic_v2 \
+  --device cuda \
+  --segment-strategy hybrid \
+  --dry-run
+```
+### 7.4 小规模试训
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python train.py \
+  --config configs/coverhunter_finetune_4gb.yaml \
+  --data data/synthetic_v2 \
+  --output data/training_runs/coverhunter_4gb_trial \
+  --device cuda \
+  --segment-strategy hybrid \
+  --batch-size 2 \
+  --epochs 2
+```
+### 7.5 专题正式训练
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
+  --python /usr/local/miniconda3/bin/python \
+  --config configs/coverhunter_finetune_4gb.yaml \
+  --data data/synthetic_v2 \
+  --device cuda \
+  --segment-strategy hybrid \
+  --noise-root data/noise/restaurant \
+  --noise-root data/noise/street
+```
+### 7.6 后续扩容训练
+当你补充新的 music 语料后，建议：
+1. 先保持 `song_id + type + audio_path + duration` 元数据结构一致
+2. 新语料先做小批量接入
+3. 先跑 2 epoch 验证
+4. 再逐步扩大训练轮次
+---
+## 8. 训练过程会产生什么产物
+每次训练会生成目录：
+```text
+acr-engine/data/training_runs/<run_name>/
+```
+标准产物包括：
+- `best_model.pt`
+- `checkpoint_epoch_*.pt`
+- `song_to_idx.json`
+- `training_metrics.json`
+- `training_manifest.json`
+- `run_request.json`
+- `run_summary.json`
+- `stdout.log`
+- `stderr.log`
+### 8.1 各产物的用途
+#### best_model.pt
+- 当前训练过程中最优权重
+- 后续检索、建库、推理优先使用它
+#### checkpoint_epoch_*.pt
+- 周期性保存点
+- 用于中断恢复、回溯比较
+#### song_to_idx.json
+- 训练类别到 `song_id` 的映射
+- 用于解释训练分类头与标签对应关系
+#### training_metrics.json
+- 记录最后一次训练指标
+- 用于专题对比不同配置
+#### training_manifest.json
+- 记录本次训练的配置、输入、产物路径
+- 适合作为专题可追溯记录
+#### run_request.json / run_summary.json
+- 记录本次运行命令、解释器、配置与运行结果
+- 便于回放与专题管理
+---
+## 9. 预期权重怎么使用
+这是专题里非常关键的一部分。
+### 9.1 训练权重的核心用途
+训练出来的 `best_model.pt` 不是只为了看 loss，而是为了后续两类使用：
+1. **离线建库**
+2. **在线查询 embedding 提取**
+### 9.2 离线建库
+目标：
+- 使用参考音源（reference）切片提取 embedding
+- 建立 reference 向量索引
+预期流程：
+1. 读取 `reference` 音源
+2. 切片
+3. 用双流模型提 embedding
+4. 存成 embedding matrix
+5. 后续接 Faiss / pgvector / Milvus
+### 9.3 在线查询
+目标：
+- 输入录音、翻唱、哼唱片段
+- 提取 embedding
+- 与 reference index 做相似度检索
+预期方式：
+1. 加载 `best_model.pt`
+2. 对查询音频切片
+3. 提取 embedding
+4. 与 reference embedding 做 ANN 检索
+5. 结合 vote / rerank 输出最终结果
+### 9.4 推荐使用策略
+#### 最佳权重
+生产或专题评估优先使用：
+- `best_model.pt`
+#### 恢复训练
+继续训练优先使用：
+- `checkpoint_epoch_*.pt`
+#### 对比实验
+建议每个专题 run 保留完整目录，不覆盖历史 run。
+---
+## 10. 预计怎么推进专题训练
+### 第 1 步：先跑通当前 synthetic_v2
+目标：
+- 验证链路
+- 验证显存
+- 验证双流结构
+- 验证 MERT 接口
+### 第 2 步：补录音噪声语料
+优先补充：
+- 餐厅
+- 街道
+- 室内人声背景
+- 手机录制样本
+目标：
+- 提升抗噪与设备鲁棒性
+### 第 3 步：补真实翻唱/旋律相近样本
+目标：
+- 强化旋律对齐
+- 降低音色依赖
+### 第 4 步：补难负样本
+目标：
+- 降低误识别
+- 提高边界判别能力
+### 第 5 步：固化最优专题权重
+目标：
+- 形成一个可用于离线建库与线上检索的标准权重版本
+---
+## 11. 当前专题的资源结论
+### 可以做的事
+- 继续完善训练链路
+- 用 `synthetic_v2` 做小规模训练
+- 做双流模型结构验证
+- 做 4GB GPU 轻量试训
+- 规范化训练产物与权重使用方式
+### 当前暂时受限的事
+- 由于环境缺依赖，**还不能直接启动真实训练**
+- 由于 GPU 只有 4GB，**真实 MERT + ECAPA 双流正式训练需要保守配置**
+- 当前真实音乐语料仍不足，**暂时更适合专题验证，不适合最终权重定版**
+---
+## 12. 本专题当前落地文件
+### 配置
+- `acr-engine/configs/coverhunter_finetune.yaml`
+- `acr-engine/configs/coverhunter_finetune_4gb.yaml`
+- `acr-engine/configs/default.yaml`
+### 模型与训练
+- `acr-engine/src/models/ecapa_tdnn.py`
+- `acr-engine/src/models/losses.py`
+- `acr-engine/src/data/dataset.py`
+- `acr-engine/src/utils/augment.py`
+- `acr-engine/train.py`
+- `acr-engine/scripts/run_coverhunter_finetune.py`
+### 文档
+- `docs/coverhunter_training_process.md`
+- `docs/coverhunter_finetune_topic.md`
+---
+## 13. 当前专题结论
+当前已经具备：
+- 双流 CoverHunter 微调架构
+- 4GB GPU 专用轻量配置
+- 训练流程脚本
+- 训练产物记录机制
+- 专题级训练文档
+当前下一步最实际的动作是：
+1. 在 `/usr/local/miniconda3/bin/python` 下补齐依赖
+2. 用 `coverhunter_finetune_4gb.yaml` 跑 dry-run
+3. 用 `synthetic_v2` 做 2 epoch 小规模试训
+4. 再逐步接入更多 music 语料
--- a/docs/coverhunter_training_process.md 0 → 100644
View file @7da7686
+++ b/docs/coverhunter_training_process.md 0 → 100644
View file @7da7686
+# CoverHunter 双流微调标准流程
+## 1. 当前架构
+当前训练架构已经调整为双流：
+- **流 A：MERT + Melody 分支**
+  - 代码位置：`acr-engine/src/models/ecapa_tdnn.py`
+  - 逻辑：冻结的 `FrozenMERTFeatureExtractor` + `melody/chroma` 融合
+  - 默认模型：`m-a-p/MERT-v1-95M`
+  - 说明：当前代码已经支持真实 HuggingFace MERT 权重接入；若环境里缺少 `transformers` 或首次拉取失败，则无法启用真实 MERT
+- **流 B：ECAPA 分支**
+  - 逻辑：保留 ECAPA 特征建模路径
+- **双流融合**
+  - `DualStreamFusion`
+- **检索头**
+  - `CoverHunterHead`
+- **训练目标**
+  - `InfoNCE + AAMSoftmax`
+## 2. 当前资源检查结论
+### Python 解释器
+训练入口已固定支持：
+```bash
+/usr/local/miniconda3/bin/python
+```
+`acr-engine/scripts/run_coverhunter_finetune.py` 已支持 `--python` 参数，默认就是这个解释器。
+### GPU
+当前检测到 GPU：
+- **Quadro P1000**
+- 总显存：**4096 MiB**
+- 空闲显存：约 **3817 MiB**
+结论：
+- **可以跑训练**
+- 但显存较小，建议：
+  - `batch_size=2~4`
+  - `segment_dur=5.0` 起步
+  - 优先做 dry-run、小批量试跑、再正式训练
+  - 启用真实 MERT 后不要直接上大 batch
+### 数据
+当前仓库中可直接用于冒烟训练的数据：
+- `acr-engine/data/synthetic_v2/train.json`
+- 音频切片位于 `acr-engine/data/synthetic_v2/segments/`
+这些数据已经包含：
+- 普通切片
+- augmented
+- humming_like
+- confused
+适合先做流程验证。
+### 当前环境缺口
+`/usr/local/miniconda3/bin/python` 下当前缺少这些核心包：
+- `torch`
+- `transformers`
+- `huggingface_hub`
+- `torchaudio`
+- `librosa`
+- `soundfile`
+- `audiomentations`
+所以：
+- **GPU 与解释器可用**
+- **但当前训练环境还不能直接跑**
+- 需要先补齐依赖
+## 3. 标准处理流程
+### Step 1：准备 Python 环境
+进入项目后，先确保用的是目标解释器：
+```bash
+/usr/local/miniconda3/bin/python --version
+```
+安装依赖：
+```bash
+/usr/local/miniconda3/bin/python -m pip install -r acr-engine/requirements.txt
+```
+如需单独补装：
+```bash
+/usr/local/miniconda3/bin/python -m pip install torch torchaudio transformers huggingface_hub librosa soundfile audiomentations
+```
+### Step 2：准备 MERT 权重缓存
+首次启用真实 MERT 时，会从 HuggingFace 拉取：
+- `m-a-p/MERT-v1-95M`
+建议先确认网络可访问 HuggingFace，或提前缓存模型。
+如果不希望改默认配置，可以在 `configs/default.yaml` 或 `configs/coverhunter_finetune.yaml` 中调整：
+```yaml
+model:
+  mert_model_name: m-a-p/MERT-v1-95M
+```
+### Step 3：准备噪声数据
+为了支持伪造录音增强，建议准备目录，例如：
+```text
+acr-engine/data/noise/restaurant/
+acr-engine/data/noise/street/
+```
+里面放公开可用环境音频：
+- 餐厅底噪
+- 街道底噪
+- 室内人声背景
+训练时通过：
+```bash
+--noise-root acr-engine/data/noise/restaurant \
+--noise-root acr-engine/data/noise/street
+```
+传入。
+### Step 4：先做 dry-run
+先验证数据、模型、GPU、增强链路是否都通：
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
+  --python /usr/local/miniconda3/bin/python \
+  --data data/synthetic_v2 \
+  --device cuda \
+  --segment-strategy hybrid \
+  --dry-run
+```
+### Step 5：小规模试训
+建议先缩小 batch/config，确认显存稳定：
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python train.py \
+  --config configs/coverhunter_finetune.yaml \
+  --data data/synthetic_v2 \
+  --output data/training_runs/coverhunter_trial \
+  --device cuda \
+  --segment-strategy hybrid \
+  --batch-size 2 \
+  --epochs 2 \
+  --noise-root data/noise/restaurant \
+  --noise-root data/noise/street
+```
+如果显存稳定，再逐步提高到：
+- `batch_size=4`
+- 必要时再尝试 `batch_size=6`
+### Step 6：正式专题训练
+标准命令：
+```bash
+cd /mnt/e/hikoon-ACR/acr-engine && \
+/usr/local/miniconda3/bin/python scripts/run_coverhunter_finetune.py \
+  --python /usr/local/miniconda3/bin/python \
+  --data data/synthetic_v2 \
+  --device cuda \
+  --segment-strategy hybrid \
+  --noise-root data/noise/restaurant \
+  --noise-root data/noise/street
+```
+### Step 7：检查训练产物
+每次训练会记录到：
+```text
+acr-engine/data/training_runs/<run_name>/
+```
+标准产物包括：
+- `best_model.pt`
+- `checkpoint_epoch_*.pt`
+- `song_to_idx.json`
+- `training_metrics.json`
+- `training_manifest.json`
+- `run_request.json`
+- `run_summary.json`
+- `stdout.log`
+- `stderr.log`
+## 4. 增强策略说明
+当前代码已经覆盖两类伪造策略：
+### 伪造录音
+位置：`acr-engine/src/utils/augment.py`
+- `AddGaussianNoise`
+- `AddBackgroundNoise`
+- `BandPassFilter`
+- `Mp3Compression`
+### 伪造翻唱
+位置：`acr-engine/src/utils/augment.py`
+- `PitchShift`
+- `TimeStretch`
+- `Frequency Masking`（作用于 mel）
+## 5. 资源适配建议
+由于当前 GPU 是 Quadro P1000 4GB，建议按以下梯度推进：
+### 推荐起步配置
+- `segment_dur=5.0`
+- `batch_size=2`
+- `mixed_precision=true`
+- `num_workers=0`
+### 稳定后可尝试
+- `batch_size=4`
+- 如 OOM 则回退
+### 当前不建议
+- 直接上 8 秒片段 + batch 16
+- 真实 MERT + 大 batch 同时启用
+## 6. 当前结论
+当前状态可以概括为：
+- **架构方向已经调整正确**：双流
+- **真实 MERT 接口已接入**：是
+- **GPU 可以用于训练**：是
+- **当前 Python 解释器可用**：是，`/usr/local/miniconda3/bin/python`
+- **当前环境能否立刻开训**：**还不能**，因为依赖未装全
+- **现有数据能否支撑一波流程训练**：**可以**，先从 `synthetic_v2` 开始
--- a/docs/mert_pretrain.md 0 → 100644
View file @7da7686
+++ b/docs/mert_pretrain.md 0 → 100644
View file @7da7686
+# 音乐翻唱检测与音频片段检索系统 (CSI) 核心能力结构清单
+## 1. 核心架构逻辑
+* **底座 (Backbone)**：MERT (冻结预训练权重) - 负责音频语义理解。
+* **头部 (Head)**：CoverHunter (可训练 Conformer+Attention) - 负责旋律与结构的对比学习。
+* **对齐方式**：双流融合 (MERT 语义特征 + Melody/Chroma 旋律特征)。
+## 2. 数据与特征工程 (Data Pipeline)
+* **数据集结构**：以 `Song_ID` 为唯一键，物理隔离原曲、压缩版、录音与环境音。
+* **动态增强 (Data Augmentation)**：
+    * 物理扰动：音高平移 (Pitch Shifting)、变速 (Time Stretching)。
+    * 环境注入：背景噪声混入 (Environment Injection)。
+    * 频率掩码：频段擦除 (Frequency Masking) - 逼迫模型脱离音色依赖，转向旋律核心。
+* **数据对齐**：使用插值 (Interpolation) 将 MERT 序列长度与 Melody 序列长度对齐至一致的 `Time_Steps`。
+## 3. 训练与优化策略 (Training Strategy)
+* **样本采样 (Sampler)**：PairSampler - 确保 Batch 中包含强配对的“原曲-翻唱”与精心挑选的“原曲-难负样本”。
+* **难负样本挖掘 (Hard Negative Mining)**：
+    * 使用冻结 MERT + Faiss 构建初始索引。
+    * 挖掘曲风相似但旋律不同的“假孪生兄弟”歌曲作为 Negative 样本。
+* **损失函数 (Loss Function)**：InfoNCE Contrastive Loss - 拉近正样本余弦距离，推远负样本余弦距离。
+## 4. 推理与检索引擎 (Inference & Retrieval)
+* **离线建库**：全量原曲切片 -> 特征提取 -> 存入向量数据库 (Faiss/Milvus)。
+* **在线查询**：录音片段 -> 滑动窗口切片 -> 提取 Embedding -> 近似最近邻检索 (ANN)。
+* **鲁棒性机制**：切片投票机制 (Slice Voting) - 对查询录音切片所得的 Top-K 结果进行统计，按票数加权归一化排序。
+## 5. 工程化关键节点 (Engineering Checklist)
+* **计算优化**：离线特征缓存 (预先存储 .npy 减少 GPU 实时计算压力)。
+* **部署优化**：ONNX/TensorRT 模型编译 + 动态批处理 (Dynamic Batching)。
+* **数据飞轮**：在线难例挖掘 (基于用户反馈的 False Positives 循环重训)。