添加曲结构去重

沈秋雨
Commit 8413944a ... 8413944ad675bb85c114f5f012b4257a140fef8e authored 2026-06-11 13:14:21 +0800 by 沈秋雨
Showing 10 changed files with 695 additions and 0 deletions
composition_dedup/__init__.py
composition_dedup/dejavu_fingerprinter.py
composition_dedup/extractor.py
composition_dedup/service.py
composition_dedup/similarity.py
scripts/evaluate_composition.py
scripts/generate_composition_testset.py
scripts/import_audio_composition.py
scripts/postgres_schema.sql
tests/test_composition_dedup.py
--- a/composition_dedup/__init__.py 0 → 100644
View file @8413944
+++ b/composition_dedup/__init__.py 0 → 100644
View file @8413944
+from .service import CompositionCandidate, CompositionConfig, CompositionDedupService
+from .dejavu_fingerprinter import fingerprint_audio
+
+__all__ = [
+    "CompositionCandidate",
+    "CompositionConfig",
+    "CompositionDedupService",
+    "fingerprint_audio",
+]
--- a/composition_dedup/dejavu_fingerprinter.py 0 → 100644
View file @8413944
+++ b/composition_dedup/dejavu_fingerprinter.py 0 → 100644
View file @8413944
+"""Dejavu 风格的音频指纹生成。
+
+基于 worldveil/dejavu 的指纹算法提取实现，不依赖 Dejavu 的数据库层。
+使用 scipy.signal.spectrogram 替代已废弃的 matplotlib.mlab.specgram。
+
+流程：
+1. 音频标准化：ffmpeg 转 44100Hz / Mono / WAV
+2. librosa 加载音频
+3. 短时傅里叶变换（STFT）→ 对数频谱图
+4. 2D 峰值检测：在频谱图中找局部极大值
+5. 指纹哈希：对峰值对 (freq1, freq2, time_delta) 做 SHA1，取前 20 位
+"""
+
+import hashlib
+import logging
+import os
+import subprocess
+import tempfile
+from operator import itemgetter
+from pathlib import Path
+
+import librosa
+import numpy as np
+from scipy.ndimage import (
+    binary_erosion,
+    generate_binary_structure,
+    iterate_structure,
+    maximum_filter,
+)
+from scipy.signal import spectrogram
+
+logger = logging.getLogger(__name__)
+
+
+def _load_env_file() -> None:
+    """加载项目根目录 .env，不覆盖已存在的真实环境变量。"""
+    env_path = Path(__file__).resolve().parent.parent / ".env"
+    if not env_path.exists():
+        return
+    with env_path.open(encoding="utf-8") as file:
+        for raw_line in file:
+            line = raw_line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, value = line.split("=", 1)
+            os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))
+
+
+_load_env_file()
+
+# ===== 常量（可通过环境变量覆盖）=====
+
+DEFAULT_FS = 44100
+DEFAULT_WINDOW_SIZE = 4096
+DEFAULT_OVERLAP_RATIO = float(os.environ.get("COMPOSITION_DEJAVU_OVERLAP_RATIO", "0.3"))
+DEFAULT_FAN_VALUE = int(os.environ.get("COMPOSITION_DEJAVU_FAN_VALUE", "10"))
+DEFAULT_AMP_MIN = float(os.environ.get("COMPOSITION_DEJAVU_AMP_MIN", "20"))
+PEAK_NEIGHBORHOOD_SIZE = 20
+MIN_HASH_TIME_DELTA = 0
+MAX_HASH_TIME_DELTA = 200
+PEAK_SORT = True
+FINGERPRINT_REDUCTION = 20
+MAX_DURATION_SEC = float(os.environ.get("COMPOSITION_DEJAVU_MAX_DURATION", "120"))  # 0=不限制
+
+
+def _normalize_audio(audio_path: str, max_duration: float = MAX_DURATION_SEC) -> tuple[np.ndarray, int]:
+    """将音频标准化为单声道 WAV 并加载为 numpy 数组。
+
+    使用 ffmpeg 先做重采样，再用 librosa 读取。
+    可选限制音频长度，超长音频只取前 N 秒。
+    """
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp_wav = tmp.name
+
+    try:
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i", audio_path,
+            "-ar", str(DEFAULT_FS),
+            "-ac", "1",
+            "-f", "wav",
+        ]
+        if max_duration > 0:
+            cmd += ["-t", str(max_duration)]
+        cmd.append(tmp_wav)
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"ffmpeg 转换失败: {result.stderr}")
+
+        y, sr = librosa.load(tmp_wav, sr=DEFAULT_FS, mono=True)
+        return y, sr
+    finally:
+        if os.path.exists(tmp_wav):
+            os.remove(tmp_wav)
+
+
+def _specgram(samples: np.ndarray, fs: int, window_size: int, overlap_ratio: float):
+    """计算对数频谱图，替代 matplotlib.mlab.specgram。
+
+    Returns:
+        arr2D: shape (n_freq, n_time) 的对数频谱矩阵（dBFS 刻度）
+    """
+    noverlap = int(window_size * overlap_ratio)
+    window = np.hanning(window_size)
+
+    freqs, times, Sxx = spectrogram(
+        samples,
+        fs=fs,
+        window=window,
+        nperseg=window_size,
+        noverlap=noverlap,
+    )
+
+    # 转为对数尺度（dBFS，0 dB 为峰值参考）
+    # scipy.signal.spectrogram 返回 PSD，mlab.specgram 返回功率，两者量纲不同
+    # 统一转为相对于峰值的 dBFS 刻度，使强信号峰值落在 20~80 dB 范围
+    arr2D = 10 * np.log10(Sxx + 1e-10)
+    arr2D = arr2D - arr2D.max()  # 归一化到峰值为 0 dBFS
+    arr2D = arr2D + 80  # 偏移使典型峰值落在 20~80 dB（与 mlab.specgram 一致）
+    arr2D[arr2D < -100] = -100  # 限幅
+    return arr2D
+
+
+def _get_2d_peaks(arr2D: np.ndarray, amp_min: float = DEFAULT_AMP_MIN):
+    """在频谱图中检测 2D 局部极大值。
+
+    Returns:
+        (frequency_idx, time_idx): 峰值的频率和时间索引列表
+    """
+    struct = generate_binary_structure(2, 1)
+    neighborhood = iterate_structure(struct, PEAK_NEIGHBORHOOD_SIZE)
+
+    # 找局部极大值
+    local_max = maximum_filter(arr2D, footprint=neighborhood) == arr2D
+    background = arr2D == 0
+    eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
+
+    # 布尔掩码
+    detected_peaks = local_max ^ eroded_background
+
+    # 提取峰值
+    amps = arr2D[detected_peaks]
+    j, i = np.where(detected_peaks)
+
+    # 过滤低于阈值的峰值
+    peaks = list(zip(i, j, amps))
+    peaks_filtered = [x for x in peaks if x[2] > amp_min]
+
+    frequency_idx = [x[1] for x in peaks_filtered]
+    time_idx = [x[0] for x in peaks_filtered]
+
+    return frequency_idx, time_idx
+
+
+def _generate_hashes(peaks: list[tuple[int, int]], fan_value: int = DEFAULT_FAN_VALUE):
+    """根据峰值对生成 SHA1 指纹哈希。
+
+    Args:
+        peaks: [(freq_idx, time_idx), ...] 列表
+        fan_value: 每个峰值与后续多少个峰值配对
+
+    Yields:
+        (hash_bytes, time_offset) 元组
+    """
+    if PEAK_SORT:
+        peaks.sort(key=itemgetter(1))
+
+    for i in range(len(peaks)):
+        for j in range(1, fan_value):
+            if i + j < len(peaks):
+                freq1 = peaks[i][0]
+                freq2 = peaks[i + j][0]
+                t1 = peaks[i][1]
+                t2 = peaks[i + j][1]
+                t_delta = t2 - t1
+
+                if MIN_HASH_TIME_DELTA <= t_delta <= MAX_HASH_TIME_DELTA:
+                    h = hashlib.sha1(f"{freq1}|{freq2}|{t_delta}".encode())
+                    yield (h.hexdigest()[:FINGERPRINT_REDUCTION].encode(), t1)
+
+
+def fingerprint_audio(audio_path: str) -> tuple[str, list[tuple[bytes, int]]]:
+    """对音频文件生成 Dejavu 风格指纹。
+
+    Args:
+        audio_path: 音频文件路径。
+
+    Returns:
+        (file_sha1, fingerprints) 元组，
+        其中 fingerprints 是 [(hash_bytes, offset), ...] 列表。
+
+    Raises:
+        FileNotFoundError: 音频文件不存在。
+        RuntimeError: ffmpeg 转换失败。
+    """
+    if not os.path.isfile(audio_path):
+        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+
+    # 1. 标准化并加载音频（可选限制长度）
+    samples, fs = _normalize_audio(audio_path)
+
+    # 2. 计算文件 SHA1（用于标识）
+    file_sha1 = hashlib.sha1(samples.tobytes()).hexdigest()[:16]
+
+    # 3. 计算频谱图
+    arr2D = _specgram(samples, fs, DEFAULT_WINDOW_SIZE, DEFAULT_OVERLAP_RATIO)
+
+    # 4. 检测 2D 峰值
+    freq_idx, time_idx = _get_2d_peaks(arr2D)
+    peaks = list(zip(freq_idx, time_idx))
+
+    # 5. 生成指纹哈希
+    fingerprints = list(_generate_hashes(peaks))
+
+    logger.info("指纹生成完成: audio=%s, 指纹数=%d", audio_path, len(fingerprints))
+    return file_sha1, fingerprints
--- a/composition_dedup/extractor.py 0 → 100644
View file @8413944
+++ b/composition_dedup/extractor.py 0 → 100644
View file @8413944
+"""Chromagram 特征提取。
+
+流程：
+1. 音频标准化：ffmpeg 转 22050Hz / Mono / WAV
+2. librosa 加载音频
+3. librosa.feature.chroma_cens() 提取 12×T Chromagram（CENS，对速度/音色鲁棒）
+4. 主音对齐：将能量最大的音级滚至第 0 行，实现转调不变性
+5. scipy.signal.resample(chroma, 128, axis=1) 时间归一化到 12×128
+6. .flatten() 展开为 1536 维向量
+"""
+
+import logging
+import os
+import subprocess
+import tempfile
+
+import librosa
+import numpy as np
+from scipy.signal import resample
+
+logger = logging.getLogger(__name__)
+
+# 目标采样率和时间帧数
+TARGET_SR = 22050
+TARGET_FRAMES = 128
+VECTOR_DIM = 12 * TARGET_FRAMES  # 1536
+
+
+def _normalize_audio_ffmpeg(audio_path: str, output_path: str) -> None:
+    """使用 ffmpeg 将音频标准化为 22050Hz / Mono / WAV。"""
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", audio_path,
+        "-ar", str(TARGET_SR),
+        "-ac", "1",
+        "-f", "wav",
+        output_path,
+    ]
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg 转换失败: {result.stderr}")
+
+
+def extract_chroma_feature(audio_path: str) -> np.ndarray:
+    """从音频文件提取 1536 维 Chromagram 特征向量。
+
+    Args:
+        audio_path: 音频文件路径。
+
+    Returns:
+        shape 为 (1536,) 的 numpy 数组。
+
+    Raises:
+        FileNotFoundError: 音频文件不存在。
+        RuntimeError: ffmpeg 转换失败。
+    """
+    if not os.path.isfile(audio_path):
+        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+
+    # 1. 音频标准化：ffmpeg 转 WAV
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp_wav = tmp.name
+
+    try:
+        _normalize_audio_ffmpeg(audio_path, tmp_wav)
+
+        # 2. librosa 加载音频
+        y, _sr = librosa.load(tmp_wav, sr=TARGET_SR, mono=True)
+
+        # 3. 提取 CENS Chromagram (12×T)，对速度变化和音色具有更强鲁棒性
+        chroma = librosa.feature.chroma_cens(y=y, sr=TARGET_SR)
+
+        # 4. 主音对齐：将全局能量最大的音级循环滚至第 0 行，实现转调不变性
+        tonic = int(np.argmax(chroma.sum(axis=1)))
+        if tonic != 0:
+            chroma = np.roll(chroma, -tonic, axis=0)
+
+        # 5. 时间归一化到 12×128
+        if chroma.shape[1] != TARGET_FRAMES:
+            chroma = resample(chroma, TARGET_FRAMES, axis=1)
+
+        # 6. 展开为 1536 维向量
+        feature = chroma.flatten().astype(np.float32)
+
+        assert feature.shape == (VECTOR_DIM,), (
+            f"特征维度错误: 期望 {VECTOR_DIM}, 实际 {feature.shape}"
+        )
+
+        return feature
+    finally:
+        # 清理临时文件
+        if os.path.exists(tmp_wav):
+            os.remove(tmp_wav)
+
+
+def extract_chroma_matrix(audio_path: str) -> np.ndarray:
+    """从音频文件提取 12×128 Chromagram 矩阵（未展平，供 DTW 精排使用）。
+
+    Returns:
+        shape 为 (12, 128) 的 numpy 数组，已做主音对齐。
+    """
+    feature = extract_chroma_feature(audio_path)
+    return feature.reshape(12, TARGET_FRAMES)
--- a/composition_dedup/service.py 0 → 100644
View file @8413944
+++ b/composition_dedup/service.py 0 → 100644
View file @8413944
--- a/composition_dedup/similarity.py 0 → 100644
View file @8413944
+++ b/composition_dedup/similarity.py 0 → 100644
View file @8413944
+"""Cosine 相似度计算与去重判定。"""
+
+from enum import Enum
+
+import numpy as np
+
+DUPLICATE_THRESHOLD = 0.95
+SUSPECTED_THRESHOLD = 0.85
+
+
+class SimilarityDecision(Enum):
+    DUPLICATE = "duplicate"
+    SUSPECTED = "suspected"
+    NEW = "new"
+
+
+class CompositionSimilarity:
+    @staticmethod
+    def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+        norm_a = np.linalg.norm(a)
+        norm_b = np.linalg.norm(b)
+        if norm_a == 0.0 or norm_b == 0.0:
+            return 0.0
+        return float(np.dot(a, b) / (norm_a * norm_b))
+
+    @staticmethod
+    def classify_similarity(similarity: float) -> SimilarityDecision:
+        if similarity >= DUPLICATE_THRESHOLD:
+            return SimilarityDecision.DUPLICATE
+        if similarity >= SUSPECTED_THRESHOLD:
+            return SimilarityDecision.SUSPECTED
+        return SimilarityDecision.NEW
+
+    @staticmethod
+    def compare(a: np.ndarray, b: np.ndarray) -> tuple[float, SimilarityDecision]:
+        sim = CompositionSimilarity.cosine_similarity(a, b)
+        return sim, CompositionSimilarity.classify_similarity(sim)
--- a/scripts/evaluate_composition.py 0 → 100644
View file @8413944
+++ b/scripts/evaluate_composition.py 0 → 100644
View file @8413944
--- a/scripts/generate_composition_testset.py 0 → 100644
View file @8413944
+++ b/scripts/generate_composition_testset.py 0 → 100644
View file @8413944
--- a/scripts/import_audio_composition.py 0 → 100644
View file @8413944
+++ b/scripts/import_audio_composition.py 0 → 100644
View file @8413944
+"""批量导入音频文件到 composition_feature 表。
+
+用法:
+python scripts/import_audio_composition.py \
+    --dsn "postgresql:///lyric_dedup" \
+    --audio-dir /Volumes/移动硬盘/composition_test \
+    --ext .wav
+
+支持通过 --file-list 指定一个包含音频路径的文本文件（每行一个路径）。
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from tqdm import tqdm
+
+from composition_dedup.service import CompositionConfig, CompositionDedupService
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_EXTENSIONS = {".mp3", ".wav", ".flac", ".ogg", ".m4a", ".aac", ".wma"}
+
+
+def discover_audio_files(audio_dir: str | None, file_list: str | None, ext: str) -> list[tuple[str, str]]:
+    """发现音频文件，返回 [(song_id, audio_path), ...] 列表。
+
+    优先使用 --file-list，否则扫描 --audio-dir 目录。
+    song_id 使用文件名的数字部分或路径的哈希值。
+    """
+    results = []
+
+    if file_list:
+        with open(file_list, "r", encoding="utf-8") as f:
+            for line in f:
+                path = line.strip()
+                if not path:
+                    continue
+                song_id = _extract_song_id(path)
+                results.append((song_id, path))
+    elif audio_dir:
+        audio_dir_path = Path(audio_dir)
+        for audio_file in sorted(audio_dir_path.rglob(f"*{ext}")):
+            if audio_file.is_file() and not audio_file.name.startswith("._"):
+                song_id = _extract_song_id(str(audio_file))
+                results.append((song_id, str(audio_file)))
+    else:
+        print("错误: 请指定 --audio-dir 或 --file-list")
+        sys.exit(1)
+
+    return results
+
+
+def _extract_song_id(path: str) -> str:
+    """从路径中提取 song_id。
+    优先取文件名第一段（下划线前），若为纯数字则使用，否则用路径哈希。
+    """
+    name = Path(path).stem
+    prefix = name.split("_")[0]
+    if prefix.isdigit():
+        return prefix
+    import hashlib
+    return str(int(hashlib.md5(path.encode()).hexdigest()[:8], 16))
+
+
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+
+    parser = argparse.ArgumentParser(description="批量导入音频文件到 composition_feature 表")
+    parser.add_argument("--dsn", required=True, help="PostgreSQL DSN 连接串")
+    parser.add_argument("--audio-dir", help="音频文件目录")
+    parser.add_argument("--file-list", help="音频文件路径列表文件")
+    parser.add_argument("--ext", default=".wav", help="音频文件扩展名（默认 .wav）")
+    parser.add_argument("--batch-size", type=int, default=10, help="批次大小（默认 10）")
+    parser.add_argument("--clear", action="store_true", help="导入前清空 composition_feature 和 dejavu_fingerprints 表数据（保留表结构）")
+    args = parser.parse_args()
+
+    config = CompositionConfig(dsn=args.dsn)
+    service = CompositionDedupService(config=config)
+
+    if args.clear:
+        import psycopg
+        with psycopg.connect(args.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.execute("TRUNCATE TABLE composition_feature, dejavu_fingerprints")
+            conn.commit()
+        logger.info("已清空 composition_feature 和 dejavu_fingerprints 表")
+
+    audio_files = discover_audio_files(args.audio_dir, args.file_list, args.ext)
+    logger.info("发现 %d 个音频文件", len(audio_files))
+
+    success_count = 0
+    fail_count = 0
+
+    for start in tqdm(range(0, len(audio_files), args.batch_size), desc="导入进度"):
+        batch = audio_files[start:start + args.batch_size]
+        for song_id, audio_path in batch:
+            try:
+                service.ingest(song_id=int(song_id), audio_path=audio_path)
+                success_count += 1
+            except Exception as e:
+                logger.error("导入失败: song_id=%s, path=%s, error=%s", song_id, audio_path, e)
+                fail_count += 1
+
+    logger.info("导入完成: 成功 %d, 失败 %d", success_count, fail_count)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/postgres_schema.sql
View file @8413944
+++ b/scripts/postgres_schema.sql
View file @8413944
@@ -40,3 +40,33 @@ on lyric_lines (line_hash);

 create index if not exists lyric_lines_lyric_id_idx
 on lyric_lines (lyric_id);
+
+create extension if not exists vector;
+
+create table if not exists composition_feature (
+  id bigserial primary key,
+  song_id bigint not null unique,
+  feature_vector vector(1536) not null,
+  created_at timestamptz not null default now()
+);
+
+create index if not exists composition_feature_hnsw_idx
+on composition_feature
+using hnsw (feature_vector vector_cosine_ops)
+with (m = 16, ef_construction = 64);
+
+create table if not exists dejavu_fingerprints (
+  id      bigserial primary key,
+  song_id bigint not null references composition_feature(song_id) on delete cascade,
+  hash    bytea not null,
+  "offset"  int not null
+);
+
+create index if not exists idx_fingerprints_hash
+on dejavu_fingerprints (hash);
+
+create index if not exists idx_fingerprints_hash_song_offset
+on dejavu_fingerprints (hash, song_id, "offset");
+
+create index if not exists idx_fingerprints_song_id
+on dejavu_fingerprints (song_id);
--- a/tests/test_composition_dedup.py 0 → 100644
View file @8413944
+++ b/tests/test_composition_dedup.py 0 → 100644
View file @8413944
+"""作曲去重模块测试。
+
+测试覆盖：
+- Chromagram 提取
+- 时间归一化输出维度
+- Cosine 相似度计算
+- 向量展开维度为 1536
+"""
+
+import os
+import tempfile
+import wave
+
+import numpy as np
+import pytest
+from scipy.signal import resample
+
+from composition_dedup.extractor import extract_chroma_feature, _normalize_audio_ffmpeg
+from composition_dedup.similarity import (
+    CompositionSimilarity,
+    SimilarityDecision,
+    DUPLICATE_THRESHOLD,
+    SUSPECTED_THRESHOLD,
+)
+
+
+def _generate_test_wav(duration_sec: float = 1.0, sample_rate: int = 22050, frequency: float = 440.0) -> str:
+    """生成测试用的 WAV 文件（正弦波）。
+
+    Args:
+        duration_sec: 持续时间（秒）。
+        sample_rate: 采样率。
+        frequency: 频率（Hz）。
+
+    Returns:
+        临时 WAV 文件路径。
+    """
+    t = np.linspace(0, duration_sec, int(sample_rate * duration_sec), endpoint=False)
+    audio_data = (0.5 * np.sin(2 * np.pi * frequency * t)).astype(np.float32)
+
+    tmp_path = tempfile.mktemp(suffix=".wav")
+    with wave.open(tmp_path, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)  # 16-bit
+        wf.setframerate(sample_rate)
+        wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+
+    return tmp_path
+
+
+class TestChromaExtraction:
+    """Chromagram 提取测试。"""
+
+    def test_extract_chroma_returns_1536_dim(self):
+        """测试 Chromagram 提取返回 1536 维向量。"""
+        wav_path = _generate_test_wav(duration_sec=2.0, frequency=440.0)
+        try:
+            feature = extract_chroma_feature(wav_path)
+            assert isinstance(feature, np.ndarray)
+            assert feature.shape == (1536,), f"期望 (1536,), 实际 {feature.shape}"
+            assert feature.dtype == np.float32
+        finally:
+            if os.path.exists(wav_path):
+                os.remove(wav_path)
+
+    def test_extract_chroma_file_not_found(self):
+        """测试不存在的音频文件抛出 FileNotFoundError。"""
+        with pytest.raises(FileNotFoundError):
+            extract_chroma_feature("/nonexistent/path/audio.mp3")
+
+    def test_extract_chroma_different_frequencies(self):
+        """测试不同频率的音频产生不同特征。"""
+        wav_a = _generate_test_wav(duration_sec=2.0, frequency=440.0)
+        wav_b = _generate_test_wav(duration_sec=2.0, frequency=880.0)
+        try:
+            feature_a = extract_chroma_feature(wav_a)
+            feature_b = extract_chroma_feature(wav_b)
+            # 不同频率的音频特征不应完全相同
+            assert not np.allclose(feature_a, feature_b)
+        finally:
+            for path in [wav_a, wav_b]:
+                if os.path.exists(path):
+                    os.remove(path)
+
+    def test_extract_chroma_same_audio_consistent(self):
+        """测试同一音频多次提取结果一致。"""
+        wav_path = _generate_test_wav(duration_sec=1.0, frequency=440.0)
+        try:
+            feature_1 = extract_chroma_feature(wav_path)
+            feature_2 = extract_chroma_feature(wav_path)
+            np.testing.assert_array_almost_equal(feature_1, feature_2, decimal=5)
+        finally:
+            if os.path.exists(wav_path):
+                os.remove(wav_path)
+
+
+class TestTimeNormalization:
+    """时间归一化测试。"""
+
+    def test_resample_chroma_to_128_frames(self):
+        """测试 Chromagram 时间归一化到 128 帧。"""
+        # 模拟不同长度的 Chromagram
+        for num_frames in [100, 256, 512, 1000, 2000]:
+            chroma = np.random.rand(12, num_frames).astype(np.float32)
+            if chroma.shape[1] != 128:
+                chroma = resample(chroma, 128, axis=1)
+            assert chroma.shape == (12, 128), f"帧数归一化失败: {chroma.shape}"
+
+    def test_flatten_to_1536(self):
+        """测试展平后维度为 1536。"""
+        chroma = np.random.rand(12, 128).astype(np.float32)
+        feature = chroma.flatten()
+        assert feature.shape[0] == 12 * 128 == 1536
+
+
+class TestCosineSimilarity:
+    """Cosine 相似度计算测试。"""
+
+    def test_identical_vectors(self):
+        """测试相同向量相似度为 1。"""
+        vec = np.random.rand(1536).astype(np.float32)
+        sim = CompositionSimilarity.cosine_similarity(vec, vec)
+        assert abs(sim - 1.0) < 1e-6
+
+    def test_orthogonal_vectors(self):
+        """测试正交向量相似度接近 0。"""
+        vec_a = np.zeros(1536)
+        vec_a[0] = 1.0
+        vec_b = np.zeros(1536)
+        vec_b[1] = 1.0
+        sim = CompositionSimilarity.cosine_similarity(vec_a, vec_b)
+        assert abs(sim) < 1e-6
+
+    def test_zero_vector(self):
+        """测试零向量返回 0 相似度。"""
+        vec_a = np.random.rand(1536).astype(np.float32)
+        vec_b = np.zeros(1536)
+        sim = CompositionSimilarity.cosine_similarity(vec_a, vec_b)
+        assert sim == 0.0
+
+    def test_similarity_range(self):
+        """测试相似度值在 [0, 1] 范围内。"""
+        vec_a = np.random.rand(1536).astype(np.float32)
+        vec_b = np.random.rand(1536).astype(np.float32)
+        sim = CompositionSimilarity.cosine_similarity(vec_a, vec_b)
+        assert 0.0 <= sim <= 1.0
+
+    def test_classify_duplicate(self):
+        """测试重复判定。"""
+        assert CompositionSimilarity.classify_similarity(0.96) == SimilarityDecision.DUPLICATE
+        assert CompositionSimilarity.classify_similarity(0.95) == SimilarityDecision.DUPLICATE
+
+    def test_classify_suspected(self):
+        """测试疑似判定。"""
+        assert CompositionSimilarity.classify_similarity(0.94) == SimilarityDecision.SUSPECTED
+        assert CompositionSimilarity.classify_similarity(0.85) == SimilarityDecision.SUSPECTED
+
+    def test_classify_new(self):
+        """测试非重复判定。"""
+        assert CompositionSimilarity.classify_similarity(0.84) == SimilarityDecision.NEW
+        assert CompositionSimilarity.classify_similarity(0.5) == SimilarityDecision.NEW
+
+    def test_compare_method(self):
+        """测试 compare 方法同时返回相似度和判定。"""
+        vec = np.random.rand(1536).astype(np.float32)
+        sim, decision = CompositionSimilarity.compare(vec, vec)
+        assert abs(sim - 1.0) < 1e-6
+        assert decision == SimilarityDecision.DUPLICATE
+
+
+class TestThresholds:
+    """阈值常量测试。"""
+
+    def test_threshold_order(self):
+        """测试阈值顺序正确。"""
+        assert DUPLICATE_THRESHOLD > SUSPECTED_THRESHOLD
+
+    def test_threshold_values(self):
+        """测试阈值符合设计值。"""
+        assert DUPLICATE_THRESHOLD == 0.95
+        assert SUSPECTED_THRESHOLD == 0.85