简化去重链路，仅保留使用pg作为数据库的链路

使用opencc作为简繁转换

简化去重链路，仅保留使用pg作为数据库的链路
使用opencc作为简繁转换
沈秋雨
Commit fec2556e ... fec2556ea008688f2ceac576f400a5d1cc9c22d7 authored 2026-06-04 13:48:52 +0800 by 沈秋雨
Showing 15 changed files with 136 additions and 156 deletions
README.md
TEST_WORKFLOW.md
lyric_dedup/checker.py
lyric_dedup/cli.py
lyric_dedup/eval_dataset.py
lyric_dedup/minhash_lsh.py
lyric_dedup/normalization.py
lyric_dedup_server/__init__.py
lyric_dedup_server/config.py
lyric_dedup_server/service.py
requirements.txt
scripts/evaluate_postgres.py
scripts/process_library.py
test_api/dedup_samples/README.md
tests/test_lyric_dedup.py
--- a/README.md
View file @fec2556
+++ b/README.md
View file @fec2556
--- a/TEST_WORKFLOW.md
View file @fec2556
+++ b/TEST_WORKFLOW.md
View file @fec2556
--- a/lyric_dedup/checker.py
View file @fec2556
+++ b/lyric_dedup/checker.py
View file @fec2556
--- a/lyric_dedup/cli.py
View file @fec2556
+++ b/lyric_dedup/cli.py
View file @fec2556
--- a/lyric_dedup/eval_dataset.py
View file @fec2556
+++ b/lyric_dedup/eval_dataset.py
View file @fec2556
@@ -12,7 +12,6 @@ from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path

-from lyric_dedup.checker import DuplicateChecker
 from lyric_dedup.checker import LyricRecord
 from lyric_dedup.file_import import iter_lyric_files
 from lyric_dedup.file_import import record_from_file
@@ -133,8 +132,6 @@ def generate_eval_set(
    )
    holdout_ids = {profile.record_id for profile in holdout_profiles}
    indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
-    eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
-    _build_eval_index(indexed_profiles, eval_index_path)
    groups = _profile_groups(indexed_profiles)
    samples: list[GeneratedSample] = []

@@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran
    return _stratified_sample(profiles, min(count, len(profiles)), rng)


-def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
-    _progress(f"build eval index excluding holdout: {index_path}")
-    checker = DuplicateChecker()
-    total = len(profiles)
-    for index, profile in enumerate(profiles, start=1):
-        checker.add_normalized_record(
-            LyricRecord(
-                record_id=profile.record_id,
-                lyrics=profile.raw_text,
-                title=profile.title or None,
-                artist=profile.artist or None,
-            ),
-            profile.normalized,
-        )
-        _progress_count("build eval index", index, total, step=5000)
-    index_path.parent.mkdir(parents=True, exist_ok=True)
-    checker.save(index_path)
-
-
 def _build_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
@@ -889,7 +867,7 @@ def _write_manifest(
        "sample_size": len(samples),
        "plan": plan,
        "source_index": str(index_path) if index_path else "",
-        "eval_index": str(eval_index_path),
+        "eval_index": str(eval_index_path) if eval_index_path else "",
        "holdout_records": holdout_count,
        "lyrics_dir": str(output_dir),
        "csv": str(csv_path),
--- a/lyric_dedup/minhash_lsh.py deleted 100644 → 0
View file @d39197d
+++ b/lyric_dedup/minhash_lsh.py deleted 100644 → 0
View file @d39197d
-"""Small in-memory MinHash LSH index for incremental lyric lookup."""
-
-from __future__ import annotations
-
-import hashlib
-from collections import defaultdict
-from dataclasses import dataclass
-
-
-_MAX_HASH = (1 << 64) - 1
-
-
-@dataclass(frozen=True)
-class MinHashConfig:
-    num_perm: int = 96
-    bands: int = 24
-    seed: int = 17
-
-    @property
-    def rows_per_band(self) -> int:
-        if self.num_perm % self.bands != 0:
-            raise ValueError("num_perm must be divisible by bands")
-        return self.num_perm // self.bands
-
-
-class MinHashLSH:
-    def __init__(self, config: MinHashConfig | None = None) -> None:
-        self.config = config or MinHashConfig()
-        self._buckets: dict[tuple[int, tuple[int, ...]], set[str]] = defaultdict(set)
-
-    def signature(self, tokens: set[str]) -> tuple[int, ...]:
-        if not tokens:
-            return tuple([_MAX_HASH] * self.config.num_perm)
-
-        signature = [_MAX_HASH] * self.config.num_perm
-        for token in tokens:
-            encoded = token.encode("utf-8")
-            for idx in range(self.config.num_perm):
-                digest = hashlib.blake2b(
-                    encoded,
-                    digest_size=8,
-                    person=f"lyr{self.config.seed + idx:05d}".encode("ascii")[:16],
-                ).digest()
-                value = int.from_bytes(digest, "big")
-                if value < signature[idx]:
-                    signature[idx] = value
-        return tuple(signature)
-
-    def add(self, record_id: str, signature: tuple[int, ...]) -> None:
-        for key in self._band_keys(signature):
-            self._buckets[key].add(record_id)
-
-    def query(self, signature: tuple[int, ...]) -> set[str]:
-        candidates: set[str] = set()
-        for key in self._band_keys(signature):
-            candidates.update(self._buckets.get(key, set()))
-        return candidates
-
-    def _band_keys(self, signature: tuple[int, ...]) -> list[tuple[int, tuple[int, ...]]]:
-        rows = self.config.rows_per_band
-        return [(band, signature[band * rows : (band + 1) * rows]) for band in range(self.config.bands)]
--- a/lyric_dedup/normalization.py
View file @fec2556
+++ b/lyric_dedup/normalization.py
View file @fec2556
@@ -8,69 +8,10 @@ import unicodedata
 from collections import Counter
 from dataclasses import dataclass

+import opencc

-_TRADITIONAL_TO_SIMPLIFIED = str.maketrans(
-    {
-        "愛": "爱",
-        "會": "会",
-        "個": "个",
-        "妳": "你",
-        "們": "们",
-        "麼": "么",
-        "夢": "梦",
-        "憶": "忆",
-        "風": "风",
-        "無": "无",
-        "與": "与",
-        "聽": "听",
-        "說": "说",
-        "見": "见",
-        "話": "话",
-        "還": "还",
-        "這": "这",
-        "那": "那",
-        "裡": "里",
-        "裏": "里",
-        "過": "过",
-        "來": "来",
-        "進": "进",
-        "去": "去",
-        "給": "给",
-        "讓": "让",
-        "嗎": "吗",
-        "為": "为",
-        "誰": "谁",
-        "對": "对",
-        "錯": "错",
-        "淚": "泪",
-        "寫": "写",
-        "雲": "云",
-        "藍": "蓝",
-        "紅": "红",
-        "綠": "绿",
-        "黃": "黄",
-        "長": "长",
-        "遠": "远",
-        "燈": "灯",
-        "臺": "台",
-        "台": "台",
-        "後": "后",
-        "從": "从",
-        "時": "时",
-        "間": "间",
-        "葉": "叶",
-        "歲": "岁",
-        "聲": "声",
-        "邊": "边",
-        "歡": "欢",
-        "繼": "继",
-        "續": "续",
-        "難": "难",
-        "雙": "双",
-        "舊": "旧",
-        "離": "离",
-    }
-)
+
+_T2S_CONVERTER = opencc.OpenCC("t2s.json")

 _TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]")
 _BRACKET_RE = re.compile(r"[\[(（【<《].{0,40}?[\])）】>》]")
@@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in

 def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]:
    line = _BRACKET_RE.sub("", text)
-    line = line.strip().lower().translate(_TRADITIONAL_TO_SIMPLIFIED)
+    line = _T2S_CONVERTER.convert(line.strip().lower())
    if not line or _is_noise_line(line):
        return []
    line = _strip_symbols(line)
--- a/lyric_dedup_server/__init__.py
View file @fec2556
+++ b/lyric_dedup_server/__init__.py
View file @fec2556
 from .config import ServerConfig
-from .service import DedupService

-__all__ = ["ServerConfig", "DedupService"]
+__all__ = ["ServerConfig"]
--- a/lyric_dedup_server/config.py
View file @fec2556
+++ b/lyric_dedup_server/config.py
View file @fec2556
@@ -4,14 +4,101 @@ from __future__ import annotations

 import os
 from dataclasses import dataclass
+from pathlib import Path
+
+
+def _load_env_file() -> None:
+    """Load root .env values without overriding real environment variables."""
+    env_path = Path(__file__).resolve().parent.parent / ".env"
+    if not env_path.exists():
+        return
+    with env_path.open(encoding="utf-8") as file:
+        for raw_line in file:
+            line = raw_line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, value = line.split("=", 1)
+            os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))
+
+
+_load_env_file()


 @dataclass
 class ServerConfig:
+    # PostgreSQL DSN used by the dedup service.
    dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")
+
+    # Maximum ranked candidates returned in the final API result.
    max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))
+
+    # Maximum candidates recalled from each PostgreSQL recall tier.
    recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))
+
+    # Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall.
    enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"
+
+    # PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more.
    trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))
+
+    # PostgreSQL statement timeout for one dedup check, in milliseconds.
    statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))
+
+    # HTTP download timeout for fetching lyric URLs, in seconds.
    download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))
+
+    # Minimum primary n-gram Jaccard similarity required for automatic duplicate.
+    # Raising this makes automatic duplicate stricter; lowering it may increase false positives.
+    duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78"))
+
+    # Minimum line coverage required for automatic duplicate.
+    # This is the main guard against treating partial lyric fragments as full duplicates.
+    duplicate_line_coverage_threshold: float = float(
+        os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72")
+    )
+
+    # Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high.
+    # Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor.
+    duplicate_high_coverage_jaccard_threshold: float = float(
+        os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78")
+    )
+
+    # Line coverage required by the alternate high-coverage duplicate path.
+    # Raising this makes the alternate duplicate path stricter for near-complete variants.
+    duplicate_high_coverage_line_coverage_threshold: float = float(
+        os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90")
+    )
+
+    # Minimum primary/full n-gram Jaccard similarity that can send a candidate to review.
+    # Raising this reduces review volume; lowering it catches weaker suspicious overlaps.
+    review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45"))
+
+    # Minimum line coverage that can send a candidate to review when query coverage is also material.
+    # Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall.
+    review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35"))
+
+    # Minimum share of query lines that must match before line coverage alone can trigger review.
+    # Raising this makes partial-fragment review stricter.
+    review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
+
+    # Very short query lyric line count that can force repeated-chorus overlap into review.
+    # Raising this catches more short chorus-like inputs; lowering it reduces review volume.
+    chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
+
+    # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
+    # Raising this makes chorus-only review stricter.
+    chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
+
+    # Minimum query-side coverage for repeated-chorus overlap to be considered material.
+    # Raising this reduces review decisions caused by small shared chorus fragments.
+    chorus_material_query_coverage_threshold: float = float(
+        os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
+    )
+
+    # Weight assigned to primary n-gram Jaccard when computing confidence.
+    # This affects the reported confidence score, not the duplicate/review threshold checks directly.
+    confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58"))
+
+    # Weight assigned to primary line coverage when computing confidence.
+    # Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0.
+    confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42"))
--- a/lyric_dedup_server/service.py
View file @fec2556
+++ b/lyric_dedup_server/service.py
View file @fec2556
@@ -189,10 +189,25 @@ class DedupService:
        candidates: list[LyricRecord],
    ) -> CheckResult:
        """Run DuplicateChecker against recalled candidates."""
-        checker = DuplicateChecker()
-        for candidate in candidates:
-            checker.add_record(candidate)
-        result = checker.check_record(record, max_candidates=self.config.max_candidates)
+        checker = DuplicateChecker(
+            duplicate_jaccard_threshold=self.config.duplicate_jaccard_threshold,
+            duplicate_line_coverage_threshold=self.config.duplicate_line_coverage_threshold,
+            duplicate_high_coverage_jaccard_threshold=self.config.duplicate_high_coverage_jaccard_threshold,
+            duplicate_high_coverage_line_coverage_threshold=self.config.duplicate_high_coverage_line_coverage_threshold,
+            review_jaccard_threshold=self.config.review_jaccard_threshold,
+            review_line_coverage_threshold=self.config.review_line_coverage_threshold,
+            review_query_coverage_threshold=self.config.review_query_coverage_threshold,
+            chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold,
+            chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold,
+            chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold,
+            confidence_jaccard_weight=self.config.confidence_jaccard_weight,
+            confidence_line_coverage_weight=self.config.confidence_line_coverage_weight,
+        )
+        result = checker.check_record_against_candidates(
+            record,
+            candidates,
+            max_candidates=self.config.max_candidates,
+        )
        return CheckResult(
            duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW),
            decision=result.decision.value,
--- a/requirements.txt
View file @fec2556
+++ b/requirements.txt
View file @fec2556
@@ -3,6 +3,7 @@ pytest>=8.0

 # PostgreSQL storage prototype
 psycopg[binary]>=3.2
+OpenCC>=1.3.1

 # Existing MySQL/COS lyric download utilities
 pymysql>=1.1
--- a/scripts/evaluate_postgres.py
View file @fec2556
+++ b/scripts/evaluate_postgres.py
View file @fec2556
@@ -249,9 +249,7 @@ def _check_against_candidates(
    max_candidates: int,
 ):
    checker = DuplicateChecker()
-    for candidate in candidates:
-        checker.add_record(candidate)
-    return checker.check_record(record, max_candidates=max_candidates)
+    return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates)


 def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
--- a/scripts/process_library.py deleted 100644 → 0
View file @d39197d
+++ b/scripts/process_library.py deleted 100644 → 0
View file @d39197d
--- a/test_api/dedup_samples/README.md 0 → 100644
View file @fec2556
+++ b/test_api/dedup_samples/README.md 0 → 100644
View file @fec2556
+# Lyric Dedup Sample Set
+
+基准歌词: `test_api/test_lyric.txt`
+
+这些样本用于检查当前去重系统的两类行为:
+
+- `positive_*`: 应被判定为与基准歌词重复或高度重复。
+- `negative_*`: 不应被判定为重复，用于检查主题、关键词或风格相似时的误杀。
+
+## 样本说明
+
+| 文件 | 期望 | 测试点 |
+| --- | --- | --- |
+| `positive_01_format_spacing_punctuation_duplicate.txt` | 去重命中 | 去掉标题/分隔线、改变空行、弱化标点后的同文变体 |
+| `positive_02_minor_wording_typos_duplicate.txt` | 去重命中 | 少量错字、近义词、语序微调后的近重复 |
+| `positive_03_section_order_shift_duplicate.txt` | 去重命中 | 段落顺序变化但核心文本大量重合 |
+| `positive_04_partial_core_chorus_duplicate.txt` | 去重命中 | 只提交核心副歌/高潮片段时的局部重复检测 |
+| `negative_01_same_theme_new_lyrics_not_duplicate.txt` | 不应命中 | 同样是凌晨、长安、雪、追梦，但逐句原创 |
+| `negative_02_same_keywords_different_scene_not_duplicate.txt` | 不应命中 | 复用高频关键词，叙事场景和句法明显不同 |
+| `negative_03_style_similar_low_overlap_not_duplicate.txt` | 不应命中 | 国风+Rap+都市融合风格相似，但文本低重合 |
+| `negative_04_common_hook_phrases_not_duplicate.txt` | 不应命中 | 只含常见短语/意象，防止短文本公共表达误杀 |
+
--- a/tests/test_lyric_dedup.py
View file @fec2556
+++ b/tests/test_lyric_dedup.py
View file @fec2556