Commit fec2556e fec2556ea008688f2ceac576f400a5d1cc9c22d7 by 沈秋雨

简化去重链路,仅保留使用pg作为数据库的链路

使用opencc作为简繁转换
1 parent d39197d3
......@@ -12,7 +12,6 @@ from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import LyricRecord
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import record_from_file
......@@ -133,8 +132,6 @@ def generate_eval_set(
)
holdout_ids = {profile.record_id for profile in holdout_profiles}
indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
_build_eval_index(indexed_profiles, eval_index_path)
groups = _profile_groups(indexed_profiles)
samples: list[GeneratedSample] = []
......@@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran
return _stratified_sample(profiles, min(count, len(profiles)), rng)
def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
_progress(f"build eval index excluding holdout: {index_path}")
checker = DuplicateChecker()
total = len(profiles)
for index, profile in enumerate(profiles, start=1):
checker.add_normalized_record(
LyricRecord(
record_id=profile.record_id,
lyrics=profile.raw_text,
title=profile.title or None,
artist=profile.artist or None,
),
profile.normalized,
)
_progress_count("build eval index", index, total, step=5000)
index_path.parent.mkdir(parents=True, exist_ok=True)
checker.save(index_path)
def _build_positive_samples(
profiles: list[LyricProfile],
output_dir: Path,
......@@ -889,7 +867,7 @@ def _write_manifest(
"sample_size": len(samples),
"plan": plan,
"source_index": str(index_path) if index_path else "",
"eval_index": str(eval_index_path),
"eval_index": str(eval_index_path) if eval_index_path else "",
"holdout_records": holdout_count,
"lyrics_dir": str(output_dir),
"csv": str(csv_path),
......
"""Small in-memory MinHash LSH index for incremental lyric lookup."""
from __future__ import annotations
import hashlib
from collections import defaultdict
from dataclasses import dataclass
_MAX_HASH = (1 << 64) - 1
@dataclass(frozen=True)
class MinHashConfig:
num_perm: int = 96
bands: int = 24
seed: int = 17
@property
def rows_per_band(self) -> int:
if self.num_perm % self.bands != 0:
raise ValueError("num_perm must be divisible by bands")
return self.num_perm // self.bands
class MinHashLSH:
def __init__(self, config: MinHashConfig | None = None) -> None:
self.config = config or MinHashConfig()
self._buckets: dict[tuple[int, tuple[int, ...]], set[str]] = defaultdict(set)
def signature(self, tokens: set[str]) -> tuple[int, ...]:
if not tokens:
return tuple([_MAX_HASH] * self.config.num_perm)
signature = [_MAX_HASH] * self.config.num_perm
for token in tokens:
encoded = token.encode("utf-8")
for idx in range(self.config.num_perm):
digest = hashlib.blake2b(
encoded,
digest_size=8,
person=f"lyr{self.config.seed + idx:05d}".encode("ascii")[:16],
).digest()
value = int.from_bytes(digest, "big")
if value < signature[idx]:
signature[idx] = value
return tuple(signature)
def add(self, record_id: str, signature: tuple[int, ...]) -> None:
for key in self._band_keys(signature):
self._buckets[key].add(record_id)
def query(self, signature: tuple[int, ...]) -> set[str]:
candidates: set[str] = set()
for key in self._band_keys(signature):
candidates.update(self._buckets.get(key, set()))
return candidates
def _band_keys(self, signature: tuple[int, ...]) -> list[tuple[int, tuple[int, ...]]]:
rows = self.config.rows_per_band
return [(band, signature[band * rows : (band + 1) * rows]) for band in range(self.config.bands)]
......@@ -8,69 +8,10 @@ import unicodedata
from collections import Counter
from dataclasses import dataclass
import opencc
_TRADITIONAL_TO_SIMPLIFIED = str.maketrans(
{
"愛": "爱",
"會": "会",
"個": "个",
"妳": "你",
"們": "们",
"麼": "么",
"夢": "梦",
"憶": "忆",
"風": "风",
"無": "无",
"與": "与",
"聽": "听",
"說": "说",
"見": "见",
"話": "话",
"還": "还",
"這": "这",
"那": "那",
"裡": "里",
"裏": "里",
"過": "过",
"來": "来",
"進": "进",
"去": "去",
"給": "给",
"讓": "让",
"嗎": "吗",
"為": "为",
"誰": "谁",
"對": "对",
"錯": "错",
"淚": "泪",
"寫": "写",
"雲": "云",
"藍": "蓝",
"紅": "红",
"綠": "绿",
"黃": "黄",
"長": "长",
"遠": "远",
"燈": "灯",
"臺": "台",
"台": "台",
"後": "后",
"從": "从",
"時": "时",
"間": "间",
"葉": "叶",
"歲": "岁",
"聲": "声",
"邊": "边",
"歡": "欢",
"繼": "继",
"續": "续",
"難": "难",
"雙": "双",
"舊": "旧",
"離": "离",
}
)
_T2S_CONVERTER = opencc.OpenCC("t2s.json")
_TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]")
_BRACKET_RE = re.compile(r"[\[((【<《].{0,40}?[\]))】>》]")
......@@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in
def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]:
line = _BRACKET_RE.sub("", text)
line = line.strip().lower().translate(_TRADITIONAL_TO_SIMPLIFIED)
line = _T2S_CONVERTER.convert(line.strip().lower())
if not line or _is_noise_line(line):
return []
line = _strip_symbols(line)
......
from .config import ServerConfig
from .service import DedupService
__all__ = ["ServerConfig", "DedupService"]
__all__ = ["ServerConfig"]
......
......@@ -4,14 +4,101 @@ from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
def _load_env_file() -> None:
"""Load root .env values without overriding real environment variables."""
env_path = Path(__file__).resolve().parent.parent / ".env"
if not env_path.exists():
return
with env_path.open(encoding="utf-8") as file:
for raw_line in file:
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))
_load_env_file()
@dataclass
class ServerConfig:
# PostgreSQL DSN used by the dedup service.
dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")
# Maximum ranked candidates returned in the final API result.
max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))
# Maximum candidates recalled from each PostgreSQL recall tier.
recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))
# Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall.
enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"
# PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more.
trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))
# PostgreSQL statement timeout for one dedup check, in milliseconds.
statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))
# HTTP download timeout for fetching lyric URLs, in seconds.
download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))
# Minimum primary n-gram Jaccard similarity required for automatic duplicate.
# Raising this makes automatic duplicate stricter; lowering it may increase false positives.
duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78"))
# Minimum line coverage required for automatic duplicate.
# This is the main guard against treating partial lyric fragments as full duplicates.
duplicate_line_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72")
)
# Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high.
# Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor.
duplicate_high_coverage_jaccard_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78")
)
# Line coverage required by the alternate high-coverage duplicate path.
# Raising this makes the alternate duplicate path stricter for near-complete variants.
duplicate_high_coverage_line_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90")
)
# Minimum primary/full n-gram Jaccard similarity that can send a candidate to review.
# Raising this reduces review volume; lowering it catches weaker suspicious overlaps.
review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45"))
# Minimum line coverage that can send a candidate to review when query coverage is also material.
# Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall.
review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35"))
# Minimum share of query lines that must match before line coverage alone can trigger review.
# Raising this makes partial-fragment review stricter.
review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
# Very short query lyric line count that can force repeated-chorus overlap into review.
# Raising this catches more short chorus-like inputs; lowering it reduces review volume.
chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
# Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
# Raising this makes chorus-only review stricter.
chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
# Minimum query-side coverage for repeated-chorus overlap to be considered material.
# Raising this reduces review decisions caused by small shared chorus fragments.
chorus_material_query_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
)
# Weight assigned to primary n-gram Jaccard when computing confidence.
# This affects the reported confidence score, not the duplicate/review threshold checks directly.
confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58"))
# Weight assigned to primary line coverage when computing confidence.
# Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0.
confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42"))
......
......@@ -189,10 +189,25 @@ class DedupService:
candidates: list[LyricRecord],
) -> CheckResult:
"""Run DuplicateChecker against recalled candidates."""
checker = DuplicateChecker()
for candidate in candidates:
checker.add_record(candidate)
result = checker.check_record(record, max_candidates=self.config.max_candidates)
checker = DuplicateChecker(
duplicate_jaccard_threshold=self.config.duplicate_jaccard_threshold,
duplicate_line_coverage_threshold=self.config.duplicate_line_coverage_threshold,
duplicate_high_coverage_jaccard_threshold=self.config.duplicate_high_coverage_jaccard_threshold,
duplicate_high_coverage_line_coverage_threshold=self.config.duplicate_high_coverage_line_coverage_threshold,
review_jaccard_threshold=self.config.review_jaccard_threshold,
review_line_coverage_threshold=self.config.review_line_coverage_threshold,
review_query_coverage_threshold=self.config.review_query_coverage_threshold,
chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold,
chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold,
chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold,
confidence_jaccard_weight=self.config.confidence_jaccard_weight,
confidence_line_coverage_weight=self.config.confidence_line_coverage_weight,
)
result = checker.check_record_against_candidates(
record,
candidates,
max_candidates=self.config.max_candidates,
)
return CheckResult(
duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW),
decision=result.decision.value,
......
......@@ -3,6 +3,7 @@ pytest>=8.0
# PostgreSQL storage prototype
psycopg[binary]>=3.2
OpenCC>=1.3.1
# Existing MySQL/COS lyric download utilities
pymysql>=1.1
......
......@@ -249,9 +249,7 @@ def _check_against_candidates(
max_candidates: int,
):
checker = DuplicateChecker()
for candidate in candidates:
checker.add_record(candidate)
return checker.check_record(record, max_candidates=max_candidates)
return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates)
def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
......
# Lyric Dedup Sample Set
基准歌词: `test_api/test_lyric.txt`
这些样本用于检查当前去重系统的两类行为:
- `positive_*`: 应被判定为与基准歌词重复或高度重复。
- `negative_*`: 不应被判定为重复,用于检查主题、关键词或风格相似时的误杀。
## 样本说明
| 文件 | 期望 | 测试点 |
| --- | --- | --- |
| `positive_01_format_spacing_punctuation_duplicate.txt` | 去重命中 | 去掉标题/分隔线、改变空行、弱化标点后的同文变体 |
| `positive_02_minor_wording_typos_duplicate.txt` | 去重命中 | 少量错字、近义词、语序微调后的近重复 |
| `positive_03_section_order_shift_duplicate.txt` | 去重命中 | 段落顺序变化但核心文本大量重合 |
| `positive_04_partial_core_chorus_duplicate.txt` | 去重命中 | 只提交核心副歌/高潮片段时的局部重复检测 |
| `negative_01_same_theme_new_lyrics_not_duplicate.txt` | 不应命中 | 同样是凌晨、长安、雪、追梦,但逐句原创 |
| `negative_02_same_keywords_different_scene_not_duplicate.txt` | 不应命中 | 复用高频关键词,叙事场景和句法明显不同 |
| `negative_03_style_similar_low_overlap_not_duplicate.txt` | 不应命中 | 国风+Rap+都市融合风格相似,但文本低重合 |
| `negative_04_common_hook_phrases_not_duplicate.txt` | 不应命中 | 只含常见短语/意象,防止短文本公共表达误杀 |