Commit fec2556e fec2556ea008688f2ceac576f400a5d1cc9c22d7 by 沈秋雨

简化去重链路,仅保留使用pg作为数据库的链路

使用opencc作为简繁转换
1 parent d39197d3
...@@ -12,7 +12,6 @@ from collections import Counter ...@@ -12,7 +12,6 @@ from collections import Counter
12 from dataclasses import dataclass 12 from dataclasses import dataclass
13 from pathlib import Path 13 from pathlib import Path
14 14
15 from lyric_dedup.checker import DuplicateChecker
16 from lyric_dedup.checker import LyricRecord 15 from lyric_dedup.checker import LyricRecord
17 from lyric_dedup.file_import import iter_lyric_files 16 from lyric_dedup.file_import import iter_lyric_files
18 from lyric_dedup.file_import import record_from_file 17 from lyric_dedup.file_import import record_from_file
...@@ -133,8 +132,6 @@ def generate_eval_set( ...@@ -133,8 +132,6 @@ def generate_eval_set(
133 ) 132 )
134 holdout_ids = {profile.record_id for profile in holdout_profiles} 133 holdout_ids = {profile.record_id for profile in holdout_profiles}
135 indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles 134 indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
136 eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
137 _build_eval_index(indexed_profiles, eval_index_path)
138 groups = _profile_groups(indexed_profiles) 135 groups = _profile_groups(indexed_profiles)
139 samples: list[GeneratedSample] = [] 136 samples: list[GeneratedSample] = []
140 137
...@@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran ...@@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran
373 return _stratified_sample(profiles, min(count, len(profiles)), rng) 370 return _stratified_sample(profiles, min(count, len(profiles)), rng)
374 371
375 372
376 def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
377 _progress(f"build eval index excluding holdout: {index_path}")
378 checker = DuplicateChecker()
379 total = len(profiles)
380 for index, profile in enumerate(profiles, start=1):
381 checker.add_normalized_record(
382 LyricRecord(
383 record_id=profile.record_id,
384 lyrics=profile.raw_text,
385 title=profile.title or None,
386 artist=profile.artist or None,
387 ),
388 profile.normalized,
389 )
390 _progress_count("build eval index", index, total, step=5000)
391 index_path.parent.mkdir(parents=True, exist_ok=True)
392 checker.save(index_path)
393
394
395 def _build_positive_samples( 373 def _build_positive_samples(
396 profiles: list[LyricProfile], 374 profiles: list[LyricProfile],
397 output_dir: Path, 375 output_dir: Path,
...@@ -889,7 +867,7 @@ def _write_manifest( ...@@ -889,7 +867,7 @@ def _write_manifest(
889 "sample_size": len(samples), 867 "sample_size": len(samples),
890 "plan": plan, 868 "plan": plan,
891 "source_index": str(index_path) if index_path else "", 869 "source_index": str(index_path) if index_path else "",
892 "eval_index": str(eval_index_path), 870 "eval_index": str(eval_index_path) if eval_index_path else "",
893 "holdout_records": holdout_count, 871 "holdout_records": holdout_count,
894 "lyrics_dir": str(output_dir), 872 "lyrics_dir": str(output_dir),
895 "csv": str(csv_path), 873 "csv": str(csv_path),
......
1 """Small in-memory MinHash LSH index for incremental lyric lookup."""
2
3 from __future__ import annotations
4
5 import hashlib
6 from collections import defaultdict
7 from dataclasses import dataclass
8
9
10 _MAX_HASH = (1 << 64) - 1
11
12
13 @dataclass(frozen=True)
14 class MinHashConfig:
15 num_perm: int = 96
16 bands: int = 24
17 seed: int = 17
18
19 @property
20 def rows_per_band(self) -> int:
21 if self.num_perm % self.bands != 0:
22 raise ValueError("num_perm must be divisible by bands")
23 return self.num_perm // self.bands
24
25
26 class MinHashLSH:
27 def __init__(self, config: MinHashConfig | None = None) -> None:
28 self.config = config or MinHashConfig()
29 self._buckets: dict[tuple[int, tuple[int, ...]], set[str]] = defaultdict(set)
30
31 def signature(self, tokens: set[str]) -> tuple[int, ...]:
32 if not tokens:
33 return tuple([_MAX_HASH] * self.config.num_perm)
34
35 signature = [_MAX_HASH] * self.config.num_perm
36 for token in tokens:
37 encoded = token.encode("utf-8")
38 for idx in range(self.config.num_perm):
39 digest = hashlib.blake2b(
40 encoded,
41 digest_size=8,
42 person=f"lyr{self.config.seed + idx:05d}".encode("ascii")[:16],
43 ).digest()
44 value = int.from_bytes(digest, "big")
45 if value < signature[idx]:
46 signature[idx] = value
47 return tuple(signature)
48
49 def add(self, record_id: str, signature: tuple[int, ...]) -> None:
50 for key in self._band_keys(signature):
51 self._buckets[key].add(record_id)
52
53 def query(self, signature: tuple[int, ...]) -> set[str]:
54 candidates: set[str] = set()
55 for key in self._band_keys(signature):
56 candidates.update(self._buckets.get(key, set()))
57 return candidates
58
59 def _band_keys(self, signature: tuple[int, ...]) -> list[tuple[int, tuple[int, ...]]]:
60 rows = self.config.rows_per_band
61 return [(band, signature[band * rows : (band + 1) * rows]) for band in range(self.config.bands)]
...@@ -8,69 +8,10 @@ import unicodedata ...@@ -8,69 +8,10 @@ import unicodedata
8 from collections import Counter 8 from collections import Counter
9 from dataclasses import dataclass 9 from dataclasses import dataclass
10 10
11 import opencc
11 12
12 _TRADITIONAL_TO_SIMPLIFIED = str.maketrans( 13
13 { 14 _T2S_CONVERTER = opencc.OpenCC("t2s.json")
14 "愛": "爱",
15 "會": "会",
16 "個": "个",
17 "妳": "你",
18 "們": "们",
19 "麼": "么",
20 "夢": "梦",
21 "憶": "忆",
22 "風": "风",
23 "無": "无",
24 "與": "与",
25 "聽": "听",
26 "說": "说",
27 "見": "见",
28 "話": "话",
29 "還": "还",
30 "這": "这",
31 "那": "那",
32 "裡": "里",
33 "裏": "里",
34 "過": "过",
35 "來": "来",
36 "進": "进",
37 "去": "去",
38 "給": "给",
39 "讓": "让",
40 "嗎": "吗",
41 "為": "为",
42 "誰": "谁",
43 "對": "对",
44 "錯": "错",
45 "淚": "泪",
46 "寫": "写",
47 "雲": "云",
48 "藍": "蓝",
49 "紅": "红",
50 "綠": "绿",
51 "黃": "黄",
52 "長": "长",
53 "遠": "远",
54 "燈": "灯",
55 "臺": "台",
56 "台": "台",
57 "後": "后",
58 "從": "从",
59 "時": "时",
60 "間": "间",
61 "葉": "叶",
62 "歲": "岁",
63 "聲": "声",
64 "邊": "边",
65 "歡": "欢",
66 "繼": "继",
67 "續": "续",
68 "難": "难",
69 "雙": "双",
70 "舊": "旧",
71 "離": "离",
72 }
73 )
74 15
75 _TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]") 16 _TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]")
76 _BRACKET_RE = re.compile(r"[\[((【<《].{0,40}?[\]))】>》]") 17 _BRACKET_RE = re.compile(r"[\[((【<《].{0,40}?[\]))】>》]")
...@@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in ...@@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in
212 153
213 def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]: 154 def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]:
214 line = _BRACKET_RE.sub("", text) 155 line = _BRACKET_RE.sub("", text)
215 line = line.strip().lower().translate(_TRADITIONAL_TO_SIMPLIFIED) 156 line = _T2S_CONVERTER.convert(line.strip().lower())
216 if not line or _is_noise_line(line): 157 if not line or _is_noise_line(line):
217 return [] 158 return []
218 line = _strip_symbols(line) 159 line = _strip_symbols(line)
......
1 from .config import ServerConfig 1 from .config import ServerConfig
2 from .service import DedupService
3 2
4 __all__ = ["ServerConfig", "DedupService"] 3 __all__ = ["ServerConfig"]
......
...@@ -4,14 +4,101 @@ from __future__ import annotations ...@@ -4,14 +4,101 @@ from __future__ import annotations
4 4
5 import os 5 import os
6 from dataclasses import dataclass 6 from dataclasses import dataclass
7 from pathlib import Path
8
9
10 def _load_env_file() -> None:
11 """Load root .env values without overriding real environment variables."""
12 env_path = Path(__file__).resolve().parent.parent / ".env"
13 if not env_path.exists():
14 return
15 with env_path.open(encoding="utf-8") as file:
16 for raw_line in file:
17 line = raw_line.strip()
18 if not line or line.startswith("#") or "=" not in line:
19 continue
20 key, value = line.split("=", 1)
21 os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))
22
23
24 _load_env_file()
7 25
8 26
9 @dataclass 27 @dataclass
10 class ServerConfig: 28 class ServerConfig:
29 # PostgreSQL DSN used by the dedup service.
11 dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup") 30 dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")
31
32 # Maximum ranked candidates returned in the final API result.
12 max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5")) 33 max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))
34
35 # Maximum candidates recalled from each PostgreSQL recall tier.
13 recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100")) 36 recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))
37
38 # Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall.
14 enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true" 39 enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"
40
41 # PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more.
15 trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3")) 42 trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))
43
44 # PostgreSQL statement timeout for one dedup check, in milliseconds.
16 statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000")) 45 statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))
46
47 # HTTP download timeout for fetching lyric URLs, in seconds.
17 download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10")) 48 download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))
49
50 # Minimum primary n-gram Jaccard similarity required for automatic duplicate.
51 # Raising this makes automatic duplicate stricter; lowering it may increase false positives.
52 duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78"))
53
54 # Minimum line coverage required for automatic duplicate.
55 # This is the main guard against treating partial lyric fragments as full duplicates.
56 duplicate_line_coverage_threshold: float = float(
57 os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72")
58 )
59
60 # Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high.
61 # Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor.
62 duplicate_high_coverage_jaccard_threshold: float = float(
63 os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78")
64 )
65
66 # Line coverage required by the alternate high-coverage duplicate path.
67 # Raising this makes the alternate duplicate path stricter for near-complete variants.
68 duplicate_high_coverage_line_coverage_threshold: float = float(
69 os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90")
70 )
71
72 # Minimum primary/full n-gram Jaccard similarity that can send a candidate to review.
73 # Raising this reduces review volume; lowering it catches weaker suspicious overlaps.
74 review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45"))
75
76 # Minimum line coverage that can send a candidate to review when query coverage is also material.
77 # Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall.
78 review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35"))
79
80 # Minimum share of query lines that must match before line coverage alone can trigger review.
81 # Raising this makes partial-fragment review stricter.
82 review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
83
84 # Very short query lyric line count that can force repeated-chorus overlap into review.
85 # Raising this catches more short chorus-like inputs; lowering it reduces review volume.
86 chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
87
88 # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
89 # Raising this makes chorus-only review stricter.
90 chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
91
92 # Minimum query-side coverage for repeated-chorus overlap to be considered material.
93 # Raising this reduces review decisions caused by small shared chorus fragments.
94 chorus_material_query_coverage_threshold: float = float(
95 os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
96 )
97
98 # Weight assigned to primary n-gram Jaccard when computing confidence.
99 # This affects the reported confidence score, not the duplicate/review threshold checks directly.
100 confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58"))
101
102 # Weight assigned to primary line coverage when computing confidence.
103 # Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0.
104 confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42"))
......
...@@ -189,10 +189,25 @@ class DedupService: ...@@ -189,10 +189,25 @@ class DedupService:
189 candidates: list[LyricRecord], 189 candidates: list[LyricRecord],
190 ) -> CheckResult: 190 ) -> CheckResult:
191 """Run DuplicateChecker against recalled candidates.""" 191 """Run DuplicateChecker against recalled candidates."""
192 checker = DuplicateChecker() 192 checker = DuplicateChecker(
193 for candidate in candidates: 193 duplicate_jaccard_threshold=self.config.duplicate_jaccard_threshold,
194 checker.add_record(candidate) 194 duplicate_line_coverage_threshold=self.config.duplicate_line_coverage_threshold,
195 result = checker.check_record(record, max_candidates=self.config.max_candidates) 195 duplicate_high_coverage_jaccard_threshold=self.config.duplicate_high_coverage_jaccard_threshold,
196 duplicate_high_coverage_line_coverage_threshold=self.config.duplicate_high_coverage_line_coverage_threshold,
197 review_jaccard_threshold=self.config.review_jaccard_threshold,
198 review_line_coverage_threshold=self.config.review_line_coverage_threshold,
199 review_query_coverage_threshold=self.config.review_query_coverage_threshold,
200 chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold,
201 chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold,
202 chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold,
203 confidence_jaccard_weight=self.config.confidence_jaccard_weight,
204 confidence_line_coverage_weight=self.config.confidence_line_coverage_weight,
205 )
206 result = checker.check_record_against_candidates(
207 record,
208 candidates,
209 max_candidates=self.config.max_candidates,
210 )
196 return CheckResult( 211 return CheckResult(
197 duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), 212 duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW),
198 decision=result.decision.value, 213 decision=result.decision.value,
......
...@@ -3,6 +3,7 @@ pytest>=8.0 ...@@ -3,6 +3,7 @@ pytest>=8.0
3 3
4 # PostgreSQL storage prototype 4 # PostgreSQL storage prototype
5 psycopg[binary]>=3.2 5 psycopg[binary]>=3.2
6 OpenCC>=1.3.1
6 7
7 # Existing MySQL/COS lyric download utilities 8 # Existing MySQL/COS lyric download utilities
8 pymysql>=1.1 9 pymysql>=1.1
......
...@@ -249,9 +249,7 @@ def _check_against_candidates( ...@@ -249,9 +249,7 @@ def _check_against_candidates(
249 max_candidates: int, 249 max_candidates: int,
250 ): 250 ):
251 checker = DuplicateChecker() 251 checker = DuplicateChecker()
252 for candidate in candidates: 252 return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates)
253 checker.add_record(candidate)
254 return checker.check_record(record, max_candidates=max_candidates)
255 253
256 254
257 def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: 255 def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
......
1 # Lyric Dedup Sample Set
2
3 基准歌词: `test_api/test_lyric.txt`
4
5 这些样本用于检查当前去重系统的两类行为:
6
7 - `positive_*`: 应被判定为与基准歌词重复或高度重复。
8 - `negative_*`: 不应被判定为重复,用于检查主题、关键词或风格相似时的误杀。
9
10 ## 样本说明
11
12 | 文件 | 期望 | 测试点 |
13 | --- | --- | --- |
14 | `positive_01_format_spacing_punctuation_duplicate.txt` | 去重命中 | 去掉标题/分隔线、改变空行、弱化标点后的同文变体 |
15 | `positive_02_minor_wording_typos_duplicate.txt` | 去重命中 | 少量错字、近义词、语序微调后的近重复 |
16 | `positive_03_section_order_shift_duplicate.txt` | 去重命中 | 段落顺序变化但核心文本大量重合 |
17 | `positive_04_partial_core_chorus_duplicate.txt` | 去重命中 | 只提交核心副歌/高潮片段时的局部重复检测 |
18 | `negative_01_same_theme_new_lyrics_not_duplicate.txt` | 不应命中 | 同样是凌晨、长安、雪、追梦,但逐句原创 |
19 | `negative_02_same_keywords_different_scene_not_duplicate.txt` | 不应命中 | 复用高频关键词,叙事场景和句法明显不同 |
20 | `negative_03_style_similar_low_overlap_not_duplicate.txt` | 不应命中 | 国风+Rap+都市融合风格相似,但文本低重合 |
21 | `negative_04_common_hook_phrases_not_duplicate.txt` | 不应命中 | 只含常见短语/意象,防止短文本公共表达误杀 |
22