简化去重链路,仅保留使用pg作为数据库的链路
使用opencc作为简繁转换
Showing
15 changed files
with
136 additions
and
156 deletions
This diff is collapsed.
Click to expand it.
This diff is collapsed.
Click to expand it.
This diff is collapsed.
Click to expand it.
This diff is collapsed.
Click to expand it.
| ... | @@ -12,7 +12,6 @@ from collections import Counter | ... | @@ -12,7 +12,6 @@ from collections import Counter |
| 12 | from dataclasses import dataclass | 12 | from dataclasses import dataclass |
| 13 | from pathlib import Path | 13 | from pathlib import Path |
| 14 | 14 | ||
| 15 | from lyric_dedup.checker import DuplicateChecker | ||
| 16 | from lyric_dedup.checker import LyricRecord | 15 | from lyric_dedup.checker import LyricRecord |
| 17 | from lyric_dedup.file_import import iter_lyric_files | 16 | from lyric_dedup.file_import import iter_lyric_files |
| 18 | from lyric_dedup.file_import import record_from_file | 17 | from lyric_dedup.file_import import record_from_file |
| ... | @@ -133,8 +132,6 @@ def generate_eval_set( | ... | @@ -133,8 +132,6 @@ def generate_eval_set( |
| 133 | ) | 132 | ) |
| 134 | holdout_ids = {profile.record_id for profile in holdout_profiles} | 133 | holdout_ids = {profile.record_id for profile in holdout_profiles} |
| 135 | indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles | 134 | indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles |
| 136 | eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl") | ||
| 137 | _build_eval_index(indexed_profiles, eval_index_path) | ||
| 138 | groups = _profile_groups(indexed_profiles) | 135 | groups = _profile_groups(indexed_profiles) |
| 139 | samples: list[GeneratedSample] = [] | 136 | samples: list[GeneratedSample] = [] |
| 140 | 137 | ||
| ... | @@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran | ... | @@ -373,25 +370,6 @@ def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: ran |
| 373 | return _stratified_sample(profiles, min(count, len(profiles)), rng) | 370 | return _stratified_sample(profiles, min(count, len(profiles)), rng) |
| 374 | 371 | ||
| 375 | 372 | ||
| 376 | def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None: | ||
| 377 | _progress(f"build eval index excluding holdout: {index_path}") | ||
| 378 | checker = DuplicateChecker() | ||
| 379 | total = len(profiles) | ||
| 380 | for index, profile in enumerate(profiles, start=1): | ||
| 381 | checker.add_normalized_record( | ||
| 382 | LyricRecord( | ||
| 383 | record_id=profile.record_id, | ||
| 384 | lyrics=profile.raw_text, | ||
| 385 | title=profile.title or None, | ||
| 386 | artist=profile.artist or None, | ||
| 387 | ), | ||
| 388 | profile.normalized, | ||
| 389 | ) | ||
| 390 | _progress_count("build eval index", index, total, step=5000) | ||
| 391 | index_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 392 | checker.save(index_path) | ||
| 393 | |||
| 394 | |||
| 395 | def _build_positive_samples( | 373 | def _build_positive_samples( |
| 396 | profiles: list[LyricProfile], | 374 | profiles: list[LyricProfile], |
| 397 | output_dir: Path, | 375 | output_dir: Path, |
| ... | @@ -889,7 +867,7 @@ def _write_manifest( | ... | @@ -889,7 +867,7 @@ def _write_manifest( |
| 889 | "sample_size": len(samples), | 867 | "sample_size": len(samples), |
| 890 | "plan": plan, | 868 | "plan": plan, |
| 891 | "source_index": str(index_path) if index_path else "", | 869 | "source_index": str(index_path) if index_path else "", |
| 892 | "eval_index": str(eval_index_path), | 870 | "eval_index": str(eval_index_path) if eval_index_path else "", |
| 893 | "holdout_records": holdout_count, | 871 | "holdout_records": holdout_count, |
| 894 | "lyrics_dir": str(output_dir), | 872 | "lyrics_dir": str(output_dir), |
| 895 | "csv": str(csv_path), | 873 | "csv": str(csv_path), | ... | ... |
lyric_dedup/minhash_lsh.py
deleted
100644 → 0
| 1 | """Small in-memory MinHash LSH index for incremental lyric lookup.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | import hashlib | ||
| 6 | from collections import defaultdict | ||
| 7 | from dataclasses import dataclass | ||
| 8 | |||
| 9 | |||
| 10 | _MAX_HASH = (1 << 64) - 1 | ||
| 11 | |||
| 12 | |||
| 13 | @dataclass(frozen=True) | ||
| 14 | class MinHashConfig: | ||
| 15 | num_perm: int = 96 | ||
| 16 | bands: int = 24 | ||
| 17 | seed: int = 17 | ||
| 18 | |||
| 19 | @property | ||
| 20 | def rows_per_band(self) -> int: | ||
| 21 | if self.num_perm % self.bands != 0: | ||
| 22 | raise ValueError("num_perm must be divisible by bands") | ||
| 23 | return self.num_perm // self.bands | ||
| 24 | |||
| 25 | |||
| 26 | class MinHashLSH: | ||
| 27 | def __init__(self, config: MinHashConfig | None = None) -> None: | ||
| 28 | self.config = config or MinHashConfig() | ||
| 29 | self._buckets: dict[tuple[int, tuple[int, ...]], set[str]] = defaultdict(set) | ||
| 30 | |||
| 31 | def signature(self, tokens: set[str]) -> tuple[int, ...]: | ||
| 32 | if not tokens: | ||
| 33 | return tuple([_MAX_HASH] * self.config.num_perm) | ||
| 34 | |||
| 35 | signature = [_MAX_HASH] * self.config.num_perm | ||
| 36 | for token in tokens: | ||
| 37 | encoded = token.encode("utf-8") | ||
| 38 | for idx in range(self.config.num_perm): | ||
| 39 | digest = hashlib.blake2b( | ||
| 40 | encoded, | ||
| 41 | digest_size=8, | ||
| 42 | person=f"lyr{self.config.seed + idx:05d}".encode("ascii")[:16], | ||
| 43 | ).digest() | ||
| 44 | value = int.from_bytes(digest, "big") | ||
| 45 | if value < signature[idx]: | ||
| 46 | signature[idx] = value | ||
| 47 | return tuple(signature) | ||
| 48 | |||
| 49 | def add(self, record_id: str, signature: tuple[int, ...]) -> None: | ||
| 50 | for key in self._band_keys(signature): | ||
| 51 | self._buckets[key].add(record_id) | ||
| 52 | |||
| 53 | def query(self, signature: tuple[int, ...]) -> set[str]: | ||
| 54 | candidates: set[str] = set() | ||
| 55 | for key in self._band_keys(signature): | ||
| 56 | candidates.update(self._buckets.get(key, set())) | ||
| 57 | return candidates | ||
| 58 | |||
| 59 | def _band_keys(self, signature: tuple[int, ...]) -> list[tuple[int, tuple[int, ...]]]: | ||
| 60 | rows = self.config.rows_per_band | ||
| 61 | return [(band, signature[band * rows : (band + 1) * rows]) for band in range(self.config.bands)] |
| ... | @@ -8,69 +8,10 @@ import unicodedata | ... | @@ -8,69 +8,10 @@ import unicodedata |
| 8 | from collections import Counter | 8 | from collections import Counter |
| 9 | from dataclasses import dataclass | 9 | from dataclasses import dataclass |
| 10 | 10 | ||
| 11 | import opencc | ||
| 11 | 12 | ||
| 12 | _TRADITIONAL_TO_SIMPLIFIED = str.maketrans( | 13 | |
| 13 | { | 14 | _T2S_CONVERTER = opencc.OpenCC("t2s.json") |
| 14 | "愛": "爱", | ||
| 15 | "會": "会", | ||
| 16 | "個": "个", | ||
| 17 | "妳": "你", | ||
| 18 | "們": "们", | ||
| 19 | "麼": "么", | ||
| 20 | "夢": "梦", | ||
| 21 | "憶": "忆", | ||
| 22 | "風": "风", | ||
| 23 | "無": "无", | ||
| 24 | "與": "与", | ||
| 25 | "聽": "听", | ||
| 26 | "說": "说", | ||
| 27 | "見": "见", | ||
| 28 | "話": "话", | ||
| 29 | "還": "还", | ||
| 30 | "這": "这", | ||
| 31 | "那": "那", | ||
| 32 | "裡": "里", | ||
| 33 | "裏": "里", | ||
| 34 | "過": "过", | ||
| 35 | "來": "来", | ||
| 36 | "進": "进", | ||
| 37 | "去": "去", | ||
| 38 | "給": "给", | ||
| 39 | "讓": "让", | ||
| 40 | "嗎": "吗", | ||
| 41 | "為": "为", | ||
| 42 | "誰": "谁", | ||
| 43 | "對": "对", | ||
| 44 | "錯": "错", | ||
| 45 | "淚": "泪", | ||
| 46 | "寫": "写", | ||
| 47 | "雲": "云", | ||
| 48 | "藍": "蓝", | ||
| 49 | "紅": "红", | ||
| 50 | "綠": "绿", | ||
| 51 | "黃": "黄", | ||
| 52 | "長": "长", | ||
| 53 | "遠": "远", | ||
| 54 | "燈": "灯", | ||
| 55 | "臺": "台", | ||
| 56 | "台": "台", | ||
| 57 | "後": "后", | ||
| 58 | "從": "从", | ||
| 59 | "時": "时", | ||
| 60 | "間": "间", | ||
| 61 | "葉": "叶", | ||
| 62 | "歲": "岁", | ||
| 63 | "聲": "声", | ||
| 64 | "邊": "边", | ||
| 65 | "歡": "欢", | ||
| 66 | "繼": "继", | ||
| 67 | "續": "续", | ||
| 68 | "難": "难", | ||
| 69 | "雙": "双", | ||
| 70 | "舊": "旧", | ||
| 71 | "離": "离", | ||
| 72 | } | ||
| 73 | ) | ||
| 74 | 15 | ||
| 75 | _TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]") | 16 | _TIMESTAMP_RE = re.compile(r"\[((?:\d{1,2}:)?\d{1,2}:\d{2}(?:[.:]\d{1,3})?)\]") |
| 76 | _BRACKET_RE = re.compile(r"[\[((【<《].{0,40}?[\]))】>》]") | 17 | _BRACKET_RE = re.compile(r"[\[((【<《].{0,40}?[\]))】>》]") |
| ... | @@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in | ... | @@ -212,7 +153,7 @@ def _split_inline_translation(line: str, timestamp: str | None, source_index: in |
| 212 | 153 | ||
| 213 | def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]: | 154 | def _entry_from_text(text: str, timestamp: str | None, source_index: int) -> list[_LineEntry]: |
| 214 | line = _BRACKET_RE.sub("", text) | 155 | line = _BRACKET_RE.sub("", text) |
| 215 | line = line.strip().lower().translate(_TRADITIONAL_TO_SIMPLIFIED) | 156 | line = _T2S_CONVERTER.convert(line.strip().lower()) |
| 216 | if not line or _is_noise_line(line): | 157 | if not line or _is_noise_line(line): |
| 217 | return [] | 158 | return [] |
| 218 | line = _strip_symbols(line) | 159 | line = _strip_symbols(line) | ... | ... |
| ... | @@ -4,14 +4,101 @@ from __future__ import annotations | ... | @@ -4,14 +4,101 @@ from __future__ import annotations |
| 4 | 4 | ||
| 5 | import os | 5 | import os |
| 6 | from dataclasses import dataclass | 6 | from dataclasses import dataclass |
| 7 | from pathlib import Path | ||
| 8 | |||
| 9 | |||
| 10 | def _load_env_file() -> None: | ||
| 11 | """Load root .env values without overriding real environment variables.""" | ||
| 12 | env_path = Path(__file__).resolve().parent.parent / ".env" | ||
| 13 | if not env_path.exists(): | ||
| 14 | return | ||
| 15 | with env_path.open(encoding="utf-8") as file: | ||
| 16 | for raw_line in file: | ||
| 17 | line = raw_line.strip() | ||
| 18 | if not line or line.startswith("#") or "=" not in line: | ||
| 19 | continue | ||
| 20 | key, value = line.split("=", 1) | ||
| 21 | os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'")) | ||
| 22 | |||
| 23 | |||
| 24 | _load_env_file() | ||
| 7 | 25 | ||
| 8 | 26 | ||
| 9 | @dataclass | 27 | @dataclass |
| 10 | class ServerConfig: | 28 | class ServerConfig: |
| 29 | # PostgreSQL DSN used by the dedup service. | ||
| 11 | dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup") | 30 | dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup") |
| 31 | |||
| 32 | # Maximum ranked candidates returned in the final API result. | ||
| 12 | max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5")) | 33 | max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5")) |
| 34 | |||
| 35 | # Maximum candidates recalled from each PostgreSQL recall tier. | ||
| 13 | recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100")) | 36 | recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100")) |
| 37 | |||
| 38 | # Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall. | ||
| 14 | enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true" | 39 | enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true" |
| 40 | |||
| 41 | # PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more. | ||
| 15 | trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3")) | 42 | trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3")) |
| 43 | |||
| 44 | # PostgreSQL statement timeout for one dedup check, in milliseconds. | ||
| 16 | statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000")) | 45 | statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000")) |
| 46 | |||
| 47 | # HTTP download timeout for fetching lyric URLs, in seconds. | ||
| 17 | download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10")) | 48 | download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10")) |
| 49 | |||
| 50 | # Minimum primary n-gram Jaccard similarity required for automatic duplicate. | ||
| 51 | # Raising this makes automatic duplicate stricter; lowering it may increase false positives. | ||
| 52 | duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78")) | ||
| 53 | |||
| 54 | # Minimum line coverage required for automatic duplicate. | ||
| 55 | # This is the main guard against treating partial lyric fragments as full duplicates. | ||
| 56 | duplicate_line_coverage_threshold: float = float( | ||
| 57 | os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72") | ||
| 58 | ) | ||
| 59 | |||
| 60 | # Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high. | ||
| 61 | # Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor. | ||
| 62 | duplicate_high_coverage_jaccard_threshold: float = float( | ||
| 63 | os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78") | ||
| 64 | ) | ||
| 65 | |||
| 66 | # Line coverage required by the alternate high-coverage duplicate path. | ||
| 67 | # Raising this makes the alternate duplicate path stricter for near-complete variants. | ||
| 68 | duplicate_high_coverage_line_coverage_threshold: float = float( | ||
| 69 | os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90") | ||
| 70 | ) | ||
| 71 | |||
| 72 | # Minimum primary/full n-gram Jaccard similarity that can send a candidate to review. | ||
| 73 | # Raising this reduces review volume; lowering it catches weaker suspicious overlaps. | ||
| 74 | review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45")) | ||
| 75 | |||
| 76 | # Minimum line coverage that can send a candidate to review when query coverage is also material. | ||
| 77 | # Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall. | ||
| 78 | review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35")) | ||
| 79 | |||
| 80 | # Minimum share of query lines that must match before line coverage alone can trigger review. | ||
| 81 | # Raising this makes partial-fragment review stricter. | ||
| 82 | review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40")) | ||
| 83 | |||
| 84 | # Very short query lyric line count that can force repeated-chorus overlap into review. | ||
| 85 | # Raising this catches more short chorus-like inputs; lowering it reduces review volume. | ||
| 86 | chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6")) | ||
| 87 | |||
| 88 | # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material. | ||
| 89 | # Raising this makes chorus-only review stricter. | ||
| 90 | chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20")) | ||
| 91 | |||
| 92 | # Minimum query-side coverage for repeated-chorus overlap to be considered material. | ||
| 93 | # Raising this reduces review decisions caused by small shared chorus fragments. | ||
| 94 | chorus_material_query_coverage_threshold: float = float( | ||
| 95 | os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40") | ||
| 96 | ) | ||
| 97 | |||
| 98 | # Weight assigned to primary n-gram Jaccard when computing confidence. | ||
| 99 | # This affects the reported confidence score, not the duplicate/review threshold checks directly. | ||
| 100 | confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58")) | ||
| 101 | |||
| 102 | # Weight assigned to primary line coverage when computing confidence. | ||
| 103 | # Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0. | ||
| 104 | confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42")) | ... | ... |
| ... | @@ -189,10 +189,25 @@ class DedupService: | ... | @@ -189,10 +189,25 @@ class DedupService: |
| 189 | candidates: list[LyricRecord], | 189 | candidates: list[LyricRecord], |
| 190 | ) -> CheckResult: | 190 | ) -> CheckResult: |
| 191 | """Run DuplicateChecker against recalled candidates.""" | 191 | """Run DuplicateChecker against recalled candidates.""" |
| 192 | checker = DuplicateChecker() | 192 | checker = DuplicateChecker( |
| 193 | for candidate in candidates: | 193 | duplicate_jaccard_threshold=self.config.duplicate_jaccard_threshold, |
| 194 | checker.add_record(candidate) | 194 | duplicate_line_coverage_threshold=self.config.duplicate_line_coverage_threshold, |
| 195 | result = checker.check_record(record, max_candidates=self.config.max_candidates) | 195 | duplicate_high_coverage_jaccard_threshold=self.config.duplicate_high_coverage_jaccard_threshold, |
| 196 | duplicate_high_coverage_line_coverage_threshold=self.config.duplicate_high_coverage_line_coverage_threshold, | ||
| 197 | review_jaccard_threshold=self.config.review_jaccard_threshold, | ||
| 198 | review_line_coverage_threshold=self.config.review_line_coverage_threshold, | ||
| 199 | review_query_coverage_threshold=self.config.review_query_coverage_threshold, | ||
| 200 | chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold, | ||
| 201 | chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold, | ||
| 202 | chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold, | ||
| 203 | confidence_jaccard_weight=self.config.confidence_jaccard_weight, | ||
| 204 | confidence_line_coverage_weight=self.config.confidence_line_coverage_weight, | ||
| 205 | ) | ||
| 206 | result = checker.check_record_against_candidates( | ||
| 207 | record, | ||
| 208 | candidates, | ||
| 209 | max_candidates=self.config.max_candidates, | ||
| 210 | ) | ||
| 196 | return CheckResult( | 211 | return CheckResult( |
| 197 | duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), | 212 | duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), |
| 198 | decision=result.decision.value, | 213 | decision=result.decision.value, | ... | ... |
| ... | @@ -3,6 +3,7 @@ pytest>=8.0 | ... | @@ -3,6 +3,7 @@ pytest>=8.0 |
| 3 | 3 | ||
| 4 | # PostgreSQL storage prototype | 4 | # PostgreSQL storage prototype |
| 5 | psycopg[binary]>=3.2 | 5 | psycopg[binary]>=3.2 |
| 6 | OpenCC>=1.3.1 | ||
| 6 | 7 | ||
| 7 | # Existing MySQL/COS lyric download utilities | 8 | # Existing MySQL/COS lyric download utilities |
| 8 | pymysql>=1.1 | 9 | pymysql>=1.1 | ... | ... |
| ... | @@ -249,9 +249,7 @@ def _check_against_candidates( | ... | @@ -249,9 +249,7 @@ def _check_against_candidates( |
| 249 | max_candidates: int, | 249 | max_candidates: int, |
| 250 | ): | 250 | ): |
| 251 | checker = DuplicateChecker() | 251 | checker = DuplicateChecker() |
| 252 | for candidate in candidates: | 252 | return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates) |
| 253 | checker.add_record(candidate) | ||
| 254 | return checker.check_record(record, max_candidates=max_candidates) | ||
| 255 | 253 | ||
| 256 | 254 | ||
| 257 | def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: | 255 | def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: | ... | ... |
scripts/process_library.py
deleted
100644 → 0
This diff is collapsed.
Click to expand it.
test_api/dedup_samples/README.md
0 → 100644
| 1 | # Lyric Dedup Sample Set | ||
| 2 | |||
| 3 | 基准歌词: `test_api/test_lyric.txt` | ||
| 4 | |||
| 5 | 这些样本用于检查当前去重系统的两类行为: | ||
| 6 | |||
| 7 | - `positive_*`: 应被判定为与基准歌词重复或高度重复。 | ||
| 8 | - `negative_*`: 不应被判定为重复,用于检查主题、关键词或风格相似时的误杀。 | ||
| 9 | |||
| 10 | ## 样本说明 | ||
| 11 | |||
| 12 | | 文件 | 期望 | 测试点 | | ||
| 13 | | --- | --- | --- | | ||
| 14 | | `positive_01_format_spacing_punctuation_duplicate.txt` | 去重命中 | 去掉标题/分隔线、改变空行、弱化标点后的同文变体 | | ||
| 15 | | `positive_02_minor_wording_typos_duplicate.txt` | 去重命中 | 少量错字、近义词、语序微调后的近重复 | | ||
| 16 | | `positive_03_section_order_shift_duplicate.txt` | 去重命中 | 段落顺序变化但核心文本大量重合 | | ||
| 17 | | `positive_04_partial_core_chorus_duplicate.txt` | 去重命中 | 只提交核心副歌/高潮片段时的局部重复检测 | | ||
| 18 | | `negative_01_same_theme_new_lyrics_not_duplicate.txt` | 不应命中 | 同样是凌晨、长安、雪、追梦,但逐句原创 | | ||
| 19 | | `negative_02_same_keywords_different_scene_not_duplicate.txt` | 不应命中 | 复用高频关键词,叙事场景和句法明显不同 | | ||
| 20 | | `negative_03_style_similar_low_overlap_not_duplicate.txt` | 不应命中 | 国风+Rap+都市融合风格相似,但文本低重合 | | ||
| 21 | | `negative_04_common_hook_phrases_not_duplicate.txt` | 不应命中 | 只含常见短语/意象,防止短文本公共表达误杀 | | ||
| 22 |
This diff is collapsed.
Click to expand it.
-
Please register or sign in to post a comment