更新测试脚本
Showing
10 changed files
with
211 additions
and
124 deletions
| ... | @@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \ | ... | @@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \ |
| 105 | 105 | ||
| 106 | ```text | 106 | ```text |
| 107 | decision duplicate / review / new | 107 | decision duplicate / review / new |
| 108 | duplicate duplicate 或 review 时为 true,new 时为 false | 108 | duplicate 仅 decision=duplicate 时为 true,review/new 时为 false |
| 109 | confidence 当前判定置信度 | 109 | confidence 当前判定置信度 |
| 110 | reason 中文判定原因 | 110 | reason 中文判定原因 |
| 111 | candidate_count 参与最终排序的候选数 | 111 | candidate_count 参与最终排序的候选数 | ... | ... |
| ... | @@ -75,6 +75,9 @@ class DuplicateChecker: | ... | @@ -75,6 +75,9 @@ class DuplicateChecker: |
| 75 | review_jaccard_threshold: float = 0.45, | 75 | review_jaccard_threshold: float = 0.45, |
| 76 | review_line_coverage_threshold: float = 0.35, | 76 | review_line_coverage_threshold: float = 0.35, |
| 77 | review_query_coverage_threshold: float = 0.40, | 77 | review_query_coverage_threshold: float = 0.40, |
| 78 | fragment_query_coverage_threshold: float = 0.80, | ||
| 79 | fragment_max_line_ratio: float = 0.75, | ||
| 80 | fragment_min_matched_lines: int = 3, | ||
| 78 | chorus_short_line_count_threshold: int = 6, | 81 | chorus_short_line_count_threshold: int = 6, |
| 79 | chorus_material_overlap_threshold: float = 0.20, | 82 | chorus_material_overlap_threshold: float = 0.20, |
| 80 | chorus_material_query_coverage_threshold: float = 0.40, | 83 | chorus_material_query_coverage_threshold: float = 0.40, |
| ... | @@ -88,6 +91,9 @@ class DuplicateChecker: | ... | @@ -88,6 +91,9 @@ class DuplicateChecker: |
| 88 | self.review_jaccard_threshold = review_jaccard_threshold | 91 | self.review_jaccard_threshold = review_jaccard_threshold |
| 89 | self.review_line_coverage_threshold = review_line_coverage_threshold | 92 | self.review_line_coverage_threshold = review_line_coverage_threshold |
| 90 | self.review_query_coverage_threshold = review_query_coverage_threshold | 93 | self.review_query_coverage_threshold = review_query_coverage_threshold |
| 94 | self.fragment_query_coverage_threshold = fragment_query_coverage_threshold | ||
| 95 | self.fragment_max_line_ratio = fragment_max_line_ratio | ||
| 96 | self.fragment_min_matched_lines = fragment_min_matched_lines | ||
| 91 | self.chorus_short_line_count_threshold = chorus_short_line_count_threshold | 97 | self.chorus_short_line_count_threshold = chorus_short_line_count_threshold |
| 92 | self.chorus_material_overlap_threshold = chorus_material_overlap_threshold | 98 | self.chorus_material_overlap_threshold = chorus_material_overlap_threshold |
| 93 | self.chorus_material_query_coverage_threshold = chorus_material_query_coverage_threshold | 99 | self.chorus_material_query_coverage_threshold = chorus_material_query_coverage_threshold |
| ... | @@ -237,6 +243,14 @@ class DuplicateChecker: | ... | @@ -237,6 +243,14 @@ class DuplicateChecker: |
| 237 | query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" | 243 | query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" |
| 238 | ) | 244 | ) |
| 239 | query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines) | 245 | query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines) |
| 246 | is_plain_fragment = _is_plain_fragment( | ||
| 247 | query.normalized.primary_lines, | ||
| 248 | candidate.normalized.primary_lines, | ||
| 249 | primary_matched_lines, | ||
| 250 | min_query_coverage=self.fragment_query_coverage_threshold, | ||
| 251 | max_line_ratio=self.fragment_max_line_ratio, | ||
| 252 | min_matched_lines=self.fragment_min_matched_lines, | ||
| 253 | ) | ||
| 240 | has_review_level_overlap = ( | 254 | has_review_level_overlap = ( |
| 241 | primary_jaccard >= self.review_jaccard_threshold | 255 | primary_jaccard >= self.review_jaccard_threshold |
| 242 | or jaccard >= self.review_jaccard_threshold | 256 | or jaccard >= self.review_jaccard_threshold |
| ... | @@ -275,7 +289,10 @@ class DuplicateChecker: | ... | @@ -275,7 +289,10 @@ class DuplicateChecker: |
| 275 | + (self.confidence_line_coverage_weight * primary_coverage), | 289 | + (self.confidence_line_coverage_weight * primary_coverage), |
| 276 | 4, | 290 | 4, |
| 277 | ) | 291 | ) |
| 278 | if ( | 292 | if is_plain_fragment: |
| 293 | decision = DuplicateDecision.NEW | ||
| 294 | reason = "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理" | ||
| 295 | elif ( | ||
| 279 | ( | 296 | ( |
| 280 | primary_jaccard >= self.duplicate_jaccard_threshold | 297 | primary_jaccard >= self.duplicate_jaccard_threshold |
| 281 | or ( | 298 | or ( |
| ... | @@ -293,17 +310,17 @@ class DuplicateChecker: | ... | @@ -293,17 +310,17 @@ class DuplicateChecker: |
| 293 | reason = "原文歌词高度一致,翻译行未参与自动判重" | 310 | reason = "原文歌词高度一致,翻译行未参与自动判重" |
| 294 | else: | 311 | else: |
| 295 | reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" | 312 | reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" |
| 313 | elif has_material_chorus_overlap: | ||
| 314 | decision = DuplicateDecision.NEW | ||
| 315 | reason = "重合内容主要集中在重复副歌行,按片段歌词处理" | ||
| 296 | elif ( | 316 | elif ( |
| 297 | has_material_chorus_overlap | 317 | translation_only |
| 298 | or translation_only | ||
| 299 | or has_low_confidence_split_overlap | 318 | or has_low_confidence_split_overlap |
| 300 | or has_review_level_overlap | 319 | or has_review_level_overlap |
| 301 | ): | 320 | ): |
| 302 | decision = DuplicateDecision.REVIEW | 321 | decision = DuplicateDecision.REVIEW |
| 303 | reason = "候选相似度达到复核阈值,需要人工确认" | 322 | reason = "候选相似度达到复核阈值,需要人工确认" |
| 304 | if has_material_chorus_overlap: | 323 | if translation_only: |
| 305 | reason = "重合内容主要集中在重复副歌行,不自动判重" | ||
| 306 | elif translation_only: | ||
| 307 | reason = "仅翻译行相似,原文字面重合不足,不自动判重" | 324 | reason = "仅翻译行相似,原文字面重合不足,不自动判重" |
| 308 | elif has_low_confidence_split_overlap: | 325 | elif has_low_confidence_split_overlap: |
| 309 | reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" | 326 | reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" |
| ... | @@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[ | ... | @@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[ |
| 430 | return len(set(matched_lines)) / len(query_unique_lines) | 447 | return len(set(matched_lines)) / len(query_unique_lines) |
| 431 | 448 | ||
| 432 | 449 | ||
| 450 | def _is_plain_fragment( | ||
| 451 | query_lines: tuple[str, ...], | ||
| 452 | candidate_lines: tuple[str, ...], | ||
| 453 | matched_lines: list[str], | ||
| 454 | *, | ||
| 455 | min_query_coverage: float, | ||
| 456 | max_line_ratio: float, | ||
| 457 | min_matched_lines: int, | ||
| 458 | ) -> bool: | ||
| 459 | query_unique_lines = set(query_lines) | ||
| 460 | candidate_unique_lines = set(candidate_lines) | ||
| 461 | matched_unique_lines = set(matched_lines) | ||
| 462 | if not query_unique_lines or not candidate_unique_lines: | ||
| 463 | return False | ||
| 464 | if len(matched_unique_lines) < min_matched_lines: | ||
| 465 | return False | ||
| 466 | line_ratio = len(query_unique_lines) / len(candidate_unique_lines) | ||
| 467 | query_coverage = len(matched_unique_lines) / len(query_unique_lines) | ||
| 468 | return line_ratio <= max_line_ratio and query_coverage >= min_query_coverage | ||
| 469 | |||
| 470 | |||
| 433 | def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: | 471 | def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: |
| 434 | if not matched_lines: | 472 | if not matched_lines: |
| 435 | return False | 473 | return False | ... | ... |
| ... | @@ -50,6 +50,7 @@ class CheckResponse(BaseModel): | ... | @@ -50,6 +50,7 @@ class CheckResponse(BaseModel): |
| 50 | decision: str | None = None | 50 | decision: str | None = None |
| 51 | confidence: float | None = None | 51 | confidence: float | None = None |
| 52 | reason: str | None = None | 52 | reason: str | None = None |
| 53 | record_ids: list[str] = [] | ||
| 53 | 54 | ||
| 54 | 55 | ||
| 55 | class HealthResponse(BaseModel): | 56 | class HealthResponse(BaseModel): |
| ... | @@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any: | ... | @@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any: |
| 108 | decision=result.decision, | 109 | decision=result.decision, |
| 109 | confidence=result.confidence, | 110 | confidence=result.confidence, |
| 110 | reason=result.reason, | 111 | reason=result.reason, |
| 112 | record_ids=result.record_ids, | ||
| 111 | ) | 113 | ) |
| 112 | 114 | ||
| 113 | 115 | ... | ... |
| ... | @@ -81,16 +81,28 @@ class ServerConfig: | ... | @@ -81,16 +81,28 @@ class ServerConfig: |
| 81 | # Raising this makes partial-fragment review stricter. | 81 | # Raising this makes partial-fragment review stricter. |
| 82 | review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40")) | 82 | review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40")) |
| 83 | 83 | ||
| 84 | # Very short query lyric line count that can force repeated-chorus overlap into review. | 84 | # Plain fragment guard: query-side match ratio required to treat the input as a lyric fragment. |
| 85 | # Raising this catches more short chorus-like inputs; lowering it reduces review volume. | 85 | # When this is met together with fragment_max_line_ratio, the result is new instead of review/duplicate. |
| 86 | fragment_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_QUERY_COVERAGE_THRESHOLD", "0.80")) | ||
| 87 | |||
| 88 | # Plain fragment guard: maximum query/candidate line-count ratio still considered a fragment. | ||
| 89 | # Lower values protect only shorter fragments; higher values treat longer partial uploads as new. | ||
| 90 | fragment_max_line_ratio: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_MAX_LINE_RATIO", "0.75")) | ||
| 91 | |||
| 92 | # Plain fragment guard: minimum matched unique lyric lines before fragment protection can apply. | ||
| 93 | # This avoids classifying tiny common phrases as meaningful fragments. | ||
| 94 | fragment_min_matched_lines: int = int(os.getenv("LYRIC_DEDUP_FRAGMENT_MIN_MATCHED_LINES", "3")) | ||
| 95 | |||
| 96 | # Very short query lyric line count that can force repeated-chorus overlap into fragment protection. | ||
| 97 | # Matches protected by this path return new instead of duplicate/review. | ||
| 86 | chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6")) | 98 | chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6")) |
| 87 | 99 | ||
| 88 | # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material. | 100 | # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material. |
| 89 | # Raising this makes chorus-only review stricter. | 101 | # Raising this makes chorus-only fragment protection stricter. |
| 90 | chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20")) | 102 | chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20")) |
| 91 | 103 | ||
| 92 | # Minimum query-side coverage for repeated-chorus overlap to be considered material. | 104 | # Minimum query-side coverage for repeated-chorus overlap to be considered material. |
| 93 | # Raising this reduces review decisions caused by small shared chorus fragments. | 105 | # Raising this reduces fragment protection caused by small shared chorus fragments. |
| 94 | chorus_material_query_coverage_threshold: float = float( | 106 | chorus_material_query_coverage_threshold: float = float( |
| 95 | os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40") | 107 | os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40") |
| 96 | ) | 108 | ) | ... | ... |
| ... | @@ -27,6 +27,7 @@ class CheckResult: | ... | @@ -27,6 +27,7 @@ class CheckResult: |
| 27 | confidence: float = 0.0 | 27 | confidence: float = 0.0 |
| 28 | reason: str = "" | 28 | reason: str = "" |
| 29 | candidate_count: int = 0 | 29 | candidate_count: int = 0 |
| 30 | record_ids: list[str] = field(default_factory=list) | ||
| 30 | 31 | ||
| 31 | 32 | ||
| 32 | @dataclass | 33 | @dataclass |
| ... | @@ -197,6 +198,9 @@ class DedupService: | ... | @@ -197,6 +198,9 @@ class DedupService: |
| 197 | review_jaccard_threshold=self.config.review_jaccard_threshold, | 198 | review_jaccard_threshold=self.config.review_jaccard_threshold, |
| 198 | review_line_coverage_threshold=self.config.review_line_coverage_threshold, | 199 | review_line_coverage_threshold=self.config.review_line_coverage_threshold, |
| 199 | review_query_coverage_threshold=self.config.review_query_coverage_threshold, | 200 | review_query_coverage_threshold=self.config.review_query_coverage_threshold, |
| 201 | fragment_query_coverage_threshold=self.config.fragment_query_coverage_threshold, | ||
| 202 | fragment_max_line_ratio=self.config.fragment_max_line_ratio, | ||
| 203 | fragment_min_matched_lines=self.config.fragment_min_matched_lines, | ||
| 200 | chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold, | 204 | chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold, |
| 201 | chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold, | 205 | chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold, |
| 202 | chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold, | 206 | chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold, |
| ... | @@ -208,12 +212,18 @@ class DedupService: | ... | @@ -208,12 +212,18 @@ class DedupService: |
| 208 | candidates, | 212 | candidates, |
| 209 | max_candidates=self.config.max_candidates, | 213 | max_candidates=self.config.max_candidates, |
| 210 | ) | 214 | ) |
| 215 | # 收集 duplicate/review 决策下的候选 record_id | ||
| 216 | matched_ids = [ | ||
| 217 | c.record_id for c in result.candidates | ||
| 218 | if c.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW) | ||
| 219 | ] | ||
| 211 | return CheckResult( | 220 | return CheckResult( |
| 212 | duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), | 221 | duplicate=result.decision == DuplicateDecision.DUPLICATE, |
| 213 | decision=result.decision.value, | 222 | decision=result.decision.value, |
| 214 | confidence=result.confidence, | 223 | confidence=result.confidence, |
| 215 | reason=result.reason, | 224 | reason=result.reason, |
| 216 | candidate_count=len(result.candidates), | 225 | candidate_count=len(result.candidates), |
| 226 | record_ids=matched_ids, | ||
| 217 | ) | 227 | ) |
| 218 | 228 | ||
| 219 | 229 | ... | ... |
| ... | @@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file | ... | @@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file |
| 22 | from lyric_dedup.file_import import record_from_file | 22 | from lyric_dedup.file_import import record_from_file |
| 23 | from lyric_dedup.normalization import fingerprint_text | 23 | from lyric_dedup.normalization import fingerprint_text |
| 24 | from lyric_dedup.normalization import normalize_lyrics | 24 | from lyric_dedup.normalization import normalize_lyrics |
| 25 | from lyric_dedup_server.config import ServerConfig | ||
| 25 | 26 | ||
| 26 | 27 | ||
| 27 | def main() -> None: | 28 | def main() -> None: |
| 28 | parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.") | 29 | parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.") |
| 29 | parser.add_argument("--dsn", required=True) | ||
| 30 | parser.add_argument("--csv", required=True) | 30 | parser.add_argument("--csv", required=True) |
| 31 | parser.add_argument("--out", required=True) | 31 | parser.add_argument("--out", required=True) |
| 32 | parser.add_argument("--base-dir", default="") | 32 | parser.add_argument("--base-dir", default="") |
| 33 | parser.add_argument("--positive-decisions", default="duplicate") | ||
| 34 | parser.add_argument("--max-candidates", type=int, default=5) | ||
| 35 | parser.add_argument("--recall-limit", type=int, default=100) | ||
| 36 | parser.add_argument("--enable-trgm", action="store_true", help="Enable pg_trgm full-text recall. Slower; exact + line recall is used by default.") | ||
| 37 | parser.add_argument("--trgm-threshold", type=float, default=0.3) | ||
| 38 | parser.add_argument("--statement-timeout-ms", type=int, default=5000) | ||
| 39 | parser.add_argument("--profile-every", type=int, default=100) | 33 | parser.add_argument("--profile-every", type=int, default=100) |
| 40 | args = parser.parse_args() | 34 | args = parser.parse_args() |
| 41 | 35 | ||
| 42 | psycopg = _import_psycopg() | 36 | psycopg = _import_psycopg() |
| 37 | config = ServerConfig() | ||
| 43 | csv_path = Path(args.csv) | 38 | csv_path = Path(args.csv) |
| 44 | out_path = Path(args.out) | 39 | out_path = Path(args.out) |
| 45 | base_dir = Path(args.base_dir) if args.base_dir else None | 40 | base_dir = Path(args.base_dir) if args.base_dir else None |
| 46 | positive_decisions = {item.strip() for item in args.positive_decisions.split(",") if item.strip()} | 41 | positive_decisions = {"duplicate"} |
| 47 | 42 | ||
| 48 | total = _csv_data_row_count(csv_path) | 43 | total = _csv_data_row_count(csv_path) |
| 49 | rows: list[dict[str, object]] = [] | 44 | rows: list[dict[str, object]] = [] |
| 50 | profile_stats = _new_profile_stats() | 45 | profile_stats = _new_profile_stats() |
| 51 | out_path.parent.mkdir(parents=True, exist_ok=True) | 46 | out_path.parent.mkdir(parents=True, exist_ok=True) |
| 52 | _progress(f"evaluate postgres csv: 0/{total}") | 47 | _progress(f"evaluate postgres csv: 0/{total}") |
| 53 | with psycopg.connect(args.dsn) as conn: | 48 | with psycopg.connect(config.dsn) as conn: |
| 54 | with conn.cursor() as cursor: | 49 | with conn.cursor() as cursor: |
| 55 | cursor.execute("select set_config('statement_timeout', %s, false)", (str(args.statement_timeout_ms),)) | 50 | cursor.execute("select set_config('statement_timeout', %s, false)", (str(config.statement_timeout_ms),)) |
| 56 | cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(args.trgm_threshold),)) | 51 | cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(config.trgm_threshold),)) |
| 57 | with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open( | 52 | with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open( |
| 58 | "w", encoding="utf-8", newline="" | 53 | "w", encoding="utf-8", newline="" |
| 59 | ) as out_file: | 54 | ) as out_file: |
| ... | @@ -70,9 +65,7 @@ def main() -> None: | ... | @@ -70,9 +65,7 @@ def main() -> None: |
| 70 | csv_path=csv_path, | 65 | csv_path=csv_path, |
| 71 | base_dir=base_dir, | 66 | base_dir=base_dir, |
| 72 | positive_decisions=positive_decisions, | 67 | positive_decisions=positive_decisions, |
| 73 | max_candidates=args.max_candidates, | 68 | config=config, |
| 74 | recall_limit=args.recall_limit, | ||
| 75 | enable_trgm=args.enable_trgm, | ||
| 76 | ) | 69 | ) |
| 77 | rows.append(row_out) | 70 | rows.append(row_out) |
| 78 | writer.writerow(row_out) | 71 | writer.writerow(row_out) |
| ... | @@ -96,9 +89,7 @@ def _evaluate_row( | ... | @@ -96,9 +89,7 @@ def _evaluate_row( |
| 96 | csv_path: Path, | 89 | csv_path: Path, |
| 97 | base_dir: Path | None, | 90 | base_dir: Path | None, |
| 98 | positive_decisions: set[str], | 91 | positive_decisions: set[str], |
| 99 | max_candidates: int, | 92 | config: ServerConfig, |
| 100 | recall_limit: int, | ||
| 101 | enable_trgm: bool, | ||
| 102 | ) -> dict[str, object]: | 93 | ) -> dict[str, object]: |
| 103 | parse_started = time.perf_counter() | 94 | parse_started = time.perf_counter() |
| 104 | sample_id = row.get("id") or row.get("sample_id") or str(row_number) | 95 | sample_id = row.get("id") or row.get("sample_id") or str(row_number) |
| ... | @@ -108,12 +99,12 @@ def _evaluate_row( | ... | @@ -108,12 +99,12 @@ def _evaluate_row( |
| 108 | candidates, timings = _recall_candidates( | 99 | candidates, timings = _recall_candidates( |
| 109 | conn, | 100 | conn, |
| 110 | record, | 101 | record, |
| 111 | recall_limit=recall_limit, | 102 | recall_limit=config.recall_limit, |
| 112 | enable_trgm=enable_trgm, | 103 | enable_trgm=config.enable_trgm, |
| 113 | exclude_record_ids=_exclude_record_ids_for_eval_row(row), | 104 | exclude_record_ids=_exclude_record_ids_for_eval_row(row), |
| 114 | ) | 105 | ) |
| 115 | rank_started = time.perf_counter() | 106 | rank_started = time.perf_counter() |
| 116 | result = _check_against_candidates(record, candidates, max_candidates=max_candidates) | 107 | result = _check_against_candidates(record, candidates, config=config) |
| 117 | rank_ms = round((time.perf_counter() - rank_started) * 1000, 2) | 108 | rank_ms = round((time.perf_counter() - rank_started) * 1000, 2) |
| 118 | recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2) | 109 | recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2) |
| 119 | predicted_duplicate = result.decision.value in positive_decisions | 110 | predicted_duplicate = result.decision.value in positive_decisions |
| ... | @@ -127,7 +118,7 @@ def _evaluate_row( | ... | @@ -127,7 +118,7 @@ def _evaluate_row( |
| 127 | "correct": expected_duplicate == predicted_duplicate, | 118 | "correct": expected_duplicate == predicted_duplicate, |
| 128 | "confidence": result.confidence, | 119 | "confidence": result.confidence, |
| 129 | "reason": result.reason, | 120 | "reason": result.reason, |
| 130 | "candidate_count": len(candidates), | 121 | "candidate_count": len(result.candidates), |
| 131 | "parse_ms": parse_ms, | 122 | "parse_ms": parse_ms, |
| 132 | "recall_ms": recall_ms, | 123 | "recall_ms": recall_ms, |
| 133 | "exact_ms": timings["exact_ms"], | 124 | "exact_ms": timings["exact_ms"], |
| ... | @@ -246,10 +237,26 @@ def _check_against_candidates( | ... | @@ -246,10 +237,26 @@ def _check_against_candidates( |
| 246 | record: LyricRecord, | 237 | record: LyricRecord, |
| 247 | candidates: list[LyricRecord], | 238 | candidates: list[LyricRecord], |
| 248 | *, | 239 | *, |
| 249 | max_candidates: int, | 240 | config: ServerConfig, |
| 250 | ): | 241 | ): |
| 251 | checker = DuplicateChecker() | 242 | checker = DuplicateChecker( |
| 252 | return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates) | 243 | duplicate_jaccard_threshold=config.duplicate_jaccard_threshold, |
| 244 | duplicate_line_coverage_threshold=config.duplicate_line_coverage_threshold, | ||
| 245 | duplicate_high_coverage_jaccard_threshold=config.duplicate_high_coverage_jaccard_threshold, | ||
| 246 | duplicate_high_coverage_line_coverage_threshold=config.duplicate_high_coverage_line_coverage_threshold, | ||
| 247 | review_jaccard_threshold=config.review_jaccard_threshold, | ||
| 248 | review_line_coverage_threshold=config.review_line_coverage_threshold, | ||
| 249 | review_query_coverage_threshold=config.review_query_coverage_threshold, | ||
| 250 | fragment_query_coverage_threshold=config.fragment_query_coverage_threshold, | ||
| 251 | fragment_max_line_ratio=config.fragment_max_line_ratio, | ||
| 252 | fragment_min_matched_lines=config.fragment_min_matched_lines, | ||
| 253 | chorus_short_line_count_threshold=config.chorus_short_line_count_threshold, | ||
| 254 | chorus_material_overlap_threshold=config.chorus_material_overlap_threshold, | ||
| 255 | chorus_material_query_coverage_threshold=config.chorus_material_query_coverage_threshold, | ||
| 256 | confidence_jaccard_weight=config.confidence_jaccard_weight, | ||
| 257 | confidence_line_coverage_weight=config.confidence_line_coverage_weight, | ||
| 258 | ) | ||
| 259 | return checker.check_record_against_candidates(record, candidates, max_candidates=config.max_candidates) | ||
| 253 | 260 | ||
| 254 | 261 | ||
| 255 | def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: | 262 | def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: | ... | ... |
| ... | @@ -110,6 +110,7 @@ def main(): | ... | @@ -110,6 +110,7 @@ def main(): |
| 110 | print(f" decision: {result.get('decision', 'N/A')}") | 110 | print(f" decision: {result.get('decision', 'N/A')}") |
| 111 | print(f" confidence: {result.get('confidence', 'N/A')}") | 111 | print(f" confidence: {result.get('confidence', 'N/A')}") |
| 112 | print(f" reason: {result.get('reason', 'N/A')}") | 112 | print(f" reason: {result.get('reason', 'N/A')}") |
| 113 | print(f" record_ids: {result.get('record_ids', [])}") | ||
| 113 | 114 | ||
| 114 | 115 | ||
| 115 | if __name__ == "__main__": | 116 | if __name__ == "__main__": | ... | ... |
| 1 | ## 消失的波段 | ||
| 2 | |||
| 3 | ### 【主歌 1】 — *(压抑、低沉的叙事)* | ||
| 4 | |||
| 5 | 霓虹灯……在车窗外退后, | ||
| 6 | 霓虹——和夜色融为一体。 | ||
| 7 | 收音机里,只剩沙沙的电流…… | ||
| 8 | (像你在旧地址留下的呼吸……) | ||
| 9 | 有些习惯……总是很难去修正, | ||
| 10 | 比如——在人群中,辨认你的背影。 | ||
| 11 | |||
| 12 | ### 【主歌 2】 — *(情绪渐进,带有一丝无奈)* | ||
| 13 | |||
| 14 | 朋友圈里……你更新了风景, | ||
| 15 | 坐标是——没听过的、陌、生、城、市。 | ||
| 16 | 我们从无话不说……退回到【静音】, | ||
| 17 | 像两条失去交集的——平行线。 | ||
| 18 | 那些没有寄出的长信…… | ||
| 19 | 最后都变成,草稿箱里的——灰、尘。 | ||
| 20 | |||
| 21 | ### 【副歌】 —— *(情感爆发,高亢而撕裂)* | ||
| 22 | |||
| 23 | 我们成了彼此消 逝 的 波 段 !! | ||
| 24 | 在同一个频段……却再也无法呼喊! | ||
| 25 | 那些同频共振的夜晚…… | ||
| 26 | 最终被淹没在——嘈杂的市中心!! | ||
| 27 | 我调整着微弱的接收信号…… | ||
| 28 | 却只听到——时光断裂的声音!!! | ||
| 29 | |||
| 30 | ### 【桥段】 —— *(节奏加快,连续的内心追问)* | ||
| 31 | |||
| 32 | 是不是所有的连接……都有保质期?! | ||
| 33 | 到期后……就自动切断了所有联系?! | ||
| 34 | 我们在各自的轨道里——加!速!运!行! | ||
| 35 | 再也找不到……那天傍晚的引力。 | ||
| 36 | |||
| 37 | ### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)* | ||
| 38 | |||
| 39 | 我们成了彼此消 逝 的 波 段 ——!! | ||
| 40 | 在同一个频段……却再也无法呼喊! | ||
| 41 | 那些同频共振的夜晚…… | ||
| 42 | 最终被淹没在——嘈杂的市中心!! | ||
| 43 | 我调整着微弱的接收信号…… | ||
| 44 | 却只听到……(时光断裂的声音)…… | ||
| 45 | |||
| 46 | ### 【尾奏】 —— *(情绪下沉,最终归于死寂)* | ||
| 47 | |||
| 48 | 【信号中断……请勿追赶。】 | ||
| 49 | 城市入睡……灯光渐暗…… | ||
| 50 | 一个人的波段。 | ||
| 51 | (查……无……此……人……) | ||
| 52 | 【 挂 断 。】 | ||
| 53 | ### 副歌 | ||
| 54 | |||
| 55 | 我们成了彼此消失的波段 | ||
| 56 | 在同一个频段却再也无法呼喊 | ||
| 57 | 那些同频共振的夜晚 | ||
| 58 | 最终被淹没在嘈杂的市中心 | ||
| 59 | 我调整着微弱的接收信号 | ||
| 60 | 却只听到时光断裂的声音 | ||
| 61 | |||
| 62 | ### 桥段 | ||
| 63 | |||
| 64 | 是不是所有的连接都有保质期 | ||
| 65 | 到期后就自动切断了所有联系 | ||
| 66 | 我们在各自的轨道里加速运行 | ||
| 67 | 再也找不到那天傍晚的引力 | ||
| 68 | |||
| 69 | ### 副歌 | ||
| 70 | |||
| 71 | 我们成了彼此消失的波段 | ||
| 72 | 在同一个频段却再也无法呼喊 | ||
| 73 | 那些同频共振的夜晚 | ||
| 74 | 最终被淹没在嘈杂的市中心 | ||
| 75 | 我调整着微弱的接收信号 | ||
| 76 | 却只听到时光断裂的声音 | ||
| 77 | |||
| 78 | ### 尾奏 | ||
| 79 | |||
| 80 | 信号中断,请勿追赶 | ||
| 81 | 城市入睡,灯光渐暗 | ||
| 82 | 一个人的波段 | ||
| 83 | 查无此人 | ||
| 84 | 挂断 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | 歌曲题目:《星空大冒险》 | ||
| 2 | |||
| 3 | 【主歌 2】 | ||
| 4 | 小兔子,在划船, | ||
| 5 | 它的浆是胡萝卜。 | ||
| 6 | 小熊坐在树枝上, | ||
| 7 | 正把蜂蜜涂面包。 | ||
| 8 | 风儿吹过坏脾气, | ||
| 9 | 在这儿变成甜泡泡。 | ||
| 10 | 没有作业和烦恼, | ||
| 11 | 大家都在哈哈笑。 | ||
| 12 | |||
| 13 | 【副歌 2】 | ||
| 14 | 飞呀飞,飞向大月亮, | ||
| 15 | 月亮像个大香蕉,挂在夜空上。 | ||
| 16 | 摇呀摇,摇到银河旁, | ||
| 17 | 捞起一颗小星星,放在手心里亮。 | ||
| 18 | |||
| 19 | 【桥段(Bridge)】 | ||
| 20 | (节奏放慢,变温柔) | ||
| 21 | 天上的城堡亮晶晶, | ||
| 22 | 那是梦里的风景。 | ||
| 23 | 玩累的小孩要睡了, | ||
| 24 | 听一听,风的呼吸。 | ||
| 25 | 呼——噜——呼——噜—— | ||
| 26 | 做个好梦到天明。 | ||
| 27 | |||
| 28 | 【副歌 3】 | ||
| 29 | (节奏恢复,渐弱结束) | ||
| 30 | 飞呀飞,飞向大月亮, | ||
| 31 | 月亮像个大香蕉,挂在夜空上。 | ||
| 32 | 摇呀摇,摇到银河旁, | ||
| 33 | 捞起一颗小星星, | ||
| 34 | 抱在怀里……睡着啦。 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord | ... | @@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord |
| 7 | from lyric_dedup.eval_dataset import generate_eval_set | 7 | from lyric_dedup.eval_dataset import generate_eval_set |
| 8 | from lyric_dedup.file_import import record_from_file | 8 | from lyric_dedup.file_import import record_from_file |
| 9 | from lyric_dedup.normalization import normalize_lyrics | 9 | from lyric_dedup.normalization import normalize_lyrics |
| 10 | from lyric_dedup_server.config import ServerConfig | ||
| 11 | from lyric_dedup_server.service import DedupService | ||
| 10 | 12 | ||
| 11 | 13 | ||
| 12 | BASE_LYRIC = """ | 14 | BASE_LYRIC = """ |
| ... | @@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c | ... | @@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c |
| 55 | assert result.candidates[0].record_id == "song-1" | 57 | assert result.candidates[0].record_id == "song-1" |
| 56 | 58 | ||
| 57 | 59 | ||
| 58 | def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None: | 60 | def test_short_shared_repeated_chorus_is_new_fragment() -> None: |
| 59 | result = check_against( | 61 | result = check_against( |
| 60 | [ | 62 | [ |
| 61 | LyricRecord( | 63 | LyricRecord( |
| ... | @@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None: | ... | @@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None: |
| 78 | """ | 80 | """ |
| 79 | ) | 81 | ) |
| 80 | 82 | ||
| 81 | assert result.decision == DuplicateDecision.REVIEW | 83 | assert result.decision == DuplicateDecision.NEW |
| 82 | assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,不自动判重" | 84 | assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,按片段歌词处理" |
| 85 | |||
| 86 | |||
| 87 | def test_service_short_chorus_fragment_result_is_new() -> None: | ||
| 88 | service = DedupService(config=ServerConfig()) | ||
| 89 | |||
| 90 | result = service._check_against_candidates( | ||
| 91 | LyricRecord( | ||
| 92 | "__query__", | ||
| 93 | """ | ||
| 94 | 山谷的雨落在清晨 | ||
| 95 | 我把名字交给星辰 | ||
| 96 | 啦啦啦 我们不分离 | ||
| 97 | 啦啦啦 我们不分离 | ||
| 98 | 世界安静等一个人 | ||
| 99 | """, | ||
| 100 | ), | ||
| 101 | [ | ||
| 102 | LyricRecord( | ||
| 103 | "song-1", | ||
| 104 | """ | ||
| 105 | 海边的风吹过旧信 | ||
| 106 | 你说夏天不会远去 | ||
| 107 | 啦啦啦 我们不分离 | ||
| 108 | 啦啦啦 我们不分离 | ||
| 109 | 转身以后各自旅行 | ||
| 110 | """, | ||
| 111 | ) | ||
| 112 | ], | ||
| 113 | ) | ||
| 114 | |||
| 115 | assert result.decision == DuplicateDecision.NEW.value | ||
| 116 | assert result.duplicate is False | ||
| 117 | assert result.record_ids == [] | ||
| 83 | 118 | ||
| 84 | 119 | ||
| 85 | def test_substantial_line_overlap_is_duplicate_after_pg_recall() -> None: | 120 | def test_substantial_line_overlap_is_duplicate_after_pg_recall() -> None: |
| ... | @@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: | ... | @@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: |
| 110 | """ | 145 | """ |
| 111 | ) | 146 | ) |
| 112 | 147 | ||
| 113 | assert result.decision != DuplicateDecision.DUPLICATE | 148 | assert result.decision == DuplicateDecision.NEW |
| 149 | assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理" | ||
| 114 | assert result.candidates[0].primary_line_coverage < 0.72 | 150 | assert result.candidates[0].primary_line_coverage < 0.72 |
| 115 | 151 | ||
| 116 | 152 | ||
| 153 | def test_long_plain_fragment_of_full_song_is_new_not_review() -> None: | ||
| 154 | full_song = """ | ||
| 155 | 第一行写给凌晨的风 | ||
| 156 | 第二行写给远处的灯 | ||
| 157 | 第三行写给没有寄出的信 | ||
| 158 | 第四行写给还醒着的人 | ||
| 159 | 第五行写给旧车站 | ||
| 160 | 第六行写给长街尽头 | ||
| 161 | 第七行写给明天的太阳 | ||
| 162 | 第八行写给重新出发 | ||
| 163 | 第九行写给路过的雨 | ||
| 164 | 第十行写给沉默的月光 | ||
| 165 | """ | ||
| 166 | result = check_against( | ||
| 167 | [LyricRecord("song-1", full_song)], | ||
| 168 | """ | ||
| 169 | 第二行写给远处的灯 | ||
| 170 | 第三行写给没有寄出的信 | ||
| 171 | 第四行写给还醒着的人 | ||
| 172 | 第五行写给旧车站 | ||
| 173 | 第六行写给长街尽头 | ||
| 174 | 第七行写给明天的太阳 | ||
| 175 | """, | ||
| 176 | ) | ||
| 177 | |||
| 178 | assert result.decision == DuplicateDecision.NEW | ||
| 179 | assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理" | ||
| 180 | |||
| 181 | |||
| 117 | def test_catalog_mashup_fragments_are_new_not_review() -> None: | 182 | def test_catalog_mashup_fragments_are_new_not_review() -> None: |
| 118 | result = check_against( | 183 | result = check_against( |
| 119 | [ | 184 | [ | ... | ... |
-
Please register or sign in to post a comment