Commit cdfa3a58 cdfa3a581738a209e66f43513055a41c70831af1 by 沈秋雨

更新测试脚本

1 parent fec2556e
...@@ -23,3 +23,5 @@ text-dedup-main/ ...@@ -23,3 +23,5 @@ text-dedup-main/
23 venv/ 23 venv/
24 .idea/ 24 .idea/
25 .vscode/ 25 .vscode/
26
27 test_api
......
...@@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \ ...@@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \
105 105
106 ```text 106 ```text
107 decision duplicate / review / new 107 decision duplicate / review / new
108 duplicate duplicate 或 review 时为 true,new 时为 false 108 duplicate 仅 decision=duplicate 时为 true,review/new 时为 false
109 confidence 当前判定置信度 109 confidence 当前判定置信度
110 reason 中文判定原因 110 reason 中文判定原因
111 candidate_count 参与最终排序的候选数 111 candidate_count 参与最终排序的候选数
......
...@@ -75,6 +75,9 @@ class DuplicateChecker: ...@@ -75,6 +75,9 @@ class DuplicateChecker:
75 review_jaccard_threshold: float = 0.45, 75 review_jaccard_threshold: float = 0.45,
76 review_line_coverage_threshold: float = 0.35, 76 review_line_coverage_threshold: float = 0.35,
77 review_query_coverage_threshold: float = 0.40, 77 review_query_coverage_threshold: float = 0.40,
78 fragment_query_coverage_threshold: float = 0.80,
79 fragment_max_line_ratio: float = 0.75,
80 fragment_min_matched_lines: int = 3,
78 chorus_short_line_count_threshold: int = 6, 81 chorus_short_line_count_threshold: int = 6,
79 chorus_material_overlap_threshold: float = 0.20, 82 chorus_material_overlap_threshold: float = 0.20,
80 chorus_material_query_coverage_threshold: float = 0.40, 83 chorus_material_query_coverage_threshold: float = 0.40,
...@@ -88,6 +91,9 @@ class DuplicateChecker: ...@@ -88,6 +91,9 @@ class DuplicateChecker:
88 self.review_jaccard_threshold = review_jaccard_threshold 91 self.review_jaccard_threshold = review_jaccard_threshold
89 self.review_line_coverage_threshold = review_line_coverage_threshold 92 self.review_line_coverage_threshold = review_line_coverage_threshold
90 self.review_query_coverage_threshold = review_query_coverage_threshold 93 self.review_query_coverage_threshold = review_query_coverage_threshold
94 self.fragment_query_coverage_threshold = fragment_query_coverage_threshold
95 self.fragment_max_line_ratio = fragment_max_line_ratio
96 self.fragment_min_matched_lines = fragment_min_matched_lines
91 self.chorus_short_line_count_threshold = chorus_short_line_count_threshold 97 self.chorus_short_line_count_threshold = chorus_short_line_count_threshold
92 self.chorus_material_overlap_threshold = chorus_material_overlap_threshold 98 self.chorus_material_overlap_threshold = chorus_material_overlap_threshold
93 self.chorus_material_query_coverage_threshold = chorus_material_query_coverage_threshold 99 self.chorus_material_query_coverage_threshold = chorus_material_query_coverage_threshold
...@@ -237,6 +243,14 @@ class DuplicateChecker: ...@@ -237,6 +243,14 @@ class DuplicateChecker:
237 query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" 243 query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low"
238 ) 244 )
239 query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines) 245 query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines)
246 is_plain_fragment = _is_plain_fragment(
247 query.normalized.primary_lines,
248 candidate.normalized.primary_lines,
249 primary_matched_lines,
250 min_query_coverage=self.fragment_query_coverage_threshold,
251 max_line_ratio=self.fragment_max_line_ratio,
252 min_matched_lines=self.fragment_min_matched_lines,
253 )
240 has_review_level_overlap = ( 254 has_review_level_overlap = (
241 primary_jaccard >= self.review_jaccard_threshold 255 primary_jaccard >= self.review_jaccard_threshold
242 or jaccard >= self.review_jaccard_threshold 256 or jaccard >= self.review_jaccard_threshold
...@@ -275,7 +289,10 @@ class DuplicateChecker: ...@@ -275,7 +289,10 @@ class DuplicateChecker:
275 + (self.confidence_line_coverage_weight * primary_coverage), 289 + (self.confidence_line_coverage_weight * primary_coverage),
276 4, 290 4,
277 ) 291 )
278 if ( 292 if is_plain_fragment:
293 decision = DuplicateDecision.NEW
294 reason = "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
295 elif (
279 ( 296 (
280 primary_jaccard >= self.duplicate_jaccard_threshold 297 primary_jaccard >= self.duplicate_jaccard_threshold
281 or ( 298 or (
...@@ -293,17 +310,17 @@ class DuplicateChecker: ...@@ -293,17 +310,17 @@ class DuplicateChecker:
293 reason = "原文歌词高度一致,翻译行未参与自动判重" 310 reason = "原文歌词高度一致,翻译行未参与自动判重"
294 else: 311 else:
295 reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" 312 reason = "原文 n-gram 字面相似度高,且行级覆盖范围广"
313 elif has_material_chorus_overlap:
314 decision = DuplicateDecision.NEW
315 reason = "重合内容主要集中在重复副歌行,按片段歌词处理"
296 elif ( 316 elif (
297 has_material_chorus_overlap 317 translation_only
298 or translation_only
299 or has_low_confidence_split_overlap 318 or has_low_confidence_split_overlap
300 or has_review_level_overlap 319 or has_review_level_overlap
301 ): 320 ):
302 decision = DuplicateDecision.REVIEW 321 decision = DuplicateDecision.REVIEW
303 reason = "候选相似度达到复核阈值,需要人工确认" 322 reason = "候选相似度达到复核阈值,需要人工确认"
304 if has_material_chorus_overlap: 323 if translation_only:
305 reason = "重合内容主要集中在重复副歌行,不自动判重"
306 elif translation_only:
307 reason = "仅翻译行相似,原文字面重合不足,不自动判重" 324 reason = "仅翻译行相似,原文字面重合不足,不自动判重"
308 elif has_low_confidence_split_overlap: 325 elif has_low_confidence_split_overlap:
309 reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" 326 reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核"
...@@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[ ...@@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[
430 return len(set(matched_lines)) / len(query_unique_lines) 447 return len(set(matched_lines)) / len(query_unique_lines)
431 448
432 449
450 def _is_plain_fragment(
451 query_lines: tuple[str, ...],
452 candidate_lines: tuple[str, ...],
453 matched_lines: list[str],
454 *,
455 min_query_coverage: float,
456 max_line_ratio: float,
457 min_matched_lines: int,
458 ) -> bool:
459 query_unique_lines = set(query_lines)
460 candidate_unique_lines = set(candidate_lines)
461 matched_unique_lines = set(matched_lines)
462 if not query_unique_lines or not candidate_unique_lines:
463 return False
464 if len(matched_unique_lines) < min_matched_lines:
465 return False
466 line_ratio = len(query_unique_lines) / len(candidate_unique_lines)
467 query_coverage = len(matched_unique_lines) / len(query_unique_lines)
468 return line_ratio <= max_line_ratio and query_coverage >= min_query_coverage
469
470
433 def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: 471 def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool:
434 if not matched_lines: 472 if not matched_lines:
435 return False 473 return False
......
...@@ -50,6 +50,7 @@ class CheckResponse(BaseModel): ...@@ -50,6 +50,7 @@ class CheckResponse(BaseModel):
50 decision: str | None = None 50 decision: str | None = None
51 confidence: float | None = None 51 confidence: float | None = None
52 reason: str | None = None 52 reason: str | None = None
53 record_ids: list[str] = []
53 54
54 55
55 class HealthResponse(BaseModel): 56 class HealthResponse(BaseModel):
...@@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any: ...@@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any:
108 decision=result.decision, 109 decision=result.decision,
109 confidence=result.confidence, 110 confidence=result.confidence,
110 reason=result.reason, 111 reason=result.reason,
112 record_ids=result.record_ids,
111 ) 113 )
112 114
113 115
......
...@@ -81,16 +81,28 @@ class ServerConfig: ...@@ -81,16 +81,28 @@ class ServerConfig:
81 # Raising this makes partial-fragment review stricter. 81 # Raising this makes partial-fragment review stricter.
82 review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40")) 82 review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
83 83
84 # Very short query lyric line count that can force repeated-chorus overlap into review. 84 # Plain fragment guard: query-side match ratio required to treat the input as a lyric fragment.
85 # Raising this catches more short chorus-like inputs; lowering it reduces review volume. 85 # When this is met together with fragment_max_line_ratio, the result is new instead of review/duplicate.
86 fragment_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_QUERY_COVERAGE_THRESHOLD", "0.80"))
87
88 # Plain fragment guard: maximum query/candidate line-count ratio still considered a fragment.
89 # Lower values protect only shorter fragments; higher values treat longer partial uploads as new.
90 fragment_max_line_ratio: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_MAX_LINE_RATIO", "0.75"))
91
92 # Plain fragment guard: minimum matched unique lyric lines before fragment protection can apply.
93 # This avoids classifying tiny common phrases as meaningful fragments.
94 fragment_min_matched_lines: int = int(os.getenv("LYRIC_DEDUP_FRAGMENT_MIN_MATCHED_LINES", "3"))
95
96 # Very short query lyric line count that can force repeated-chorus overlap into fragment protection.
97 # Matches protected by this path return new instead of duplicate/review.
86 chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6")) 98 chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
87 99
88 # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material. 100 # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
89 # Raising this makes chorus-only review stricter. 101 # Raising this makes chorus-only fragment protection stricter.
90 chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20")) 102 chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
91 103
92 # Minimum query-side coverage for repeated-chorus overlap to be considered material. 104 # Minimum query-side coverage for repeated-chorus overlap to be considered material.
93 # Raising this reduces review decisions caused by small shared chorus fragments. 105 # Raising this reduces fragment protection caused by small shared chorus fragments.
94 chorus_material_query_coverage_threshold: float = float( 106 chorus_material_query_coverage_threshold: float = float(
95 os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40") 107 os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
96 ) 108 )
......
...@@ -27,6 +27,7 @@ class CheckResult: ...@@ -27,6 +27,7 @@ class CheckResult:
27 confidence: float = 0.0 27 confidence: float = 0.0
28 reason: str = "" 28 reason: str = ""
29 candidate_count: int = 0 29 candidate_count: int = 0
30 record_ids: list[str] = field(default_factory=list)
30 31
31 32
32 @dataclass 33 @dataclass
...@@ -197,6 +198,9 @@ class DedupService: ...@@ -197,6 +198,9 @@ class DedupService:
197 review_jaccard_threshold=self.config.review_jaccard_threshold, 198 review_jaccard_threshold=self.config.review_jaccard_threshold,
198 review_line_coverage_threshold=self.config.review_line_coverage_threshold, 199 review_line_coverage_threshold=self.config.review_line_coverage_threshold,
199 review_query_coverage_threshold=self.config.review_query_coverage_threshold, 200 review_query_coverage_threshold=self.config.review_query_coverage_threshold,
201 fragment_query_coverage_threshold=self.config.fragment_query_coverage_threshold,
202 fragment_max_line_ratio=self.config.fragment_max_line_ratio,
203 fragment_min_matched_lines=self.config.fragment_min_matched_lines,
200 chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold, 204 chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold,
201 chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold, 205 chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold,
202 chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold, 206 chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold,
...@@ -208,12 +212,18 @@ class DedupService: ...@@ -208,12 +212,18 @@ class DedupService:
208 candidates, 212 candidates,
209 max_candidates=self.config.max_candidates, 213 max_candidates=self.config.max_candidates,
210 ) 214 )
215 # 收集 duplicate/review 决策下的候选 record_id
216 matched_ids = [
217 c.record_id for c in result.candidates
218 if c.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW)
219 ]
211 return CheckResult( 220 return CheckResult(
212 duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), 221 duplicate=result.decision == DuplicateDecision.DUPLICATE,
213 decision=result.decision.value, 222 decision=result.decision.value,
214 confidence=result.confidence, 223 confidence=result.confidence,
215 reason=result.reason, 224 reason=result.reason,
216 candidate_count=len(result.candidates), 225 candidate_count=len(result.candidates),
226 record_ids=matched_ids,
217 ) 227 )
218 228
219 229
......
...@@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file ...@@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file
22 from lyric_dedup.file_import import record_from_file 22 from lyric_dedup.file_import import record_from_file
23 from lyric_dedup.normalization import fingerprint_text 23 from lyric_dedup.normalization import fingerprint_text
24 from lyric_dedup.normalization import normalize_lyrics 24 from lyric_dedup.normalization import normalize_lyrics
25 from lyric_dedup_server.config import ServerConfig
25 26
26 27
27 def main() -> None: 28 def main() -> None:
28 parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.") 29 parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.")
29 parser.add_argument("--dsn", required=True)
30 parser.add_argument("--csv", required=True) 30 parser.add_argument("--csv", required=True)
31 parser.add_argument("--out", required=True) 31 parser.add_argument("--out", required=True)
32 parser.add_argument("--base-dir", default="") 32 parser.add_argument("--base-dir", default="")
33 parser.add_argument("--positive-decisions", default="duplicate")
34 parser.add_argument("--max-candidates", type=int, default=5)
35 parser.add_argument("--recall-limit", type=int, default=100)
36 parser.add_argument("--enable-trgm", action="store_true", help="Enable pg_trgm full-text recall. Slower; exact + line recall is used by default.")
37 parser.add_argument("--trgm-threshold", type=float, default=0.3)
38 parser.add_argument("--statement-timeout-ms", type=int, default=5000)
39 parser.add_argument("--profile-every", type=int, default=100) 33 parser.add_argument("--profile-every", type=int, default=100)
40 args = parser.parse_args() 34 args = parser.parse_args()
41 35
42 psycopg = _import_psycopg() 36 psycopg = _import_psycopg()
37 config = ServerConfig()
43 csv_path = Path(args.csv) 38 csv_path = Path(args.csv)
44 out_path = Path(args.out) 39 out_path = Path(args.out)
45 base_dir = Path(args.base_dir) if args.base_dir else None 40 base_dir = Path(args.base_dir) if args.base_dir else None
46 positive_decisions = {item.strip() for item in args.positive_decisions.split(",") if item.strip()} 41 positive_decisions = {"duplicate"}
47 42
48 total = _csv_data_row_count(csv_path) 43 total = _csv_data_row_count(csv_path)
49 rows: list[dict[str, object]] = [] 44 rows: list[dict[str, object]] = []
50 profile_stats = _new_profile_stats() 45 profile_stats = _new_profile_stats()
51 out_path.parent.mkdir(parents=True, exist_ok=True) 46 out_path.parent.mkdir(parents=True, exist_ok=True)
52 _progress(f"evaluate postgres csv: 0/{total}") 47 _progress(f"evaluate postgres csv: 0/{total}")
53 with psycopg.connect(args.dsn) as conn: 48 with psycopg.connect(config.dsn) as conn:
54 with conn.cursor() as cursor: 49 with conn.cursor() as cursor:
55 cursor.execute("select set_config('statement_timeout', %s, false)", (str(args.statement_timeout_ms),)) 50 cursor.execute("select set_config('statement_timeout', %s, false)", (str(config.statement_timeout_ms),))
56 cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(args.trgm_threshold),)) 51 cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(config.trgm_threshold),))
57 with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open( 52 with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open(
58 "w", encoding="utf-8", newline="" 53 "w", encoding="utf-8", newline=""
59 ) as out_file: 54 ) as out_file:
...@@ -70,9 +65,7 @@ def main() -> None: ...@@ -70,9 +65,7 @@ def main() -> None:
70 csv_path=csv_path, 65 csv_path=csv_path,
71 base_dir=base_dir, 66 base_dir=base_dir,
72 positive_decisions=positive_decisions, 67 positive_decisions=positive_decisions,
73 max_candidates=args.max_candidates, 68 config=config,
74 recall_limit=args.recall_limit,
75 enable_trgm=args.enable_trgm,
76 ) 69 )
77 rows.append(row_out) 70 rows.append(row_out)
78 writer.writerow(row_out) 71 writer.writerow(row_out)
...@@ -96,9 +89,7 @@ def _evaluate_row( ...@@ -96,9 +89,7 @@ def _evaluate_row(
96 csv_path: Path, 89 csv_path: Path,
97 base_dir: Path | None, 90 base_dir: Path | None,
98 positive_decisions: set[str], 91 positive_decisions: set[str],
99 max_candidates: int, 92 config: ServerConfig,
100 recall_limit: int,
101 enable_trgm: bool,
102 ) -> dict[str, object]: 93 ) -> dict[str, object]:
103 parse_started = time.perf_counter() 94 parse_started = time.perf_counter()
104 sample_id = row.get("id") or row.get("sample_id") or str(row_number) 95 sample_id = row.get("id") or row.get("sample_id") or str(row_number)
...@@ -108,12 +99,12 @@ def _evaluate_row( ...@@ -108,12 +99,12 @@ def _evaluate_row(
108 candidates, timings = _recall_candidates( 99 candidates, timings = _recall_candidates(
109 conn, 100 conn,
110 record, 101 record,
111 recall_limit=recall_limit, 102 recall_limit=config.recall_limit,
112 enable_trgm=enable_trgm, 103 enable_trgm=config.enable_trgm,
113 exclude_record_ids=_exclude_record_ids_for_eval_row(row), 104 exclude_record_ids=_exclude_record_ids_for_eval_row(row),
114 ) 105 )
115 rank_started = time.perf_counter() 106 rank_started = time.perf_counter()
116 result = _check_against_candidates(record, candidates, max_candidates=max_candidates) 107 result = _check_against_candidates(record, candidates, config=config)
117 rank_ms = round((time.perf_counter() - rank_started) * 1000, 2) 108 rank_ms = round((time.perf_counter() - rank_started) * 1000, 2)
118 recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2) 109 recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2)
119 predicted_duplicate = result.decision.value in positive_decisions 110 predicted_duplicate = result.decision.value in positive_decisions
...@@ -127,7 +118,7 @@ def _evaluate_row( ...@@ -127,7 +118,7 @@ def _evaluate_row(
127 "correct": expected_duplicate == predicted_duplicate, 118 "correct": expected_duplicate == predicted_duplicate,
128 "confidence": result.confidence, 119 "confidence": result.confidence,
129 "reason": result.reason, 120 "reason": result.reason,
130 "candidate_count": len(candidates), 121 "candidate_count": len(result.candidates),
131 "parse_ms": parse_ms, 122 "parse_ms": parse_ms,
132 "recall_ms": recall_ms, 123 "recall_ms": recall_ms,
133 "exact_ms": timings["exact_ms"], 124 "exact_ms": timings["exact_ms"],
...@@ -246,10 +237,26 @@ def _check_against_candidates( ...@@ -246,10 +237,26 @@ def _check_against_candidates(
246 record: LyricRecord, 237 record: LyricRecord,
247 candidates: list[LyricRecord], 238 candidates: list[LyricRecord],
248 *, 239 *,
249 max_candidates: int, 240 config: ServerConfig,
250 ): 241 ):
251 checker = DuplicateChecker() 242 checker = DuplicateChecker(
252 return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates) 243 duplicate_jaccard_threshold=config.duplicate_jaccard_threshold,
244 duplicate_line_coverage_threshold=config.duplicate_line_coverage_threshold,
245 duplicate_high_coverage_jaccard_threshold=config.duplicate_high_coverage_jaccard_threshold,
246 duplicate_high_coverage_line_coverage_threshold=config.duplicate_high_coverage_line_coverage_threshold,
247 review_jaccard_threshold=config.review_jaccard_threshold,
248 review_line_coverage_threshold=config.review_line_coverage_threshold,
249 review_query_coverage_threshold=config.review_query_coverage_threshold,
250 fragment_query_coverage_threshold=config.fragment_query_coverage_threshold,
251 fragment_max_line_ratio=config.fragment_max_line_ratio,
252 fragment_min_matched_lines=config.fragment_min_matched_lines,
253 chorus_short_line_count_threshold=config.chorus_short_line_count_threshold,
254 chorus_material_overlap_threshold=config.chorus_material_overlap_threshold,
255 chorus_material_query_coverage_threshold=config.chorus_material_query_coverage_threshold,
256 confidence_jaccard_weight=config.confidence_jaccard_weight,
257 confidence_line_coverage_weight=config.confidence_line_coverage_weight,
258 )
259 return checker.check_record_against_candidates(record, candidates, max_candidates=config.max_candidates)
253 260
254 261
255 def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]: 262 def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
......
...@@ -110,6 +110,7 @@ def main(): ...@@ -110,6 +110,7 @@ def main():
110 print(f" decision: {result.get('decision', 'N/A')}") 110 print(f" decision: {result.get('decision', 'N/A')}")
111 print(f" confidence: {result.get('confidence', 'N/A')}") 111 print(f" confidence: {result.get('confidence', 'N/A')}")
112 print(f" reason: {result.get('reason', 'N/A')}") 112 print(f" reason: {result.get('reason', 'N/A')}")
113 print(f" record_ids: {result.get('record_ids', [])}")
113 114
114 115
115 if __name__ == "__main__": 116 if __name__ == "__main__":
......
1 ## 消失的波段
2
3 ### 【主歌 1】 — *(压抑、低沉的叙事)*
4
5 霓虹灯……在车窗外退后,
6 霓虹——和夜色融为一体。
7 收音机里,只剩沙沙的电流……
8 (像你在旧地址留下的呼吸……)
9 有些习惯……总是很难去修正,
10 比如——在人群中,辨认你的背影。
11
12 ### 【主歌 2】 — *(情绪渐进,带有一丝无奈)*
13
14 朋友圈里……你更新了风景,
15 坐标是——没听过的、陌、生、城、市。
16 我们从无话不说……退回到【静音】,
17 像两条失去交集的——平行线。
18 那些没有寄出的长信……
19 最后都变成,草稿箱里的——灰、尘。
20
21 ### 【副歌】 —— *(情感爆发,高亢而撕裂)*
22
23 我们成了彼此消 逝 的 波 段 !!
24 在同一个频段……却再也无法呼喊!
25 那些同频共振的夜晚……
26 最终被淹没在——嘈杂的市中心!!
27 我调整着微弱的接收信号……
28 却只听到——时光断裂的声音!!!
29
30 ### 【桥段】 —— *(节奏加快,连续的内心追问)*
31
32 是不是所有的连接……都有保质期?!
33 到期后……就自动切断了所有联系?!
34 我们在各自的轨道里——加!速!运!行!
35 再也找不到……那天傍晚的引力。
36
37 ### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)*
38
39 我们成了彼此消 逝 的 波 段 ——!!
40 在同一个频段……却再也无法呼喊!
41 那些同频共振的夜晚……
42 最终被淹没在——嘈杂的市中心!!
43 我调整着微弱的接收信号……
44 却只听到……(时光断裂的声音)……
45
46 ### 【尾奏】 —— *(情绪下沉,最终归于死寂)*
47
48 【信号中断……请勿追赶。】
49 城市入睡……灯光渐暗……
50 一个人的波段。
51 (查……无……此……人……)
52 【 挂 断 。】
53 ### 副歌
54
55 我们成了彼此消失的波段
56 在同一个频段却再也无法呼喊
57 那些同频共振的夜晚
58 最终被淹没在嘈杂的市中心
59 我调整着微弱的接收信号
60 却只听到时光断裂的声音
61
62 ### 桥段
63
64 是不是所有的连接都有保质期
65 到期后就自动切断了所有联系
66 我们在各自的轨道里加速运行
67 再也找不到那天傍晚的引力
68
69 ### 副歌
70
71 我们成了彼此消失的波段
72 在同一个频段却再也无法呼喊
73 那些同频共振的夜晚
74 最终被淹没在嘈杂的市中心
75 我调整着微弱的接收信号
76 却只听到时光断裂的声音
77
78 ### 尾奏
79
80 信号中断,请勿追赶
81 城市入睡,灯光渐暗
82 一个人的波段
83 查无此人
84 挂断
...\ No newline at end of file ...\ No newline at end of file
1 歌曲题目:《星空大冒险》
2
3 【主歌 2】
4 小兔子,在划船,
5 它的浆是胡萝卜。
6 小熊坐在树枝上,
7 正把蜂蜜涂面包。
8 风儿吹过坏脾气,
9 在这儿变成甜泡泡。
10 没有作业和烦恼,
11 大家都在哈哈笑。
12
13 【副歌 2】
14 飞呀飞,飞向大月亮,
15 月亮像个大香蕉,挂在夜空上。
16 摇呀摇,摇到银河旁,
17 捞起一颗小星星,放在手心里亮。
18
19 【桥段(Bridge)】
20 (节奏放慢,变温柔)
21 天上的城堡亮晶晶,
22 那是梦里的风景。
23 玩累的小孩要睡了,
24 听一听,风的呼吸。
25 呼——噜——呼——噜——
26 做个好梦到天明。
27
28 【副歌 3】
29 (节奏恢复,渐弱结束)
30 飞呀飞,飞向大月亮,
31 月亮像个大香蕉,挂在夜空上。
32 摇呀摇,摇到银河旁,
33 捞起一颗小星星,
34 抱在怀里……睡着啦。
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord ...@@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord
7 from lyric_dedup.eval_dataset import generate_eval_set 7 from lyric_dedup.eval_dataset import generate_eval_set
8 from lyric_dedup.file_import import record_from_file 8 from lyric_dedup.file_import import record_from_file
9 from lyric_dedup.normalization import normalize_lyrics 9 from lyric_dedup.normalization import normalize_lyrics
10 from lyric_dedup_server.config import ServerConfig
11 from lyric_dedup_server.service import DedupService
10 12
11 13
12 BASE_LYRIC = """ 14 BASE_LYRIC = """
...@@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c ...@@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c
55 assert result.candidates[0].record_id == "song-1" 57 assert result.candidates[0].record_id == "song-1"
56 58
57 59
58 def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None: 60 def test_short_shared_repeated_chorus_is_new_fragment() -> None:
59 result = check_against( 61 result = check_against(
60 [ 62 [
61 LyricRecord( 63 LyricRecord(
...@@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None: ...@@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None:
78 """ 80 """
79 ) 81 )
80 82
81 assert result.decision == DuplicateDecision.REVIEW 83 assert result.decision == DuplicateDecision.NEW
82 assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,不自动判重" 84 assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,按片段歌词处理"
85
86
87 def test_service_short_chorus_fragment_result_is_new() -> None:
88 service = DedupService(config=ServerConfig())
89
90 result = service._check_against_candidates(
91 LyricRecord(
92 "__query__",
93 """
94 山谷的雨落在清晨
95 我把名字交给星辰
96 啦啦啦 我们不分离
97 啦啦啦 我们不分离
98 世界安静等一个人
99 """,
100 ),
101 [
102 LyricRecord(
103 "song-1",
104 """
105 海边的风吹过旧信
106 你说夏天不会远去
107 啦啦啦 我们不分离
108 啦啦啦 我们不分离
109 转身以后各自旅行
110 """,
111 )
112 ],
113 )
114
115 assert result.decision == DuplicateDecision.NEW.value
116 assert result.duplicate is False
117 assert result.record_ids == []
83 118
84 119
85 def test_substantial_line_overlap_is_duplicate_after_pg_recall() -> None: 120 def test_substantial_line_overlap_is_duplicate_after_pg_recall() -> None:
...@@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: ...@@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None:
110 """ 145 """
111 ) 146 )
112 147
113 assert result.decision != DuplicateDecision.DUPLICATE 148 assert result.decision == DuplicateDecision.NEW
149 assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
114 assert result.candidates[0].primary_line_coverage < 0.72 150 assert result.candidates[0].primary_line_coverage < 0.72
115 151
116 152
153 def test_long_plain_fragment_of_full_song_is_new_not_review() -> None:
154 full_song = """
155 第一行写给凌晨的风
156 第二行写给远处的灯
157 第三行写给没有寄出的信
158 第四行写给还醒着的人
159 第五行写给旧车站
160 第六行写给长街尽头
161 第七行写给明天的太阳
162 第八行写给重新出发
163 第九行写给路过的雨
164 第十行写给沉默的月光
165 """
166 result = check_against(
167 [LyricRecord("song-1", full_song)],
168 """
169 第二行写给远处的灯
170 第三行写给没有寄出的信
171 第四行写给还醒着的人
172 第五行写给旧车站
173 第六行写给长街尽头
174 第七行写给明天的太阳
175 """,
176 )
177
178 assert result.decision == DuplicateDecision.NEW
179 assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
180
181
117 def test_catalog_mashup_fragments_are_new_not_review() -> None: 182 def test_catalog_mashup_fragments_are_new_not_review() -> None:
118 result = check_against( 183 result = check_against(
119 [ 184 [
......