Commit ba39ce6a ba39ce6aa50b5bc45d24bc32bcbc7c51b870922a by 沈秋雨

添加测试集内部去重

1 parent f8ad329c
...@@ -80,6 +80,7 @@ python -m lyric_dedup.cli generate-eval-set \ ...@@ -80,6 +80,7 @@ python -m lyric_dedup.cli generate-eval-set \
80 --lyrics-dir data/generated_eval/incoming \ 80 --lyrics-dir data/generated_eval/incoming \
81 --csv data/generated_eval/eval_50000.csv \ 81 --csv data/generated_eval/eval_50000.csv \
82 --index outputs/indexes/lyrics.pkl \ 82 --index outputs/indexes/lyrics.pkl \
83 --eval-index data/generated_eval/eval_50000.csv.index.pkl \
83 --size 50000 \ 84 --size 50000 \
84 --positive-ratio 0.3 85 --positive-ratio 0.3
85 ``` 86 ```
...@@ -88,10 +89,10 @@ python -m lyric_dedup.cli generate-eval-set \ ...@@ -88,10 +89,10 @@ python -m lyric_dedup.cli generate-eval-set \
88 89
89 - 先扫描整个曲库,按有效歌词行数、语言类型、文件来源前缀做分层采样,不再按排序前缀取样。 90 - 先扫描整个曲库,按有效歌词行数、语言类型、文件来源前缀做分层采样,不再按排序前缀取样。
90 - `应去重` 样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。 91 - `应去重` 样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。
91 - `不应去重` 样本包含同主题新歌词、hard negative、片段歌词、重复副歌碰撞、仅翻译相似、短歌词/占位边界样本。 92 - `不应去重` 样本以真实 holdout 完整歌词为主,也包含片段歌词、重复副歌碰撞、仅翻译相似、同主题新歌词、短歌词/占位边界样本。
92 - 片段歌词即使命中已有歌曲的一部分,也不应该输出 `duplicate`;最多进入 `review` 93 - 片段歌词即使命中已有歌曲的一部分,也不应该输出 `duplicate`;最多进入 `review`
93 - 如果传入 `--index`,生成器会用现有索引构造更接近线上召回风险的 hard negative 94 - 生成器会额外写出 `--eval-index`,这个索引排除了 holdout 歌,评估生成 CSV 时应使用它
94 - 同时会生成 `*.manifest.json`,记录 seed、曲库规模、样本类型分布、语言/来源分桶和样本来源覆盖数。 95 - 同时会生成 `*.manifest.json`,记录 seed、曲库规模、holdout 数、样本类型分布、语言/来源分桶和样本来源覆盖数。
95 96
96 先准备一个 CSV,例如 `data/eval/eval.csv` 97 先准备一个 CSV,例如 `data/eval/eval.csv`
97 98
......
...@@ -103,6 +103,7 @@ python -m lyric_dedup.cli generate-eval-set \ ...@@ -103,6 +103,7 @@ python -m lyric_dedup.cli generate-eval-set \
103 --lyrics-dir data/generated_eval/incoming \ 103 --lyrics-dir data/generated_eval/incoming \
104 --csv data/generated_eval/eval_50000.csv \ 104 --csv data/generated_eval/eval_50000.csv \
105 --index outputs/indexes/library_lyrics.pkl \ 105 --index outputs/indexes/library_lyrics.pkl \
106 --eval-index data/generated_eval/eval_50000.csv.index.pkl \
106 --size 50000 \ 107 --size 50000 \
107 --positive-ratio 0.3 108 --positive-ratio 0.3
108 ``` 109 ```
...@@ -120,24 +121,26 @@ python -m lyric_dedup.cli generate-eval-set \ ...@@ -120,24 +121,26 @@ python -m lyric_dedup.cli generate-eval-set \
120 121
121 ```text 122 ```text
122 positive_* = 应去重,全曲歌词样式变化 123 positive_* = 应去重,全曲歌词样式变化
123 negative_random_unrelated = 不应去重,同主题新歌词 124 negative_real_holdout_full_song = 不应去重,完整真实歌词,已从评估索引中排除
124 negative_hard_candidate = 不应去重,系统容易召回的短句/局部重合样本
125 negative_fragment = 不应去重,单曲片段 125 negative_fragment = 不应去重,单曲片段
126 negative_shared_chorus = 不应去重,重复副歌碰撞 126 negative_shared_chorus = 不应去重,重复副歌碰撞
127 negative_translation_only = 不应去重,仅翻译相似 127 negative_translation_only = 不应去重,仅翻译相似
128 negative_same_theme_synthetic = 不应去重,同主题新歌词
128 edge_short_or_placeholder = 不应去重,短歌词/占位边界样本 129 edge_short_or_placeholder = 不应去重,短歌词/占位边界样本
129 ``` 130 ```
130 131
131 生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。传入 `--index` 后会用现有索引生成 hard negative。每次还会输出: 132 生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。它会分出一批 holdout 完整歌词作为真实新歌负样本,并生成一个排除 holdout 的评估索引。每次还会输出:
132 133
133 ```text 134 ```text
134 data/generated_eval/eval_50000.csv.manifest.json 135 data/generated_eval/eval_50000.csv.manifest.json
136 data/generated_eval/eval_50000.csv.index.pkl
135 ``` 137 ```
136 138
137 manifest 里重点看: 139 manifest 里重点看:
138 140
139 ```text 141 ```text
140 library_files 曲库歌词文件数 142 library_files 曲库歌词文件数
143 holdout_records 从评估索引中排除、作为真实新歌负样本的数量
141 sample_type_counts 各样本类型数量 144 sample_type_counts 各样本类型数量
142 line_count_bucket_counts / language_bucket_counts / source_bucket_counts 145 line_count_bucket_counts / language_bucket_counts / source_bucket_counts
143 unique_source_records 本次评估覆盖了多少真实源文件 146 unique_source_records 本次评估覆盖了多少真实源文件
...@@ -147,7 +150,7 @@ unique_source_records 本次评估覆盖了多少真实源文件 ...@@ -147,7 +150,7 @@ unique_source_records 本次评估覆盖了多少真实源文件
147 150
148 ```bash 151 ```bash
149 python -m lyric_dedup.cli evaluate-csv \ 152 python -m lyric_dedup.cli evaluate-csv \
150 --index outputs/indexes/library_lyrics.pkl \ 153 --index data/generated_eval/eval_50000.csv.index.pkl \
151 --csv data/generated_eval/eval_50000.csv \ 154 --csv data/generated_eval/eval_50000.csv \
152 --base-dir data/generated_eval \ 155 --base-dir data/generated_eval \
153 --out outputs/results/library_eval_50000.csv 156 --out outputs/results/library_eval_50000.csv
...@@ -171,7 +174,7 @@ false_positive ...@@ -171,7 +174,7 @@ false_positive
171 174
172 ```bash 175 ```bash
173 python -m lyric_dedup.cli evaluate-csv \ 176 python -m lyric_dedup.cli evaluate-csv \
174 --index outputs/indexes/library_lyrics.pkl \ 177 --index data/generated_eval/eval_50000.csv.index.pkl \
175 --csv data/generated_eval/eval_50000.csv \ 178 --csv data/generated_eval/eval_50000.csv \
176 --base-dir data/generated_eval \ 179 --base-dir data/generated_eval \
177 --positive-decisions duplicate,review \ 180 --positive-decisions duplicate,review \
......
...@@ -96,16 +96,24 @@ class DuplicateChecker: ...@@ -96,16 +96,24 @@ class DuplicateChecker:
96 96
97 def add_record(self, record: LyricRecord) -> None: 97 def add_record(self, record: LyricRecord) -> None:
98 indexed = self._index(record) 98 indexed = self._index(record)
99 self._records[record.record_id] = indexed 99 self._add_indexed(record.record_id, indexed)
100 self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record.record_id) 100
101 def add_normalized_record(self, record: LyricRecord, normalized: NormalizedLyrics) -> None:
102 """Add a record when normalized lyrics have already been computed."""
103 indexed = self._index_normalized(record, normalized)
104 self._add_indexed(record.record_id, indexed)
105
106 def _add_indexed(self, record_id: str, indexed: _IndexedRecord) -> None:
107 self._records[record_id] = indexed
108 self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record_id)
101 for line in indexed.normalized.unique_lines: 109 for line in indexed.normalized.unique_lines:
102 if len(line) >= 4: 110 if len(line) >= 4:
103 self._line_to_ids.setdefault(line, set()).add(record.record_id) 111 self._line_to_ids.setdefault(line, set()).add(record_id)
104 for token in indexed.tokens: 112 for token in indexed.tokens:
105 self._token_to_ids.setdefault(token, set()).add(record.record_id) 113 self._token_to_ids.setdefault(token, set()).add(record_id)
106 for token in indexed.fallback_tokens: 114 for token in indexed.fallback_tokens:
107 self._token_to_ids.setdefault(token, set()).add(record.record_id) 115 self._token_to_ids.setdefault(token, set()).add(record_id)
108 self._lsh.add(record.record_id, indexed.signature) 116 self._lsh.add(record_id, indexed.signature)
109 117
110 def save(self, path: str | Path) -> None: 118 def save(self, path: str | Path) -> None:
111 """Persist the in-memory index for later checks.""" 119 """Persist the in-memory index for later checks."""
...@@ -187,6 +195,9 @@ class DuplicateChecker: ...@@ -187,6 +195,9 @@ class DuplicateChecker:
187 195
188 def _index(self, record: LyricRecord) -> _IndexedRecord: 196 def _index(self, record: LyricRecord) -> _IndexedRecord:
189 normalized = normalize_lyrics(record.lyrics) 197 normalized = normalize_lyrics(record.lyrics)
198 return self._index_normalized(record, normalized)
199
200 def _index_normalized(self, record: LyricRecord, normalized: NormalizedLyrics) -> _IndexedRecord:
190 tokens = lyric_tokens(normalized) 201 tokens = lyric_tokens(normalized)
191 primary_tokens = lyric_tokens(normalized, lines=normalized.primary_lines) 202 primary_tokens = lyric_tokens(normalized, lines=normalized.primary_lines)
192 translation_tokens = lyric_tokens(normalized, lines=normalized.translation_lines) 203 translation_tokens = lyric_tokens(normalized, lines=normalized.translation_lines)
......
...@@ -5,6 +5,7 @@ from __future__ import annotations ...@@ -5,6 +5,7 @@ from __future__ import annotations
5 import argparse 5 import argparse
6 import csv 6 import csv
7 import json 7 import json
8 import sys
8 from pathlib import Path 9 from pathlib import Path
9 10
10 from lyric_dedup.checker import DuplicateChecker 11 from lyric_dedup.checker import DuplicateChecker
...@@ -50,7 +51,8 @@ def main() -> None: ...@@ -50,7 +51,8 @@ def main() -> None:
50 generate.add_argument("--size", type=int, default=100) 51 generate.add_argument("--size", type=int, default=100)
51 generate.add_argument("--positive-ratio", type=float, default=0.3) 52 generate.add_argument("--positive-ratio", type=float, default=0.3)
52 generate.add_argument("--seed", type=int, default=20260602) 53 generate.add_argument("--seed", type=int, default=20260602)
53 generate.add_argument("--index", default="", help="optional existing index for hard-negative generation") 54 generate.add_argument("--index", default="", help="optional source index path recorded in the manifest")
55 generate.add_argument("--eval-index", default="", help="output index built from non-holdout records for this eval set")
54 56
55 args = parser.parse_args() 57 args = parser.parse_args()
56 if args.command == "build-index": 58 if args.command == "build-index":
...@@ -77,6 +79,7 @@ def main() -> None: ...@@ -77,6 +79,7 @@ def main() -> None:
77 positive_ratio=args.positive_ratio, 79 positive_ratio=args.positive_ratio,
78 seed=args.seed, 80 seed=args.seed,
79 index_path=Path(args.index) if args.index else None, 81 index_path=Path(args.index) if args.index else None,
82 eval_index_path=Path(args.eval_index) if args.eval_index else None,
80 ) 83 )
81 print(json.dumps(summary, ensure_ascii=False)) 84 print(json.dumps(summary, ensure_ascii=False))
82 85
...@@ -155,52 +158,58 @@ def evaluate_csv( ...@@ -155,52 +158,58 @@ def evaluate_csv(
155 positive_decisions: set[str], 158 positive_decisions: set[str],
156 max_candidates: int, 159 max_candidates: int,
157 ) -> None: 160 ) -> None:
161 _progress(f"load index: {index_path}")
158 checker = DuplicateChecker.load(index_path) 162 checker = DuplicateChecker.load(index_path)
159 rows: list[dict[str, object]] = [] 163 rows: list[dict[str, object]] = []
164 total = _csv_data_row_count(csv_path)
165 _progress(f"evaluate csv: 0/{total}")
166 out_path.parent.mkdir(parents=True, exist_ok=True)
160 with csv_path.open(encoding="utf-8-sig", newline="") as file: 167 with csv_path.open(encoding="utf-8-sig", newline="") as file:
161 reader = csv.DictReader(file) 168 reader = csv.DictReader(file)
162 if reader.fieldnames is None: 169 if reader.fieldnames is None:
163 raise ValueError("评估 CSV 需要表头") 170 raise ValueError("评估 CSV 需要表头")
164 for row_number, row in enumerate(reader, start=2): 171 fieldnames = [
165 sample_id = row.get("id") or row.get("sample_id") or str(row_number) 172 "id",
166 record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir) 173 "source",
167 expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target")) 174 "expected_duplicate",
168 result = checker.check_record(record, max_candidates=max_candidates) 175 "decision",
169 predicted_duplicate = result.decision.value in positive_decisions 176 "predicted_duplicate",
170 best = result.candidates[0] if result.candidates else None 177 "correct",
171 rows.append( 178 "confidence",
172 { 179 "reason",
173 "id": sample_id, 180 "best_candidate_id",
174 "source": source, 181 "best_candidate_decision",
175 "expected_duplicate": expected_duplicate, 182 "best_candidate_confidence",
176 "decision": result.decision.value, 183 "best_candidate_jaccard",
177 "predicted_duplicate": predicted_duplicate, 184 "best_candidate_line_coverage",
178 "correct": expected_duplicate == predicted_duplicate, 185 "best_candidate_primary_jaccard",
179 "confidence": result.confidence, 186 "best_candidate_primary_line_coverage",
180 "reason": result.reason, 187 "best_candidate_translation_jaccard",
181 "best_candidate_id": best.record_id if best else "", 188 "best_candidate_translation_line_coverage",
182 "best_candidate_decision": best.decision.value if best else "", 189 "best_candidate_reason",
183 "best_candidate_confidence": best.confidence if best else "", 190 "matched_unique_lines",
184 "best_candidate_jaccard": best.jaccard if best else "", 191 ]
185 "best_candidate_line_coverage": best.line_coverage if best else "", 192 with out_path.open("w", encoding="utf-8", newline="") as out_file:
186 "best_candidate_primary_jaccard": best.primary_jaccard if best else "", 193 writer = csv.DictWriter(out_file, fieldnames=fieldnames)
187 "best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
188 "best_candidate_translation_jaccard": best.translation_jaccard if best else "",
189 "best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
190 "best_candidate_reason": best.reason if best else "",
191 "matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
192 }
193 )
194
195 out_path.parent.mkdir(parents=True, exist_ok=True)
196 with out_path.open("w", encoding="utf-8", newline="") as file:
197 writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()) if rows else ["id"])
198 writer.writeheader() 194 writer.writeheader()
199 writer.writerows(rows) 195 for index, row in enumerate(reader, start=1):
196 row_out = _evaluate_row(
197 row,
198 row_number=index + 1,
199 checker=checker,
200 csv_path=csv_path,
201 base_dir=base_dir,
202 positive_decisions=positive_decisions,
203 max_candidates=max_candidates,
204 )
205 rows.append(row_out)
206 writer.writerow(row_out)
207 _progress_count("evaluate csv", index, total, step=1000)
200 208
201 summary = _evaluation_summary(rows, positive_decisions=positive_decisions, out_path=out_path) 209 summary = _evaluation_summary(rows, positive_decisions=positive_decisions, out_path=out_path)
202 summary_path = out_path.with_suffix(out_path.suffix + ".summary.json") 210 summary_path = out_path.with_suffix(out_path.suffix + ".summary.json")
203 summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") 211 summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
212 _progress("evaluation complete")
204 print(json.dumps(summary, ensure_ascii=False)) 213 print(json.dumps(summary, ensure_ascii=False))
205 214
206 215
...@@ -229,6 +238,45 @@ def _result_to_dict(result, *, source: str) -> dict[str, object]: ...@@ -229,6 +238,45 @@ def _result_to_dict(result, *, source: str) -> dict[str, object]:
229 } 238 }
230 239
231 240
241 def _evaluate_row(
242 row: dict[str, str],
243 *,
244 row_number: int,
245 checker: DuplicateChecker,
246 csv_path: Path,
247 base_dir: Path | None,
248 positive_decisions: set[str],
249 max_candidates: int,
250 ) -> dict[str, object]:
251 sample_id = row.get("id") or row.get("sample_id") or str(row_number)
252 record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
253 expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
254 result = checker.check_record(record, max_candidates=max_candidates)
255 predicted_duplicate = result.decision.value in positive_decisions
256 best = result.candidates[0] if result.candidates else None
257 return {
258 "id": sample_id,
259 "source": source,
260 "expected_duplicate": expected_duplicate,
261 "decision": result.decision.value,
262 "predicted_duplicate": predicted_duplicate,
263 "correct": expected_duplicate == predicted_duplicate,
264 "confidence": result.confidence,
265 "reason": result.reason,
266 "best_candidate_id": best.record_id if best else "",
267 "best_candidate_decision": best.decision.value if best else "",
268 "best_candidate_confidence": best.confidence if best else "",
269 "best_candidate_jaccard": best.jaccard if best else "",
270 "best_candidate_line_coverage": best.line_coverage if best else "",
271 "best_candidate_primary_jaccard": best.primary_jaccard if best else "",
272 "best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
273 "best_candidate_translation_jaccard": best.translation_jaccard if best else "",
274 "best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
275 "best_candidate_reason": best.reason if best else "",
276 "matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
277 }
278
279
232 def _lyrics_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[str, str]: 280 def _lyrics_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[str, str]:
233 lyrics = (row.get("lyrics") or "").strip() 281 lyrics = (row.get("lyrics") or "").strip()
234 if lyrics: 282 if lyrics:
...@@ -322,5 +370,23 @@ def _evaluation_summary( ...@@ -322,5 +370,23 @@ def _evaluation_summary(
322 } 370 }
323 371
324 372
373 def _csv_data_row_count(csv_path: Path) -> int:
374 with csv_path.open(encoding="utf-8-sig", newline="") as file:
375 reader = csv.reader(file)
376 next(reader, None)
377 return sum(1 for _ in reader)
378
379
380 def _progress(message: str) -> None:
381 print(f"[eval] {message}", file=sys.stderr, flush=True)
382
383
384 def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
385 if total <= 0:
386 return
387 if current == 1 or current == total or current % step == 0:
388 _progress(f"{label}: {current}/{total}")
389
390
325 if __name__ == "__main__": 391 if __name__ == "__main__":
326 main() 392 main()
......
...@@ -7,14 +7,14 @@ import hashlib ...@@ -7,14 +7,14 @@ import hashlib
7 import json 7 import json
8 import random 8 import random
9 import re 9 import re
10 import sys
10 from collections import Counter 11 from collections import Counter
11 from dataclasses import dataclass 12 from dataclasses import dataclass
12 from pathlib import Path 13 from pathlib import Path
13 14
14 from lyric_dedup.checker import DuplicateChecker 15 from lyric_dedup.checker import DuplicateChecker
15 from lyric_dedup.checker import DuplicateDecision 16 from lyric_dedup.checker import LyricRecord
16 from lyric_dedup.file_import import iter_lyric_files 17 from lyric_dedup.file_import import iter_lyric_files
17 from lyric_dedup.file_import import read_lyric_file
18 from lyric_dedup.file_import import record_from_file 18 from lyric_dedup.file_import import record_from_file
19 from lyric_dedup.normalization import NormalizedLyrics 19 from lyric_dedup.normalization import NormalizedLyrics
20 from lyric_dedup.normalization import fingerprint_text 20 from lyric_dedup.normalization import fingerprint_text
...@@ -23,19 +23,31 @@ from lyric_dedup.normalization import normalize_lyrics ...@@ -23,19 +23,31 @@ from lyric_dedup.normalization import normalize_lyrics
23 23
24 DEFAULT_SAMPLE_MIX = { 24 DEFAULT_SAMPLE_MIX = {
25 "positive_full_duplicate": 0.30, 25 "positive_full_duplicate": 0.30,
26 "negative_random_unrelated": 0.20, 26 "negative_real_holdout_full_song": 0.40,
27 "negative_hard_candidate": 0.25,
28 "negative_fragment": 0.10, 27 "negative_fragment": 0.10,
29 "negative_shared_chorus": 0.05, 28 "negative_shared_chorus": 0.05,
30 "negative_translation_only": 0.05, 29 "negative_translation_only": 0.05,
30 "negative_same_theme_synthetic": 0.05,
31 "edge_short_or_placeholder": 0.05, 31 "edge_short_or_placeholder": 0.05,
32 } 32 }
33 33
34 34
35 def _progress(message: str) -> None:
36 print(f"[eval-gen] {message}", file=sys.stderr, flush=True)
37
38
39 def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
40 if total <= 0:
41 return
42 if current == 1 or current == total or current % step == 0:
43 _progress(f"{label}: {current}/{total}")
44
45
35 @dataclass(frozen=True) 46 @dataclass(frozen=True)
36 class LyricProfile: 47 class LyricProfile:
37 path: Path 48 path: Path
38 record_id: str 49 record_id: str
50 raw_text: str
39 title: str 51 title: str
40 artist: str 52 artist: str
41 normalized: NormalizedLyrics 53 normalized: NormalizedLyrics
...@@ -74,6 +86,7 @@ def generate_eval_set( ...@@ -74,6 +86,7 @@ def generate_eval_set(
74 positive_ratio: float = 0.30, 86 positive_ratio: float = 0.30,
75 seed: int = 20260602, 87 seed: int = 20260602,
76 index_path: Path | None = None, 88 index_path: Path | None = None,
89 eval_index_path: Path | None = None,
77 ) -> dict[str, object]: 90 ) -> dict[str, object]:
78 """Generate a stratified production evaluation set. 91 """Generate a stratified production evaluation set.
79 92
...@@ -83,6 +96,7 @@ def generate_eval_set( ...@@ -83,6 +96,7 @@ def generate_eval_set(
83 if size <= 0: 96 if size <= 0:
84 raise ValueError("size must be positive") 97 raise ValueError("size must be positive")
85 98
99 _progress(f"start generation: size={size}, positive_ratio={positive_ratio}, seed={seed}")
86 rng = random.Random(seed) 100 rng = random.Random(seed)
87 profiles = profile_library(library_dir) 101 profiles = profile_library(library_dir)
88 if not profiles: 102 if not profiles:
...@@ -90,13 +104,25 @@ def generate_eval_set( ...@@ -90,13 +104,25 @@ def generate_eval_set(
90 104
91 output_dir.mkdir(parents=True, exist_ok=True) 105 output_dir.mkdir(parents=True, exist_ok=True)
92 csv_path.parent.mkdir(parents=True, exist_ok=True) 106 csv_path.parent.mkdir(parents=True, exist_ok=True)
107 _progress(f"clean output dir: {output_dir}")
93 _clean_generated_output_dir(output_dir) 108 _clean_generated_output_dir(output_dir)
94 109
95 checker = DuplicateChecker.load(index_path) if index_path else None
96 plan = _sample_plan(size, positive_ratio=positive_ratio) 110 plan = _sample_plan(size, positive_ratio=positive_ratio)
97 groups = _profile_groups(profiles) 111 _progress(f"sample plan: {plan}")
112 holdout_count = min(plan["negative_real_holdout_full_song"], max(1, len(profiles) // 2))
113 holdout_profiles = _stratified_unique_sample(
114 profiles,
115 holdout_count,
116 rng,
117 )
118 holdout_ids = {profile.record_id for profile in holdout_profiles}
119 indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
120 eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
121 _build_eval_index(indexed_profiles, eval_index_path)
122 groups = _profile_groups(indexed_profiles)
98 samples: list[GeneratedSample] = [] 123 samples: list[GeneratedSample] = []
99 124
125 _progress("build positive_full_duplicate samples")
100 samples.extend( 126 samples.extend(
101 _build_positive_samples( 127 _build_positive_samples(
102 _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng), 128 _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
...@@ -106,53 +132,62 @@ def generate_eval_set( ...@@ -106,53 +132,62 @@ def generate_eval_set(
106 start_index=len(samples) + 1, 132 start_index=len(samples) + 1,
107 ) 133 )
108 ) 134 )
135 _progress(f"built samples: {len(samples)}/{size}")
136 _progress("build negative_real_holdout_full_song samples")
109 samples.extend( 137 samples.extend(
110 _build_random_unrelated_samples( 138 _build_holdout_full_song_samples(
111 plan["negative_random_unrelated"], 139 holdout_profiles,
112 output_dir, 140 output_dir,
113 csv_path.parent, 141 csv_path.parent,
114 rng,
115 start_index=len(samples) + 1, 142 start_index=len(samples) + 1,
116 ) 143 )
117 ) 144 )
145 _progress(f"built samples: {len(samples)}/{size}")
146 _progress("build negative_fragment samples")
118 samples.extend( 147 samples.extend(
119 _build_hard_candidate_samples( 148 _build_fragment_samples(
120 groups["normal"], 149 _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
121 plan["negative_hard_candidate"],
122 output_dir, 150 output_dir,
123 csv_path.parent, 151 csv_path.parent,
124 rng, 152 rng,
125 checker=checker,
126 start_index=len(samples) + 1, 153 start_index=len(samples) + 1,
127 ) 154 )
128 ) 155 )
156 _progress(f"built samples: {len(samples)}/{size}")
157 _progress("build negative_shared_chorus samples")
129 samples.extend( 158 samples.extend(
130 _build_fragment_samples( 159 _build_shared_chorus_samples(
131 _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng), 160 _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
132 output_dir, 161 output_dir,
133 csv_path.parent, 162 csv_path.parent,
134 rng, 163 rng,
135 start_index=len(samples) + 1, 164 start_index=len(samples) + 1,
136 ) 165 )
137 ) 166 )
167 _progress(f"built samples: {len(samples)}/{size}")
168 _progress("build negative_translation_only samples")
138 samples.extend( 169 samples.extend(
139 _build_shared_chorus_samples( 170 _build_translation_only_samples(
140 _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng), 171 _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
141 output_dir, 172 output_dir,
142 csv_path.parent, 173 csv_path.parent,
143 rng, 174 rng,
144 start_index=len(samples) + 1, 175 start_index=len(samples) + 1,
145 ) 176 )
146 ) 177 )
178 _progress(f"built samples: {len(samples)}/{size}")
179 _progress("build negative_same_theme_synthetic samples")
147 samples.extend( 180 samples.extend(
148 _build_translation_only_samples( 181 _build_same_theme_synthetic_samples(
149 _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng), 182 plan["negative_same_theme_synthetic"],
150 output_dir, 183 output_dir,
151 csv_path.parent, 184 csv_path.parent,
152 rng, 185 rng,
153 start_index=len(samples) + 1, 186 start_index=len(samples) + 1,
154 ) 187 )
155 ) 188 )
189 _progress(f"built samples: {len(samples)}/{size}")
190 _progress("build edge_short_or_placeholder samples")
156 samples.extend( 191 samples.extend(
157 _build_edge_samples( 192 _build_edge_samples(
158 _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng), 193 _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
...@@ -162,10 +197,12 @@ def generate_eval_set( ...@@ -162,10 +197,12 @@ def generate_eval_set(
162 start_index=len(samples) + 1, 197 start_index=len(samples) + 1,
163 ) 198 )
164 ) 199 )
200 _progress(f"built samples: {len(samples)}/{size}")
165 201
166 if len(samples) < size: 202 if len(samples) < size:
203 _progress(f"top up with negative_same_theme_synthetic samples: {size - len(samples)}")
167 samples.extend( 204 samples.extend(
168 _build_random_unrelated_samples( 205 _build_same_theme_synthetic_samples(
169 size - len(samples), 206 size - len(samples),
170 output_dir, 207 output_dir,
171 csv_path.parent, 208 csv_path.parent,
...@@ -176,7 +213,9 @@ def generate_eval_set( ...@@ -176,7 +213,9 @@ def generate_eval_set(
176 samples = samples[:size] 213 samples = samples[:size]
177 rng.shuffle(samples) 214 rng.shuffle(samples)
178 215
216 _progress(f"write csv: {csv_path}")
179 _write_csv(samples, csv_path, seed=seed) 217 _write_csv(samples, csv_path, seed=seed)
218 _progress("write manifest")
180 manifest = _write_manifest( 219 manifest = _write_manifest(
181 profiles=profiles, 220 profiles=profiles,
182 samples=samples, 221 samples=samples,
...@@ -185,15 +224,21 @@ def generate_eval_set( ...@@ -185,15 +224,21 @@ def generate_eval_set(
185 seed=seed, 224 seed=seed,
186 plan=plan, 225 plan=plan,
187 index_path=index_path, 226 index_path=index_path,
227 eval_index_path=eval_index_path,
228 holdout_count=len(holdout_profiles),
188 ) 229 )
230 _progress("generation complete")
189 return manifest 231 return manifest
190 232
191 233
192 def profile_library(library_dir: Path) -> list[LyricProfile]: 234 def profile_library(library_dir: Path) -> list[LyricProfile]:
193 profiles: list[LyricProfile] = [] 235 profiles: list[LyricProfile] = []
194 for path in iter_lyric_files(library_dir): 236 paths = iter_lyric_files(library_dir)
237 _progress(f"profile library: 0/{len(paths)}")
238 for index, path in enumerate(paths, start=1):
195 record = record_from_file(path, base_dir=library_dir) 239 record = record_from_file(path, base_dir=library_dir)
196 normalized = normalize_lyrics(record.lyrics) 240 raw_text = record.lyrics
241 normalized = normalize_lyrics(raw_text)
197 lines = normalized.primary_lines or normalized.unique_lines 242 lines = normalized.primary_lines or normalized.unique_lines
198 line_count = len(lines) 243 line_count = len(lines)
199 normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text 244 normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
...@@ -202,6 +247,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]: ...@@ -202,6 +247,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
202 LyricProfile( 247 LyricProfile(
203 path=path, 248 path=path,
204 record_id=record.record_id, 249 record_id=record.record_id,
250 raw_text=raw_text,
205 title=record.title or "", 251 title=record.title or "",
206 artist=record.artist or "", 252 artist=record.artist or "",
207 normalized=normalized, 253 normalized=normalized,
...@@ -214,6 +260,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]: ...@@ -214,6 +260,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
214 has_translation=bool(normalized.translation_lines), 260 has_translation=bool(normalized.translation_lines),
215 ) 261 )
216 ) 262 )
263 _progress_count("profile library", index, len(paths), step=5000)
217 return profiles 264 return profiles
218 265
219 266
...@@ -283,6 +330,31 @@ def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Ran ...@@ -283,6 +330,31 @@ def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Ran
283 return selected 330 return selected
284 331
285 332
333 def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
334 if count <= 0 or not profiles:
335 return []
336 return _stratified_sample(profiles, min(count, len(profiles)), rng)
337
338
339 def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
340 _progress(f"build eval index excluding holdout: {index_path}")
341 checker = DuplicateChecker()
342 total = len(profiles)
343 for index, profile in enumerate(profiles, start=1):
344 checker.add_normalized_record(
345 LyricRecord(
346 record_id=profile.record_id,
347 lyrics=profile.raw_text,
348 title=profile.title or None,
349 artist=profile.artist or None,
350 ),
351 profile.normalized,
352 )
353 _progress_count("build eval index", index, total, step=5000)
354 index_path.parent.mkdir(parents=True, exist_ok=True)
355 checker.save(index_path)
356
357
286 def _build_positive_samples( 358 def _build_positive_samples(
287 profiles: list[LyricProfile], 359 profiles: list[LyricProfile],
288 output_dir: Path, 360 output_dir: Path,
...@@ -293,7 +365,7 @@ def _build_positive_samples( ...@@ -293,7 +365,7 @@ def _build_positive_samples(
293 ) -> list[GeneratedSample]: 365 ) -> list[GeneratedSample]:
294 samples: list[GeneratedSample] = [] 366 samples: list[GeneratedSample] = []
295 for offset, profile in enumerate(profiles): 367 for offset, profile in enumerate(profiles):
296 raw = read_lyric_file(profile.path) 368 raw = profile.raw_text
297 lines = _content_lines(raw) 369 lines = _content_lines(raw)
298 variants = [ 370 variants = [
299 ("positive_exact_copy", raw), 371 ("positive_exact_copy", raw),
...@@ -308,80 +380,62 @@ def _build_positive_samples( ...@@ -308,80 +380,62 @@ def _build_positive_samples(
308 index = start_index + offset 380 index = start_index + offset
309 path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text) 381 path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
310 samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile)) 382 samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
383 _progress_count("positive_full_duplicate", len(samples), len(profiles))
311 return samples 384 return samples
312 385
313 386
314 def _build_random_unrelated_samples( 387 def _build_holdout_full_song_samples(
315 count: int, 388 profiles: list[LyricProfile],
316 output_dir: Path, 389 output_dir: Path,
317 csv_base: Path, 390 csv_base: Path,
318 rng: random.Random,
319 *, 391 *,
320 start_index: int, 392 start_index: int,
321 ) -> list[GeneratedSample]: 393 ) -> list[GeneratedSample]:
394 _progress("build negative_real_holdout_full_song samples")
322 samples: list[GeneratedSample] = [] 395 samples: list[GeneratedSample] = []
323 for offset in range(count): 396 for offset, profile in enumerate(profiles):
324 index = start_index + offset 397 index = start_index + offset
325 text = _same_theme_synthetic(index, rng) 398 text = profile.raw_text
326 path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_random_unrelated.txt", text) 399 path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_real_holdout_full_song.txt", text)
327 samples.append( 400 samples.append(
328 GeneratedSample( 401 _sample_from_profile(
329 sample_id=f"sample-{index:05d}", 402 index,
330 file=str(path.relative_to(csv_base)), 403 path,
331 expected="不应去重", 404 csv_base,
332 sample_type="negative_random_unrelated", 405 "不应去重",
333 source="synthetic", 406 "negative_real_holdout_full_song",
334 notes="same-theme synthetic full lyric not copied from library", 407 profile,
408 notes="full real lyric held out from the generated eval index",
335 ) 409 )
336 ) 410 )
411 _progress_count("negative_real_holdout_full_song", len(samples), len(profiles))
337 return samples 412 return samples
338 413
339 414
340 def _build_hard_candidate_samples( 415 def _build_same_theme_synthetic_samples(
341 profiles: list[LyricProfile],
342 count: int, 416 count: int,
343 output_dir: Path, 417 output_dir: Path,
344 csv_base: Path, 418 csv_base: Path,
345 rng: random.Random, 419 rng: random.Random,
346 *, 420 *,
347 checker: DuplicateChecker | None,
348 start_index: int, 421 start_index: int,
349 ) -> list[GeneratedSample]: 422 ) -> list[GeneratedSample]:
350 if count <= 0:
351 return []
352 sources = _stratified_sample(profiles, count * 3, rng)
353 samples: list[GeneratedSample] = [] 423 samples: list[GeneratedSample] = []
354 for profile in sources: 424 for offset in range(count):
355 if len(samples) >= count: 425 index = start_index + offset
356 break 426 text = _same_theme_synthetic(index, rng)
357 lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines) 427 path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_same_theme_synthetic.txt", text)
358 text = _short_shared_snippet(lines, rng)
359 candidate_id = ""
360 if checker is not None:
361 result = checker.check(text, max_candidates=5)
362 candidate = next(
363 (
364 item
365 for item in result.candidates
366 if item.record_id != profile.record_id and item.decision != DuplicateDecision.NEW
367 ),
368 result.candidates[0] if result.candidates else None,
369 )
370 candidate_id = candidate.record_id if candidate else ""
371 index = start_index + len(samples)
372 path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_hard_candidate.txt", text)
373 samples.append( 428 samples.append(
374 _sample_from_profile( 429 GeneratedSample(
375 index, 430 sample_id=f"sample-{index:05d}",
376 path, 431 file=str(path.relative_to(csv_base)),
377 csv_base, 432 expected="不应去重",
378 "不应去重", 433 sample_type="negative_same_theme_synthetic",
379 "negative_hard_candidate", 434 source="synthetic",
380 profile, 435 notes="same-theme synthetic full lyric not copied from library",
381 candidate_record_id=candidate_id,
382 notes="shares a few real lines plus new filler; should not auto duplicate",
383 ) 436 )
384 ) 437 )
438 _progress_count("negative_same_theme_synthetic", len(samples), count)
385 return samples 439 return samples
386 440
387 441
...@@ -410,6 +464,7 @@ def _build_fragment_samples( ...@@ -410,6 +464,7 @@ def _build_fragment_samples(
410 notes="partial lyric fragment only", 464 notes="partial lyric fragment only",
411 ) 465 )
412 ) 466 )
467 _progress_count("negative_fragment", len(samples), len(profiles))
413 return samples 468 return samples
414 469
415 470
...@@ -447,6 +502,7 @@ def _build_shared_chorus_samples( ...@@ -447,6 +502,7 @@ def _build_shared_chorus_samples(
447 notes="shared repeated lines with new surrounding content", 502 notes="shared repeated lines with new surrounding content",
448 ) 503 )
449 ) 504 )
505 _progress_count("negative_shared_chorus", len(samples), len(profiles))
450 return samples 506 return samples
451 507
452 508
...@@ -478,6 +534,7 @@ def _build_translation_only_samples( ...@@ -478,6 +534,7 @@ def _build_translation_only_samples(
478 notes="translation-like text without matching original lyric", 534 notes="translation-like text without matching original lyric",
479 ) 535 )
480 ) 536 )
537 _progress_count("negative_translation_only", len(samples), len(profiles))
481 return samples 538 return samples
482 539
483 540
...@@ -511,6 +568,7 @@ def _build_edge_samples( ...@@ -511,6 +568,7 @@ def _build_edge_samples(
511 notes=notes, 568 notes=notes,
512 ) 569 )
513 ) 570 )
571 _progress_count("edge_short_or_placeholder", len(samples), len(profiles))
514 return samples 572 return samples
515 573
516 574
...@@ -598,13 +656,17 @@ def _write_manifest( ...@@ -598,13 +656,17 @@ def _write_manifest(
598 seed: int, 656 seed: int,
599 plan: dict[str, int], 657 plan: dict[str, int],
600 index_path: Path | None, 658 index_path: Path | None,
659 eval_index_path: Path,
660 holdout_count: int,
601 ) -> dict[str, object]: 661 ) -> dict[str, object]:
602 manifest = { 662 manifest = {
603 "seed": seed, 663 "seed": seed,
604 "library_files": len(profiles), 664 "library_files": len(profiles),
605 "sample_size": len(samples), 665 "sample_size": len(samples),
606 "plan": plan, 666 "plan": plan,
607 "index": str(index_path) if index_path else "", 667 "source_index": str(index_path) if index_path else "",
668 "eval_index": str(eval_index_path),
669 "holdout_records": holdout_count,
608 "lyrics_dir": str(output_dir), 670 "lyrics_dir": str(output_dir),
609 "csv": str(csv_path), 671 "csv": str(csv_path),
610 "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")), 672 "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
......
...@@ -4,8 +4,9 @@ This script is intended for the recurring workflow after adding files to ...@@ -4,8 +4,9 @@ This script is intended for the recurring workflow after adding files to
4 ``data/library``: 4 ``data/library``:
5 5
6 1. Move pure-music placeholder lyric files out of the active library. 6 1. Move pure-music placeholder lyric files out of the active library.
7 2. Rebuild the duplicate-checking index. 7 2. Move duplicate lyric files out of the active library.
8 3. Optionally regenerate and evaluate a synthetic regression set. 8 3. Rebuild the duplicate-checking index from retained files.
9 4. Optionally regenerate and evaluate a production-style eval set.
9 """ 10 """
10 11
11 from __future__ import annotations 12 from __future__ import annotations
...@@ -15,6 +16,7 @@ import csv ...@@ -15,6 +16,7 @@ import csv
15 import json 16 import json
16 import shutil 17 import shutil
17 import sys 18 import sys
19 from dataclasses import dataclass
18 from datetime import datetime 20 from datetime import datetime
19 from pathlib import Path 21 from pathlib import Path
20 22
...@@ -23,11 +25,14 @@ if str(PROJECT_ROOT) not in sys.path: ...@@ -23,11 +25,14 @@ if str(PROJECT_ROOT) not in sys.path:
23 sys.path.insert(0, str(PROJECT_ROOT)) 25 sys.path.insert(0, str(PROJECT_ROOT))
24 26
25 from lyric_dedup.checker import DuplicateChecker 27 from lyric_dedup.checker import DuplicateChecker
28 from lyric_dedup.checker import DuplicateDecision
29 from lyric_dedup.checker import LyricRecord
26 from lyric_dedup.cli import evaluate_csv 30 from lyric_dedup.cli import evaluate_csv
27 from lyric_dedup.eval_dataset import generate_eval_set 31 from lyric_dedup.eval_dataset import generate_eval_set
28 from lyric_dedup.file_import import iter_lyric_files 32 from lyric_dedup.file_import import iter_lyric_files
29 from lyric_dedup.file_import import read_lyric_file 33 from lyric_dedup.file_import import read_lyric_file
30 from lyric_dedup.file_import import records_from_dir 34 from lyric_dedup.file_import import record_from_file
35 from lyric_dedup.normalization import NormalizedLyrics
31 from lyric_dedup.normalization import normalize_lyrics 36 from lyric_dedup.normalization import normalize_lyrics
32 37
33 38
...@@ -37,13 +42,25 @@ PLACEHOLDER_MARKERS = ( ...@@ -37,13 +42,25 @@ PLACEHOLDER_MARKERS = (
37 ) 42 )
38 43
39 44
45 @dataclass(frozen=True)
46 class LibraryProfile:
47 path: Path
48 record: LyricRecord
49 normalized: NormalizedLyrics
50 line_count: int
51 char_count: int
52
53
40 def main() -> None: 54 def main() -> None:
41 parser = argparse.ArgumentParser(description="Process lyric library additions.") 55 parser = argparse.ArgumentParser(description="Process lyric library additions.")
42 parser.add_argument("--library-dir", default="data/library") 56 parser.add_argument("--library-dir", default="data/library")
43 parser.add_argument("--index", default="outputs/indexes/library_lyrics.pkl") 57 parser.add_argument("--index", default="outputs/indexes/library_lyrics.pkl")
44 parser.add_argument("--quarantine-dir", default="data/quarantine/no_lyrics_placeholders") 58 parser.add_argument("--quarantine-dir", default="data/quarantine/no_lyrics_placeholders")
59 parser.add_argument("--duplicate-quarantine-dir", default="data/quarantine/duplicates")
45 parser.add_argument("--dry-run", action="store_true", help="Only report placeholder files; do not move or write outputs.") 60 parser.add_argument("--dry-run", action="store_true", help="Only report placeholder files; do not move or write outputs.")
46 parser.add_argument("--delete-placeholders", action="store_true", help="Delete matched placeholder files instead of moving them.") 61 parser.add_argument("--delete-placeholders", action="store_true", help="Delete matched placeholder files instead of moving them.")
62 parser.add_argument("--delete-duplicates", action="store_true", help="Delete duplicate lyric files instead of moving them.")
63 parser.add_argument("--skip-library-dedup", action="store_true", help="Skip internal duplicate cleanup before rebuilding the index.")
47 parser.add_argument("--eval-size", type=int, default=0, help="Generate and evaluate this many synthetic samples. 0 disables eval.") 64 parser.add_argument("--eval-size", type=int, default=0, help="Generate and evaluate this many synthetic samples. 0 disables eval.")
48 parser.add_argument("--positive-ratio", type=float, default=0.2) 65 parser.add_argument("--positive-ratio", type=float, default=0.2)
49 parser.add_argument("--eval-dir", default="data/generated_eval/incoming") 66 parser.add_argument("--eval-dir", default="data/generated_eval/incoming")
...@@ -54,13 +71,18 @@ def main() -> None: ...@@ -54,13 +71,18 @@ def main() -> None:
54 71
55 library_dir = Path(args.library_dir) 72 library_dir = Path(args.library_dir)
56 quarantine_dir = Path(args.quarantine_dir) 73 quarantine_dir = Path(args.quarantine_dir)
74 duplicate_quarantine_dir = Path(args.duplicate_quarantine_dir)
57 report_path = Path(args.report) 75 report_path = Path(args.report)
58 76
59 files_before = iter_lyric_files(library_dir) 77 files_before = iter_lyric_files(library_dir)
60 placeholders = _find_placeholder_files(library_dir) 78 placeholders = _find_placeholder_files(library_dir)
61 short_effective = _effective_line_report(library_dir) 79 duplicate_report_path = report_path.with_suffix(".duplicates.csv")
62 80
63 moved_or_deleted: list[str] = [] 81 moved_or_deleted: list[str] = []
82 duplicate_actions: list[str] = []
83 duplicate_rows: list[dict[str, object]] = []
84 short_effective: dict[str, int]
85 retained_count = 0
64 if not args.dry_run: 86 if not args.dry_run:
65 moved_or_deleted = _handle_placeholders( 87 moved_or_deleted = _handle_placeholders(
66 placeholders, 88 placeholders,
...@@ -68,9 +90,25 @@ def main() -> None: ...@@ -68,9 +90,25 @@ def main() -> None:
68 quarantine_dir=quarantine_dir, 90 quarantine_dir=quarantine_dir,
69 delete=args.delete_placeholders, 91 delete=args.delete_placeholders,
70 ) 92 )
71 _build_index(library_dir, Path(args.index)) 93 if args.skip_library_dedup:
94 profiles = _profile_library(library_dir)
95 short_effective = _effective_line_report_from_profiles(profiles)
96 retained_count = _build_index_from_profiles(profiles, Path(args.index))
97 else:
98 profiles = _profile_library(library_dir)
99 short_effective = _effective_line_report_from_profiles(profiles)
100 retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
101 profiles,
102 library_dir=library_dir,
103 index_path=Path(args.index),
104 duplicate_quarantine_dir=duplicate_quarantine_dir,
105 delete=args.delete_duplicates,
106 dry_run=False,
107 )
108 _write_duplicate_report(duplicate_rows, duplicate_report_path)
72 109
73 if args.eval_size > 0: 110 if args.eval_size > 0:
111 eval_index_path = Path(args.eval_csv).with_suffix(".index.pkl")
74 generate_eval_set( 112 generate_eval_set(
75 library_dir=library_dir, 113 library_dir=library_dir,
76 output_dir=Path(args.eval_dir), 114 output_dir=Path(args.eval_dir),
...@@ -78,9 +116,10 @@ def main() -> None: ...@@ -78,9 +116,10 @@ def main() -> None:
78 size=args.eval_size, 116 size=args.eval_size,
79 positive_ratio=args.positive_ratio, 117 positive_ratio=args.positive_ratio,
80 index_path=Path(args.index), 118 index_path=Path(args.index),
119 eval_index_path=eval_index_path,
81 ) 120 )
82 evaluate_csv( 121 evaluate_csv(
83 Path(args.index), 122 eval_index_path,
84 Path(args.eval_csv), 123 Path(args.eval_csv),
85 Path(args.eval_out), 124 Path(args.eval_out),
86 base_dir=Path(args.eval_csv).parent, 125 base_dir=Path(args.eval_csv).parent,
...@@ -88,13 +127,27 @@ def main() -> None: ...@@ -88,13 +127,27 @@ def main() -> None:
88 max_candidates=5, 127 max_candidates=5,
89 ) 128 )
90 evaluate_csv( 129 evaluate_csv(
91 Path(args.index), 130 eval_index_path,
92 Path(args.eval_csv), 131 Path(args.eval_csv),
93 Path(args.eval_out).with_name(Path(args.eval_out).stem + "_review_positive.csv"), 132 Path(args.eval_out).with_name(Path(args.eval_out).stem + "_review_positive.csv"),
94 base_dir=Path(args.eval_csv).parent, 133 base_dir=Path(args.eval_csv).parent,
95 positive_decisions={"duplicate", "review"}, 134 positive_decisions={"duplicate", "review"},
96 max_candidates=5, 135 max_candidates=5,
97 ) 136 )
137 else:
138 profiles = _profile_library(library_dir)
139 short_effective = _effective_line_report_from_profiles(profiles)
140 if not args.skip_library_dedup:
141 retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
142 profiles,
143 library_dir=library_dir,
144 index_path=Path(args.index),
145 duplicate_quarantine_dir=duplicate_quarantine_dir,
146 delete=args.delete_duplicates,
147 dry_run=True,
148 )
149 else:
150 retained_count = len(profiles)
98 151
99 report = { 152 report = {
100 "timestamp": datetime.now().isoformat(timespec="seconds"), 153 "timestamp": datetime.now().isoformat(timespec="seconds"),
...@@ -104,11 +157,18 @@ def main() -> None: ...@@ -104,11 +157,18 @@ def main() -> None:
104 "placeholder_matches": len(placeholders), 157 "placeholder_matches": len(placeholders),
105 "placeholder_files": [str(path) for path in placeholders], 158 "placeholder_files": [str(path) for path in placeholders],
106 "handled_placeholder_files": moved_or_deleted, 159 "handled_placeholder_files": moved_or_deleted,
160 "library_dedup_skipped": args.skip_library_dedup,
161 "duplicate_matches": len(duplicate_rows),
162 "duplicate_report": str(duplicate_report_path) if duplicate_rows else "",
163 "handled_duplicate_files": duplicate_actions[:1000],
164 "handled_duplicate_files_truncated": len(duplicate_actions) > 1000,
165 "retained_index_records": retained_count,
107 "files_after": len(iter_lyric_files(library_dir)), 166 "files_after": len(iter_lyric_files(library_dir)),
108 "index": str(args.index), 167 "index": str(args.index),
109 "eval_size": args.eval_size, 168 "eval_size": args.eval_size,
110 "eval_csv": str(args.eval_csv) if args.eval_size > 0 else "", 169 "eval_csv": str(args.eval_csv) if args.eval_size > 0 else "",
111 "eval_out": str(args.eval_out) if args.eval_size > 0 else "", 170 "eval_out": str(args.eval_out) if args.eval_size > 0 else "",
171 "eval_index": str(Path(args.eval_csv).with_suffix(".index.pkl")) if args.eval_size > 0 else "",
112 "short_effective_line_counts": short_effective, 172 "short_effective_line_counts": short_effective,
113 } 173 }
114 174
...@@ -154,15 +214,133 @@ def _handle_placeholders( ...@@ -154,15 +214,133 @@ def _handle_placeholders(
154 return handled 214 return handled
155 215
156 216
157 def _build_index(library_dir: Path, index_path: Path) -> None: 217 def _profile_library(library_dir: Path) -> list[LibraryProfile]:
218 profiles: list[LibraryProfile] = []
219 files = iter_lyric_files(library_dir)
220 _progress(f"profile active library: 0/{len(files)}")
221 for index, path in enumerate(files, start=1):
222 record = record_from_file(path, base_dir=library_dir)
223 normalized = normalize_lyrics(record.lyrics)
224 lines = normalized.primary_lines or normalized.unique_lines
225 normalized_text = normalized.normalized_full_text
226 profiles.append(
227 LibraryProfile(
228 path=path,
229 record=record,
230 normalized=normalized,
231 line_count=len(lines),
232 char_count=len(normalized_text),
233 )
234 )
235 _progress_count("profile active library", index, len(files), step=5000)
236 return profiles
237
238
239 def _build_index_from_profiles(profiles: list[LibraryProfile], index_path: Path) -> int:
158 checker = DuplicateChecker() 240 checker = DuplicateChecker()
159 for record in records_from_dir(library_dir): 241 for index, profile in enumerate(profiles, start=1):
160 checker.add_record(record) 242 checker.add_normalized_record(profile.record, profile.normalized)
243 _progress_count("build index", index, len(profiles), step=5000)
244 index_path.parent.mkdir(parents=True, exist_ok=True)
245 checker.save(index_path)
246 return checker.record_count
247
248
249 def _deduplicate_and_build_index(
250 profiles: list[LibraryProfile],
251 *,
252 library_dir: Path,
253 index_path: Path,
254 duplicate_quarantine_dir: Path,
255 delete: bool,
256 dry_run: bool,
257 ) -> tuple[int, list[dict[str, object]], list[str]]:
258 checker = DuplicateChecker()
259 duplicate_rows: list[dict[str, object]] = []
260 duplicate_actions: list[str] = []
261 ordered = sorted(profiles, key=_profile_quality_key)
262 _progress(f"deduplicate active library: 0/{len(ordered)}")
263 for index, profile in enumerate(ordered, start=1):
264 result = checker.check_record(profile.record, max_candidates=1)
265 best = result.candidates[0] if result.candidates else None
266 if result.decision == DuplicateDecision.DUPLICATE and best is not None:
267 duplicate_rows.append(
268 {
269 "duplicate_path": str(profile.path),
270 "duplicate_record_id": profile.record.record_id,
271 "kept_record_id": best.record_id,
272 "decision": result.decision.value,
273 "confidence": result.confidence,
274 "reason": result.reason,
275 "best_candidate_jaccard": best.jaccard,
276 "best_candidate_line_coverage": best.line_coverage,
277 "best_candidate_primary_jaccard": best.primary_jaccard,
278 "best_candidate_primary_line_coverage": best.primary_line_coverage,
279 "matched_unique_lines": " | ".join(best.matched_unique_lines),
280 "line_count": profile.line_count,
281 "char_count": profile.char_count,
282 }
283 )
284 if not dry_run:
285 duplicate_actions.append(
286 _handle_duplicate_file(
287 profile.path,
288 library_dir=library_dir,
289 duplicate_quarantine_dir=duplicate_quarantine_dir,
290 delete=delete,
291 )
292 )
293 else:
294 checker.add_normalized_record(profile.record, profile.normalized)
295 _progress_count("deduplicate active library", index, len(ordered), step=5000)
296
297 if not dry_run:
161 index_path.parent.mkdir(parents=True, exist_ok=True) 298 index_path.parent.mkdir(parents=True, exist_ok=True)
162 checker.save(index_path) 299 checker.save(index_path)
300 return checker.record_count, duplicate_rows, duplicate_actions
301
302
303 def _handle_duplicate_file(
304 path: Path,
305 *,
306 library_dir: Path,
307 duplicate_quarantine_dir: Path,
308 delete: bool,
309 ) -> str:
310 if delete:
311 path.unlink()
312 return f"deleted:{path}"
313 duplicate_quarantine_dir.mkdir(parents=True, exist_ok=True)
314 relative = path.resolve().relative_to(library_dir.resolve())
315 destination = duplicate_quarantine_dir / relative
316 destination.parent.mkdir(parents=True, exist_ok=True)
317 if destination.exists():
318 destination = destination.with_name(f"{destination.stem}_{datetime.now().strftime('%Y%m%d%H%M%S')}{destination.suffix}")
319 shutil.move(str(path), str(destination))
320 return f"moved:{path}->{destination}"
321
322
323 def _profile_quality_key(profile: LibraryProfile) -> tuple[int, int, int, str]:
324 # Sort ascending; negative values make higher-quality records come first.
325 filename_quality = 0 if not profile.path.name.startswith("None_") else 1
326 return (filename_quality, -profile.line_count, -profile.char_count, str(profile.path))
327
328
329 def _write_duplicate_report(rows: list[dict[str, object]], report_path: Path) -> None:
330 if not rows:
331 return
332 report_path.parent.mkdir(parents=True, exist_ok=True)
333 with report_path.open("w", encoding="utf-8", newline="") as file:
334 writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()))
335 writer.writeheader()
336 writer.writerows(rows)
163 337
164 338
165 def _effective_line_report(library_dir: Path) -> dict[str, int]: 339 def _effective_line_report(library_dir: Path) -> dict[str, int]:
340 return _effective_line_report_from_profiles(_profile_library(library_dir))
341
342
343 def _effective_line_report_from_profiles(profiles: list[LibraryProfile]) -> dict[str, int]:
166 buckets = { 344 buckets = {
167 "total": 0, 345 "total": 0,
168 "zero_effective_lines": 0, 346 "zero_effective_lines": 0,
...@@ -170,10 +348,9 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]: ...@@ -170,10 +348,9 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
170 "four_to_five_effective_lines": 0, 348 "four_to_five_effective_lines": 0,
171 "six_plus_effective_lines": 0, 349 "six_plus_effective_lines": 0,
172 } 350 }
173 for path in iter_lyric_files(library_dir): 351 for profile in profiles:
174 buckets["total"] += 1 352 buckets["total"] += 1
175 normalized = normalize_lyrics(read_lyric_file(path)) 353 line_count = profile.line_count
176 line_count = len(normalized.primary_lines or normalized.unique_lines)
177 if line_count == 0: 354 if line_count == 0:
178 buckets["zero_effective_lines"] += 1 355 buckets["zero_effective_lines"] += 1
179 elif line_count <= 3: 356 elif line_count <= 3:
...@@ -185,5 +362,16 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]: ...@@ -185,5 +362,16 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
185 return buckets 362 return buckets
186 363
187 364
365 def _progress(message: str) -> None:
366 print(f"[process-library] {message}", file=sys.stderr, flush=True)
367
368
369 def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
370 if total <= 0:
371 return
372 if current == 1 or current == total or current % step == 0:
373 _progress(f"{label}: {current}/{total}")
374
375
188 if __name__ == "__main__": 376 if __name__ == "__main__":
189 main() 377 main()
......
...@@ -308,9 +308,11 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None: ...@@ -308,9 +308,11 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None:
308 assert manifest["library_files"] == 12 308 assert manifest["library_files"] == 12
309 assert manifest["sample_size"] == 30 309 assert manifest["sample_size"] == 30
310 assert manifest["unique_source_records"] > 1 310 assert manifest["unique_source_records"] > 1
311 assert manifest["holdout_records"] > 1
312 assert (tmp_path / "generated" / "eval.csv.index.pkl").exists()
311 assert "positive_full_duplicate" in manifest["plan"] 313 assert "positive_full_duplicate" in manifest["plan"]
314 assert "negative_real_holdout_full_song" in negative_types
312 assert "negative_fragment" in negative_types 315 assert "negative_fragment" in negative_types
313 assert "negative_hard_candidate" in negative_types
314 assert all(row["expected"] == "不应去重" for row in rows if row["sample_type"].startswith("negative_")) 316 assert all(row["expected"] == "不应去重" for row in rows if row["sample_type"].startswith("negative_"))
315 317
316 318
......