添加测试集内部去重

沈秋雨
Commit ba39ce6a ... ba39ce6aa50b5bc45d24bc32bcbc7c51b870922a authored 2026-06-03 11:25:52 +0800 by 沈秋雨
Showing 7 changed files with 469 additions and 136 deletions
README.md
TEST_WORKFLOW.md
lyric_dedup/checker.py
lyric_dedup/cli.py
lyric_dedup/eval_dataset.py
scripts/process_library.py
tests/test_lyric_dedup.py
--- a/README.md
View file @ba39ce6
+++ b/README.md
View file @ba39ce6
@@ -80,6 +80,7 @@ python -m lyric_dedup.cli generate-eval-set \
  --lyrics-dir data/generated_eval/incoming \
  --csv data/generated_eval/eval_50000.csv \
  --index outputs/indexes/lyrics.pkl \
+  --eval-index data/generated_eval/eval_50000.csv.index.pkl \
  --size 50000 \
  --positive-ratio 0.3
 ```
@@ -88,10 +89,10 @@ python -m lyric_dedup.cli generate-eval-set \

 - 先扫描整个曲库，按有效歌词行数、语言类型、文件来源前缀做分层采样，不再按排序前缀取样。
 - `应去重` 样本只生成全曲歌词的样式变化，例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。
- `不应去重` 样本包含同主题新歌词、hard negative、片段歌词、重复副歌碰撞、仅翻译相似、短歌词/占位边界样本。
+- `不应去重` 样本以真实 holdout 完整歌词为主，也包含片段歌词、重复副歌碰撞、仅翻译相似、同主题新歌词、短歌词/占位边界样本。
 - 片段歌词即使命中已有歌曲的一部分，也不应该输出 `duplicate`；最多进入 `review`。
- 如果传入 `--index`，生成器会用现有索引构造更接近线上召回风险的 hard negative。
- 同时会生成 `*.manifest.json`，记录 seed、曲库规模、样本类型分布、语言/来源分桶和样本来源覆盖数。
+- 生成器会额外写出 `--eval-index`，这个索引排除了 holdout 歌，评估生成 CSV 时应使用它。
+- 同时会生成 `*.manifest.json`，记录 seed、曲库规模、holdout 数、样本类型分布、语言/来源分桶和样本来源覆盖数。

 先准备一个 CSV，例如 `data/eval/eval.csv`：

--- a/TEST_WORKFLOW.md
View file @ba39ce6
+++ b/TEST_WORKFLOW.md
View file @ba39ce6
@@ -103,6 +103,7 @@ python -m lyric_dedup.cli generate-eval-set \
  --lyrics-dir data/generated_eval/incoming \
  --csv data/generated_eval/eval_50000.csv \
  --index outputs/indexes/library_lyrics.pkl \
+  --eval-index data/generated_eval/eval_50000.csv.index.pkl \
  --size 50000 \
  --positive-ratio 0.3
 ```
@@ -120,24 +121,26 @@ python -m lyric_dedup.cli generate-eval-set \

 ```text
 positive_* = 应去重，全曲歌词样式变化
-negative_random_unrelated = 不应去重，同主题新歌词
-negative_hard_candidate = 不应去重，系统容易召回的短句/局部重合样本
+negative_real_holdout_full_song = 不应去重，完整真实歌词，已从评估索引中排除
 negative_fragment = 不应去重，单曲片段
 negative_shared_chorus = 不应去重，重复副歌碰撞
 negative_translation_only = 不应去重，仅翻译相似
+negative_same_theme_synthetic = 不应去重，同主题新歌词
 edge_short_or_placeholder = 不应去重，短歌词/占位边界样本
 ```

-生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。传入 `--index` 后会用现有索引生成 hard negative。每次还会输出：
+生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。它会分出一批 holdout 完整歌词作为真实新歌负样本，并生成一个排除 holdout 的评估索引。每次还会输出：

 ```text
 data/generated_eval/eval_50000.csv.manifest.json
+data/generated_eval/eval_50000.csv.index.pkl
 ```

 manifest 里重点看：

 ```text
 library_files          曲库歌词文件数
+holdout_records        从评估索引中排除、作为真实新歌负样本的数量
 sample_type_counts     各样本类型数量
 line_count_bucket_counts / language_bucket_counts / source_bucket_counts
 unique_source_records  本次评估覆盖了多少真实源文件
@@ -147,7 +150,7 @@ unique_source_records  本次评估覆盖了多少真实源文件

 ```bash
 python -m lyric_dedup.cli evaluate-csv \
-  --index outputs/indexes/library_lyrics.pkl \
+  --index data/generated_eval/eval_50000.csv.index.pkl \
  --csv data/generated_eval/eval_50000.csv \
  --base-dir data/generated_eval \
  --out outputs/results/library_eval_50000.csv
@@ -171,7 +174,7 @@ false_positive

 ```bash
 python -m lyric_dedup.cli evaluate-csv \
-  --index outputs/indexes/library_lyrics.pkl \
+  --index data/generated_eval/eval_50000.csv.index.pkl \
  --csv data/generated_eval/eval_50000.csv \
  --base-dir data/generated_eval \
  --positive-decisions duplicate,review \
--- a/lyric_dedup/checker.py
View file @ba39ce6
+++ b/lyric_dedup/checker.py
View file @ba39ce6
@@ -96,16 +96,24 @@ class DuplicateChecker:

    def add_record(self, record: LyricRecord) -> None:
        indexed = self._index(record)
-        self._records[record.record_id] = indexed
-        self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record.record_id)
+        self._add_indexed(record.record_id, indexed)
+
+    def add_normalized_record(self, record: LyricRecord, normalized: NormalizedLyrics) -> None:
+        """Add a record when normalized lyrics have already been computed."""
+        indexed = self._index_normalized(record, normalized)
+        self._add_indexed(record.record_id, indexed)
+
+    def _add_indexed(self, record_id: str, indexed: _IndexedRecord) -> None:
+        self._records[record_id] = indexed
+        self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record_id)
        for line in indexed.normalized.unique_lines:
            if len(line) >= 4:
-                self._line_to_ids.setdefault(line, set()).add(record.record_id)
+                self._line_to_ids.setdefault(line, set()).add(record_id)
        for token in indexed.tokens:
-            self._token_to_ids.setdefault(token, set()).add(record.record_id)
+            self._token_to_ids.setdefault(token, set()).add(record_id)
        for token in indexed.fallback_tokens:
-            self._token_to_ids.setdefault(token, set()).add(record.record_id)
-        self._lsh.add(record.record_id, indexed.signature)
+            self._token_to_ids.setdefault(token, set()).add(record_id)
+        self._lsh.add(record_id, indexed.signature)

    def save(self, path: str | Path) -> None:
        """Persist the in-memory index for later checks."""
@@ -187,6 +195,9 @@ class DuplicateChecker:

    def _index(self, record: LyricRecord) -> _IndexedRecord:
        normalized = normalize_lyrics(record.lyrics)
+        return self._index_normalized(record, normalized)
+
+    def _index_normalized(self, record: LyricRecord, normalized: NormalizedLyrics) -> _IndexedRecord:
        tokens = lyric_tokens(normalized)
        primary_tokens = lyric_tokens(normalized, lines=normalized.primary_lines)
        translation_tokens = lyric_tokens(normalized, lines=normalized.translation_lines)
--- a/lyric_dedup/cli.py
View file @ba39ce6
+++ b/lyric_dedup/cli.py
View file @ba39ce6
@@ -5,6 +5,7 @@ from __future__ import annotations
 import argparse
 import csv
 import json
+import sys
 from pathlib import Path

 from lyric_dedup.checker import DuplicateChecker
@@ -50,7 +51,8 @@ def main() -> None:
    generate.add_argument("--size", type=int, default=100)
    generate.add_argument("--positive-ratio", type=float, default=0.3)
    generate.add_argument("--seed", type=int, default=20260602)
-    generate.add_argument("--index", default="", help="optional existing index for hard-negative generation")
+    generate.add_argument("--index", default="", help="optional source index path recorded in the manifest")
+    generate.add_argument("--eval-index", default="", help="output index built from non-holdout records for this eval set")

    args = parser.parse_args()
    if args.command == "build-index":
@@ -77,6 +79,7 @@ def main() -> None:
            positive_ratio=args.positive_ratio,
            seed=args.seed,
            index_path=Path(args.index) if args.index else None,
+            eval_index_path=Path(args.eval_index) if args.eval_index else None,
        )
        print(json.dumps(summary, ensure_ascii=False))

@@ -155,52 +158,58 @@ def evaluate_csv(
    positive_decisions: set[str],
    max_candidates: int,
 ) -> None:
+    _progress(f"load index: {index_path}")
    checker = DuplicateChecker.load(index_path)
    rows: list[dict[str, object]] = []
+    total = _csv_data_row_count(csv_path)
+    _progress(f"evaluate csv: 0/{total}")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
    with csv_path.open(encoding="utf-8-sig", newline="") as file:
        reader = csv.DictReader(file)
        if reader.fieldnames is None:
            raise ValueError("评估 CSV 需要表头")
-        for row_number, row in enumerate(reader, start=2):
-            sample_id = row.get("id") or row.get("sample_id") or str(row_number)
-            record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
-            expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
-            result = checker.check_record(record, max_candidates=max_candidates)
-            predicted_duplicate = result.decision.value in positive_decisions
-            best = result.candidates[0] if result.candidates else None
-            rows.append(
-                {
-                    "id": sample_id,
-                    "source": source,
-                    "expected_duplicate": expected_duplicate,
-                    "decision": result.decision.value,
-                    "predicted_duplicate": predicted_duplicate,
-                    "correct": expected_duplicate == predicted_duplicate,
-                    "confidence": result.confidence,
-                    "reason": result.reason,
-                    "best_candidate_id": best.record_id if best else "",
-                    "best_candidate_decision": best.decision.value if best else "",
-                    "best_candidate_confidence": best.confidence if best else "",
-                    "best_candidate_jaccard": best.jaccard if best else "",
-                    "best_candidate_line_coverage": best.line_coverage if best else "",
-                    "best_candidate_primary_jaccard": best.primary_jaccard if best else "",
-                    "best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
-                    "best_candidate_translation_jaccard": best.translation_jaccard if best else "",
-                    "best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
-                    "best_candidate_reason": best.reason if best else "",
-                    "matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
-                }
-            )
-
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    with out_path.open("w", encoding="utf-8", newline="") as file:
-        writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()) if rows else ["id"])
-        writer.writeheader()
-        writer.writerows(rows)
+        fieldnames = [
+            "id",
+            "source",
+            "expected_duplicate",
+            "decision",
+            "predicted_duplicate",
+            "correct",
+            "confidence",
+            "reason",
+            "best_candidate_id",
+            "best_candidate_decision",
+            "best_candidate_confidence",
+            "best_candidate_jaccard",
+            "best_candidate_line_coverage",
+            "best_candidate_primary_jaccard",
+            "best_candidate_primary_line_coverage",
+            "best_candidate_translation_jaccard",
+            "best_candidate_translation_line_coverage",
+            "best_candidate_reason",
+            "matched_unique_lines",
+        ]
+        with out_path.open("w", encoding="utf-8", newline="") as out_file:
+            writer = csv.DictWriter(out_file, fieldnames=fieldnames)
+            writer.writeheader()
+            for index, row in enumerate(reader, start=1):
+                row_out = _evaluate_row(
+                    row,
+                    row_number=index + 1,
+                    checker=checker,
+                    csv_path=csv_path,
+                    base_dir=base_dir,
+                    positive_decisions=positive_decisions,
+                    max_candidates=max_candidates,
+                )
+                rows.append(row_out)
+                writer.writerow(row_out)
+                _progress_count("evaluate csv", index, total, step=1000)

    summary = _evaluation_summary(rows, positive_decisions=positive_decisions, out_path=out_path)
    summary_path = out_path.with_suffix(out_path.suffix + ".summary.json")
    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    _progress("evaluation complete")
    print(json.dumps(summary, ensure_ascii=False))


@@ -229,6 +238,45 @@ def _result_to_dict(result, *, source: str) -> dict[str, object]:
    }


+def _evaluate_row(
+    row: dict[str, str],
+    *,
+    row_number: int,
+    checker: DuplicateChecker,
+    csv_path: Path,
+    base_dir: Path | None,
+    positive_decisions: set[str],
+    max_candidates: int,
+) -> dict[str, object]:
+    sample_id = row.get("id") or row.get("sample_id") or str(row_number)
+    record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
+    expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
+    result = checker.check_record(record, max_candidates=max_candidates)
+    predicted_duplicate = result.decision.value in positive_decisions
+    best = result.candidates[0] if result.candidates else None
+    return {
+        "id": sample_id,
+        "source": source,
+        "expected_duplicate": expected_duplicate,
+        "decision": result.decision.value,
+        "predicted_duplicate": predicted_duplicate,
+        "correct": expected_duplicate == predicted_duplicate,
+        "confidence": result.confidence,
+        "reason": result.reason,
+        "best_candidate_id": best.record_id if best else "",
+        "best_candidate_decision": best.decision.value if best else "",
+        "best_candidate_confidence": best.confidence if best else "",
+        "best_candidate_jaccard": best.jaccard if best else "",
+        "best_candidate_line_coverage": best.line_coverage if best else "",
+        "best_candidate_primary_jaccard": best.primary_jaccard if best else "",
+        "best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
+        "best_candidate_translation_jaccard": best.translation_jaccard if best else "",
+        "best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
+        "best_candidate_reason": best.reason if best else "",
+        "matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
+    }
+
+
 def _lyrics_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[str, str]:
    lyrics = (row.get("lyrics") or "").strip()
    if lyrics:
@@ -322,5 +370,23 @@ def _evaluation_summary(
    }


+def _csv_data_row_count(csv_path: Path) -> int:
+    with csv_path.open(encoding="utf-8-sig", newline="") as file:
+        reader = csv.reader(file)
+        next(reader, None)
+        return sum(1 for _ in reader)
+
+
+def _progress(message: str) -> None:
+    print(f"[eval] {message}", file=sys.stderr, flush=True)
+
+
+def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
+    if total <= 0:
+        return
+    if current == 1 or current == total or current % step == 0:
+        _progress(f"{label}: {current}/{total}")
+
+
 if __name__ == "__main__":
    main()
--- a/lyric_dedup/eval_dataset.py
View file @ba39ce6
+++ b/lyric_dedup/eval_dataset.py
View file @ba39ce6
@@ -7,14 +7,14 @@ import hashlib
 import json
 import random
 import re
+import sys
 from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path

 from lyric_dedup.checker import DuplicateChecker
-from lyric_dedup.checker import DuplicateDecision
+from lyric_dedup.checker import LyricRecord
 from lyric_dedup.file_import import iter_lyric_files
-from lyric_dedup.file_import import read_lyric_file
 from lyric_dedup.file_import import record_from_file
 from lyric_dedup.normalization import NormalizedLyrics
 from lyric_dedup.normalization import fingerprint_text
@@ -23,19 +23,31 @@ from lyric_dedup.normalization import normalize_lyrics

 DEFAULT_SAMPLE_MIX = {
    "positive_full_duplicate": 0.30,
-    "negative_random_unrelated": 0.20,
-    "negative_hard_candidate": 0.25,
+    "negative_real_holdout_full_song": 0.40,
    "negative_fragment": 0.10,
    "negative_shared_chorus": 0.05,
    "negative_translation_only": 0.05,
+    "negative_same_theme_synthetic": 0.05,
    "edge_short_or_placeholder": 0.05,
 }


+def _progress(message: str) -> None:
+    print(f"[eval-gen] {message}", file=sys.stderr, flush=True)
+
+
+def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
+    if total <= 0:
+        return
+    if current == 1 or current == total or current % step == 0:
+        _progress(f"{label}: {current}/{total}")
+
+
 @dataclass(frozen=True)
 class LyricProfile:
    path: Path
    record_id: str
+    raw_text: str
    title: str
    artist: str
    normalized: NormalizedLyrics
@@ -74,6 +86,7 @@ def generate_eval_set(
    positive_ratio: float = 0.30,
    seed: int = 20260602,
    index_path: Path | None = None,
+    eval_index_path: Path | None = None,
 ) -> dict[str, object]:
    """Generate a stratified production evaluation set.

@@ -83,6 +96,7 @@ def generate_eval_set(
    if size <= 0:
        raise ValueError("size must be positive")

+    _progress(f"start generation: size={size}, positive_ratio={positive_ratio}, seed={seed}")
    rng = random.Random(seed)
    profiles = profile_library(library_dir)
    if not profiles:
@@ -90,13 +104,25 @@ def generate_eval_set(

    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    _progress(f"clean output dir: {output_dir}")
    _clean_generated_output_dir(output_dir)

-    checker = DuplicateChecker.load(index_path) if index_path else None
    plan = _sample_plan(size, positive_ratio=positive_ratio)
-    groups = _profile_groups(profiles)
+    _progress(f"sample plan: {plan}")
+    holdout_count = min(plan["negative_real_holdout_full_song"], max(1, len(profiles) // 2))
+    holdout_profiles = _stratified_unique_sample(
+        profiles,
+        holdout_count,
+        rng,
+    )
+    holdout_ids = {profile.record_id for profile in holdout_profiles}
+    indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
+    eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
+    _build_eval_index(indexed_profiles, eval_index_path)
+    groups = _profile_groups(indexed_profiles)
    samples: list[GeneratedSample] = []

+    _progress("build positive_full_duplicate samples")
    samples.extend(
        _build_positive_samples(
            _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
@@ -106,53 +132,62 @@ def generate_eval_set(
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build negative_real_holdout_full_song samples")
    samples.extend(
-        _build_random_unrelated_samples(
-            plan["negative_random_unrelated"],
+        _build_holdout_full_song_samples(
+            holdout_profiles,
            output_dir,
            csv_path.parent,
-            rng,
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build negative_fragment samples")
    samples.extend(
-        _build_hard_candidate_samples(
-            groups["normal"],
-            plan["negative_hard_candidate"],
+        _build_fragment_samples(
+            _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
            output_dir,
            csv_path.parent,
            rng,
-            checker=checker,
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build negative_shared_chorus samples")
    samples.extend(
-        _build_fragment_samples(
-            _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
+        _build_shared_chorus_samples(
+            _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build negative_translation_only samples")
    samples.extend(
-        _build_shared_chorus_samples(
-            _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
+        _build_translation_only_samples(
+            _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build negative_same_theme_synthetic samples")
    samples.extend(
-        _build_translation_only_samples(
-            _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
+        _build_same_theme_synthetic_samples(
+            plan["negative_same_theme_synthetic"],
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")
+    _progress("build edge_short_or_placeholder samples")
    samples.extend(
        _build_edge_samples(
            _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
@@ -162,10 +197,12 @@ def generate_eval_set(
            start_index=len(samples) + 1,
        )
    )
+    _progress(f"built samples: {len(samples)}/{size}")

    if len(samples) < size:
+        _progress(f"top up with negative_same_theme_synthetic samples: {size - len(samples)}")
        samples.extend(
-            _build_random_unrelated_samples(
+            _build_same_theme_synthetic_samples(
                size - len(samples),
                output_dir,
                csv_path.parent,
@@ -176,7 +213,9 @@ def generate_eval_set(
    samples = samples[:size]
    rng.shuffle(samples)

+    _progress(f"write csv: {csv_path}")
    _write_csv(samples, csv_path, seed=seed)
+    _progress("write manifest")
    manifest = _write_manifest(
        profiles=profiles,
        samples=samples,
@@ -185,15 +224,21 @@ def generate_eval_set(
        seed=seed,
        plan=plan,
        index_path=index_path,
+        eval_index_path=eval_index_path,
+        holdout_count=len(holdout_profiles),
    )
+    _progress("generation complete")
    return manifest


 def profile_library(library_dir: Path) -> list[LyricProfile]:
    profiles: list[LyricProfile] = []
-    for path in iter_lyric_files(library_dir):
+    paths = iter_lyric_files(library_dir)
+    _progress(f"profile library: 0/{len(paths)}")
+    for index, path in enumerate(paths, start=1):
        record = record_from_file(path, base_dir=library_dir)
-        normalized = normalize_lyrics(record.lyrics)
+        raw_text = record.lyrics
+        normalized = normalize_lyrics(raw_text)
        lines = normalized.primary_lines or normalized.unique_lines
        line_count = len(lines)
        normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
@@ -202,6 +247,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
            LyricProfile(
                path=path,
                record_id=record.record_id,
+                raw_text=raw_text,
                title=record.title or "",
                artist=record.artist or "",
                normalized=normalized,
@@ -214,6 +260,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
                has_translation=bool(normalized.translation_lines),
            )
        )
+        _progress_count("profile library", index, len(paths), step=5000)
    return profiles


@@ -283,6 +330,31 @@ def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Ran
    return selected


+def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
+    if count <= 0 or not profiles:
+        return []
+    return _stratified_sample(profiles, min(count, len(profiles)), rng)
+
+
+def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
+    _progress(f"build eval index excluding holdout: {index_path}")
+    checker = DuplicateChecker()
+    total = len(profiles)
+    for index, profile in enumerate(profiles, start=1):
+        checker.add_normalized_record(
+            LyricRecord(
+                record_id=profile.record_id,
+                lyrics=profile.raw_text,
+                title=profile.title or None,
+                artist=profile.artist or None,
+            ),
+            profile.normalized,
+        )
+        _progress_count("build eval index", index, total, step=5000)
+    index_path.parent.mkdir(parents=True, exist_ok=True)
+    checker.save(index_path)
+
+
 def _build_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
@@ -293,7 +365,7 @@ def _build_positive_samples(
 ) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
-        raw = read_lyric_file(profile.path)
+        raw = profile.raw_text
        lines = _content_lines(raw)
        variants = [
            ("positive_exact_copy", raw),
@@ -308,80 +380,62 @@ def _build_positive_samples(
        index = start_index + offset
        path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
        samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
+        _progress_count("positive_full_duplicate", len(samples), len(profiles))
    return samples


-def _build_random_unrelated_samples(
-    count: int,
+def _build_holdout_full_song_samples(
+    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
-    rng: random.Random,
    *,
    start_index: int,
 ) -> list[GeneratedSample]:
+    _progress("build negative_real_holdout_full_song samples")
    samples: list[GeneratedSample] = []
-    for offset in range(count):
+    for offset, profile in enumerate(profiles):
        index = start_index + offset
-        text = _same_theme_synthetic(index, rng)
-        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_random_unrelated.txt", text)
+        text = profile.raw_text
+        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_real_holdout_full_song.txt", text)
        samples.append(
-            GeneratedSample(
-                sample_id=f"sample-{index:05d}",
-                file=str(path.relative_to(csv_base)),
-                expected="不应去重",
-                sample_type="negative_random_unrelated",
-                source="synthetic",
-                notes="same-theme synthetic full lyric not copied from library",
+            _sample_from_profile(
+                index,
+                path,
+                csv_base,
+                "不应去重",
+                "negative_real_holdout_full_song",
+                profile,
+                notes="full real lyric held out from the generated eval index",
            )
        )
+        _progress_count("negative_real_holdout_full_song", len(samples), len(profiles))
    return samples


-def _build_hard_candidate_samples(
-    profiles: list[LyricProfile],
+def _build_same_theme_synthetic_samples(
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
-    checker: DuplicateChecker | None,
    start_index: int,
 ) -> list[GeneratedSample]:
-    if count <= 0:
-        return []
-    sources = _stratified_sample(profiles, count * 3, rng)
    samples: list[GeneratedSample] = []
-    for profile in sources:
-        if len(samples) >= count:
-            break
-        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
-        text = _short_shared_snippet(lines, rng)
-        candidate_id = ""
-        if checker is not None:
-            result = checker.check(text, max_candidates=5)
-            candidate = next(
-                (
-                    item
-                    for item in result.candidates
-                    if item.record_id != profile.record_id and item.decision != DuplicateDecision.NEW
-                ),
-                result.candidates[0] if result.candidates else None,
-            )
-            candidate_id = candidate.record_id if candidate else ""
-        index = start_index + len(samples)
-        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_hard_candidate.txt", text)
+    for offset in range(count):
+        index = start_index + offset
+        text = _same_theme_synthetic(index, rng)
+        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_same_theme_synthetic.txt", text)
        samples.append(
-            _sample_from_profile(
-                index,
-                path,
-                csv_base,
-                "不应去重",
-                "negative_hard_candidate",
-                profile,
-                candidate_record_id=candidate_id,
-                notes="shares a few real lines plus new filler; should not auto duplicate",
+            GeneratedSample(
+                sample_id=f"sample-{index:05d}",
+                file=str(path.relative_to(csv_base)),
+                expected="不应去重",
+                sample_type="negative_same_theme_synthetic",
+                source="synthetic",
+                notes="same-theme synthetic full lyric not copied from library",
            )
        )
+        _progress_count("negative_same_theme_synthetic", len(samples), count)
    return samples


@@ -410,6 +464,7 @@ def _build_fragment_samples(
                notes="partial lyric fragment only",
            )
        )
+        _progress_count("negative_fragment", len(samples), len(profiles))
    return samples


@@ -447,6 +502,7 @@ def _build_shared_chorus_samples(
                notes="shared repeated lines with new surrounding content",
            )
        )
+        _progress_count("negative_shared_chorus", len(samples), len(profiles))
    return samples


@@ -478,6 +534,7 @@ def _build_translation_only_samples(
                notes="translation-like text without matching original lyric",
            )
        )
+        _progress_count("negative_translation_only", len(samples), len(profiles))
    return samples


@@ -511,6 +568,7 @@ def _build_edge_samples(
                notes=notes,
            )
        )
+        _progress_count("edge_short_or_placeholder", len(samples), len(profiles))
    return samples


@@ -598,13 +656,17 @@ def _write_manifest(
    seed: int,
    plan: dict[str, int],
    index_path: Path | None,
+    eval_index_path: Path,
+    holdout_count: int,
 ) -> dict[str, object]:
    manifest = {
        "seed": seed,
        "library_files": len(profiles),
        "sample_size": len(samples),
        "plan": plan,
-        "index": str(index_path) if index_path else "",
+        "source_index": str(index_path) if index_path else "",
+        "eval_index": str(eval_index_path),
+        "holdout_records": holdout_count,
        "lyrics_dir": str(output_dir),
        "csv": str(csv_path),
        "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
--- a/scripts/process_library.py
View file @ba39ce6
+++ b/scripts/process_library.py
View file @ba39ce6
@@ -4,8 +4,9 @@ This script is intended for the recurring workflow after adding files to
 ``data/library``:

 1. Move pure-music placeholder lyric files out of the active library.
-2. Rebuild the duplicate-checking index.
-3. Optionally regenerate and evaluate a synthetic regression set.
+2. Move duplicate lyric files out of the active library.
+3. Rebuild the duplicate-checking index from retained files.
+4. Optionally regenerate and evaluate a production-style eval set.
 """

 from __future__ import annotations
@@ -15,6 +16,7 @@ import csv
 import json
 import shutil
 import sys
+from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path

@@ -23,11 +25,14 @@ if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

 from lyric_dedup.checker import DuplicateChecker
+from lyric_dedup.checker import DuplicateDecision
+from lyric_dedup.checker import LyricRecord
 from lyric_dedup.cli import evaluate_csv
 from lyric_dedup.eval_dataset import generate_eval_set
 from lyric_dedup.file_import import iter_lyric_files
 from lyric_dedup.file_import import read_lyric_file
-from lyric_dedup.file_import import records_from_dir
+from lyric_dedup.file_import import record_from_file
+from lyric_dedup.normalization import NormalizedLyrics
 from lyric_dedup.normalization import normalize_lyrics


@@ -37,13 +42,25 @@ PLACEHOLDER_MARKERS = (
 )


+@dataclass(frozen=True)
+class LibraryProfile:
+    path: Path
+    record: LyricRecord
+    normalized: NormalizedLyrics
+    line_count: int
+    char_count: int
+
+
 def main() -> None:
    parser = argparse.ArgumentParser(description="Process lyric library additions.")
    parser.add_argument("--library-dir", default="data/library")
    parser.add_argument("--index", default="outputs/indexes/library_lyrics.pkl")
    parser.add_argument("--quarantine-dir", default="data/quarantine/no_lyrics_placeholders")
+    parser.add_argument("--duplicate-quarantine-dir", default="data/quarantine/duplicates")
    parser.add_argument("--dry-run", action="store_true", help="Only report placeholder files; do not move or write outputs.")
    parser.add_argument("--delete-placeholders", action="store_true", help="Delete matched placeholder files instead of moving them.")
+    parser.add_argument("--delete-duplicates", action="store_true", help="Delete duplicate lyric files instead of moving them.")
+    parser.add_argument("--skip-library-dedup", action="store_true", help="Skip internal duplicate cleanup before rebuilding the index.")
    parser.add_argument("--eval-size", type=int, default=0, help="Generate and evaluate this many synthetic samples. 0 disables eval.")
    parser.add_argument("--positive-ratio", type=float, default=0.2)
    parser.add_argument("--eval-dir", default="data/generated_eval/incoming")
@@ -54,13 +71,18 @@ def main() -> None:

    library_dir = Path(args.library_dir)
    quarantine_dir = Path(args.quarantine_dir)
+    duplicate_quarantine_dir = Path(args.duplicate_quarantine_dir)
    report_path = Path(args.report)

    files_before = iter_lyric_files(library_dir)
    placeholders = _find_placeholder_files(library_dir)
-    short_effective = _effective_line_report(library_dir)
+    duplicate_report_path = report_path.with_suffix(".duplicates.csv")

    moved_or_deleted: list[str] = []
+    duplicate_actions: list[str] = []
+    duplicate_rows: list[dict[str, object]] = []
+    short_effective: dict[str, int]
+    retained_count = 0
    if not args.dry_run:
        moved_or_deleted = _handle_placeholders(
            placeholders,
@@ -68,9 +90,25 @@ def main() -> None:
            quarantine_dir=quarantine_dir,
            delete=args.delete_placeholders,
        )
-        _build_index(library_dir, Path(args.index))
+        if args.skip_library_dedup:
+            profiles = _profile_library(library_dir)
+            short_effective = _effective_line_report_from_profiles(profiles)
+            retained_count = _build_index_from_profiles(profiles, Path(args.index))
+        else:
+            profiles = _profile_library(library_dir)
+            short_effective = _effective_line_report_from_profiles(profiles)
+            retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
+                profiles,
+                library_dir=library_dir,
+                index_path=Path(args.index),
+                duplicate_quarantine_dir=duplicate_quarantine_dir,
+                delete=args.delete_duplicates,
+                dry_run=False,
+            )
+            _write_duplicate_report(duplicate_rows, duplicate_report_path)

        if args.eval_size > 0:
+            eval_index_path = Path(args.eval_csv).with_suffix(".index.pkl")
            generate_eval_set(
                library_dir=library_dir,
                output_dir=Path(args.eval_dir),
@@ -78,9 +116,10 @@ def main() -> None:
                size=args.eval_size,
                positive_ratio=args.positive_ratio,
                index_path=Path(args.index),
+                eval_index_path=eval_index_path,
            )
            evaluate_csv(
-                Path(args.index),
+                eval_index_path,
                Path(args.eval_csv),
                Path(args.eval_out),
                base_dir=Path(args.eval_csv).parent,
@@ -88,13 +127,27 @@ def main() -> None:
                max_candidates=5,
            )
            evaluate_csv(
-                Path(args.index),
+                eval_index_path,
                Path(args.eval_csv),
                Path(args.eval_out).with_name(Path(args.eval_out).stem + "_review_positive.csv"),
                base_dir=Path(args.eval_csv).parent,
                positive_decisions={"duplicate", "review"},
                max_candidates=5,
            )
+    else:
+        profiles = _profile_library(library_dir)
+        short_effective = _effective_line_report_from_profiles(profiles)
+        if not args.skip_library_dedup:
+            retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
+                profiles,
+                library_dir=library_dir,
+                index_path=Path(args.index),
+                duplicate_quarantine_dir=duplicate_quarantine_dir,
+                delete=args.delete_duplicates,
+                dry_run=True,
+            )
+        else:
+            retained_count = len(profiles)

    report = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
@@ -104,11 +157,18 @@ def main() -> None:
        "placeholder_matches": len(placeholders),
        "placeholder_files": [str(path) for path in placeholders],
        "handled_placeholder_files": moved_or_deleted,
+        "library_dedup_skipped": args.skip_library_dedup,
+        "duplicate_matches": len(duplicate_rows),
+        "duplicate_report": str(duplicate_report_path) if duplicate_rows else "",
+        "handled_duplicate_files": duplicate_actions[:1000],
+        "handled_duplicate_files_truncated": len(duplicate_actions) > 1000,
+        "retained_index_records": retained_count,
        "files_after": len(iter_lyric_files(library_dir)),
        "index": str(args.index),
        "eval_size": args.eval_size,
        "eval_csv": str(args.eval_csv) if args.eval_size > 0 else "",
        "eval_out": str(args.eval_out) if args.eval_size > 0 else "",
+        "eval_index": str(Path(args.eval_csv).with_suffix(".index.pkl")) if args.eval_size > 0 else "",
        "short_effective_line_counts": short_effective,
    }

@@ -154,15 +214,133 @@ def _handle_placeholders(
    return handled


-def _build_index(library_dir: Path, index_path: Path) -> None:
+def _profile_library(library_dir: Path) -> list[LibraryProfile]:
+    profiles: list[LibraryProfile] = []
+    files = iter_lyric_files(library_dir)
+    _progress(f"profile active library: 0/{len(files)}")
+    for index, path in enumerate(files, start=1):
+        record = record_from_file(path, base_dir=library_dir)
+        normalized = normalize_lyrics(record.lyrics)
+        lines = normalized.primary_lines or normalized.unique_lines
+        normalized_text = normalized.normalized_full_text
+        profiles.append(
+            LibraryProfile(
+                path=path,
+                record=record,
+                normalized=normalized,
+                line_count=len(lines),
+                char_count=len(normalized_text),
+            )
+        )
+        _progress_count("profile active library", index, len(files), step=5000)
+    return profiles
+
+
+def _build_index_from_profiles(profiles: list[LibraryProfile], index_path: Path) -> int:
    checker = DuplicateChecker()
-    for record in records_from_dir(library_dir):
-        checker.add_record(record)
+    for index, profile in enumerate(profiles, start=1):
+        checker.add_normalized_record(profile.record, profile.normalized)
+        _progress_count("build index", index, len(profiles), step=5000)
    index_path.parent.mkdir(parents=True, exist_ok=True)
    checker.save(index_path)
+    return checker.record_count
+
+
+def _deduplicate_and_build_index(
+    profiles: list[LibraryProfile],
+    *,
+    library_dir: Path,
+    index_path: Path,
+    duplicate_quarantine_dir: Path,
+    delete: bool,
+    dry_run: bool,
+) -> tuple[int, list[dict[str, object]], list[str]]:
+    checker = DuplicateChecker()
+    duplicate_rows: list[dict[str, object]] = []
+    duplicate_actions: list[str] = []
+    ordered = sorted(profiles, key=_profile_quality_key)
+    _progress(f"deduplicate active library: 0/{len(ordered)}")
+    for index, profile in enumerate(ordered, start=1):
+        result = checker.check_record(profile.record, max_candidates=1)
+        best = result.candidates[0] if result.candidates else None
+        if result.decision == DuplicateDecision.DUPLICATE and best is not None:
+            duplicate_rows.append(
+                {
+                    "duplicate_path": str(profile.path),
+                    "duplicate_record_id": profile.record.record_id,
+                    "kept_record_id": best.record_id,
+                    "decision": result.decision.value,
+                    "confidence": result.confidence,
+                    "reason": result.reason,
+                    "best_candidate_jaccard": best.jaccard,
+                    "best_candidate_line_coverage": best.line_coverage,
+                    "best_candidate_primary_jaccard": best.primary_jaccard,
+                    "best_candidate_primary_line_coverage": best.primary_line_coverage,
+                    "matched_unique_lines": " | ".join(best.matched_unique_lines),
+                    "line_count": profile.line_count,
+                    "char_count": profile.char_count,
+                }
+            )
+            if not dry_run:
+                duplicate_actions.append(
+                    _handle_duplicate_file(
+                        profile.path,
+                        library_dir=library_dir,
+                        duplicate_quarantine_dir=duplicate_quarantine_dir,
+                        delete=delete,
+                    )
+                )
+        else:
+            checker.add_normalized_record(profile.record, profile.normalized)
+        _progress_count("deduplicate active library", index, len(ordered), step=5000)
+
+    if not dry_run:
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        checker.save(index_path)
+    return checker.record_count, duplicate_rows, duplicate_actions
+
+
+def _handle_duplicate_file(
+    path: Path,
+    *,
+    library_dir: Path,
+    duplicate_quarantine_dir: Path,
+    delete: bool,
+) -> str:
+    if delete:
+        path.unlink()
+        return f"deleted:{path}"
+    duplicate_quarantine_dir.mkdir(parents=True, exist_ok=True)
+    relative = path.resolve().relative_to(library_dir.resolve())
+    destination = duplicate_quarantine_dir / relative
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    if destination.exists():
+        destination = destination.with_name(f"{destination.stem}_{datetime.now().strftime('%Y%m%d%H%M%S')}{destination.suffix}")
+    shutil.move(str(path), str(destination))
+    return f"moved:{path}->{destination}"
+
+
+def _profile_quality_key(profile: LibraryProfile) -> tuple[int, int, int, str]:
+    # Sort ascending; negative values make higher-quality records come first.
+    filename_quality = 0 if not profile.path.name.startswith("None_") else 1
+    return (filename_quality, -profile.line_count, -profile.char_count, str(profile.path))
+
+
+def _write_duplicate_report(rows: list[dict[str, object]], report_path: Path) -> None:
+    if not rows:
+        return
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    with report_path.open("w", encoding="utf-8", newline="") as file:
+        writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(rows)


 def _effective_line_report(library_dir: Path) -> dict[str, int]:
+    return _effective_line_report_from_profiles(_profile_library(library_dir))
+
+
+def _effective_line_report_from_profiles(profiles: list[LibraryProfile]) -> dict[str, int]:
    buckets = {
        "total": 0,
        "zero_effective_lines": 0,
@@ -170,10 +348,9 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
        "four_to_five_effective_lines": 0,
        "six_plus_effective_lines": 0,
    }
-    for path in iter_lyric_files(library_dir):
+    for profile in profiles:
        buckets["total"] += 1
-        normalized = normalize_lyrics(read_lyric_file(path))
-        line_count = len(normalized.primary_lines or normalized.unique_lines)
+        line_count = profile.line_count
        if line_count == 0:
            buckets["zero_effective_lines"] += 1
        elif line_count <= 3:
@@ -185,5 +362,16 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
    return buckets


+def _progress(message: str) -> None:
+    print(f"[process-library] {message}", file=sys.stderr, flush=True)
+
+
+def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
+    if total <= 0:
+        return
+    if current == 1 or current == total or current % step == 0:
+        _progress(f"{label}: {current}/{total}")
+
+
 if __name__ == "__main__":
    main()
--- a/tests/test_lyric_dedup.py
View file @ba39ce6
+++ b/tests/test_lyric_dedup.py
View file @ba39ce6
@@ -308,9 +308,11 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None:
    assert manifest["library_files"] == 12
    assert manifest["sample_size"] == 30
    assert manifest["unique_source_records"] > 1
+    assert manifest["holdout_records"] > 1
+    assert (tmp_path / "generated" / "eval.csv.index.pkl").exists()
    assert "positive_full_duplicate" in manifest["plan"]
+    assert "negative_real_holdout_full_song" in negative_types
    assert "negative_fragment" in negative_types
-    assert "negative_hard_candidate" in negative_types
    assert all(row["expected"] == "不应去重" for row in rows if row["sample_type"].startswith("negative_"))