Commit ba39ce6a ba39ce6aa50b5bc45d24bc32bcbc7c51b870922a by 沈秋雨

添加测试集内部去重

1 parent f8ad329c
......@@ -80,6 +80,7 @@ python -m lyric_dedup.cli generate-eval-set \
--lyrics-dir data/generated_eval/incoming \
--csv data/generated_eval/eval_50000.csv \
--index outputs/indexes/lyrics.pkl \
--eval-index data/generated_eval/eval_50000.csv.index.pkl \
--size 50000 \
--positive-ratio 0.3
```
......@@ -88,10 +89,10 @@ python -m lyric_dedup.cli generate-eval-set \
- 先扫描整个曲库,按有效歌词行数、语言类型、文件来源前缀做分层采样,不再按排序前缀取样。
- `应去重` 样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。
- `不应去重` 样本包含同主题新歌词、hard negative、片段歌词、重复副歌碰撞、仅翻译相似、短歌词/占位边界样本。
- `不应去重` 样本以真实 holdout 完整歌词为主,也包含片段歌词、重复副歌碰撞、仅翻译相似、同主题新歌词、短歌词/占位边界样本。
- 片段歌词即使命中已有歌曲的一部分,也不应该输出 `duplicate`;最多进入 `review`
- 如果传入 `--index`,生成器会用现有索引构造更接近线上召回风险的 hard negative
- 同时会生成 `*.manifest.json`,记录 seed、曲库规模、样本类型分布、语言/来源分桶和样本来源覆盖数。
- 生成器会额外写出 `--eval-index`,这个索引排除了 holdout 歌,评估生成 CSV 时应使用它
- 同时会生成 `*.manifest.json`,记录 seed、曲库规模、holdout 数、样本类型分布、语言/来源分桶和样本来源覆盖数。
先准备一个 CSV,例如 `data/eval/eval.csv`
......
......@@ -103,6 +103,7 @@ python -m lyric_dedup.cli generate-eval-set \
--lyrics-dir data/generated_eval/incoming \
--csv data/generated_eval/eval_50000.csv \
--index outputs/indexes/library_lyrics.pkl \
--eval-index data/generated_eval/eval_50000.csv.index.pkl \
--size 50000 \
--positive-ratio 0.3
```
......@@ -120,24 +121,26 @@ python -m lyric_dedup.cli generate-eval-set \
```text
positive_* = 应去重,全曲歌词样式变化
negative_random_unrelated = 不应去重,同主题新歌词
negative_hard_candidate = 不应去重,系统容易召回的短句/局部重合样本
negative_real_holdout_full_song = 不应去重,完整真实歌词,已从评估索引中排除
negative_fragment = 不应去重,单曲片段
negative_shared_chorus = 不应去重,重复副歌碰撞
negative_translation_only = 不应去重,仅翻译相似
negative_same_theme_synthetic = 不应去重,同主题新歌词
edge_short_or_placeholder = 不应去重,短歌词/占位边界样本
```
生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。传入 `--index` 后会用现有索引生成 hard negative。每次还会输出:
生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。它会分出一批 holdout 完整歌词作为真实新歌负样本,并生成一个排除 holdout 的评估索引。每次还会输出:
```text
data/generated_eval/eval_50000.csv.manifest.json
data/generated_eval/eval_50000.csv.index.pkl
```
manifest 里重点看:
```text
library_files 曲库歌词文件数
holdout_records 从评估索引中排除、作为真实新歌负样本的数量
sample_type_counts 各样本类型数量
line_count_bucket_counts / language_bucket_counts / source_bucket_counts
unique_source_records 本次评估覆盖了多少真实源文件
......@@ -147,7 +150,7 @@ unique_source_records 本次评估覆盖了多少真实源文件
```bash
python -m lyric_dedup.cli evaluate-csv \
--index outputs/indexes/library_lyrics.pkl \
--index data/generated_eval/eval_50000.csv.index.pkl \
--csv data/generated_eval/eval_50000.csv \
--base-dir data/generated_eval \
--out outputs/results/library_eval_50000.csv
......@@ -171,7 +174,7 @@ false_positive
```bash
python -m lyric_dedup.cli evaluate-csv \
--index outputs/indexes/library_lyrics.pkl \
--index data/generated_eval/eval_50000.csv.index.pkl \
--csv data/generated_eval/eval_50000.csv \
--base-dir data/generated_eval \
--positive-decisions duplicate,review \
......
......@@ -96,16 +96,24 @@ class DuplicateChecker:
def add_record(self, record: LyricRecord) -> None:
indexed = self._index(record)
self._records[record.record_id] = indexed
self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record.record_id)
self._add_indexed(record.record_id, indexed)
def add_normalized_record(self, record: LyricRecord, normalized: NormalizedLyrics) -> None:
"""Add a record when normalized lyrics have already been computed."""
indexed = self._index_normalized(record, normalized)
self._add_indexed(record.record_id, indexed)
def _add_indexed(self, record_id: str, indexed: _IndexedRecord) -> None:
self._records[record_id] = indexed
self._exact_hash_to_ids.setdefault(indexed.exact_hash, set()).add(record_id)
for line in indexed.normalized.unique_lines:
if len(line) >= 4:
self._line_to_ids.setdefault(line, set()).add(record.record_id)
self._line_to_ids.setdefault(line, set()).add(record_id)
for token in indexed.tokens:
self._token_to_ids.setdefault(token, set()).add(record.record_id)
self._token_to_ids.setdefault(token, set()).add(record_id)
for token in indexed.fallback_tokens:
self._token_to_ids.setdefault(token, set()).add(record.record_id)
self._lsh.add(record.record_id, indexed.signature)
self._token_to_ids.setdefault(token, set()).add(record_id)
self._lsh.add(record_id, indexed.signature)
def save(self, path: str | Path) -> None:
"""Persist the in-memory index for later checks."""
......@@ -187,6 +195,9 @@ class DuplicateChecker:
def _index(self, record: LyricRecord) -> _IndexedRecord:
normalized = normalize_lyrics(record.lyrics)
return self._index_normalized(record, normalized)
def _index_normalized(self, record: LyricRecord, normalized: NormalizedLyrics) -> _IndexedRecord:
tokens = lyric_tokens(normalized)
primary_tokens = lyric_tokens(normalized, lines=normalized.primary_lines)
translation_tokens = lyric_tokens(normalized, lines=normalized.translation_lines)
......
......@@ -5,6 +5,7 @@ from __future__ import annotations
import argparse
import csv
import json
import sys
from pathlib import Path
from lyric_dedup.checker import DuplicateChecker
......@@ -50,7 +51,8 @@ def main() -> None:
generate.add_argument("--size", type=int, default=100)
generate.add_argument("--positive-ratio", type=float, default=0.3)
generate.add_argument("--seed", type=int, default=20260602)
generate.add_argument("--index", default="", help="optional existing index for hard-negative generation")
generate.add_argument("--index", default="", help="optional source index path recorded in the manifest")
generate.add_argument("--eval-index", default="", help="output index built from non-holdout records for this eval set")
args = parser.parse_args()
if args.command == "build-index":
......@@ -77,6 +79,7 @@ def main() -> None:
positive_ratio=args.positive_ratio,
seed=args.seed,
index_path=Path(args.index) if args.index else None,
eval_index_path=Path(args.eval_index) if args.eval_index else None,
)
print(json.dumps(summary, ensure_ascii=False))
......@@ -155,52 +158,58 @@ def evaluate_csv(
positive_decisions: set[str],
max_candidates: int,
) -> None:
_progress(f"load index: {index_path}")
checker = DuplicateChecker.load(index_path)
rows: list[dict[str, object]] = []
total = _csv_data_row_count(csv_path)
_progress(f"evaluate csv: 0/{total}")
out_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open(encoding="utf-8-sig", newline="") as file:
reader = csv.DictReader(file)
if reader.fieldnames is None:
raise ValueError("评估 CSV 需要表头")
for row_number, row in enumerate(reader, start=2):
sample_id = row.get("id") or row.get("sample_id") or str(row_number)
record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
result = checker.check_record(record, max_candidates=max_candidates)
predicted_duplicate = result.decision.value in positive_decisions
best = result.candidates[0] if result.candidates else None
rows.append(
{
"id": sample_id,
"source": source,
"expected_duplicate": expected_duplicate,
"decision": result.decision.value,
"predicted_duplicate": predicted_duplicate,
"correct": expected_duplicate == predicted_duplicate,
"confidence": result.confidence,
"reason": result.reason,
"best_candidate_id": best.record_id if best else "",
"best_candidate_decision": best.decision.value if best else "",
"best_candidate_confidence": best.confidence if best else "",
"best_candidate_jaccard": best.jaccard if best else "",
"best_candidate_line_coverage": best.line_coverage if best else "",
"best_candidate_primary_jaccard": best.primary_jaccard if best else "",
"best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
"best_candidate_translation_jaccard": best.translation_jaccard if best else "",
"best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
"best_candidate_reason": best.reason if best else "",
"matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
}
)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()) if rows else ["id"])
writer.writeheader()
writer.writerows(rows)
fieldnames = [
"id",
"source",
"expected_duplicate",
"decision",
"predicted_duplicate",
"correct",
"confidence",
"reason",
"best_candidate_id",
"best_candidate_decision",
"best_candidate_confidence",
"best_candidate_jaccard",
"best_candidate_line_coverage",
"best_candidate_primary_jaccard",
"best_candidate_primary_line_coverage",
"best_candidate_translation_jaccard",
"best_candidate_translation_line_coverage",
"best_candidate_reason",
"matched_unique_lines",
]
with out_path.open("w", encoding="utf-8", newline="") as out_file:
writer = csv.DictWriter(out_file, fieldnames=fieldnames)
writer.writeheader()
for index, row in enumerate(reader, start=1):
row_out = _evaluate_row(
row,
row_number=index + 1,
checker=checker,
csv_path=csv_path,
base_dir=base_dir,
positive_decisions=positive_decisions,
max_candidates=max_candidates,
)
rows.append(row_out)
writer.writerow(row_out)
_progress_count("evaluate csv", index, total, step=1000)
summary = _evaluation_summary(rows, positive_decisions=positive_decisions, out_path=out_path)
summary_path = out_path.with_suffix(out_path.suffix + ".summary.json")
summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
_progress("evaluation complete")
print(json.dumps(summary, ensure_ascii=False))
......@@ -229,6 +238,45 @@ def _result_to_dict(result, *, source: str) -> dict[str, object]:
}
def _evaluate_row(
row: dict[str, str],
*,
row_number: int,
checker: DuplicateChecker,
csv_path: Path,
base_dir: Path | None,
positive_decisions: set[str],
max_candidates: int,
) -> dict[str, object]:
sample_id = row.get("id") or row.get("sample_id") or str(row_number)
record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
result = checker.check_record(record, max_candidates=max_candidates)
predicted_duplicate = result.decision.value in positive_decisions
best = result.candidates[0] if result.candidates else None
return {
"id": sample_id,
"source": source,
"expected_duplicate": expected_duplicate,
"decision": result.decision.value,
"predicted_duplicate": predicted_duplicate,
"correct": expected_duplicate == predicted_duplicate,
"confidence": result.confidence,
"reason": result.reason,
"best_candidate_id": best.record_id if best else "",
"best_candidate_decision": best.decision.value if best else "",
"best_candidate_confidence": best.confidence if best else "",
"best_candidate_jaccard": best.jaccard if best else "",
"best_candidate_line_coverage": best.line_coverage if best else "",
"best_candidate_primary_jaccard": best.primary_jaccard if best else "",
"best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
"best_candidate_translation_jaccard": best.translation_jaccard if best else "",
"best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
"best_candidate_reason": best.reason if best else "",
"matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
}
def _lyrics_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[str, str]:
lyrics = (row.get("lyrics") or "").strip()
if lyrics:
......@@ -322,5 +370,23 @@ def _evaluation_summary(
}
def _csv_data_row_count(csv_path: Path) -> int:
with csv_path.open(encoding="utf-8-sig", newline="") as file:
reader = csv.reader(file)
next(reader, None)
return sum(1 for _ in reader)
def _progress(message: str) -> None:
print(f"[eval] {message}", file=sys.stderr, flush=True)
def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
if total <= 0:
return
if current == 1 or current == total or current % step == 0:
_progress(f"{label}: {current}/{total}")
if __name__ == "__main__":
main()
......
......@@ -7,14 +7,14 @@ import hashlib
import json
import random
import re
import sys
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import DuplicateDecision
from lyric_dedup.checker import LyricRecord
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import NormalizedLyrics
from lyric_dedup.normalization import fingerprint_text
......@@ -23,19 +23,31 @@ from lyric_dedup.normalization import normalize_lyrics
DEFAULT_SAMPLE_MIX = {
"positive_full_duplicate": 0.30,
"negative_random_unrelated": 0.20,
"negative_hard_candidate": 0.25,
"negative_real_holdout_full_song": 0.40,
"negative_fragment": 0.10,
"negative_shared_chorus": 0.05,
"negative_translation_only": 0.05,
"negative_same_theme_synthetic": 0.05,
"edge_short_or_placeholder": 0.05,
}
def _progress(message: str) -> None:
print(f"[eval-gen] {message}", file=sys.stderr, flush=True)
def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
if total <= 0:
return
if current == 1 or current == total or current % step == 0:
_progress(f"{label}: {current}/{total}")
@dataclass(frozen=True)
class LyricProfile:
path: Path
record_id: str
raw_text: str
title: str
artist: str
normalized: NormalizedLyrics
......@@ -74,6 +86,7 @@ def generate_eval_set(
positive_ratio: float = 0.30,
seed: int = 20260602,
index_path: Path | None = None,
eval_index_path: Path | None = None,
) -> dict[str, object]:
"""Generate a stratified production evaluation set.
......@@ -83,6 +96,7 @@ def generate_eval_set(
if size <= 0:
raise ValueError("size must be positive")
_progress(f"start generation: size={size}, positive_ratio={positive_ratio}, seed={seed}")
rng = random.Random(seed)
profiles = profile_library(library_dir)
if not profiles:
......@@ -90,13 +104,25 @@ def generate_eval_set(
output_dir.mkdir(parents=True, exist_ok=True)
csv_path.parent.mkdir(parents=True, exist_ok=True)
_progress(f"clean output dir: {output_dir}")
_clean_generated_output_dir(output_dir)
checker = DuplicateChecker.load(index_path) if index_path else None
plan = _sample_plan(size, positive_ratio=positive_ratio)
groups = _profile_groups(profiles)
_progress(f"sample plan: {plan}")
holdout_count = min(plan["negative_real_holdout_full_song"], max(1, len(profiles) // 2))
holdout_profiles = _stratified_unique_sample(
profiles,
holdout_count,
rng,
)
holdout_ids = {profile.record_id for profile in holdout_profiles}
indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
_build_eval_index(indexed_profiles, eval_index_path)
groups = _profile_groups(indexed_profiles)
samples: list[GeneratedSample] = []
_progress("build positive_full_duplicate samples")
samples.extend(
_build_positive_samples(
_stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
......@@ -106,53 +132,62 @@ def generate_eval_set(
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build negative_real_holdout_full_song samples")
samples.extend(
_build_random_unrelated_samples(
plan["negative_random_unrelated"],
_build_holdout_full_song_samples(
holdout_profiles,
output_dir,
csv_path.parent,
rng,
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build negative_fragment samples")
samples.extend(
_build_hard_candidate_samples(
groups["normal"],
plan["negative_hard_candidate"],
_build_fragment_samples(
_stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
output_dir,
csv_path.parent,
rng,
checker=checker,
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build negative_shared_chorus samples")
samples.extend(
_build_fragment_samples(
_stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
_build_shared_chorus_samples(
_stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
output_dir,
csv_path.parent,
rng,
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build negative_translation_only samples")
samples.extend(
_build_shared_chorus_samples(
_stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
_build_translation_only_samples(
_stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
output_dir,
csv_path.parent,
rng,
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build negative_same_theme_synthetic samples")
samples.extend(
_build_translation_only_samples(
_stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
_build_same_theme_synthetic_samples(
plan["negative_same_theme_synthetic"],
output_dir,
csv_path.parent,
rng,
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
_progress("build edge_short_or_placeholder samples")
samples.extend(
_build_edge_samples(
_stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
......@@ -162,10 +197,12 @@ def generate_eval_set(
start_index=len(samples) + 1,
)
)
_progress(f"built samples: {len(samples)}/{size}")
if len(samples) < size:
_progress(f"top up with negative_same_theme_synthetic samples: {size - len(samples)}")
samples.extend(
_build_random_unrelated_samples(
_build_same_theme_synthetic_samples(
size - len(samples),
output_dir,
csv_path.parent,
......@@ -176,7 +213,9 @@ def generate_eval_set(
samples = samples[:size]
rng.shuffle(samples)
_progress(f"write csv: {csv_path}")
_write_csv(samples, csv_path, seed=seed)
_progress("write manifest")
manifest = _write_manifest(
profiles=profiles,
samples=samples,
......@@ -185,15 +224,21 @@ def generate_eval_set(
seed=seed,
plan=plan,
index_path=index_path,
eval_index_path=eval_index_path,
holdout_count=len(holdout_profiles),
)
_progress("generation complete")
return manifest
def profile_library(library_dir: Path) -> list[LyricProfile]:
profiles: list[LyricProfile] = []
for path in iter_lyric_files(library_dir):
paths = iter_lyric_files(library_dir)
_progress(f"profile library: 0/{len(paths)}")
for index, path in enumerate(paths, start=1):
record = record_from_file(path, base_dir=library_dir)
normalized = normalize_lyrics(record.lyrics)
raw_text = record.lyrics
normalized = normalize_lyrics(raw_text)
lines = normalized.primary_lines or normalized.unique_lines
line_count = len(lines)
normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
......@@ -202,6 +247,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
LyricProfile(
path=path,
record_id=record.record_id,
raw_text=raw_text,
title=record.title or "",
artist=record.artist or "",
normalized=normalized,
......@@ -214,6 +260,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
has_translation=bool(normalized.translation_lines),
)
)
_progress_count("profile library", index, len(paths), step=5000)
return profiles
......@@ -283,6 +330,31 @@ def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Ran
return selected
def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
if count <= 0 or not profiles:
return []
return _stratified_sample(profiles, min(count, len(profiles)), rng)
def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
_progress(f"build eval index excluding holdout: {index_path}")
checker = DuplicateChecker()
total = len(profiles)
for index, profile in enumerate(profiles, start=1):
checker.add_normalized_record(
LyricRecord(
record_id=profile.record_id,
lyrics=profile.raw_text,
title=profile.title or None,
artist=profile.artist or None,
),
profile.normalized,
)
_progress_count("build eval index", index, total, step=5000)
index_path.parent.mkdir(parents=True, exist_ok=True)
checker.save(index_path)
def _build_positive_samples(
profiles: list[LyricProfile],
output_dir: Path,
......@@ -293,7 +365,7 @@ def _build_positive_samples(
) -> list[GeneratedSample]:
samples: list[GeneratedSample] = []
for offset, profile in enumerate(profiles):
raw = read_lyric_file(profile.path)
raw = profile.raw_text
lines = _content_lines(raw)
variants = [
("positive_exact_copy", raw),
......@@ -308,80 +380,62 @@ def _build_positive_samples(
index = start_index + offset
path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
_progress_count("positive_full_duplicate", len(samples), len(profiles))
return samples
def _build_random_unrelated_samples(
count: int,
def _build_holdout_full_song_samples(
profiles: list[LyricProfile],
output_dir: Path,
csv_base: Path,
rng: random.Random,
*,
start_index: int,
) -> list[GeneratedSample]:
_progress("build negative_real_holdout_full_song samples")
samples: list[GeneratedSample] = []
for offset in range(count):
for offset, profile in enumerate(profiles):
index = start_index + offset
text = _same_theme_synthetic(index, rng)
path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_random_unrelated.txt", text)
text = profile.raw_text
path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_real_holdout_full_song.txt", text)
samples.append(
GeneratedSample(
sample_id=f"sample-{index:05d}",
file=str(path.relative_to(csv_base)),
expected="不应去重",
sample_type="negative_random_unrelated",
source="synthetic",
notes="same-theme synthetic full lyric not copied from library",
_sample_from_profile(
index,
path,
csv_base,
"不应去重",
"negative_real_holdout_full_song",
profile,
notes="full real lyric held out from the generated eval index",
)
)
_progress_count("negative_real_holdout_full_song", len(samples), len(profiles))
return samples
def _build_hard_candidate_samples(
profiles: list[LyricProfile],
def _build_same_theme_synthetic_samples(
count: int,
output_dir: Path,
csv_base: Path,
rng: random.Random,
*,
checker: DuplicateChecker | None,
start_index: int,
) -> list[GeneratedSample]:
if count <= 0:
return []
sources = _stratified_sample(profiles, count * 3, rng)
samples: list[GeneratedSample] = []
for profile in sources:
if len(samples) >= count:
break
lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
text = _short_shared_snippet(lines, rng)
candidate_id = ""
if checker is not None:
result = checker.check(text, max_candidates=5)
candidate = next(
(
item
for item in result.candidates
if item.record_id != profile.record_id and item.decision != DuplicateDecision.NEW
),
result.candidates[0] if result.candidates else None,
)
candidate_id = candidate.record_id if candidate else ""
index = start_index + len(samples)
path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_hard_candidate.txt", text)
for offset in range(count):
index = start_index + offset
text = _same_theme_synthetic(index, rng)
path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_same_theme_synthetic.txt", text)
samples.append(
_sample_from_profile(
index,
path,
csv_base,
"不应去重",
"negative_hard_candidate",
profile,
candidate_record_id=candidate_id,
notes="shares a few real lines plus new filler; should not auto duplicate",
GeneratedSample(
sample_id=f"sample-{index:05d}",
file=str(path.relative_to(csv_base)),
expected="不应去重",
sample_type="negative_same_theme_synthetic",
source="synthetic",
notes="same-theme synthetic full lyric not copied from library",
)
)
_progress_count("negative_same_theme_synthetic", len(samples), count)
return samples
......@@ -410,6 +464,7 @@ def _build_fragment_samples(
notes="partial lyric fragment only",
)
)
_progress_count("negative_fragment", len(samples), len(profiles))
return samples
......@@ -447,6 +502,7 @@ def _build_shared_chorus_samples(
notes="shared repeated lines with new surrounding content",
)
)
_progress_count("negative_shared_chorus", len(samples), len(profiles))
return samples
......@@ -478,6 +534,7 @@ def _build_translation_only_samples(
notes="translation-like text without matching original lyric",
)
)
_progress_count("negative_translation_only", len(samples), len(profiles))
return samples
......@@ -511,6 +568,7 @@ def _build_edge_samples(
notes=notes,
)
)
_progress_count("edge_short_or_placeholder", len(samples), len(profiles))
return samples
......@@ -598,13 +656,17 @@ def _write_manifest(
seed: int,
plan: dict[str, int],
index_path: Path | None,
eval_index_path: Path,
holdout_count: int,
) -> dict[str, object]:
manifest = {
"seed": seed,
"library_files": len(profiles),
"sample_size": len(samples),
"plan": plan,
"index": str(index_path) if index_path else "",
"source_index": str(index_path) if index_path else "",
"eval_index": str(eval_index_path),
"holdout_records": holdout_count,
"lyrics_dir": str(output_dir),
"csv": str(csv_path),
"manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
......
......@@ -4,8 +4,9 @@ This script is intended for the recurring workflow after adding files to
``data/library``:
1. Move pure-music placeholder lyric files out of the active library.
2. Rebuild the duplicate-checking index.
3. Optionally regenerate and evaluate a synthetic regression set.
2. Move duplicate lyric files out of the active library.
3. Rebuild the duplicate-checking index from retained files.
4. Optionally regenerate and evaluate a production-style eval set.
"""
from __future__ import annotations
......@@ -15,6 +16,7 @@ import csv
import json
import shutil
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
......@@ -23,11 +25,14 @@ if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import DuplicateDecision
from lyric_dedup.checker import LyricRecord
from lyric_dedup.cli import evaluate_csv
from lyric_dedup.eval_dataset import generate_eval_set
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import records_from_dir
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import NormalizedLyrics
from lyric_dedup.normalization import normalize_lyrics
......@@ -37,13 +42,25 @@ PLACEHOLDER_MARKERS = (
)
@dataclass(frozen=True)
class LibraryProfile:
path: Path
record: LyricRecord
normalized: NormalizedLyrics
line_count: int
char_count: int
def main() -> None:
parser = argparse.ArgumentParser(description="Process lyric library additions.")
parser.add_argument("--library-dir", default="data/library")
parser.add_argument("--index", default="outputs/indexes/library_lyrics.pkl")
parser.add_argument("--quarantine-dir", default="data/quarantine/no_lyrics_placeholders")
parser.add_argument("--duplicate-quarantine-dir", default="data/quarantine/duplicates")
parser.add_argument("--dry-run", action="store_true", help="Only report placeholder files; do not move or write outputs.")
parser.add_argument("--delete-placeholders", action="store_true", help="Delete matched placeholder files instead of moving them.")
parser.add_argument("--delete-duplicates", action="store_true", help="Delete duplicate lyric files instead of moving them.")
parser.add_argument("--skip-library-dedup", action="store_true", help="Skip internal duplicate cleanup before rebuilding the index.")
parser.add_argument("--eval-size", type=int, default=0, help="Generate and evaluate this many synthetic samples. 0 disables eval.")
parser.add_argument("--positive-ratio", type=float, default=0.2)
parser.add_argument("--eval-dir", default="data/generated_eval/incoming")
......@@ -54,13 +71,18 @@ def main() -> None:
library_dir = Path(args.library_dir)
quarantine_dir = Path(args.quarantine_dir)
duplicate_quarantine_dir = Path(args.duplicate_quarantine_dir)
report_path = Path(args.report)
files_before = iter_lyric_files(library_dir)
placeholders = _find_placeholder_files(library_dir)
short_effective = _effective_line_report(library_dir)
duplicate_report_path = report_path.with_suffix(".duplicates.csv")
moved_or_deleted: list[str] = []
duplicate_actions: list[str] = []
duplicate_rows: list[dict[str, object]] = []
short_effective: dict[str, int]
retained_count = 0
if not args.dry_run:
moved_or_deleted = _handle_placeholders(
placeholders,
......@@ -68,9 +90,25 @@ def main() -> None:
quarantine_dir=quarantine_dir,
delete=args.delete_placeholders,
)
_build_index(library_dir, Path(args.index))
if args.skip_library_dedup:
profiles = _profile_library(library_dir)
short_effective = _effective_line_report_from_profiles(profiles)
retained_count = _build_index_from_profiles(profiles, Path(args.index))
else:
profiles = _profile_library(library_dir)
short_effective = _effective_line_report_from_profiles(profiles)
retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
profiles,
library_dir=library_dir,
index_path=Path(args.index),
duplicate_quarantine_dir=duplicate_quarantine_dir,
delete=args.delete_duplicates,
dry_run=False,
)
_write_duplicate_report(duplicate_rows, duplicate_report_path)
if args.eval_size > 0:
eval_index_path = Path(args.eval_csv).with_suffix(".index.pkl")
generate_eval_set(
library_dir=library_dir,
output_dir=Path(args.eval_dir),
......@@ -78,9 +116,10 @@ def main() -> None:
size=args.eval_size,
positive_ratio=args.positive_ratio,
index_path=Path(args.index),
eval_index_path=eval_index_path,
)
evaluate_csv(
Path(args.index),
eval_index_path,
Path(args.eval_csv),
Path(args.eval_out),
base_dir=Path(args.eval_csv).parent,
......@@ -88,13 +127,27 @@ def main() -> None:
max_candidates=5,
)
evaluate_csv(
Path(args.index),
eval_index_path,
Path(args.eval_csv),
Path(args.eval_out).with_name(Path(args.eval_out).stem + "_review_positive.csv"),
base_dir=Path(args.eval_csv).parent,
positive_decisions={"duplicate", "review"},
max_candidates=5,
)
else:
profiles = _profile_library(library_dir)
short_effective = _effective_line_report_from_profiles(profiles)
if not args.skip_library_dedup:
retained_count, duplicate_rows, duplicate_actions = _deduplicate_and_build_index(
profiles,
library_dir=library_dir,
index_path=Path(args.index),
duplicate_quarantine_dir=duplicate_quarantine_dir,
delete=args.delete_duplicates,
dry_run=True,
)
else:
retained_count = len(profiles)
report = {
"timestamp": datetime.now().isoformat(timespec="seconds"),
......@@ -104,11 +157,18 @@ def main() -> None:
"placeholder_matches": len(placeholders),
"placeholder_files": [str(path) for path in placeholders],
"handled_placeholder_files": moved_or_deleted,
"library_dedup_skipped": args.skip_library_dedup,
"duplicate_matches": len(duplicate_rows),
"duplicate_report": str(duplicate_report_path) if duplicate_rows else "",
"handled_duplicate_files": duplicate_actions[:1000],
"handled_duplicate_files_truncated": len(duplicate_actions) > 1000,
"retained_index_records": retained_count,
"files_after": len(iter_lyric_files(library_dir)),
"index": str(args.index),
"eval_size": args.eval_size,
"eval_csv": str(args.eval_csv) if args.eval_size > 0 else "",
"eval_out": str(args.eval_out) if args.eval_size > 0 else "",
"eval_index": str(Path(args.eval_csv).with_suffix(".index.pkl")) if args.eval_size > 0 else "",
"short_effective_line_counts": short_effective,
}
......@@ -154,15 +214,133 @@ def _handle_placeholders(
return handled
def _build_index(library_dir: Path, index_path: Path) -> None:
def _profile_library(library_dir: Path) -> list[LibraryProfile]:
profiles: list[LibraryProfile] = []
files = iter_lyric_files(library_dir)
_progress(f"profile active library: 0/{len(files)}")
for index, path in enumerate(files, start=1):
record = record_from_file(path, base_dir=library_dir)
normalized = normalize_lyrics(record.lyrics)
lines = normalized.primary_lines or normalized.unique_lines
normalized_text = normalized.normalized_full_text
profiles.append(
LibraryProfile(
path=path,
record=record,
normalized=normalized,
line_count=len(lines),
char_count=len(normalized_text),
)
)
_progress_count("profile active library", index, len(files), step=5000)
return profiles
def _build_index_from_profiles(profiles: list[LibraryProfile], index_path: Path) -> int:
checker = DuplicateChecker()
for record in records_from_dir(library_dir):
checker.add_record(record)
for index, profile in enumerate(profiles, start=1):
checker.add_normalized_record(profile.record, profile.normalized)
_progress_count("build index", index, len(profiles), step=5000)
index_path.parent.mkdir(parents=True, exist_ok=True)
checker.save(index_path)
return checker.record_count
def _deduplicate_and_build_index(
profiles: list[LibraryProfile],
*,
library_dir: Path,
index_path: Path,
duplicate_quarantine_dir: Path,
delete: bool,
dry_run: bool,
) -> tuple[int, list[dict[str, object]], list[str]]:
checker = DuplicateChecker()
duplicate_rows: list[dict[str, object]] = []
duplicate_actions: list[str] = []
ordered = sorted(profiles, key=_profile_quality_key)
_progress(f"deduplicate active library: 0/{len(ordered)}")
for index, profile in enumerate(ordered, start=1):
result = checker.check_record(profile.record, max_candidates=1)
best = result.candidates[0] if result.candidates else None
if result.decision == DuplicateDecision.DUPLICATE and best is not None:
duplicate_rows.append(
{
"duplicate_path": str(profile.path),
"duplicate_record_id": profile.record.record_id,
"kept_record_id": best.record_id,
"decision": result.decision.value,
"confidence": result.confidence,
"reason": result.reason,
"best_candidate_jaccard": best.jaccard,
"best_candidate_line_coverage": best.line_coverage,
"best_candidate_primary_jaccard": best.primary_jaccard,
"best_candidate_primary_line_coverage": best.primary_line_coverage,
"matched_unique_lines": " | ".join(best.matched_unique_lines),
"line_count": profile.line_count,
"char_count": profile.char_count,
}
)
if not dry_run:
duplicate_actions.append(
_handle_duplicate_file(
profile.path,
library_dir=library_dir,
duplicate_quarantine_dir=duplicate_quarantine_dir,
delete=delete,
)
)
else:
checker.add_normalized_record(profile.record, profile.normalized)
_progress_count("deduplicate active library", index, len(ordered), step=5000)
if not dry_run:
index_path.parent.mkdir(parents=True, exist_ok=True)
checker.save(index_path)
return checker.record_count, duplicate_rows, duplicate_actions
def _handle_duplicate_file(
path: Path,
*,
library_dir: Path,
duplicate_quarantine_dir: Path,
delete: bool,
) -> str:
if delete:
path.unlink()
return f"deleted:{path}"
duplicate_quarantine_dir.mkdir(parents=True, exist_ok=True)
relative = path.resolve().relative_to(library_dir.resolve())
destination = duplicate_quarantine_dir / relative
destination.parent.mkdir(parents=True, exist_ok=True)
if destination.exists():
destination = destination.with_name(f"{destination.stem}_{datetime.now().strftime('%Y%m%d%H%M%S')}{destination.suffix}")
shutil.move(str(path), str(destination))
return f"moved:{path}->{destination}"
def _profile_quality_key(profile: LibraryProfile) -> tuple[int, int, int, str]:
# Sort ascending; negative values make higher-quality records come first.
filename_quality = 0 if not profile.path.name.startswith("None_") else 1
return (filename_quality, -profile.line_count, -profile.char_count, str(profile.path))
def _write_duplicate_report(rows: list[dict[str, object]], report_path: Path) -> None:
if not rows:
return
report_path.parent.mkdir(parents=True, exist_ok=True)
with report_path.open("w", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()))
writer.writeheader()
writer.writerows(rows)
def _effective_line_report(library_dir: Path) -> dict[str, int]:
return _effective_line_report_from_profiles(_profile_library(library_dir))
def _effective_line_report_from_profiles(profiles: list[LibraryProfile]) -> dict[str, int]:
buckets = {
"total": 0,
"zero_effective_lines": 0,
......@@ -170,10 +348,9 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
"four_to_five_effective_lines": 0,
"six_plus_effective_lines": 0,
}
for path in iter_lyric_files(library_dir):
for profile in profiles:
buckets["total"] += 1
normalized = normalize_lyrics(read_lyric_file(path))
line_count = len(normalized.primary_lines or normalized.unique_lines)
line_count = profile.line_count
if line_count == 0:
buckets["zero_effective_lines"] += 1
elif line_count <= 3:
......@@ -185,5 +362,16 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
return buckets
def _progress(message: str) -> None:
print(f"[process-library] {message}", file=sys.stderr, flush=True)
def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
if total <= 0:
return
if current == 1 or current == total or current % step == 0:
_progress(f"{label}: {current}/{total}")
if __name__ == "__main__":
main()
......
......@@ -308,9 +308,11 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None:
assert manifest["library_files"] == 12
assert manifest["sample_size"] == 30
assert manifest["unique_source_records"] > 1
assert manifest["holdout_records"] > 1
assert (tmp_path / "generated" / "eval.csv.index.pkl").exists()
assert "positive_full_duplicate" in manifest["plan"]
assert "negative_real_holdout_full_song" in negative_types
assert "negative_fragment" in negative_types
assert "negative_hard_candidate" in negative_types
assert all(row["expected"] == "不应去重" for row in rows if row["sample_type"].startswith("negative_"))
......