eval_dataset.py 24.8 KB

Raw Blame History Permalink

"""Generate production-style labeled evaluation samples from a lyric library."""

from __future__ import annotations

import csv
import hashlib
import json
import random
import re
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import DuplicateDecision
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import NormalizedLyrics
from lyric_dedup.normalization import fingerprint_text
from lyric_dedup.normalization import normalize_lyrics


DEFAULT_SAMPLE_MIX = {
    "positive_full_duplicate": 0.30,
    "negative_random_unrelated": 0.20,
    "negative_hard_candidate": 0.25,
    "negative_fragment": 0.10,
    "negative_shared_chorus": 0.05,
    "negative_translation_only": 0.05,
    "edge_short_or_placeholder": 0.05,
}


@dataclass(frozen=True)
class LyricProfile:
    path: Path
    record_id: str
    title: str
    artist: str
    normalized: NormalizedLyrics
    line_count: int
    char_count: int
    line_count_bucket: str
    language_bucket: str
    source_bucket: str
    normalized_hash: str
    has_translation: bool


@dataclass(frozen=True)
class GeneratedSample:
    sample_id: str
    file: str
    expected: str
    sample_type: str
    source: str
    source_record_id: str = ""
    candidate_record_id: str = ""
    line_count_bucket: str = ""
    language_bucket: str = ""
    source_bucket: str = ""
    title: str = ""
    artist: str = ""
    notes: str = ""


def generate_eval_set(
    *,
    library_dir: Path,
    output_dir: Path,
    csv_path: Path,
    size: int = 100,
    positive_ratio: float = 0.30,
    seed: int = 20260602,
    index_path: Path | None = None,
) -> dict[str, object]:
    """Generate a stratified production evaluation set.

    ``positive_ratio`` is kept for CLI compatibility. It overrides the default
    positive quota while keeping the remaining negative categories proportional.
    """
    if size <= 0:
        raise ValueError("size must be positive")

    rng = random.Random(seed)
    profiles = profile_library(library_dir)
    if not profiles:
        raise ValueError(f"{library_dir} 下没有 .lrc/.txt 歌词文件")

    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    _clean_generated_output_dir(output_dir)

    checker = DuplicateChecker.load(index_path) if index_path else None
    plan = _sample_plan(size, positive_ratio=positive_ratio)
    groups = _profile_groups(profiles)
    samples: list[GeneratedSample] = []

    samples.extend(
        _build_positive_samples(
            _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_random_unrelated_samples(
            plan["negative_random_unrelated"],
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_hard_candidate_samples(
            groups["normal"],
            plan["negative_hard_candidate"],
            output_dir,
            csv_path.parent,
            rng,
            checker=checker,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_fragment_samples(
            _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_shared_chorus_samples(
            _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_translation_only_samples(
            _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    samples.extend(
        _build_edge_samples(
            _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )

    if len(samples) < size:
        samples.extend(
            _build_random_unrelated_samples(
                size - len(samples),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
    samples = samples[:size]
    rng.shuffle(samples)

    _write_csv(samples, csv_path, seed=seed)
    manifest = _write_manifest(
        profiles=profiles,
        samples=samples,
        csv_path=csv_path,
        output_dir=output_dir,
        seed=seed,
        plan=plan,
        index_path=index_path,
    )
    return manifest


def profile_library(library_dir: Path) -> list[LyricProfile]:
    profiles: list[LyricProfile] = []
    for path in iter_lyric_files(library_dir):
        record = record_from_file(path, base_dir=library_dir)
        normalized = normalize_lyrics(record.lyrics)
        lines = normalized.primary_lines or normalized.unique_lines
        line_count = len(lines)
        normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
        source_bucket = _source_bucket(path)
        profiles.append(
            LyricProfile(
                path=path,
                record_id=record.record_id,
                title=record.title or "",
                artist=record.artist or "",
                normalized=normalized,
                line_count=line_count,
                char_count=len(normalized_text),
                line_count_bucket=_line_count_bucket(line_count),
                language_bucket=_language_bucket(lines),
                source_bucket=source_bucket,
                normalized_hash=hashlib.sha256(normalized_text.encode("utf-8")).hexdigest(),
                has_translation=bool(normalized.translation_lines),
            )
        )
    return profiles


def _sample_plan(size: int, *, positive_ratio: float) -> dict[str, int]:
    positive_ratio = max(0.0, min(1.0, positive_ratio))
    mix = dict(DEFAULT_SAMPLE_MIX)
    negative_total = sum(value for key, value in mix.items() if key != "positive_full_duplicate")
    mix["positive_full_duplicate"] = positive_ratio
    for key in list(mix):
        if key != "positive_full_duplicate":
            mix[key] = (1.0 - positive_ratio) * (DEFAULT_SAMPLE_MIX[key] / negative_total)

    plan = {key: int(size * value) for key, value in mix.items()}
    remainder = size - sum(plan.values())
    for key in sorted(mix, key=mix.get, reverse=True):
        if remainder <= 0:
            break
        plan[key] += 1
        remainder -= 1
    return plan


def _profile_groups(profiles: list[LyricProfile]) -> dict[str, list[LyricProfile]]:
    normal = [profile for profile in profiles if profile.line_count >= 6]
    edge = [profile for profile in profiles if profile.line_count <= 5]
    return {
        "normal": normal or profiles,
        "fragmentable": [profile for profile in profiles if profile.line_count >= 12] or normal or profiles,
        "foreign": [
            profile
            for profile in profiles
            if profile.language_bucket in {"latin", "mixed", "jp_kr"} and profile.line_count >= 4
        ]
        or normal
        or profiles,
        "edge": edge or normal or profiles,
    }


def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
    if count <= 0 or not profiles:
        return []
    buckets: dict[tuple[str, str, str], list[LyricProfile]] = {}
    for profile in profiles:
        key = (profile.line_count_bucket, profile.language_bucket, profile.source_bucket)
        buckets.setdefault(key, []).append(profile)

    selected: list[LyricProfile] = []
    bucket_keys = list(buckets)
    rng.shuffle(bucket_keys)
    cursors = {key: rng.sample(items, len(items)) for key, items in buckets.items()}
    while len(selected) < count and bucket_keys:
        progressed = False
        for key in list(bucket_keys):
            if len(selected) >= count:
                break
            items = cursors[key]
            if not items:
                bucket_keys.remove(key)
                continue
            selected.append(items.pop())
            progressed = True
        if not progressed:
            break
    while len(selected) < count:
        selected.append(rng.choice(profiles))
    return selected


def _build_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        raw = read_lyric_file(profile.path)
        lines = _content_lines(raw)
        variants = [
            ("positive_exact_copy", raw),
            ("positive_timestamped", _add_timestamps(lines)),
            ("positive_punctuation_noise", _add_punctuation_noise(lines, rng)),
            ("positive_platform_noise", _with_platform_noise(lines)),
            ("positive_blank_line_noise", _add_blank_line_noise(lines)),
            ("positive_chorus_count_changed", _change_repeated_line_counts(lines)),
            ("positive_translation_added", _translation_added(lines)),
        ]
        sample_type, text = variants[offset % len(variants)]
        index = start_index + offset
        path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
        samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
    return samples


def _build_random_unrelated_samples(
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset in range(count):
        index = start_index + offset
        text = _same_theme_synthetic(index, rng)
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_random_unrelated.txt", text)
        samples.append(
            GeneratedSample(
                sample_id=f"sample-{index:05d}",
                file=str(path.relative_to(csv_base)),
                expected="不应去重",
                sample_type="negative_random_unrelated",
                source="synthetic",
                notes="same-theme synthetic full lyric not copied from library",
            )
        )
    return samples


def _build_hard_candidate_samples(
    profiles: list[LyricProfile],
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    checker: DuplicateChecker | None,
    start_index: int,
) -> list[GeneratedSample]:
    if count <= 0:
        return []
    sources = _stratified_sample(profiles, count * 3, rng)
    samples: list[GeneratedSample] = []
    for profile in sources:
        if len(samples) >= count:
            break
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        text = _short_shared_snippet(lines, rng)
        candidate_id = ""
        if checker is not None:
            result = checker.check(text, max_candidates=5)
            candidate = next(
                (
                    item
                    for item in result.candidates
                    if item.record_id != profile.record_id and item.decision != DuplicateDecision.NEW
                ),
                result.candidates[0] if result.candidates else None,
            )
            candidate_id = candidate.record_id if candidate else ""
        index = start_index + len(samples)
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_hard_candidate.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_hard_candidate",
                profile,
                candidate_record_id=candidate_id,
                notes="shares a few real lines plus new filler; should not auto duplicate",
            )
        )
    return samples


def _build_fragment_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        text = _single_song_fragment(lines, rng)
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_fragment.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_fragment",
                profile,
                notes="partial lyric fragment only",
            )
        )
    return samples


def _build_shared_chorus_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        repeated = _repeated_or_sampled_lines(profile.normalized, rng)
        text = "\n".join(
            [
                "清晨的光落在新的街口",
                "我把故事重新写给以后",
                *repeated,
                *repeated,
                "所有答案都从这里开始",
            ]
        )
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_shared_chorus.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_shared_chorus",
                profile,
                notes="shared repeated lines with new surrounding content",
            )
        )
    return samples


def _build_translation_only_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.translation_lines) or [
            _pseudo_translation(idx) for idx in range(1, min(8, max(profile.line_count, 4)) + 1)
        ]
        rng.shuffle(lines)
        text = "\n".join(lines[:8])
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_translation_only.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_translation_only",
                profile,
                notes="translation-like text without matching original lyric",
            )
        )
    return samples


def _build_edge_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        if profile.line_count <= 1:
            text = _same_theme_synthetic(start_index + offset, rng)
            notes = "zero or one effective line; use synthetic edge negative"
        else:
            text = _short_shared_snippet(lines, rng)
            notes = "short lyric edge case with limited overlap"
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_edge_short_or_placeholder.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "edge_short_or_placeholder",
                profile,
                notes=notes,
            )
        )
    return samples


def _sample_from_profile(
    index: int,
    path: Path,
    csv_base: Path,
    expected: str,
    sample_type: str,
    profile: LyricProfile,
    *,
    candidate_record_id: str = "",
    notes: str = "",
) -> GeneratedSample:
    return GeneratedSample(
        sample_id=f"sample-{index:05d}",
        file=str(path.relative_to(csv_base)),
        expected=expected,
        sample_type=sample_type,
        source=str(profile.path),
        source_record_id=profile.record_id,
        candidate_record_id=candidate_record_id,
        line_count_bucket=profile.line_count_bucket,
        language_bucket=profile.language_bucket,
        source_bucket=profile.source_bucket,
        title=profile.title,
        artist=profile.artist,
        notes=notes,
    )


def _write_sample_file(output_dir: Path, name: str, text: str) -> Path:
    path = output_dir / name
    path.write_text(text.strip() + "\n", encoding="utf-8")
    return path


def _write_csv(samples: list[GeneratedSample], csv_path: Path, *, seed: int) -> None:
    fieldnames = [
        "id",
        "file",
        "expected",
        "sample_type",
        "source",
        "source_record_id",
        "candidate_record_id",
        "line_count_bucket",
        "language_bucket",
        "source_bucket",
        "title",
        "artist",
        "seed",
        "notes",
    ]
    with csv_path.open("w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for sample in samples:
            writer.writerow(
                {
                    "id": sample.sample_id,
                    "file": sample.file,
                    "expected": sample.expected,
                    "sample_type": sample.sample_type,
                    "source": sample.source,
                    "source_record_id": sample.source_record_id,
                    "candidate_record_id": sample.candidate_record_id,
                    "line_count_bucket": sample.line_count_bucket,
                    "language_bucket": sample.language_bucket,
                    "source_bucket": sample.source_bucket,
                    "title": sample.title,
                    "artist": sample.artist,
                    "seed": seed,
                    "notes": sample.notes,
                }
            )


def _write_manifest(
    *,
    profiles: list[LyricProfile],
    samples: list[GeneratedSample],
    csv_path: Path,
    output_dir: Path,
    seed: int,
    plan: dict[str, int],
    index_path: Path | None,
) -> dict[str, object]:
    manifest = {
        "seed": seed,
        "library_files": len(profiles),
        "sample_size": len(samples),
        "plan": plan,
        "index": str(index_path) if index_path else "",
        "lyrics_dir": str(output_dir),
        "csv": str(csv_path),
        "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
        "sample_type_counts": dict(Counter(sample.sample_type for sample in samples)),
        "expected_counts": dict(Counter(sample.expected for sample in samples)),
        "line_count_bucket_counts": dict(Counter(profile.line_count_bucket for profile in profiles)),
        "language_bucket_counts": dict(Counter(profile.language_bucket for profile in profiles)),
        "source_bucket_counts": dict(Counter(profile.source_bucket for profile in profiles).most_common(50)),
        "unique_source_records": len({sample.source_record_id for sample in samples if sample.source_record_id}),
    }
    csv_path.with_suffix(csv_path.suffix + ".manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    return manifest


def _content_lines(text: str) -> list[str]:
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return lines or [text.strip()]


def _clean_generated_output_dir(output_dir: Path) -> None:
    for path in output_dir.iterdir():
        if path.is_file() and path.suffix.lower() in {".txt", ".lrc"}:
            path.unlink()


def _line_count_bucket(line_count: int) -> str:
    if line_count == 0:
        return "zero"
    if line_count <= 5:
        return "short"
    if line_count <= 40:
        return "normal"
    return "long"


def _language_bucket(lines: tuple[str, ...]) -> str:
    text = "\n".join(lines)
    cjk = len(re.findall(r"[\u4e00-\u9fff]", text))
    latin = len(re.findall(r"[A-Za-z]", text))
    kana = len(re.findall(r"[\u3040-\u30ff]", text))
    hangul = len(re.findall(r"[\uac00-\ud7af]", text))
    if kana or hangul:
        return "jp_kr"
    if cjk and latin:
        return "mixed"
    if cjk:
        return "zh"
    if latin:
        return "latin"
    return "other"


def _source_bucket(path: Path) -> str:
    stem = path.stem
    parts = stem.split("_")
    if len(parts) >= 2:
        code = re.sub(r"\d+$", "", parts[-1])
        return code or "unknown"
    return "unknown"


def _add_timestamps(lines: list[str]) -> str:
    return "\n".join(f"[00:{idx % 60:02d}.00]{line}" for idx, line in enumerate(lines, start=1))


def _add_punctuation_noise(lines: list[str], rng: random.Random) -> str:
    marks = ["！", "?", "...", "，", "。"]
    return "\n".join(f"{line}{rng.choice(marks)}" for line in lines)


def _with_platform_noise(lines: list[str]) -> str:
    return "\n".join(["歌词来自QQ音乐", "作词：测试", *lines, "未经著作权人许可 不得翻唱"])


def _add_blank_line_noise(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if idx % 4 == 0:
            result.append("")
    return "\n".join(result)


def _change_repeated_line_counts(lines: list[str]) -> str:
    seen: set[str] = set()
    result: list[str] = []
    for line in lines:
        if line in seen:
            continue
        seen.add(line)
        result.append(line)
    return "\n".join(result or lines)


def _translation_added(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if _looks_foreign(line) and idx <= 24:
            result.append(_pseudo_translation(idx))
    return "\n".join(result)


def _single_song_fragment(lines: list[str], rng: random.Random) -> str:
    if len(lines) <= 4:
        return "\n".join(lines[: max(1, len(lines) // 2)])
    fragment_len = max(2, min(8, len(lines) // rng.choice([3, 4, 5])))
    start = rng.randrange(0, max(1, len(lines) - fragment_len + 1))
    return "\n".join(lines[start : start + fragment_len])


def _short_shared_snippet(lines: list[str], rng: random.Random) -> str:
    snippet = rng.sample(lines, k=min(2, len(lines))) if lines else []
    synthetic = [
        "清晨的风吹过新的街口",
        "我把昨天放进安静的口袋",
        *snippet,
        "故事从这里重新开始",
        "灯光落下我继续往前走",
    ]
    return "\n".join(synthetic)


def _repeated_or_sampled_lines(normalized: NormalizedLyrics, rng: random.Random) -> list[str]:
    repeated = [line for line, count in normalized.line_counts.items() if count >= 2]
    if repeated:
        return rng.sample(repeated, k=min(2, len(repeated)))
    lines = list(normalized.primary_lines or normalized.unique_lines)
    return rng.sample(lines, k=min(2, len(lines))) if lines else []


def _same_theme_synthetic(index: int, rng: random.Random) -> str:
    starts = ["我在夜里想起远方的你", "城市灯火陪我走过雨季", "风把旧名字吹向清晨"]
    middles = ["那些没说完的话留在风里", "新的路口慢慢亮起", "时间把答案交给下一站"]
    ends = ["明天醒来我们各自继续", "我会把今天写成新的旋律", "故事从这里重新开始"]
    return "\n".join(
        [
            rng.choice(starts),
            rng.choice(middles),
            rng.choice(ends),
            f"这是第 {index} 个全新测试样本",
        ]
    )


def _pseudo_translation(index: int) -> str:
    translations = [
        "今晚我仍然想念你",
        "风会带走所有疲惫",
        "黑暗里也会有光",
        "别让昨天困住自己",
        "我们终会继续向前",
        "雨停以后天空会亮",
        "把遗憾留在旧时光",
        "你已经足够好了",
    ]
    return translations[(index - 1) % len(translations)]


def _looks_foreign(line: str) -> bool:
    latin = len(re.findall(r"[A-Za-z]", line))
    cjk = len(re.findall(r"[\u4e00-\u9fff]", line))
    return latin > 0 and cjk == 0