eval_dataset.py 41.1 KB

Raw Blame History Permalink

"""Generate production-style labeled evaluation samples from a lyric library."""

from __future__ import annotations

import csv
import hashlib
import json
import random
import re
import sys
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

from lyric_dedup.checker import LyricRecord
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import NormalizedLyrics
from lyric_dedup.normalization import fingerprint_text
from lyric_dedup.normalization import normalize_lyrics


STANDARD_SAMPLE_MIX = {
    "positive_full_duplicate": 0.30,
    "negative_real_holdout_full_song": 0.40,
    "negative_fragment": 0.10,
    "negative_shared_chorus": 0.05,
    "negative_translation_only": 0.05,
    "negative_same_theme_synthetic": 0.05,
    "edge_short_or_placeholder": 0.05,
}
DEFAULT_SAMPLE_MIX = STANDARD_SAMPLE_MIX

HARD_SAMPLE_MIX = {
    "positive_realistic_variant": 0.30,
    "negative_real_holdout_full_song": 0.20,
    "negative_near_neighbor_holdout_full_song": 0.20,
    "negative_long_fragment": 0.15,
    "negative_shared_chorus": 0.05,
    "negative_translation_only": 0.04,
    "negative_catalog_mashup": 0.04,
    "edge_short_or_placeholder": 0.02,
}


def _progress(message: str) -> None:
    print(f"[eval-gen] {message}", file=sys.stderr, flush=True)


def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
    if total <= 0:
        return
    if current == 1 or current == total or current % step == 0:
        _progress(f"{label}: {current}/{total}")


@dataclass(frozen=True)
class LyricProfile:
    path: Path
    record_id: str
    raw_text: str
    title: str
    artist: str
    normalized: NormalizedLyrics
    line_count: int
    char_count: int
    line_count_bucket: str
    language_bucket: str
    source_bucket: str
    normalized_hash: str
    has_translation: bool


@dataclass(frozen=True)
class GeneratedSample:
    sample_id: str
    file: str
    expected: str
    sample_type: str
    source: str
    source_record_id: str = ""
    candidate_record_id: str = ""
    line_count_bucket: str = ""
    language_bucket: str = ""
    source_bucket: str = ""
    title: str = ""
    artist: str = ""
    notes: str = ""


def generate_eval_set(
    *,
    library_dir: Path,
    output_dir: Path,
    csv_path: Path,
    size: int = 100,
    positive_ratio: float = 0.30,
    seed: int = 20260602,
    index_path: Path | None = None,
    eval_index_path: Path | None = None,
    profile: str = "standard",
) -> dict[str, object]:
    """Generate a stratified production evaluation set.

    ``positive_ratio`` is kept for CLI compatibility. It overrides the default
    positive quota while keeping the remaining negative categories proportional.
    """
    if size <= 0:
        raise ValueError("size must be positive")

    if profile not in {"standard", "hard"}:
        raise ValueError("profile must be 'standard' or 'hard'")

    _progress(f"start generation: profile={profile}, size={size}, positive_ratio={positive_ratio}, seed={seed}")
    rng = random.Random(seed)
    profiles = profile_library(library_dir)
    if not profiles:
        raise ValueError(f"{library_dir} 下没有 .lrc/.txt 歌词文件")

    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    _progress(f"clean output dir: {output_dir}")
    _clean_generated_output_dir(output_dir)

    plan = _sample_plan(size, positive_ratio=positive_ratio, profile=profile)
    _progress(f"sample plan: {plan}")
    holdout_count = min(_holdout_plan_count(plan), max(1, len(profiles) // 2))
    holdout_profiles = _stratified_unique_sample(
        profiles,
        holdout_count,
        rng,
    )
    holdout_ids = {profile.record_id for profile in holdout_profiles}
    indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
    groups = _profile_groups(indexed_profiles)
    samples: list[GeneratedSample] = []

    if profile == "hard":
        samples.extend(
            _build_hard_samples(
                plan,
                groups=groups,
                holdout_profiles=holdout_profiles,
                indexed_profiles=indexed_profiles,
                output_dir=output_dir,
                csv_base=csv_path.parent,
                rng=rng,
                start_index=len(samples) + 1,
            )
        )
    else:
        _progress("build positive_full_duplicate samples")
        samples.extend(
            _build_positive_samples(
                _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build negative_real_holdout_full_song samples")
        samples.extend(
            _build_holdout_full_song_samples(
                holdout_profiles[: plan["negative_real_holdout_full_song"]],
                output_dir,
                csv_path.parent,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build negative_fragment samples")
        samples.extend(
            _build_fragment_samples(
                _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build negative_shared_chorus samples")
        samples.extend(
            _build_shared_chorus_samples(
                _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build negative_translation_only samples")
        samples.extend(
            _build_translation_only_samples(
                _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build negative_same_theme_synthetic samples")
        samples.extend(
            _build_same_theme_synthetic_samples(
                plan["negative_same_theme_synthetic"],
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
        _progress(f"built samples: {len(samples)}/{size}")
        _progress("build edge_short_or_placeholder samples")
        samples.extend(
            _build_edge_samples(
                _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
    _progress(f"built samples: {len(samples)}/{size}")

    if len(samples) < size:
        _progress(f"top up with negative_same_theme_synthetic samples: {size - len(samples)}")
        samples.extend(
            _build_same_theme_synthetic_samples(
                size - len(samples),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
    samples = samples[:size]
    rng.shuffle(samples)

    _progress(f"write csv: {csv_path}")
    _write_csv(samples, csv_path, seed=seed)
    _progress("write manifest")
    manifest = _write_manifest(
        profiles=profiles,
        samples=samples,
        csv_path=csv_path,
        output_dir=output_dir,
        seed=seed,
        plan=plan,
        index_path=index_path,
        eval_index_path=eval_index_path,
        holdout_count=len(holdout_profiles),
        profile=profile,
    )
    _progress("generation complete")
    return manifest


def profile_library(library_dir: Path) -> list[LyricProfile]:
    profiles: list[LyricProfile] = []
    paths = iter_lyric_files(library_dir)
    _progress(f"profile library: 0/{len(paths)}")
    for index, path in enumerate(paths, start=1):
        record = record_from_file(path, base_dir=library_dir)
        raw_text = record.lyrics
        normalized = normalize_lyrics(raw_text)
        lines = normalized.primary_lines or normalized.unique_lines
        line_count = len(lines)
        normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
        source_bucket = _source_bucket(path)
        profiles.append(
            LyricProfile(
                path=path,
                record_id=record.record_id,
                raw_text=raw_text,
                title=record.title or "",
                artist=record.artist or "",
                normalized=normalized,
                line_count=line_count,
                char_count=len(normalized_text),
                line_count_bucket=_line_count_bucket(line_count),
                language_bucket=_language_bucket(lines),
                source_bucket=source_bucket,
                normalized_hash=hashlib.sha256(normalized_text.encode("utf-8")).hexdigest(),
                has_translation=bool(normalized.translation_lines),
            )
        )
        _progress_count("profile library", index, len(paths), step=5000)
    return profiles


def _sample_plan(size: int, *, positive_ratio: float, profile: str) -> dict[str, int]:
    positive_ratio = max(0.0, min(1.0, positive_ratio))
    mix = dict(HARD_SAMPLE_MIX if profile == "hard" else STANDARD_SAMPLE_MIX)
    positive_key = "positive_realistic_variant" if profile == "hard" else "positive_full_duplicate"
    negative_total = sum(value for key, value in mix.items() if key != positive_key)
    mix[positive_key] = positive_ratio
    for key in list(mix):
        if key != positive_key:
            base_mix = HARD_SAMPLE_MIX if profile == "hard" else STANDARD_SAMPLE_MIX
            mix[key] = (1.0 - positive_ratio) * (base_mix[key] / negative_total)

    plan = {key: int(size * value) for key, value in mix.items()}
    remainder = size - sum(plan.values())
    for key in sorted(mix, key=mix.get, reverse=True):
        if remainder <= 0:
            break
        plan[key] += 1
        remainder -= 1
    return plan


def _holdout_plan_count(plan: dict[str, int]) -> int:
    return plan.get("negative_real_holdout_full_song", 0) + plan.get("negative_near_neighbor_holdout_full_song", 0)


def _profile_groups(profiles: list[LyricProfile]) -> dict[str, list[LyricProfile]]:
    normal = [profile for profile in profiles if profile.line_count >= 6]
    edge = [profile for profile in profiles if profile.line_count <= 5]
    return {
        "normal": normal or profiles,
        "fragmentable": [profile for profile in profiles if profile.line_count >= 12] or normal or profiles,
        "foreign": [
            profile
            for profile in profiles
            if profile.language_bucket in {"latin", "mixed", "jp_kr"} and profile.line_count >= 4
        ]
        or normal
        or profiles,
        "edge": edge or normal or profiles,
    }


def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
    if count <= 0 or not profiles:
        return []
    buckets: dict[tuple[str, str, str], list[LyricProfile]] = {}
    for profile in profiles:
        key = (profile.line_count_bucket, profile.language_bucket, profile.source_bucket)
        buckets.setdefault(key, []).append(profile)

    selected: list[LyricProfile] = []
    bucket_keys = list(buckets)
    rng.shuffle(bucket_keys)
    cursors = {key: rng.sample(items, len(items)) for key, items in buckets.items()}
    while len(selected) < count and bucket_keys:
        progressed = False
        for key in list(bucket_keys):
            if len(selected) >= count:
                break
            items = cursors[key]
            if not items:
                bucket_keys.remove(key)
                continue
            selected.append(items.pop())
            progressed = True
        if not progressed:
            break
    while len(selected) < count:
        selected.append(rng.choice(profiles))
    return selected


def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
    if count <= 0 or not profiles:
        return []
    return _stratified_sample(profiles, min(count, len(profiles)), rng)


def _build_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        raw = profile.raw_text
        lines = _content_lines(raw)
        variants = [
            ("positive_exact_copy", raw),
            ("positive_timestamped", _add_timestamps(lines)),
            ("positive_punctuation_noise", _add_punctuation_noise(lines, rng)),
            ("positive_platform_noise", _with_platform_noise(lines)),
            ("positive_blank_line_noise", _add_blank_line_noise(lines)),
            ("positive_chorus_count_changed", _change_repeated_line_counts(lines)),
            ("positive_translation_added", _translation_added(lines)),
            ("positive_typo_noise", _add_typo_noise(lines, rng)),
        ]
        sample_type, text = variants[offset % len(variants)]
        index = start_index + offset
        path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
        samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
        _progress_count("positive_full_duplicate", len(samples), len(profiles))
    return samples


def _build_hard_samples(
    plan: dict[str, int],
    *,
    groups: dict[str, list[LyricProfile]],
    holdout_profiles: list[LyricProfile],
    indexed_profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []

    _progress("build positive_realistic_variant samples")
    samples.extend(
        _build_realistic_positive_samples(
            _stratified_sample(groups["normal"], plan["positive_realistic_variant"], rng),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
        )
    )
    _progress(f"built samples: {len(samples)}")

    real_holdout_count = plan.get("negative_real_holdout_full_song", 0)
    _progress("build negative_real_holdout_full_song samples")
    samples.extend(
        _build_holdout_full_song_samples(
            holdout_profiles[:real_holdout_count],
            output_dir,
            csv_base,
            start_index=start_index + len(samples),
        )
    )
    _progress(f"built samples: {len(samples)}")

    near_count = plan.get("negative_near_neighbor_holdout_full_song", 0)
    _progress("build negative_near_neighbor_holdout_full_song samples")
    near_holdouts = _near_neighbor_holdouts(
        holdout_profiles[real_holdout_count:],
        indexed_profiles,
        near_count,
    )
    samples.extend(
        _build_holdout_full_song_samples(
            near_holdouts,
            output_dir,
            csv_base,
            start_index=start_index + len(samples),
            sample_type="negative_near_neighbor_holdout_full_song",
            notes="full real holdout lyric selected for catalog line overlap with indexed songs",
        )
    )
    _progress(f"built samples: {len(samples)}")

    _progress("build negative_long_fragment samples")
    samples.extend(
        _build_fragment_samples(
            _stratified_sample(groups["fragmentable"], plan.get("negative_long_fragment", 0), rng),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
            sample_type="negative_long_fragment",
            long_fragment=True,
            notes="realistic long partial lyric upload, not a full-song duplicate",
        )
    )
    _progress(f"built samples: {len(samples)}")

    _progress("build negative_shared_chorus samples")
    samples.extend(
        _build_shared_chorus_samples(
            _stratified_sample(groups["normal"], plan.get("negative_shared_chorus", 0), rng),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
        )
    )
    _progress(f"built samples: {len(samples)}")

    _progress("build negative_translation_only samples")
    samples.extend(
        _build_translation_only_samples(
            _stratified_sample(groups["foreign"], plan.get("negative_translation_only", 0), rng),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
        )
    )
    _progress(f"built samples: {len(samples)}")

    _progress("build negative_catalog_mashup samples")
    samples.extend(
        _build_catalog_mashup_samples(
            _stratified_sample(groups["normal"], plan.get("negative_catalog_mashup", 0) * 3, rng),
            plan.get("negative_catalog_mashup", 0),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
        )
    )
    _progress(f"built samples: {len(samples)}")

    _progress("build edge_short_or_placeholder samples")
    samples.extend(
        _build_edge_samples(
            _stratified_sample(groups["edge"], plan.get("edge_short_or_placeholder", 0), rng),
            output_dir,
            csv_base,
            rng,
            start_index=start_index + len(samples),
        )
    )
    return samples


def _build_realistic_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        content_lines = _content_lines(profile.raw_text)
        primary_lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines) or content_lines
        variants = [
            ("positive_platform_mixed_noise", _platform_mixed_noise(content_lines, rng)),
            ("positive_near_full_missing_section", _near_full_missing_section(primary_lines, rng)),
            ("positive_block_translation_added", _block_translation_added(primary_lines)),
            ("positive_typo_and_punctuation_noise", _stronger_typo_and_punctuation_noise(content_lines, rng)),
            ("positive_timestamped_platform_variant", _timestamped_platform_variant(content_lines)),
            ("positive_chorus_count_changed", _change_repeated_line_counts(content_lines)),
        ]
        sample_type, text = variants[offset % len(variants)]
        index = start_index + offset
        path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
        samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
        _progress_count("positive_realistic_variant", len(samples), len(profiles))
    return samples


def _build_holdout_full_song_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    *,
    start_index: int,
    sample_type: str = "negative_real_holdout_full_song",
    notes: str = "full real lyric held out from the generated eval index",
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        index = start_index + offset
        text = profile.raw_text
        path = _write_sample_file(output_dir, f"neg_{index:05d}_{sample_type}.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                sample_type,
                profile,
                notes=notes,
            )
        )
        _progress_count(sample_type, len(samples), len(profiles))
    return samples


def _build_same_theme_synthetic_samples(
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset in range(count):
        index = start_index + offset
        text = _same_theme_synthetic(index, rng)
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_same_theme_synthetic.txt", text)
        samples.append(
            GeneratedSample(
                sample_id=f"sample-{index:05d}",
                file=str(path.relative_to(csv_base)),
                expected="不应去重",
                sample_type="negative_same_theme_synthetic",
                source="synthetic",
                notes="same-theme synthetic full lyric not copied from library",
            )
        )
        _progress_count("negative_same_theme_synthetic", len(samples), count)
    return samples


def _build_fragment_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
    sample_type: str = "negative_fragment",
    long_fragment: bool = False,
    notes: str = "partial lyric fragment only",
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        text = _long_song_fragment(lines, rng) if long_fragment else _single_song_fragment(lines, rng)
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_{sample_type}.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                sample_type,
                profile,
                notes=notes,
            )
        )
        _progress_count(sample_type, len(samples), len(profiles))
    return samples


def _build_catalog_mashup_samples(
    profiles: list[LyricProfile],
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    if count <= 0 or not profiles:
        return samples
    for offset in range(count):
        index = start_index + offset
        picked = rng.sample(profiles, k=min(3, len(profiles)))
        text = _catalog_mashup_text(picked, rng)
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_catalog_mashup.txt", text)
        samples.append(
            GeneratedSample(
                sample_id=f"sample-{index:05d}",
                file=str(path.relative_to(csv_base)),
                expected="不应去重",
                sample_type="negative_catalog_mashup",
                source=" | ".join(str(profile.path) for profile in picked),
                notes="medley-style partial lyric assembled from multiple catalog songs",
            )
        )
        _progress_count("negative_catalog_mashup", len(samples), count)
    return samples


def _build_shared_chorus_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        repeated = _repeated_or_sampled_lines(profile.normalized, rng)
        text = "\n".join(
            [
                "清晨的光落在新的街口",
                "我把故事重新写给以后",
                *repeated,
                *repeated,
                "所有答案都从这里开始",
            ]
        )
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_shared_chorus.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_shared_chorus",
                profile,
                notes="shared repeated lines with new surrounding content",
            )
        )
        _progress_count("negative_shared_chorus", len(samples), len(profiles))
    return samples


def _build_translation_only_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.translation_lines) or [
            _pseudo_translation(idx) for idx in range(1, min(8, max(profile.line_count, 4)) + 1)
        ]
        rng.shuffle(lines)
        text = "\n".join(lines[:8])
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_translation_only.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_translation_only",
                profile,
                notes="translation-like text without matching original lyric",
            )
        )
        _progress_count("negative_translation_only", len(samples), len(profiles))
    return samples


def _build_edge_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        if profile.line_count <= 1:
            text = _same_theme_synthetic(start_index + offset, rng)
            notes = "zero or one effective line; use synthetic edge negative"
        else:
            text = _short_shared_snippet(lines, rng)
            notes = "short lyric edge case with limited overlap"
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_edge_short_or_placeholder.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "edge_short_or_placeholder",
                profile,
                notes=notes,
            )
        )
        _progress_count("edge_short_or_placeholder", len(samples), len(profiles))
    return samples


def _sample_from_profile(
    index: int,
    path: Path,
    csv_base: Path,
    expected: str,
    sample_type: str,
    profile: LyricProfile,
    *,
    candidate_record_id: str = "",
    notes: str = "",
) -> GeneratedSample:
    return GeneratedSample(
        sample_id=f"sample-{index:05d}",
        file=str(path.relative_to(csv_base)),
        expected=expected,
        sample_type=sample_type,
        source=str(profile.path),
        source_record_id=profile.record_id,
        candidate_record_id=candidate_record_id,
        line_count_bucket=profile.line_count_bucket,
        language_bucket=profile.language_bucket,
        source_bucket=profile.source_bucket,
        title=profile.title,
        artist=profile.artist,
        notes=notes,
    )


def _write_sample_file(output_dir: Path, name: str, text: str) -> Path:
    path = output_dir / name
    path.write_text(text.strip() + "\n", encoding="utf-8")
    return path


def _write_csv(samples: list[GeneratedSample], csv_path: Path, *, seed: int) -> None:
    fieldnames = [
        "id",
        "file",
        "expected",
        "sample_type",
        "source",
        "source_record_id",
        "candidate_record_id",
        "line_count_bucket",
        "language_bucket",
        "source_bucket",
        "title",
        "artist",
        "seed",
        "notes",
    ]
    with csv_path.open("w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for sample in samples:
            writer.writerow(
                {
                    "id": sample.sample_id,
                    "file": sample.file,
                    "expected": sample.expected,
                    "sample_type": sample.sample_type,
                    "source": sample.source,
                    "source_record_id": sample.source_record_id,
                    "candidate_record_id": sample.candidate_record_id,
                    "line_count_bucket": sample.line_count_bucket,
                    "language_bucket": sample.language_bucket,
                    "source_bucket": sample.source_bucket,
                    "title": sample.title,
                    "artist": sample.artist,
                    "seed": seed,
                    "notes": sample.notes,
                }
            )


def _write_manifest(
    *,
    profiles: list[LyricProfile],
    samples: list[GeneratedSample],
    csv_path: Path,
    output_dir: Path,
    seed: int,
    plan: dict[str, int],
    index_path: Path | None,
    eval_index_path: Path,
    holdout_count: int,
    profile: str,
) -> dict[str, object]:
    manifest = {
        "profile": profile,
        "seed": seed,
        "library_files": len(profiles),
        "sample_size": len(samples),
        "plan": plan,
        "source_index": str(index_path) if index_path else "",
        "eval_index": str(eval_index_path) if eval_index_path else "",
        "holdout_records": holdout_count,
        "lyrics_dir": str(output_dir),
        "csv": str(csv_path),
        "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
        "sample_type_counts": dict(Counter(sample.sample_type for sample in samples)),
        "expected_counts": dict(Counter(sample.expected for sample in samples)),
        "line_count_bucket_counts": dict(Counter(profile.line_count_bucket for profile in profiles)),
        "language_bucket_counts": dict(Counter(profile.language_bucket for profile in profiles)),
        "source_bucket_counts": dict(Counter(profile.source_bucket for profile in profiles).most_common(50)),
        "unique_source_records": len({sample.source_record_id for sample in samples if sample.source_record_id}),
    }
    csv_path.with_suffix(csv_path.suffix + ".manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    return manifest


def _near_neighbor_holdouts(
    holdout_profiles: list[LyricProfile],
    indexed_profiles: list[LyricProfile],
    count: int,
) -> list[LyricProfile]:
    if count <= 0 or not holdout_profiles:
        return []
    if not indexed_profiles:
        return holdout_profiles[:count]

    line_to_indexed_count: Counter[str] = Counter()
    for profile in indexed_profiles:
        for line in set(profile.normalized.primary_lines or profile.normalized.unique_lines):
            if len(line) >= 4:
                line_to_indexed_count[line] += 1

    scored: list[tuple[float, LyricProfile]] = []
    for profile in holdout_profiles:
        lines = set(profile.normalized.primary_lines or profile.normalized.unique_lines)
        useful_lines = {line for line in lines if len(line) >= 4}
        if not useful_lines:
            score = 0.0
        else:
            shared = sum(1 for line in useful_lines if line_to_indexed_count[line] > 0)
            common_weight = sum(min(line_to_indexed_count[line], 5) for line in useful_lines)
            score = (shared / len(useful_lines)) + (common_weight / (len(useful_lines) * 20))
        scored.append((score, profile))

    scored.sort(key=lambda item: item[0], reverse=True)
    return [profile for _, profile in scored[:count]]


def _content_lines(text: str) -> list[str]:
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return lines or [text.strip()]


def _clean_generated_output_dir(output_dir: Path) -> None:
    for path in output_dir.iterdir():
        if path.is_file() and path.suffix.lower() in {".txt", ".lrc"}:
            path.unlink()


def _line_count_bucket(line_count: int) -> str:
    if line_count == 0:
        return "zero"
    if line_count <= 5:
        return "short"
    if line_count <= 40:
        return "normal"
    return "long"


def _language_bucket(lines: tuple[str, ...]) -> str:
    text = "\n".join(lines)
    cjk = len(re.findall(r"[\u4e00-\u9fff]", text))
    latin = len(re.findall(r"[A-Za-z]", text))
    kana = len(re.findall(r"[\u3040-\u30ff]", text))
    hangul = len(re.findall(r"[\uac00-\ud7af]", text))
    if kana or hangul:
        return "jp_kr"
    if cjk and latin:
        return "mixed"
    if cjk:
        return "zh"
    if latin:
        return "latin"
    return "other"


def _source_bucket(path: Path) -> str:
    stem = path.stem
    parts = stem.split("_")
    if len(parts) >= 2:
        code = re.sub(r"\d+$", "", parts[-1])
        return code or "unknown"
    return "unknown"


def _add_timestamps(lines: list[str]) -> str:
    return "\n".join(f"[00:{idx % 60:02d}.00]{line}" for idx, line in enumerate(lines, start=1))


def _platform_mixed_noise(lines: list[str], rng: random.Random) -> str:
    noisy = _add_blank_line_noise(lines).splitlines()
    if noisy:
        noisy = _add_punctuation_noise(noisy, rng).splitlines()
    return "\n".join(["作词：未知", "歌词来自平台同步", *noisy, "未经著作权人许可 不得商业使用"])


def _timestamped_platform_variant(lines: list[str]) -> str:
    timestamped = _add_timestamps(lines).splitlines()
    return "\n".join(["[00:00.00]歌词贡献者：用户上传", *timestamped])


def _add_punctuation_noise(lines: list[str], rng: random.Random) -> str:
    marks = ["！", "?", "...", "，", "。"]
    return "\n".join(f"{line}{rng.choice(marks)}" for line in lines)


def _with_platform_noise(lines: list[str]) -> str:
    return "\n".join(["歌词来自QQ音乐", "作词：测试", *lines, "未经著作权人许可 不得翻唱"])


def _add_blank_line_noise(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if idx % 4 == 0:
            result.append("")
    return "\n".join(result)


def _change_repeated_line_counts(lines: list[str]) -> str:
    seen: set[str] = set()
    result: list[str] = []
    for line in lines:
        if line in seen:
            continue
        seen.add(line)
        result.append(line)
    return "\n".join(result or lines)


def _translation_added(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if _looks_foreign(line) and idx <= 24:
            result.append(_pseudo_translation(idx))
    return "\n".join(result)


def _block_translation_added(lines: list[str]) -> str:
    body = "\n".join(lines)
    translation_count = min(8, max(4, len(lines) // 4))
    translations = [_pseudo_translation(index) for index in range(1, translation_count + 1)]
    return "\n".join([body, "", *translations])


def _near_full_missing_section(lines: list[str], rng: random.Random) -> str:
    if len(lines) <= 8:
        return "\n".join(lines)
    drop_count = max(1, min(max(1, len(lines) // 5), 8))
    start = rng.randrange(0, max(1, len(lines) - drop_count + 1))
    kept = lines[:start] + lines[start + drop_count :]
    return "\n".join(kept or lines)


def _add_typo_noise(lines: list[str], rng: random.Random) -> str:
    if not lines:
        return ""
    result = list(lines)
    editable_indexes = [index for index, line in enumerate(result) if _can_typo_line(line)]
    if not editable_indexes:
        return "\n".join(result)
    typo_count = max(1, min(4, len(editable_indexes) // 8 or 1))
    for index in rng.sample(editable_indexes, k=min(typo_count, len(editable_indexes))):
        result[index] = _typo_line(result[index], rng)
    return "\n".join(result)


def _stronger_typo_and_punctuation_noise(lines: list[str], rng: random.Random) -> str:
    if not lines:
        return ""
    result = _add_punctuation_noise(lines, rng).splitlines()
    editable_indexes = [index for index, line in enumerate(result) if _can_typo_line(line)]
    typo_count = max(1, min(8, len(editable_indexes) // 6 or 1))
    for index in rng.sample(editable_indexes, k=min(typo_count, len(editable_indexes))):
        result[index] = _typo_line(result[index], rng)
    return "\n".join(result)


def _can_typo_line(line: str) -> bool:
    return bool(re.search(r"[A-Za-z]{4,}|[\u4e00-\u9fff]{4,}", line))


def _typo_line(line: str, rng: random.Random) -> str:
    words = list(re.finditer(r"[A-Za-z]{4,}", line))
    if words and rng.random() < 0.65:
        match = rng.choice(words)
        typo = _typo_english_word(match.group(0), rng)
        return line[: match.start()] + typo + line[match.end() :]
    cjk_positions = [index for index, char in enumerate(line) if "\u4e00" <= char <= "\u9fff"]
    if cjk_positions:
        index = rng.choice(cjk_positions)
        return line[:index] + _typo_cjk_char(line[index]) + line[index + 1 :]
    return line


def _typo_english_word(word: str, rng: random.Random) -> str:
    if len(word) <= 4 or rng.random() < 0.55:
        remove_at = rng.randrange(1, max(2, len(word) - 1))
        return word[:remove_at] + word[remove_at + 1 :]
    swap_at = rng.randrange(1, max(2, len(word) - 2))
    chars = list(word)
    chars[swap_at], chars[swap_at + 1] = chars[swap_at + 1], chars[swap_at]
    return "".join(chars)


def _typo_cjk_char(char: str) -> str:
    replacements = {
        "你": "妳",
        "爱": "爰",
        "夜": "液",
        "里": "裏",
        "风": "凤",
        "雨": "兩",
        "听": "昕",
        "说": "説",
        "想": "相",
        "梦": "夣",
        "心": "芯",
        "光": "先",
        "城": "诚",
        "远": "迩",
        "回": "囬",
        "走": "赱",
        "海": "毎",
        "天": "夭",
    }
    return replacements.get(char, char)


def _single_song_fragment(lines: list[str], rng: random.Random) -> str:
    if len(lines) <= 4:
        return "\n".join(lines[: max(1, len(lines) // 2)])
    fragment_len = max(2, min(8, len(lines) // rng.choice([3, 4, 5])))
    start = rng.randrange(0, max(1, len(lines) - fragment_len + 1))
    return "\n".join(lines[start : start + fragment_len])


def _long_song_fragment(lines: list[str], rng: random.Random) -> str:
    if len(lines) <= 8:
        return _single_song_fragment(lines, rng)
    fragment_len = max(6, min(len(lines) - 1, int(len(lines) * rng.uniform(0.35, 0.60))))
    start = rng.randrange(0, max(1, len(lines) - fragment_len + 1))
    return "\n".join(lines[start : start + fragment_len])


def _catalog_mashup_text(profiles: list[LyricProfile], rng: random.Random) -> str:
    sections: list[str] = []
    for profile in profiles:
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        if not lines:
            continue
        section_len = min(max(2, len(lines) // 8), 5)
        start = rng.randrange(0, max(1, len(lines) - section_len + 1))
        sections.extend(lines[start : start + section_len])
    if not sections:
        return _same_theme_synthetic(0, rng)
    return "\n".join(sections)


def _short_shared_snippet(lines: list[str], rng: random.Random) -> str:
    snippet = rng.sample(lines, k=min(2, len(lines))) if lines else []
    synthetic = [
        "清晨的风吹过新的街口",
        "我把昨天放进安静的口袋",
        *snippet,
        "故事从这里重新开始",
        "灯光落下我继续往前走",
    ]
    return "\n".join(synthetic)


def _repeated_or_sampled_lines(normalized: NormalizedLyrics, rng: random.Random) -> list[str]:
    repeated = [line for line, count in normalized.line_counts.items() if count >= 2]
    if repeated:
        return rng.sample(repeated, k=min(2, len(repeated)))
    lines = list(normalized.primary_lines or normalized.unique_lines)
    return rng.sample(lines, k=min(2, len(lines))) if lines else []


def _same_theme_synthetic(index: int, rng: random.Random) -> str:
    starts = ["我在夜里想起远方的你", "城市灯火陪我走过雨季", "风把旧名字吹向清晨"]
    middles = ["那些没说完的话留在风里", "新的路口慢慢亮起", "时间把答案交给下一站"]
    ends = ["明天醒来我们各自继续", "我会把今天写成新的旋律", "故事从这里重新开始"]
    return "\n".join(
        [
            rng.choice(starts),
            rng.choice(middles),
            rng.choice(ends),
            f"这是第 {index} 个全新测试样本",
        ]
    )


def _pseudo_translation(index: int) -> str:
    translations = [
        "今晚我仍然想念你",
        "风会带走所有疲惫",
        "黑暗里也会有光",
        "别让昨天困住自己",
        "我们终会继续向前",
        "雨停以后天空会亮",
        "把遗憾留在旧时光",
        "你已经足够好了",
    ]
    return translations[(index - 1) % len(translations)]


def _looks_foreign(line: str) -> bool:
    latin = len(re.findall(r"[A-Za-z]", line))
    cjk = len(re.findall(r"[\u4e00-\u9fff]", line))
    return latin > 0 and cjk == 0