eval_dataset.py 27.8 KB

Raw Blame History Permalink

"""Generate production-style labeled evaluation samples from a lyric library."""

from __future__ import annotations

import csv
import hashlib
import json
import random
import re
import sys
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import LyricRecord
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import NormalizedLyrics
from lyric_dedup.normalization import fingerprint_text
from lyric_dedup.normalization import normalize_lyrics


DEFAULT_SAMPLE_MIX = {
    "positive_full_duplicate": 0.30,
    "negative_real_holdout_full_song": 0.40,
    "negative_fragment": 0.10,
    "negative_shared_chorus": 0.05,
    "negative_translation_only": 0.05,
    "negative_same_theme_synthetic": 0.05,
    "edge_short_or_placeholder": 0.05,
}


def _progress(message: str) -> None:
    print(f"[eval-gen] {message}", file=sys.stderr, flush=True)


def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
    if total <= 0:
        return
    if current == 1 or current == total or current % step == 0:
        _progress(f"{label}: {current}/{total}")


@dataclass(frozen=True)
class LyricProfile:
    path: Path
    record_id: str
    raw_text: str
    title: str
    artist: str
    normalized: NormalizedLyrics
    line_count: int
    char_count: int
    line_count_bucket: str
    language_bucket: str
    source_bucket: str
    normalized_hash: str
    has_translation: bool


@dataclass(frozen=True)
class GeneratedSample:
    sample_id: str
    file: str
    expected: str
    sample_type: str
    source: str
    source_record_id: str = ""
    candidate_record_id: str = ""
    line_count_bucket: str = ""
    language_bucket: str = ""
    source_bucket: str = ""
    title: str = ""
    artist: str = ""
    notes: str = ""


def generate_eval_set(
    *,
    library_dir: Path,
    output_dir: Path,
    csv_path: Path,
    size: int = 100,
    positive_ratio: float = 0.30,
    seed: int = 20260602,
    index_path: Path | None = None,
    eval_index_path: Path | None = None,
) -> dict[str, object]:
    """Generate a stratified production evaluation set.

    ``positive_ratio`` is kept for CLI compatibility. It overrides the default
    positive quota while keeping the remaining negative categories proportional.
    """
    if size <= 0:
        raise ValueError("size must be positive")

    _progress(f"start generation: size={size}, positive_ratio={positive_ratio}, seed={seed}")
    rng = random.Random(seed)
    profiles = profile_library(library_dir)
    if not profiles:
        raise ValueError(f"{library_dir} 下没有 .lrc/.txt 歌词文件")

    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    _progress(f"clean output dir: {output_dir}")
    _clean_generated_output_dir(output_dir)

    plan = _sample_plan(size, positive_ratio=positive_ratio)
    _progress(f"sample plan: {plan}")
    holdout_count = min(plan["negative_real_holdout_full_song"], max(1, len(profiles) // 2))
    holdout_profiles = _stratified_unique_sample(
        profiles,
        holdout_count,
        rng,
    )
    holdout_ids = {profile.record_id for profile in holdout_profiles}
    indexed_profiles = [profile for profile in profiles if profile.record_id not in holdout_ids] or profiles
    eval_index_path = eval_index_path or csv_path.with_suffix(csv_path.suffix + ".index.pkl")
    _build_eval_index(indexed_profiles, eval_index_path)
    groups = _profile_groups(indexed_profiles)
    samples: list[GeneratedSample] = []

    _progress("build positive_full_duplicate samples")
    samples.extend(
        _build_positive_samples(
            _stratified_sample(groups["normal"], plan["positive_full_duplicate"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build negative_real_holdout_full_song samples")
    samples.extend(
        _build_holdout_full_song_samples(
            holdout_profiles,
            output_dir,
            csv_path.parent,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build negative_fragment samples")
    samples.extend(
        _build_fragment_samples(
            _stratified_sample(groups["fragmentable"], plan["negative_fragment"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build negative_shared_chorus samples")
    samples.extend(
        _build_shared_chorus_samples(
            _stratified_sample(groups["normal"], plan["negative_shared_chorus"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build negative_translation_only samples")
    samples.extend(
        _build_translation_only_samples(
            _stratified_sample(groups["foreign"], plan["negative_translation_only"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build negative_same_theme_synthetic samples")
    samples.extend(
        _build_same_theme_synthetic_samples(
            plan["negative_same_theme_synthetic"],
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")
    _progress("build edge_short_or_placeholder samples")
    samples.extend(
        _build_edge_samples(
            _stratified_sample(groups["edge"], plan["edge_short_or_placeholder"], rng),
            output_dir,
            csv_path.parent,
            rng,
            start_index=len(samples) + 1,
        )
    )
    _progress(f"built samples: {len(samples)}/{size}")

    if len(samples) < size:
        _progress(f"top up with negative_same_theme_synthetic samples: {size - len(samples)}")
        samples.extend(
            _build_same_theme_synthetic_samples(
                size - len(samples),
                output_dir,
                csv_path.parent,
                rng,
                start_index=len(samples) + 1,
            )
        )
    samples = samples[:size]
    rng.shuffle(samples)

    _progress(f"write csv: {csv_path}")
    _write_csv(samples, csv_path, seed=seed)
    _progress("write manifest")
    manifest = _write_manifest(
        profiles=profiles,
        samples=samples,
        csv_path=csv_path,
        output_dir=output_dir,
        seed=seed,
        plan=plan,
        index_path=index_path,
        eval_index_path=eval_index_path,
        holdout_count=len(holdout_profiles),
    )
    _progress("generation complete")
    return manifest


def profile_library(library_dir: Path) -> list[LyricProfile]:
    profiles: list[LyricProfile] = []
    paths = iter_lyric_files(library_dir)
    _progress(f"profile library: 0/{len(paths)}")
    for index, path in enumerate(paths, start=1):
        record = record_from_file(path, base_dir=library_dir)
        raw_text = record.lyrics
        normalized = normalize_lyrics(raw_text)
        lines = normalized.primary_lines or normalized.unique_lines
        line_count = len(lines)
        normalized_text = fingerprint_text(normalized) or normalized.normalized_full_text
        source_bucket = _source_bucket(path)
        profiles.append(
            LyricProfile(
                path=path,
                record_id=record.record_id,
                raw_text=raw_text,
                title=record.title or "",
                artist=record.artist or "",
                normalized=normalized,
                line_count=line_count,
                char_count=len(normalized_text),
                line_count_bucket=_line_count_bucket(line_count),
                language_bucket=_language_bucket(lines),
                source_bucket=source_bucket,
                normalized_hash=hashlib.sha256(normalized_text.encode("utf-8")).hexdigest(),
                has_translation=bool(normalized.translation_lines),
            )
        )
        _progress_count("profile library", index, len(paths), step=5000)
    return profiles


def _sample_plan(size: int, *, positive_ratio: float) -> dict[str, int]:
    positive_ratio = max(0.0, min(1.0, positive_ratio))
    mix = dict(DEFAULT_SAMPLE_MIX)
    negative_total = sum(value for key, value in mix.items() if key != "positive_full_duplicate")
    mix["positive_full_duplicate"] = positive_ratio
    for key in list(mix):
        if key != "positive_full_duplicate":
            mix[key] = (1.0 - positive_ratio) * (DEFAULT_SAMPLE_MIX[key] / negative_total)

    plan = {key: int(size * value) for key, value in mix.items()}
    remainder = size - sum(plan.values())
    for key in sorted(mix, key=mix.get, reverse=True):
        if remainder <= 0:
            break
        plan[key] += 1
        remainder -= 1
    return plan


def _profile_groups(profiles: list[LyricProfile]) -> dict[str, list[LyricProfile]]:
    normal = [profile for profile in profiles if profile.line_count >= 6]
    edge = [profile for profile in profiles if profile.line_count <= 5]
    return {
        "normal": normal or profiles,
        "fragmentable": [profile for profile in profiles if profile.line_count >= 12] or normal or profiles,
        "foreign": [
            profile
            for profile in profiles
            if profile.language_bucket in {"latin", "mixed", "jp_kr"} and profile.line_count >= 4
        ]
        or normal
        or profiles,
        "edge": edge or normal or profiles,
    }


def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
    if count <= 0 or not profiles:
        return []
    buckets: dict[tuple[str, str, str], list[LyricProfile]] = {}
    for profile in profiles:
        key = (profile.line_count_bucket, profile.language_bucket, profile.source_bucket)
        buckets.setdefault(key, []).append(profile)

    selected: list[LyricProfile] = []
    bucket_keys = list(buckets)
    rng.shuffle(bucket_keys)
    cursors = {key: rng.sample(items, len(items)) for key, items in buckets.items()}
    while len(selected) < count and bucket_keys:
        progressed = False
        for key in list(bucket_keys):
            if len(selected) >= count:
                break
            items = cursors[key]
            if not items:
                bucket_keys.remove(key)
                continue
            selected.append(items.pop())
            progressed = True
        if not progressed:
            break
    while len(selected) < count:
        selected.append(rng.choice(profiles))
    return selected


def _stratified_unique_sample(profiles: list[LyricProfile], count: int, rng: random.Random) -> list[LyricProfile]:
    if count <= 0 or not profiles:
        return []
    return _stratified_sample(profiles, min(count, len(profiles)), rng)


def _build_eval_index(profiles: list[LyricProfile], index_path: Path) -> None:
    _progress(f"build eval index excluding holdout: {index_path}")
    checker = DuplicateChecker()
    total = len(profiles)
    for index, profile in enumerate(profiles, start=1):
        checker.add_normalized_record(
            LyricRecord(
                record_id=profile.record_id,
                lyrics=profile.raw_text,
                title=profile.title or None,
                artist=profile.artist or None,
            ),
            profile.normalized,
        )
        _progress_count("build eval index", index, total, step=5000)
    index_path.parent.mkdir(parents=True, exist_ok=True)
    checker.save(index_path)


def _build_positive_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        raw = profile.raw_text
        lines = _content_lines(raw)
        variants = [
            ("positive_exact_copy", raw),
            ("positive_timestamped", _add_timestamps(lines)),
            ("positive_punctuation_noise", _add_punctuation_noise(lines, rng)),
            ("positive_platform_noise", _with_platform_noise(lines)),
            ("positive_blank_line_noise", _add_blank_line_noise(lines)),
            ("positive_chorus_count_changed", _change_repeated_line_counts(lines)),
            ("positive_translation_added", _translation_added(lines)),
        ]
        sample_type, text = variants[offset % len(variants)]
        index = start_index + offset
        path = _write_sample_file(output_dir, f"pos_{index:05d}_{sample_type}.txt", text)
        samples.append(_sample_from_profile(index, path, csv_base, "应去重", sample_type, profile))
        _progress_count("positive_full_duplicate", len(samples), len(profiles))
    return samples


def _build_holdout_full_song_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    _progress("build negative_real_holdout_full_song samples")
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        index = start_index + offset
        text = profile.raw_text
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_real_holdout_full_song.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_real_holdout_full_song",
                profile,
                notes="full real lyric held out from the generated eval index",
            )
        )
        _progress_count("negative_real_holdout_full_song", len(samples), len(profiles))
    return samples


def _build_same_theme_synthetic_samples(
    count: int,
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset in range(count):
        index = start_index + offset
        text = _same_theme_synthetic(index, rng)
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_same_theme_synthetic.txt", text)
        samples.append(
            GeneratedSample(
                sample_id=f"sample-{index:05d}",
                file=str(path.relative_to(csv_base)),
                expected="不应去重",
                sample_type="negative_same_theme_synthetic",
                source="synthetic",
                notes="same-theme synthetic full lyric not copied from library",
            )
        )
        _progress_count("negative_same_theme_synthetic", len(samples), count)
    return samples


def _build_fragment_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        text = _single_song_fragment(lines, rng)
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_fragment.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_fragment",
                profile,
                notes="partial lyric fragment only",
            )
        )
        _progress_count("negative_fragment", len(samples), len(profiles))
    return samples


def _build_shared_chorus_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        repeated = _repeated_or_sampled_lines(profile.normalized, rng)
        text = "\n".join(
            [
                "清晨的光落在新的街口",
                "我把故事重新写给以后",
                *repeated,
                *repeated,
                "所有答案都从这里开始",
            ]
        )
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_shared_chorus.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_shared_chorus",
                profile,
                notes="shared repeated lines with new surrounding content",
            )
        )
        _progress_count("negative_shared_chorus", len(samples), len(profiles))
    return samples


def _build_translation_only_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.translation_lines) or [
            _pseudo_translation(idx) for idx in range(1, min(8, max(profile.line_count, 4)) + 1)
        ]
        rng.shuffle(lines)
        text = "\n".join(lines[:8])
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_negative_translation_only.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "negative_translation_only",
                profile,
                notes="translation-like text without matching original lyric",
            )
        )
        _progress_count("negative_translation_only", len(samples), len(profiles))
    return samples


def _build_edge_samples(
    profiles: list[LyricProfile],
    output_dir: Path,
    csv_base: Path,
    rng: random.Random,
    *,
    start_index: int,
) -> list[GeneratedSample]:
    samples: list[GeneratedSample] = []
    for offset, profile in enumerate(profiles):
        lines = list(profile.normalized.primary_lines or profile.normalized.unique_lines)
        if profile.line_count <= 1:
            text = _same_theme_synthetic(start_index + offset, rng)
            notes = "zero or one effective line; use synthetic edge negative"
        else:
            text = _short_shared_snippet(lines, rng)
            notes = "short lyric edge case with limited overlap"
        index = start_index + offset
        path = _write_sample_file(output_dir, f"neg_{index:05d}_edge_short_or_placeholder.txt", text)
        samples.append(
            _sample_from_profile(
                index,
                path,
                csv_base,
                "不应去重",
                "edge_short_or_placeholder",
                profile,
                notes=notes,
            )
        )
        _progress_count("edge_short_or_placeholder", len(samples), len(profiles))
    return samples


def _sample_from_profile(
    index: int,
    path: Path,
    csv_base: Path,
    expected: str,
    sample_type: str,
    profile: LyricProfile,
    *,
    candidate_record_id: str = "",
    notes: str = "",
) -> GeneratedSample:
    return GeneratedSample(
        sample_id=f"sample-{index:05d}",
        file=str(path.relative_to(csv_base)),
        expected=expected,
        sample_type=sample_type,
        source=str(profile.path),
        source_record_id=profile.record_id,
        candidate_record_id=candidate_record_id,
        line_count_bucket=profile.line_count_bucket,
        language_bucket=profile.language_bucket,
        source_bucket=profile.source_bucket,
        title=profile.title,
        artist=profile.artist,
        notes=notes,
    )


def _write_sample_file(output_dir: Path, name: str, text: str) -> Path:
    path = output_dir / name
    path.write_text(text.strip() + "\n", encoding="utf-8")
    return path


def _write_csv(samples: list[GeneratedSample], csv_path: Path, *, seed: int) -> None:
    fieldnames = [
        "id",
        "file",
        "expected",
        "sample_type",
        "source",
        "source_record_id",
        "candidate_record_id",
        "line_count_bucket",
        "language_bucket",
        "source_bucket",
        "title",
        "artist",
        "seed",
        "notes",
    ]
    with csv_path.open("w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for sample in samples:
            writer.writerow(
                {
                    "id": sample.sample_id,
                    "file": sample.file,
                    "expected": sample.expected,
                    "sample_type": sample.sample_type,
                    "source": sample.source,
                    "source_record_id": sample.source_record_id,
                    "candidate_record_id": sample.candidate_record_id,
                    "line_count_bucket": sample.line_count_bucket,
                    "language_bucket": sample.language_bucket,
                    "source_bucket": sample.source_bucket,
                    "title": sample.title,
                    "artist": sample.artist,
                    "seed": seed,
                    "notes": sample.notes,
                }
            )


def _write_manifest(
    *,
    profiles: list[LyricProfile],
    samples: list[GeneratedSample],
    csv_path: Path,
    output_dir: Path,
    seed: int,
    plan: dict[str, int],
    index_path: Path | None,
    eval_index_path: Path,
    holdout_count: int,
) -> dict[str, object]:
    manifest = {
        "seed": seed,
        "library_files": len(profiles),
        "sample_size": len(samples),
        "plan": plan,
        "source_index": str(index_path) if index_path else "",
        "eval_index": str(eval_index_path),
        "holdout_records": holdout_count,
        "lyrics_dir": str(output_dir),
        "csv": str(csv_path),
        "manifest": str(csv_path.with_suffix(csv_path.suffix + ".manifest.json")),
        "sample_type_counts": dict(Counter(sample.sample_type for sample in samples)),
        "expected_counts": dict(Counter(sample.expected for sample in samples)),
        "line_count_bucket_counts": dict(Counter(profile.line_count_bucket for profile in profiles)),
        "language_bucket_counts": dict(Counter(profile.language_bucket for profile in profiles)),
        "source_bucket_counts": dict(Counter(profile.source_bucket for profile in profiles).most_common(50)),
        "unique_source_records": len({sample.source_record_id for sample in samples if sample.source_record_id}),
    }
    csv_path.with_suffix(csv_path.suffix + ".manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    return manifest


def _content_lines(text: str) -> list[str]:
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return lines or [text.strip()]


def _clean_generated_output_dir(output_dir: Path) -> None:
    for path in output_dir.iterdir():
        if path.is_file() and path.suffix.lower() in {".txt", ".lrc"}:
            path.unlink()


def _line_count_bucket(line_count: int) -> str:
    if line_count == 0:
        return "zero"
    if line_count <= 5:
        return "short"
    if line_count <= 40:
        return "normal"
    return "long"


def _language_bucket(lines: tuple[str, ...]) -> str:
    text = "\n".join(lines)
    cjk = len(re.findall(r"[\u4e00-\u9fff]", text))
    latin = len(re.findall(r"[A-Za-z]", text))
    kana = len(re.findall(r"[\u3040-\u30ff]", text))
    hangul = len(re.findall(r"[\uac00-\ud7af]", text))
    if kana or hangul:
        return "jp_kr"
    if cjk and latin:
        return "mixed"
    if cjk:
        return "zh"
    if latin:
        return "latin"
    return "other"


def _source_bucket(path: Path) -> str:
    stem = path.stem
    parts = stem.split("_")
    if len(parts) >= 2:
        code = re.sub(r"\d+$", "", parts[-1])
        return code or "unknown"
    return "unknown"


def _add_timestamps(lines: list[str]) -> str:
    return "\n".join(f"[00:{idx % 60:02d}.00]{line}" for idx, line in enumerate(lines, start=1))


def _add_punctuation_noise(lines: list[str], rng: random.Random) -> str:
    marks = ["！", "?", "...", "，", "。"]
    return "\n".join(f"{line}{rng.choice(marks)}" for line in lines)


def _with_platform_noise(lines: list[str]) -> str:
    return "\n".join(["歌词来自QQ音乐", "作词：测试", *lines, "未经著作权人许可 不得翻唱"])


def _add_blank_line_noise(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if idx % 4 == 0:
            result.append("")
    return "\n".join(result)


def _change_repeated_line_counts(lines: list[str]) -> str:
    seen: set[str] = set()
    result: list[str] = []
    for line in lines:
        if line in seen:
            continue
        seen.add(line)
        result.append(line)
    return "\n".join(result or lines)


def _translation_added(lines: list[str]) -> str:
    result: list[str] = []
    for idx, line in enumerate(lines, start=1):
        result.append(line)
        if _looks_foreign(line) and idx <= 24:
            result.append(_pseudo_translation(idx))
    return "\n".join(result)


def _single_song_fragment(lines: list[str], rng: random.Random) -> str:
    if len(lines) <= 4:
        return "\n".join(lines[: max(1, len(lines) // 2)])
    fragment_len = max(2, min(8, len(lines) // rng.choice([3, 4, 5])))
    start = rng.randrange(0, max(1, len(lines) - fragment_len + 1))
    return "\n".join(lines[start : start + fragment_len])


def _short_shared_snippet(lines: list[str], rng: random.Random) -> str:
    snippet = rng.sample(lines, k=min(2, len(lines))) if lines else []
    synthetic = [
        "清晨的风吹过新的街口",
        "我把昨天放进安静的口袋",
        *snippet,
        "故事从这里重新开始",
        "灯光落下我继续往前走",
    ]
    return "\n".join(synthetic)


def _repeated_or_sampled_lines(normalized: NormalizedLyrics, rng: random.Random) -> list[str]:
    repeated = [line for line, count in normalized.line_counts.items() if count >= 2]
    if repeated:
        return rng.sample(repeated, k=min(2, len(repeated)))
    lines = list(normalized.primary_lines or normalized.unique_lines)
    return rng.sample(lines, k=min(2, len(lines))) if lines else []


def _same_theme_synthetic(index: int, rng: random.Random) -> str:
    starts = ["我在夜里想起远方的你", "城市灯火陪我走过雨季", "风把旧名字吹向清晨"]
    middles = ["那些没说完的话留在风里", "新的路口慢慢亮起", "时间把答案交给下一站"]
    ends = ["明天醒来我们各自继续", "我会把今天写成新的旋律", "故事从这里重新开始"]
    return "\n".join(
        [
            rng.choice(starts),
            rng.choice(middles),
            rng.choice(ends),
            f"这是第 {index} 个全新测试样本",
        ]
    )


def _pseudo_translation(index: int) -> str:
    translations = [
        "今晚我仍然想念你",
        "风会带走所有疲惫",
        "黑暗里也会有光",
        "别让昨天困住自己",
        "我们终会继续向前",
        "雨停以后天空会亮",
        "把遗憾留在旧时光",
        "你已经足够好了",
    ]
    return translations[(index - 1) % len(translations)]


def _looks_foreign(line: str) -> bool:
    latin = len(re.findall(r"[A-Za-z]", line))
    cjk = len(re.findall(r"[\u4e00-\u9fff]", line))
    return latin > 0 and cjk == 0