process_library.py 6.96 KB
"""Process newly added lyric library files.

This script is intended for the recurring workflow after adding files to
``data/library``:

1. Move pure-music placeholder lyric files out of the active library.
2. Rebuild the duplicate-checking index.
3. Optionally regenerate and evaluate a synthetic regression set.
"""

from __future__ import annotations

import argparse
import csv
import json
import shutil
import sys
from datetime import datetime
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.cli import evaluate_csv
from lyric_dedup.eval_dataset import generate_eval_set
from lyric_dedup.file_import import iter_lyric_files
from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import records_from_dir
from lyric_dedup.normalization import normalize_lyrics


PLACEHOLDER_MARKERS = (
    "【曲库专用】",
    "此歌曲为没有填词的纯音乐",
)


def main() -> None:
    parser = argparse.ArgumentParser(description="Process lyric library additions.")
    parser.add_argument("--library-dir", default="data/library")
    parser.add_argument("--index", default="outputs/indexes/library_lyrics.pkl")
    parser.add_argument("--quarantine-dir", default="data/quarantine/no_lyrics_placeholders")
    parser.add_argument("--dry-run", action="store_true", help="Only report placeholder files; do not move or write outputs.")
    parser.add_argument("--delete-placeholders", action="store_true", help="Delete matched placeholder files instead of moving them.")
    parser.add_argument("--eval-size", type=int, default=0, help="Generate and evaluate this many synthetic samples. 0 disables eval.")
    parser.add_argument("--positive-ratio", type=float, default=0.2)
    parser.add_argument("--eval-dir", default="data/generated_eval/incoming")
    parser.add_argument("--eval-csv", default="data/generated_eval/eval.csv")
    parser.add_argument("--eval-out", default="outputs/results/library_eval.csv")
    parser.add_argument("--report", default="outputs/results/library_process_report.json")
    args = parser.parse_args()

    library_dir = Path(args.library_dir)
    quarantine_dir = Path(args.quarantine_dir)
    report_path = Path(args.report)

    files_before = iter_lyric_files(library_dir)
    placeholders = _find_placeholder_files(library_dir)
    short_effective = _effective_line_report(library_dir)

    moved_or_deleted: list[str] = []
    if not args.dry_run:
        moved_or_deleted = _handle_placeholders(
            placeholders,
            library_dir=library_dir,
            quarantine_dir=quarantine_dir,
            delete=args.delete_placeholders,
        )
        _build_index(library_dir, Path(args.index))

        if args.eval_size > 0:
            generate_eval_set(
                library_dir=library_dir,
                output_dir=Path(args.eval_dir),
                csv_path=Path(args.eval_csv),
                size=args.eval_size,
                positive_ratio=args.positive_ratio,
                index_path=Path(args.index),
            )
            evaluate_csv(
                Path(args.index),
                Path(args.eval_csv),
                Path(args.eval_out),
                base_dir=Path(args.eval_csv).parent,
                positive_decisions={"duplicate"},
                max_candidates=5,
            )
            evaluate_csv(
                Path(args.index),
                Path(args.eval_csv),
                Path(args.eval_out).with_name(Path(args.eval_out).stem + "_review_positive.csv"),
                base_dir=Path(args.eval_csv).parent,
                positive_decisions={"duplicate", "review"},
                max_candidates=5,
            )

    report = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "dry_run": args.dry_run,
        "library_dir": str(library_dir),
        "files_before": len(files_before),
        "placeholder_matches": len(placeholders),
        "placeholder_files": [str(path) for path in placeholders],
        "handled_placeholder_files": moved_or_deleted,
        "files_after": len(iter_lyric_files(library_dir)),
        "index": str(args.index),
        "eval_size": args.eval_size,
        "eval_csv": str(args.eval_csv) if args.eval_size > 0 else "",
        "eval_out": str(args.eval_out) if args.eval_size > 0 else "",
        "short_effective_line_counts": short_effective,
    }

    print(json.dumps(report, ensure_ascii=False, indent=2))
    if not args.dry_run:
        report_path.parent.mkdir(parents=True, exist_ok=True)
        report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")


def _find_placeholder_files(library_dir: Path) -> list[Path]:
    matches: list[Path] = []
    for path in iter_lyric_files(library_dir):
        text = read_lyric_file(path)
        if any(marker in text for marker in PLACEHOLDER_MARKERS):
            matches.append(path)
    return matches


def _handle_placeholders(
    placeholders: list[Path],
    *,
    library_dir: Path,
    quarantine_dir: Path,
    delete: bool,
) -> list[str]:
    handled: list[str] = []
    if not placeholders:
        return handled
    if not delete:
        quarantine_dir.mkdir(parents=True, exist_ok=True)
    for path in placeholders:
        if delete:
            path.unlink()
            handled.append(f"deleted:{path}")
            continue
        relative = path.resolve().relative_to(library_dir.resolve())
        destination = quarantine_dir / relative
        destination.parent.mkdir(parents=True, exist_ok=True)
        if destination.exists():
            destination = destination.with_name(f"{destination.stem}_{datetime.now().strftime('%Y%m%d%H%M%S')}{destination.suffix}")
        shutil.move(str(path), str(destination))
        handled.append(f"moved:{path}->{destination}")
    return handled


def _build_index(library_dir: Path, index_path: Path) -> None:
    checker = DuplicateChecker()
    for record in records_from_dir(library_dir):
        checker.add_record(record)
    index_path.parent.mkdir(parents=True, exist_ok=True)
    checker.save(index_path)


def _effective_line_report(library_dir: Path) -> dict[str, int]:
    buckets = {
        "total": 0,
        "zero_effective_lines": 0,
        "one_to_three_effective_lines": 0,
        "four_to_five_effective_lines": 0,
        "six_plus_effective_lines": 0,
    }
    for path in iter_lyric_files(library_dir):
        buckets["total"] += 1
        normalized = normalize_lyrics(read_lyric_file(path))
        line_count = len(normalized.primary_lines or normalized.unique_lines)
        if line_count == 0:
            buckets["zero_effective_lines"] += 1
        elif line_count <= 3:
            buckets["one_to_three_effective_lines"] += 1
        elif line_count <= 5:
            buckets["four_to_five_effective_lines"] += 1
        else:
            buckets["six_plus_effective_lines"] += 1
    return buckets


if __name__ == "__main__":
    main()