cli.py 3.18 KB
"""PostgreSQL-backed command line tools for lyric duplicate checking."""

from __future__ import annotations

import argparse
import json
from pathlib import Path

from lyric_dedup.eval_dataset import generate_eval_set
from lyric_dedup.file_import import record_from_file


def main() -> None:
    parser = argparse.ArgumentParser(prog="lyric-dedup")
    subparsers = parser.add_subparsers(dest="command", required=True)

    check = subparsers.add_parser("check-file", help="check one .lrc/.txt file using PostgreSQL recall")
    check.add_argument("--dsn", default="postgresql:///lyric_dedup")
    check.add_argument("--file", required=True)
    check.add_argument("--max-candidates", type=int, default=5)
    check.add_argument("--recall-limit", type=int, default=100)
    check.add_argument("--enable-trgm", action="store_true")
    check.add_argument("--trgm-threshold", type=float, default=0.3)
    check.add_argument("--statement-timeout-ms", type=int, default=5000)

    generate = subparsers.add_parser("generate-eval-set", help="generate labeled eval samples from a lyric library")
    generate.add_argument("--library-dir", required=True)
    generate.add_argument("--lyrics-dir", required=True)
    generate.add_argument("--csv", required=True)
    generate.add_argument("--size", type=int, default=100)
    generate.add_argument("--positive-ratio", type=float, default=0.3)
    generate.add_argument("--seed", type=int, default=20260602)
    generate.add_argument(
        "--profile",
        choices=("standard", "hard"),
        default="standard",
        help="evaluation sample profile: standard production mix or harder business-realistic edge mix",
    )

    args = parser.parse_args()
    if args.command == "check-file":
        check_file_pg(args)
    elif args.command == "generate-eval-set":
        summary = generate_eval_set(
            library_dir=Path(args.library_dir),
            output_dir=Path(args.lyrics_dir),
            csv_path=Path(args.csv),
            size=args.size,
            positive_ratio=args.positive_ratio,
            seed=args.seed,
            profile=args.profile,
        )
        print(json.dumps(summary, ensure_ascii=False))


def check_file_pg(args: argparse.Namespace) -> None:
    from lyric_dedup_server.config import ServerConfig
    from lyric_dedup_server.service import DedupService

    record = record_from_file(Path(args.file))
    config = ServerConfig(
        dsn=args.dsn,
        max_candidates=args.max_candidates,
        recall_limit=args.recall_limit,
        enable_trgm=args.enable_trgm,
        trgm_threshold=args.trgm_threshold,
        statement_timeout_ms=args.statement_timeout_ms,
    )
    service = DedupService(config=config)
    result = service.check(record.lyrics, title=record.title, artist=record.artist)
    print(
        json.dumps(
            {
                "source": args.file,
                "decision": result.decision,
                "duplicate": result.duplicate,
                "confidence": result.confidence,
                "reason": result.reason,
                "candidate_count": result.candidate_count,
            },
            ensure_ascii=False,
            indent=2,
        )
    )


if __name__ == "__main__":
    main()