evaluate_postgres.py 17.1 KB

Raw Blame History Permalink

"""Evaluate lyric duplicate checking with PostgreSQL-backed candidate recall."""

from __future__ import annotations

import argparse
import csv
import hashlib
import json
import sys
import time
from pathlib import Path
from typing import Any


PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from lyric_dedup.checker import DuplicateChecker
from lyric_dedup.checker import LyricRecord
from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import fingerprint_text
from lyric_dedup.normalization import normalize_lyrics
from lyric_dedup_server.config import ServerConfig


def main() -> None:
    parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.")
    parser.add_argument("--csv", required=True)
    parser.add_argument("--out", required=True)
    parser.add_argument("--base-dir", default="")
    parser.add_argument("--profile-every", type=int, default=100)
    args = parser.parse_args()

    psycopg = _import_psycopg()
    config = ServerConfig()
    csv_path = Path(args.csv)
    out_path = Path(args.out)
    base_dir = Path(args.base_dir) if args.base_dir else None
    positive_decisions = {"duplicate"}

    total = _csv_data_row_count(csv_path)
    rows: list[dict[str, object]] = []
    profile_stats = _new_profile_stats()
    out_path.parent.mkdir(parents=True, exist_ok=True)
    _progress(f"evaluate postgres csv: 0/{total}")
    with psycopg.connect(config.dsn) as conn:
        with conn.cursor() as cursor:
            cursor.execute("select set_config('statement_timeout', %s, false)", (str(config.statement_timeout_ms),))
            cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(config.trgm_threshold),))
        with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open(
            "w", encoding="utf-8", newline=""
        ) as out_file:
            reader = csv.DictReader(in_file)
            if reader.fieldnames is None:
                raise ValueError("评估 CSV 需要表头")
            writer = csv.DictWriter(out_file, fieldnames=_fieldnames())
            writer.writeheader()
            for index, row in enumerate(reader, start=1):
                row_out = _evaluate_row(
                    conn,
                    row,
                    row_number=index + 1,
                    csv_path=csv_path,
                    base_dir=base_dir,
                    positive_decisions=positive_decisions,
                    config=config,
                )
                rows.append(row_out)
                writer.writerow(row_out)
                _progress_count("evaluate postgres csv", index, total, step=10)
                _update_profile_stats(profile_stats, row_out)
                if args.profile_every > 0 and index % args.profile_every == 0:
                    _progress(_format_profile_stats(profile_stats, index))

    summary = _evaluation_summary(rows, positive_decisions=positive_decisions, out_path=out_path)
    summary_path = out_path.with_suffix(out_path.suffix + ".summary.json")
    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
    _progress("postgres evaluation complete")
    print(json.dumps(summary, ensure_ascii=False))


def _evaluate_row(
    conn: Any,
    row: dict[str, str],
    *,
    row_number: int,
    csv_path: Path,
    base_dir: Path | None,
    positive_decisions: set[str],
    config: ServerConfig,
) -> dict[str, object]:
    parse_started = time.perf_counter()
    sample_id = row.get("id") or row.get("sample_id") or str(row_number)
    record, source = _record_from_eval_row(row, csv_path=csv_path, base_dir=base_dir)
    expected_duplicate = _parse_expected(row.get("expected") or row.get("label") or row.get("target"))
    parse_ms = round((time.perf_counter() - parse_started) * 1000, 2)
    candidates, timings = _recall_candidates(
        conn,
        record,
        recall_limit=config.recall_limit,
        enable_trgm=config.enable_trgm,
        exclude_record_ids=_exclude_record_ids_for_eval_row(row),
    )
    rank_started = time.perf_counter()
    result = _check_against_candidates(record, candidates, config=config)
    rank_ms = round((time.perf_counter() - rank_started) * 1000, 2)
    recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2)
    predicted_duplicate = result.decision.value in positive_decisions
    best = result.candidates[0] if result.candidates else None
    return {
        "id": sample_id,
        "source": source,
        "expected_duplicate": expected_duplicate,
        "decision": result.decision.value,
        "predicted_duplicate": predicted_duplicate,
        "correct": expected_duplicate == predicted_duplicate,
        "confidence": result.confidence,
        "reason": result.reason,
        "candidate_count": len(result.candidates),
        "parse_ms": parse_ms,
        "recall_ms": recall_ms,
        "exact_ms": timings["exact_ms"],
        "trgm_ms": timings["trgm_ms"],
        "line_ms": timings["line_ms"],
        "rank_ms": rank_ms,
        "best_candidate_id": best.record_id if best else "",
        "best_candidate_decision": best.decision.value if best else "",
        "best_candidate_confidence": best.confidence if best else "",
        "best_candidate_jaccard": best.jaccard if best else "",
        "best_candidate_line_coverage": best.line_coverage if best else "",
        "best_candidate_primary_jaccard": best.primary_jaccard if best else "",
        "best_candidate_primary_line_coverage": best.primary_line_coverage if best else "",
        "best_candidate_translation_jaccard": best.translation_jaccard if best else "",
        "best_candidate_translation_line_coverage": best.translation_line_coverage if best else "",
        "best_candidate_reason": best.reason if best else "",
        "matched_unique_lines": " | ".join(best.matched_unique_lines) if best else "",
    }


def _recall_candidates(
    conn: Any,
    record: LyricRecord,
    *,
    recall_limit: int,
    enable_trgm: bool,
    exclude_record_ids: list[str],
) -> tuple[list[LyricRecord], dict[str, float]]:
    query_lyrics = _pg_text(record.lyrics) or ""
    normalized = normalize_lyrics(query_lyrics)
    exact_text = fingerprint_text(normalized)
    exact_hash = hashlib.sha256(exact_text.encode("utf-8")).hexdigest()
    primary_text = "\n".join(normalized.primary_lines)
    line_hashes = [hashlib.sha256(line.encode("utf-8")).hexdigest() for line in normalized.primary_lines if line]
    candidates: dict[str, LyricRecord] = {}
    timings = {"exact_ms": 0.0, "trgm_ms": 0.0, "line_ms": 0.0}
    with conn.cursor() as cursor:
        started = time.perf_counter()
        cursor.execute(
            """
            select record_id, raw_text, title, artist
            from lyrics
            where deleted_at is null
              and exact_hash = %s
              and not (record_id = any(%s))
            limit %s
            """,
            (exact_hash, exclude_record_ids, recall_limit),
        )
        _add_rows(candidates, cursor.fetchall())
        timings["exact_ms"] = round((time.perf_counter() - started) * 1000, 2)

        if enable_trgm and primary_text:
            started = time.perf_counter()
            cursor.execute(
                """
                select record_id, raw_text, title, artist
                from lyrics
                where deleted_at is null
                  and not (record_id = any(%s))
                  and primary_text %% %s
                order by similarity(primary_text, %s) desc
                limit %s
                """,
                (exclude_record_ids, primary_text, primary_text, recall_limit),
            )
            _add_rows(candidates, cursor.fetchall())
            timings["trgm_ms"] = round((time.perf_counter() - started) * 1000, 2)

        if line_hashes:
            started = time.perf_counter()
            cursor.execute(
                """
                select l.record_id, l.raw_text, l.title, l.artist
                from lyric_lines ll
                join lyrics l on l.id = ll.lyric_id
                where l.deleted_at is null
                  and not (l.record_id = any(%s))
                  and ll.role = 'primary'
                  and ll.line_hash = any(%s)
                group by l.id
                order by count(*) desc
                limit %s
                """,
                (exclude_record_ids, line_hashes, recall_limit),
            )
            _add_rows(candidates, cursor.fetchall())
            timings["line_ms"] = round((time.perf_counter() - started) * 1000, 2)
    return list(candidates.values()), timings


def _exclude_record_ids_for_eval_row(row: dict[str, str]) -> list[str]:
    holdout_sample_types = {
        "negative_real_holdout_full_song",
        "negative_near_neighbor_holdout_full_song",
    }
    if row.get("sample_type") in holdout_sample_types and row.get("source_record_id"):
        return [row["source_record_id"]]
    return []


def _add_rows(candidates: dict[str, LyricRecord], rows: list[tuple[object, ...]]) -> None:
    for record_id, raw_text, title, artist in rows:
        candidates.setdefault(
            str(record_id),
            LyricRecord(
                record_id=str(record_id),
                lyrics=str(raw_text),
                title=str(title) if title is not None else None,
                artist=str(artist) if artist is not None else None,
            ),
        )


def _check_against_candidates(
    record: LyricRecord,
    candidates: list[LyricRecord],
    *,
    config: ServerConfig,
):
    checker = DuplicateChecker(
        duplicate_jaccard_threshold=config.duplicate_jaccard_threshold,
        duplicate_line_coverage_threshold=config.duplicate_line_coverage_threshold,
        duplicate_high_coverage_jaccard_threshold=config.duplicate_high_coverage_jaccard_threshold,
        duplicate_high_coverage_line_coverage_threshold=config.duplicate_high_coverage_line_coverage_threshold,
        review_jaccard_threshold=config.review_jaccard_threshold,
        review_line_coverage_threshold=config.review_line_coverage_threshold,
        review_query_coverage_threshold=config.review_query_coverage_threshold,
        fragment_query_coverage_threshold=config.fragment_query_coverage_threshold,
        fragment_max_line_ratio=config.fragment_max_line_ratio,
        fragment_min_matched_lines=config.fragment_min_matched_lines,
        chorus_short_line_count_threshold=config.chorus_short_line_count_threshold,
        chorus_material_overlap_threshold=config.chorus_material_overlap_threshold,
        chorus_material_query_coverage_threshold=config.chorus_material_query_coverage_threshold,
        confidence_jaccard_weight=config.confidence_jaccard_weight,
        confidence_line_coverage_weight=config.confidence_line_coverage_weight,
    )
    return checker.check_record_against_candidates(record, candidates, max_candidates=config.max_candidates)


def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
    lyrics = (row.get("lyrics") or "").strip()
    if lyrics:
        return (
            LyricRecord(
                record_id=row.get("id") or row.get("sample_id") or "__eval__",
                lyrics=_pg_text(lyrics.replace("\\n", "\n")) or "",
                title=_pg_text(row.get("title") or None),
                artist=_pg_text(row.get("artist") or None),
            ),
            "inline",
        )

    file_value = (row.get("file") or row.get("path") or row.get("source") or "").strip()
    if not file_value:
        raise ValueError("评估 CSV 每行需要 lyrics，或 file/path/source 文件路径")

    file_path = Path(file_value)
    if not file_path.is_absolute():
        file_path = (base_dir or csv_path.parent) / file_path
    record = record_from_file(file_path)
    record = LyricRecord(
        record_id=record.record_id,
        lyrics=_pg_text(record.lyrics) or "",
        title=_pg_text(record.title),
        artist=_pg_text(record.artist),
    )
    if row.get("title") or row.get("artist"):
        record = LyricRecord(
            record_id=record.record_id,
            lyrics=record.lyrics,
            title=_pg_text(row.get("title") or record.title),
            artist=_pg_text(row.get("artist") or record.artist),
        )
    return record, str(file_path)


def _parse_expected(value: str | None) -> bool:
    if value is None:
        raise ValueError("评估 CSV 每行需要 expected/label/target 列")
    normalized = value.strip().lower()
    positives = {"1", "true", "yes", "y", "duplicate", "dup", "重复", "应去重", "去重", "是"}
    negatives = {"0", "false", "no", "n", "new", "not_duplicate", "non_duplicate", "不重复", "不应去重", "新歌", "否"}
    if normalized in positives:
        return True
    if normalized in negatives:
        return False
    raise ValueError(f"无法识别 expected 值: {value!r}")


def _evaluation_summary(
    rows: list[dict[str, object]],
    *,
    positive_decisions: set[str],
    out_path: Path,
) -> dict[str, object]:
    tp = sum(1 for row in rows if row["expected_duplicate"] is True and row["predicted_duplicate"] is True)
    fp = sum(1 for row in rows if row["expected_duplicate"] is False and row["predicted_duplicate"] is True)
    tn = sum(1 for row in rows if row["expected_duplicate"] is False and row["predicted_duplicate"] is False)
    fn = sum(1 for row in rows if row["expected_duplicate"] is True and row["predicted_duplicate"] is False)
    total = len(rows)
    precision = tp / (tp + fp) if tp + fp else 0.0
    recall = tp / (tp + fn) if tp + fn else 0.0
    accuracy = (tp + tn) / total if total else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if precision + recall else 0.0
    return {
        "total": total,
        "positive_decisions": sorted(positive_decisions),
        "accuracy": round(accuracy, 4),
        "precision": round(precision, 4),
        "recall": round(recall, 4),
        "f1": round(f1, 4),
        "true_positive": tp,
        "false_positive": fp,
        "true_negative": tn,
        "false_negative": fn,
        "duplicate": sum(1 for row in rows if row["decision"] == "duplicate"),
        "review": sum(1 for row in rows if row["decision"] == "review"),
        "new": sum(1 for row in rows if row["decision"] == "new"),
        "out": str(out_path),
        "summary": str(out_path.with_suffix(out_path.suffix + ".summary.json")),
    }


def _fieldnames() -> list[str]:
    return [
        "id",
        "source",
        "expected_duplicate",
        "decision",
        "predicted_duplicate",
        "correct",
        "confidence",
        "reason",
        "candidate_count",
        "parse_ms",
        "recall_ms",
        "exact_ms",
        "trgm_ms",
        "line_ms",
        "rank_ms",
        "best_candidate_id",
        "best_candidate_decision",
        "best_candidate_confidence",
        "best_candidate_jaccard",
        "best_candidate_line_coverage",
        "best_candidate_primary_jaccard",
        "best_candidate_primary_line_coverage",
        "best_candidate_translation_jaccard",
        "best_candidate_translation_line_coverage",
        "best_candidate_reason",
        "matched_unique_lines",
    ]


def _csv_data_row_count(csv_path: Path) -> int:
    with csv_path.open(encoding="utf-8-sig", newline="") as file:
        reader = csv.reader(file)
        next(reader, None)
        return sum(1 for _ in reader)


def _progress(message: str) -> None:
    print(f"[pg-eval] {message}", file=sys.stderr, flush=True)


def _progress_count(label: str, current: int, total: int, *, step: int = 1000) -> None:
    if total <= 0:
        return
    if current == 1 or current == total or current % step == 0:
        _progress(f"{label}: {current}/{total}")


def _new_profile_stats() -> dict[str, float]:
    return {
        "parse_ms": 0.0,
        "exact_ms": 0.0,
        "trgm_ms": 0.0,
        "line_ms": 0.0,
        "rank_ms": 0.0,
        "recall_ms": 0.0,
        "candidate_count": 0.0,
    }


def _update_profile_stats(stats: dict[str, float], row: dict[str, object]) -> None:
    for key in stats:
        try:
            stats[key] += float(row.get(key) or 0)
        except (TypeError, ValueError):
            pass


def _format_profile_stats(stats: dict[str, float], count: int) -> str:
    if count <= 0:
        return "profile: no rows"
    return (
        "profile avg "
        f"parse={stats['parse_ms'] / count:.2f}ms "
        f"exact={stats['exact_ms'] / count:.2f}ms "
        f"line={stats['line_ms'] / count:.2f}ms "
        f"trgm={stats['trgm_ms'] / count:.2f}ms "
        f"rank={stats['rank_ms'] / count:.2f}ms "
        f"recall={stats['recall_ms'] / count:.2f}ms "
        f"candidates={stats['candidate_count'] / count:.1f}"
    )


def _pg_text(value: str | None) -> str | None:
    if value is None:
        return None
    return value.replace("\x00", "")


def _import_psycopg():
    try:
        import psycopg

        return psycopg
    except ModuleNotFoundError:
        print(
            "Missing dependency: psycopg. Install it with:\n"
            "  python -m pip install 'psycopg[binary]'",
            file=sys.stderr,
        )
        raise SystemExit(1)


if __name__ == "__main__":
    main()