config.py 5.33 KB

Raw Blame History Permalink

"""Server configuration loaded from environment variables."""

from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path


def _load_env_file() -> None:
    """Load root .env values without overriding real environment variables."""
    env_path = Path(__file__).resolve().parent.parent / ".env"
    if not env_path.exists():
        return
    with env_path.open(encoding="utf-8") as file:
        for raw_line in file:
            line = raw_line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            key, value = line.split("=", 1)
            os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))


_load_env_file()


@dataclass
class ServerConfig:
    # PostgreSQL DSN used by the dedup service.
    dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")

    # Maximum ranked candidates returned in the final API result.
    max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))

    # Maximum candidates recalled from each PostgreSQL recall tier.
    recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))

    # Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall.
    enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"

    # PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more.
    trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))

    # PostgreSQL statement timeout for one dedup check, in milliseconds.
    statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))

    # HTTP download timeout for fetching lyric URLs, in seconds.
    download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))

    # Minimum primary n-gram Jaccard similarity required for automatic duplicate.
    # Raising this makes automatic duplicate stricter; lowering it may increase false positives.
    duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78"))

    # Minimum line coverage required for automatic duplicate.
    # This is the main guard against treating partial lyric fragments as full duplicates.
    duplicate_line_coverage_threshold: float = float(
        os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72")
    )

    # Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high.
    # Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor.
    duplicate_high_coverage_jaccard_threshold: float = float(
        os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78")
    )

    # Line coverage required by the alternate high-coverage duplicate path.
    # Raising this makes the alternate duplicate path stricter for near-complete variants.
    duplicate_high_coverage_line_coverage_threshold: float = float(
        os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90")
    )

    # Minimum primary/full n-gram Jaccard similarity that can send a candidate to review.
    # Raising this reduces review volume; lowering it catches weaker suspicious overlaps.
    review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45"))

    # Minimum line coverage that can send a candidate to review when query coverage is also material.
    # Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall.
    review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35"))

    # Minimum share of query lines that must match before line coverage alone can trigger review.
    # Raising this makes partial-fragment review stricter.
    review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))

    # Very short query lyric line count that can force repeated-chorus overlap into review.
    # Raising this catches more short chorus-like inputs; lowering it reduces review volume.
    chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))

    # Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
    # Raising this makes chorus-only review stricter.
    chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))

    # Minimum query-side coverage for repeated-chorus overlap to be considered material.
    # Raising this reduces review decisions caused by small shared chorus fragments.
    chorus_material_query_coverage_threshold: float = float(
        os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
    )

    # Weight assigned to primary n-gram Jaccard when computing confidence.
    # This affects the reported confidence score, not the duplicate/review threshold checks directly.
    confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58"))

    # Weight assigned to primary line coverage when computing confidence.
    # Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0.
    confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42"))