config.py
6.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Server configuration loaded from environment variables."""
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
def _load_env_file() -> None:
"""Load root .env values without overriding real environment variables."""
env_path = Path(__file__).resolve().parent.parent / ".env"
if not env_path.exists():
return
with env_path.open(encoding="utf-8") as file:
for raw_line in file:
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
os.environ.setdefault(key.strip(), value.strip().strip('"').strip("'"))
_load_env_file()
@dataclass
class ServerConfig:
# PostgreSQL DSN used by the dedup service.
dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")
# Maximum ranked candidates returned in the final API result.
max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))
# Maximum candidates recalled from each PostgreSQL recall tier.
recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))
# Whether to use pg_trgm similarity recall in addition to exact hash and line hash recall.
enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"
# PostgreSQL pg_trgm recall threshold; lower values recall more candidates and cost more.
trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))
# PostgreSQL statement timeout for one dedup check, in milliseconds.
statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))
# HTTP download timeout for fetching lyric URLs, in seconds.
download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))
# Minimum primary n-gram Jaccard similarity required for automatic duplicate.
# Raising this makes automatic duplicate stricter; lowering it may increase false positives.
duplicate_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_DUPLICATE_JACCARD_THRESHOLD", "0.78"))
# Minimum line coverage required for automatic duplicate.
# This is the main guard against treating partial lyric fragments as full duplicates.
duplicate_line_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_LINE_COVERAGE_THRESHOLD", "0.72")
)
# Alternate automatic duplicate path: lower/normal Jaccard can still duplicate when line coverage is very high.
# Keep this aligned with duplicate_jaccard_threshold to avoid an unintended duplicate backdoor.
duplicate_high_coverage_jaccard_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_JACCARD_THRESHOLD", "0.78")
)
# Line coverage required by the alternate high-coverage duplicate path.
# Raising this makes the alternate duplicate path stricter for near-complete variants.
duplicate_high_coverage_line_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_DUPLICATE_HIGH_COVERAGE_LINE_COVERAGE_THRESHOLD", "0.90")
)
# Minimum primary/full n-gram Jaccard similarity that can send a candidate to review.
# Raising this reduces review volume; lowering it catches weaker suspicious overlaps.
review_jaccard_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_JACCARD_THRESHOLD", "0.45"))
# Minimum line coverage that can send a candidate to review when query coverage is also material.
# Raising this reduces fragment/short-overlap reviews; lowering it increases suspicious recall.
review_line_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_LINE_COVERAGE_THRESHOLD", "0.35"))
# Minimum share of query lines that must match before line coverage alone can trigger review.
# Raising this makes partial-fragment review stricter.
review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
# Plain fragment guard: query-side match ratio required to treat the input as a lyric fragment.
# When this is met together with fragment_max_line_ratio, the result is new instead of review/duplicate.
fragment_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_QUERY_COVERAGE_THRESHOLD", "0.80"))
# Plain fragment guard: maximum query/candidate line-count ratio still considered a fragment.
# Lower values protect only shorter fragments; higher values treat longer partial uploads as new.
fragment_max_line_ratio: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_MAX_LINE_RATIO", "0.75"))
# Plain fragment guard: minimum matched unique lyric lines before fragment protection can apply.
# This avoids classifying tiny common phrases as meaningful fragments.
fragment_min_matched_lines: int = int(os.getenv("LYRIC_DEDUP_FRAGMENT_MIN_MATCHED_LINES", "3"))
# Very short query lyric line count that can force repeated-chorus overlap into fragment protection.
# Matches protected by this path return new instead of duplicate/review.
chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
# Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
# Raising this makes chorus-only fragment protection stricter.
chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
# Minimum query-side coverage for repeated-chorus overlap to be considered material.
# Raising this reduces fragment protection caused by small shared chorus fragments.
chorus_material_query_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
)
# Weight assigned to primary n-gram Jaccard when computing confidence.
# This affects the reported confidence score, not the duplicate/review threshold checks directly.
confidence_jaccard_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_JACCARD_WEIGHT", "0.58"))
# Weight assigned to primary line coverage when computing confidence.
# Keep this coordinated with confidence_jaccard_weight; defaults sum to 1.0.
confidence_line_coverage_weight: float = float(os.getenv("LYRIC_DEDUP_CONFIDENCE_LINE_COVERAGE_WEIGHT", "0.42"))