Commit cdfa3a58 cdfa3a581738a209e66f43513055a41c70831af1 by 沈秋雨

更新测试脚本

1 parent fec2556e
......@@ -23,3 +23,5 @@ text-dedup-main/
venv/
.idea/
.vscode/
test_api
......
......@@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \
```text
decision duplicate / review / new
duplicate duplicate 或 review 时为 true,new 时为 false
duplicate 仅 decision=duplicate 时为 true,review/new 时为 false
confidence 当前判定置信度
reason 中文判定原因
candidate_count 参与最终排序的候选数
......
......@@ -75,6 +75,9 @@ class DuplicateChecker:
review_jaccard_threshold: float = 0.45,
review_line_coverage_threshold: float = 0.35,
review_query_coverage_threshold: float = 0.40,
fragment_query_coverage_threshold: float = 0.80,
fragment_max_line_ratio: float = 0.75,
fragment_min_matched_lines: int = 3,
chorus_short_line_count_threshold: int = 6,
chorus_material_overlap_threshold: float = 0.20,
chorus_material_query_coverage_threshold: float = 0.40,
......@@ -88,6 +91,9 @@ class DuplicateChecker:
self.review_jaccard_threshold = review_jaccard_threshold
self.review_line_coverage_threshold = review_line_coverage_threshold
self.review_query_coverage_threshold = review_query_coverage_threshold
self.fragment_query_coverage_threshold = fragment_query_coverage_threshold
self.fragment_max_line_ratio = fragment_max_line_ratio
self.fragment_min_matched_lines = fragment_min_matched_lines
self.chorus_short_line_count_threshold = chorus_short_line_count_threshold
self.chorus_material_overlap_threshold = chorus_material_overlap_threshold
self.chorus_material_query_coverage_threshold = chorus_material_query_coverage_threshold
......@@ -237,6 +243,14 @@ class DuplicateChecker:
query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low"
)
query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines)
is_plain_fragment = _is_plain_fragment(
query.normalized.primary_lines,
candidate.normalized.primary_lines,
primary_matched_lines,
min_query_coverage=self.fragment_query_coverage_threshold,
max_line_ratio=self.fragment_max_line_ratio,
min_matched_lines=self.fragment_min_matched_lines,
)
has_review_level_overlap = (
primary_jaccard >= self.review_jaccard_threshold
or jaccard >= self.review_jaccard_threshold
......@@ -275,7 +289,10 @@ class DuplicateChecker:
+ (self.confidence_line_coverage_weight * primary_coverage),
4,
)
if (
if is_plain_fragment:
decision = DuplicateDecision.NEW
reason = "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
elif (
(
primary_jaccard >= self.duplicate_jaccard_threshold
or (
......@@ -293,17 +310,17 @@ class DuplicateChecker:
reason = "原文歌词高度一致,翻译行未参与自动判重"
else:
reason = "原文 n-gram 字面相似度高,且行级覆盖范围广"
elif has_material_chorus_overlap:
decision = DuplicateDecision.NEW
reason = "重合内容主要集中在重复副歌行,按片段歌词处理"
elif (
has_material_chorus_overlap
or translation_only
translation_only
or has_low_confidence_split_overlap
or has_review_level_overlap
):
decision = DuplicateDecision.REVIEW
reason = "候选相似度达到复核阈值,需要人工确认"
if has_material_chorus_overlap:
reason = "重合内容主要集中在重复副歌行,不自动判重"
elif translation_only:
if translation_only:
reason = "仅翻译行相似,原文字面重合不足,不自动判重"
elif has_low_confidence_split_overlap:
reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核"
......@@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[
return len(set(matched_lines)) / len(query_unique_lines)
def _is_plain_fragment(
query_lines: tuple[str, ...],
candidate_lines: tuple[str, ...],
matched_lines: list[str],
*,
min_query_coverage: float,
max_line_ratio: float,
min_matched_lines: int,
) -> bool:
query_unique_lines = set(query_lines)
candidate_unique_lines = set(candidate_lines)
matched_unique_lines = set(matched_lines)
if not query_unique_lines or not candidate_unique_lines:
return False
if len(matched_unique_lines) < min_matched_lines:
return False
line_ratio = len(query_unique_lines) / len(candidate_unique_lines)
query_coverage = len(matched_unique_lines) / len(query_unique_lines)
return line_ratio <= max_line_ratio and query_coverage >= min_query_coverage
def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool:
if not matched_lines:
return False
......
......@@ -50,6 +50,7 @@ class CheckResponse(BaseModel):
decision: str | None = None
confidence: float | None = None
reason: str | None = None
record_ids: list[str] = []
class HealthResponse(BaseModel):
......@@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any:
decision=result.decision,
confidence=result.confidence,
reason=result.reason,
record_ids=result.record_ids,
)
......
......@@ -81,16 +81,28 @@ class ServerConfig:
# Raising this makes partial-fragment review stricter.
review_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD", "0.40"))
# Very short query lyric line count that can force repeated-chorus overlap into review.
# Raising this catches more short chorus-like inputs; lowering it reduces review volume.
# Plain fragment guard: query-side match ratio required to treat the input as a lyric fragment.
# When this is met together with fragment_max_line_ratio, the result is new instead of review/duplicate.
fragment_query_coverage_threshold: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_QUERY_COVERAGE_THRESHOLD", "0.80"))
# Plain fragment guard: maximum query/candidate line-count ratio still considered a fragment.
# Lower values protect only shorter fragments; higher values treat longer partial uploads as new.
fragment_max_line_ratio: float = float(os.getenv("LYRIC_DEDUP_FRAGMENT_MAX_LINE_RATIO", "0.75"))
# Plain fragment guard: minimum matched unique lyric lines before fragment protection can apply.
# This avoids classifying tiny common phrases as meaningful fragments.
fragment_min_matched_lines: int = int(os.getenv("LYRIC_DEDUP_FRAGMENT_MIN_MATCHED_LINES", "3"))
# Very short query lyric line count that can force repeated-chorus overlap into fragment protection.
# Matches protected by this path return new instead of duplicate/review.
chorus_short_line_count_threshold: int = int(os.getenv("LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD", "6"))
# Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
# Raising this makes chorus-only review stricter.
# Raising this makes chorus-only fragment protection stricter.
chorus_material_overlap_threshold: float = float(os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD", "0.20"))
# Minimum query-side coverage for repeated-chorus overlap to be considered material.
# Raising this reduces review decisions caused by small shared chorus fragments.
# Raising this reduces fragment protection caused by small shared chorus fragments.
chorus_material_query_coverage_threshold: float = float(
os.getenv("LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD", "0.40")
)
......
......@@ -27,6 +27,7 @@ class CheckResult:
confidence: float = 0.0
reason: str = ""
candidate_count: int = 0
record_ids: list[str] = field(default_factory=list)
@dataclass
......@@ -197,6 +198,9 @@ class DedupService:
review_jaccard_threshold=self.config.review_jaccard_threshold,
review_line_coverage_threshold=self.config.review_line_coverage_threshold,
review_query_coverage_threshold=self.config.review_query_coverage_threshold,
fragment_query_coverage_threshold=self.config.fragment_query_coverage_threshold,
fragment_max_line_ratio=self.config.fragment_max_line_ratio,
fragment_min_matched_lines=self.config.fragment_min_matched_lines,
chorus_short_line_count_threshold=self.config.chorus_short_line_count_threshold,
chorus_material_overlap_threshold=self.config.chorus_material_overlap_threshold,
chorus_material_query_coverage_threshold=self.config.chorus_material_query_coverage_threshold,
......@@ -208,12 +212,18 @@ class DedupService:
candidates,
max_candidates=self.config.max_candidates,
)
# 收集 duplicate/review 决策下的候选 record_id
matched_ids = [
c.record_id for c in result.candidates
if c.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW)
]
return CheckResult(
duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW),
duplicate=result.decision == DuplicateDecision.DUPLICATE,
decision=result.decision.value,
confidence=result.confidence,
reason=result.reason,
candidate_count=len(result.candidates),
record_ids=matched_ids,
)
......
......@@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import fingerprint_text
from lyric_dedup.normalization import normalize_lyrics
from lyric_dedup_server.config import ServerConfig
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate duplicate checking using PostgreSQL recall.")
parser.add_argument("--dsn", required=True)
parser.add_argument("--csv", required=True)
parser.add_argument("--out", required=True)
parser.add_argument("--base-dir", default="")
parser.add_argument("--positive-decisions", default="duplicate")
parser.add_argument("--max-candidates", type=int, default=5)
parser.add_argument("--recall-limit", type=int, default=100)
parser.add_argument("--enable-trgm", action="store_true", help="Enable pg_trgm full-text recall. Slower; exact + line recall is used by default.")
parser.add_argument("--trgm-threshold", type=float, default=0.3)
parser.add_argument("--statement-timeout-ms", type=int, default=5000)
parser.add_argument("--profile-every", type=int, default=100)
args = parser.parse_args()
psycopg = _import_psycopg()
config = ServerConfig()
csv_path = Path(args.csv)
out_path = Path(args.out)
base_dir = Path(args.base_dir) if args.base_dir else None
positive_decisions = {item.strip() for item in args.positive_decisions.split(",") if item.strip()}
positive_decisions = {"duplicate"}
total = _csv_data_row_count(csv_path)
rows: list[dict[str, object]] = []
profile_stats = _new_profile_stats()
out_path.parent.mkdir(parents=True, exist_ok=True)
_progress(f"evaluate postgres csv: 0/{total}")
with psycopg.connect(args.dsn) as conn:
with psycopg.connect(config.dsn) as conn:
with conn.cursor() as cursor:
cursor.execute("select set_config('statement_timeout', %s, false)", (str(args.statement_timeout_ms),))
cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(args.trgm_threshold),))
cursor.execute("select set_config('statement_timeout', %s, false)", (str(config.statement_timeout_ms),))
cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(config.trgm_threshold),))
with csv_path.open(encoding="utf-8-sig", newline="") as in_file, out_path.open(
"w", encoding="utf-8", newline=""
) as out_file:
......@@ -70,9 +65,7 @@ def main() -> None:
csv_path=csv_path,
base_dir=base_dir,
positive_decisions=positive_decisions,
max_candidates=args.max_candidates,
recall_limit=args.recall_limit,
enable_trgm=args.enable_trgm,
config=config,
)
rows.append(row_out)
writer.writerow(row_out)
......@@ -96,9 +89,7 @@ def _evaluate_row(
csv_path: Path,
base_dir: Path | None,
positive_decisions: set[str],
max_candidates: int,
recall_limit: int,
enable_trgm: bool,
config: ServerConfig,
) -> dict[str, object]:
parse_started = time.perf_counter()
sample_id = row.get("id") or row.get("sample_id") or str(row_number)
......@@ -108,12 +99,12 @@ def _evaluate_row(
candidates, timings = _recall_candidates(
conn,
record,
recall_limit=recall_limit,
enable_trgm=enable_trgm,
recall_limit=config.recall_limit,
enable_trgm=config.enable_trgm,
exclude_record_ids=_exclude_record_ids_for_eval_row(row),
)
rank_started = time.perf_counter()
result = _check_against_candidates(record, candidates, max_candidates=max_candidates)
result = _check_against_candidates(record, candidates, config=config)
rank_ms = round((time.perf_counter() - rank_started) * 1000, 2)
recall_ms = round(timings["exact_ms"] + timings["trgm_ms"] + timings["line_ms"], 2)
predicted_duplicate = result.decision.value in positive_decisions
......@@ -127,7 +118,7 @@ def _evaluate_row(
"correct": expected_duplicate == predicted_duplicate,
"confidence": result.confidence,
"reason": result.reason,
"candidate_count": len(candidates),
"candidate_count": len(result.candidates),
"parse_ms": parse_ms,
"recall_ms": recall_ms,
"exact_ms": timings["exact_ms"],
......@@ -246,10 +237,26 @@ def _check_against_candidates(
record: LyricRecord,
candidates: list[LyricRecord],
*,
max_candidates: int,
config: ServerConfig,
):
checker = DuplicateChecker()
return checker.check_record_against_candidates(record, candidates, max_candidates=max_candidates)
checker = DuplicateChecker(
duplicate_jaccard_threshold=config.duplicate_jaccard_threshold,
duplicate_line_coverage_threshold=config.duplicate_line_coverage_threshold,
duplicate_high_coverage_jaccard_threshold=config.duplicate_high_coverage_jaccard_threshold,
duplicate_high_coverage_line_coverage_threshold=config.duplicate_high_coverage_line_coverage_threshold,
review_jaccard_threshold=config.review_jaccard_threshold,
review_line_coverage_threshold=config.review_line_coverage_threshold,
review_query_coverage_threshold=config.review_query_coverage_threshold,
fragment_query_coverage_threshold=config.fragment_query_coverage_threshold,
fragment_max_line_ratio=config.fragment_max_line_ratio,
fragment_min_matched_lines=config.fragment_min_matched_lines,
chorus_short_line_count_threshold=config.chorus_short_line_count_threshold,
chorus_material_overlap_threshold=config.chorus_material_overlap_threshold,
chorus_material_query_coverage_threshold=config.chorus_material_query_coverage_threshold,
confidence_jaccard_weight=config.confidence_jaccard_weight,
confidence_line_coverage_weight=config.confidence_line_coverage_weight,
)
return checker.check_record_against_candidates(record, candidates, max_candidates=config.max_candidates)
def _record_from_eval_row(row: dict[str, str], *, csv_path: Path, base_dir: Path | None) -> tuple[LyricRecord, str]:
......
......@@ -110,6 +110,7 @@ def main():
print(f" decision: {result.get('decision', 'N/A')}")
print(f" confidence: {result.get('confidence', 'N/A')}")
print(f" reason: {result.get('reason', 'N/A')}")
print(f" record_ids: {result.get('record_ids', [])}")
if __name__ == "__main__":
......
## 消失的波段
### 【主歌 1】 — *(压抑、低沉的叙事)*
霓虹灯……在车窗外退后,
霓虹——和夜色融为一体。
收音机里,只剩沙沙的电流……
(像你在旧地址留下的呼吸……)
有些习惯……总是很难去修正,
比如——在人群中,辨认你的背影。
### 【主歌 2】 — *(情绪渐进,带有一丝无奈)*
朋友圈里……你更新了风景,
坐标是——没听过的、陌、生、城、市。
我们从无话不说……退回到【静音】,
像两条失去交集的——平行线。
那些没有寄出的长信……
最后都变成,草稿箱里的——灰、尘。
### 【副歌】 —— *(情感爆发,高亢而撕裂)*
我们成了彼此消 逝 的 波 段 !!
在同一个频段……却再也无法呼喊!
那些同频共振的夜晚……
最终被淹没在——嘈杂的市中心!!
我调整着微弱的接收信号……
却只听到——时光断裂的声音!!!
### 【桥段】 —— *(节奏加快,连续的内心追问)*
是不是所有的连接……都有保质期?!
到期后……就自动切断了所有联系?!
我们在各自的轨道里——加!速!运!行!
再也找不到……那天傍晚的引力。
### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)*
我们成了彼此消 逝 的 波 段 ——!!
在同一个频段……却再也无法呼喊!
那些同频共振的夜晚……
最终被淹没在——嘈杂的市中心!!
我调整着微弱的接收信号……
却只听到……(时光断裂的声音)……
### 【尾奏】 —— *(情绪下沉,最终归于死寂)*
【信号中断……请勿追赶。】
城市入睡……灯光渐暗……
一个人的波段。
(查……无……此……人……)
【 挂 断 。】
### 副歌
我们成了彼此消失的波段
在同一个频段却再也无法呼喊
那些同频共振的夜晚
最终被淹没在嘈杂的市中心
我调整着微弱的接收信号
却只听到时光断裂的声音
### 桥段
是不是所有的连接都有保质期
到期后就自动切断了所有联系
我们在各自的轨道里加速运行
再也找不到那天傍晚的引力
### 副歌
我们成了彼此消失的波段
在同一个频段却再也无法呼喊
那些同频共振的夜晚
最终被淹没在嘈杂的市中心
我调整着微弱的接收信号
却只听到时光断裂的声音
### 尾奏
信号中断,请勿追赶
城市入睡,灯光渐暗
一个人的波段
查无此人
挂断
\ No newline at end of file
歌曲题目:《星空大冒险》
【主歌 2】
小兔子,在划船,
它的浆是胡萝卜。
小熊坐在树枝上,
正把蜂蜜涂面包。
风儿吹过坏脾气,
在这儿变成甜泡泡。
没有作业和烦恼,
大家都在哈哈笑。
【副歌 2】
飞呀飞,飞向大月亮,
月亮像个大香蕉,挂在夜空上。
摇呀摇,摇到银河旁,
捞起一颗小星星,放在手心里亮。
【桥段(Bridge)】
(节奏放慢,变温柔)
天上的城堡亮晶晶,
那是梦里的风景。
玩累的小孩要睡了,
听一听,风的呼吸。
呼——噜——呼——噜——
做个好梦到天明。
【副歌 3】
(节奏恢复,渐弱结束)
飞呀飞,飞向大月亮,
月亮像个大香蕉,挂在夜空上。
摇呀摇,摇到银河旁,
捞起一颗小星星,
抱在怀里……睡着啦。
\ No newline at end of file
......
......@@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord
from lyric_dedup.eval_dataset import generate_eval_set
from lyric_dedup.file_import import record_from_file
from lyric_dedup.normalization import normalize_lyrics
from lyric_dedup_server.config import ServerConfig
from lyric_dedup_server.service import DedupService
BASE_LYRIC = """
......@@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c
assert result.candidates[0].record_id == "song-1"
def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None:
def test_short_shared_repeated_chorus_is_new_fragment() -> None:
result = check_against(
[
LyricRecord(
......@@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None:
"""
)
assert result.decision == DuplicateDecision.REVIEW
assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,不自动判重"
assert result.decision == DuplicateDecision.NEW
assert result.candidates[0].reason == "重合内容主要集中在重复副歌行,按片段歌词处理"
def test_service_short_chorus_fragment_result_is_new() -> None:
service = DedupService(config=ServerConfig())
result = service._check_against_candidates(
LyricRecord(
"__query__",
"""
山谷的雨落在清晨
我把名字交给星辰
啦啦啦 我们不分离
啦啦啦 我们不分离
世界安静等一个人
""",
),
[
LyricRecord(
"song-1",
"""
海边的风吹过旧信
你说夏天不会远去
啦啦啦 我们不分离
啦啦啦 我们不分离
转身以后各自旅行
""",
)
],
)
assert result.decision == DuplicateDecision.NEW.value
assert result.duplicate is False
assert result.record_ids == []
def test_substantial_line_overlap_is_duplicate_after_pg_recall() -> None:
......@@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None:
"""
)
assert result.decision != DuplicateDecision.DUPLICATE
assert result.decision == DuplicateDecision.NEW
assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
assert result.candidates[0].primary_line_coverage < 0.72
def test_long_plain_fragment_of_full_song_is_new_not_review() -> None:
full_song = """
第一行写给凌晨的风
第二行写给远处的灯
第三行写给没有寄出的信
第四行写给还醒着的人
第五行写给旧车站
第六行写给长街尽头
第七行写给明天的太阳
第八行写给重新出发
第九行写给路过的雨
第十行写给沉默的月光
"""
result = check_against(
[LyricRecord("song-1", full_song)],
"""
第二行写给远处的灯
第三行写给没有寄出的信
第四行写给还醒着的人
第五行写给旧车站
第六行写给长街尽头
第七行写给明天的太阳
""",
)
assert result.decision == DuplicateDecision.NEW
assert result.candidates[0].reason == "歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
def test_catalog_mashup_fragments_are_new_not_review() -> None:
result = check_against(
[
......