Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
lyric_rhyme
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
cdfa3a58
...
cdfa3a581738a209e66f43513055a41c70831af1
authored
2026-06-04 16:53:48 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
更新测试脚本
1 parent
fec2556e
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
211 additions
and
124 deletions
.gitignore
README.md
lyric_dedup/checker.py
lyric_dedup_server/app.py
lyric_dedup_server/config.py
lyric_dedup_server/service.py
scripts/evaluate_postgres.py
test_api/test_dedup_api.py
test_api/test_lyric.txt
tests/test_lyric_dedup.py
.gitignore
View file @
cdfa3a5
...
...
@@ -23,3 +23,5 @@ text-dedup-main/
venv/
.idea/
.vscode/
test_api
...
...
README.md
View file @
cdfa3a5
...
...
@@ -105,7 +105,7 @@ python -m lyric_dedup.cli check-file \
```
text
decision duplicate / review / new
duplicate
duplicate 或 review 时为 true,
new 时为 false
duplicate
仅 decision=duplicate 时为 true,review/
new 时为 false
confidence 当前判定置信度
reason 中文判定原因
candidate_count 参与最终排序的候选数
...
...
lyric_dedup/checker.py
View file @
cdfa3a5
...
...
@@ -75,6 +75,9 @@ class DuplicateChecker:
review_jaccard_threshold
:
float
=
0.45
,
review_line_coverage_threshold
:
float
=
0.35
,
review_query_coverage_threshold
:
float
=
0.40
,
fragment_query_coverage_threshold
:
float
=
0.80
,
fragment_max_line_ratio
:
float
=
0.75
,
fragment_min_matched_lines
:
int
=
3
,
chorus_short_line_count_threshold
:
int
=
6
,
chorus_material_overlap_threshold
:
float
=
0.20
,
chorus_material_query_coverage_threshold
:
float
=
0.40
,
...
...
@@ -88,6 +91,9 @@ class DuplicateChecker:
self
.
review_jaccard_threshold
=
review_jaccard_threshold
self
.
review_line_coverage_threshold
=
review_line_coverage_threshold
self
.
review_query_coverage_threshold
=
review_query_coverage_threshold
self
.
fragment_query_coverage_threshold
=
fragment_query_coverage_threshold
self
.
fragment_max_line_ratio
=
fragment_max_line_ratio
self
.
fragment_min_matched_lines
=
fragment_min_matched_lines
self
.
chorus_short_line_count_threshold
=
chorus_short_line_count_threshold
self
.
chorus_material_overlap_threshold
=
chorus_material_overlap_threshold
self
.
chorus_material_query_coverage_threshold
=
chorus_material_query_coverage_threshold
...
...
@@ -237,6 +243,14 @@ class DuplicateChecker:
query
.
normalized
.
split_confidence
==
"low"
or
candidate
.
normalized
.
split_confidence
==
"low"
)
query_coverage
=
_matched_query_line_ratio
(
query
.
normalized
.
unique_lines
,
matched_lines
)
is_plain_fragment
=
_is_plain_fragment
(
query
.
normalized
.
primary_lines
,
candidate
.
normalized
.
primary_lines
,
primary_matched_lines
,
min_query_coverage
=
self
.
fragment_query_coverage_threshold
,
max_line_ratio
=
self
.
fragment_max_line_ratio
,
min_matched_lines
=
self
.
fragment_min_matched_lines
,
)
has_review_level_overlap
=
(
primary_jaccard
>=
self
.
review_jaccard_threshold
or
jaccard
>=
self
.
review_jaccard_threshold
...
...
@@ -275,7 +289,10 @@ class DuplicateChecker:
+
(
self
.
confidence_line_coverage_weight
*
primary_coverage
),
4
,
)
if
(
if
is_plain_fragment
:
decision
=
DuplicateDecision
.
NEW
reason
=
"歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
elif
(
(
primary_jaccard
>=
self
.
duplicate_jaccard_threshold
or
(
...
...
@@ -293,17 +310,17 @@ class DuplicateChecker:
reason
=
"原文歌词高度一致,翻译行未参与自动判重"
else
:
reason
=
"原文 n-gram 字面相似度高,且行级覆盖范围广"
elif
has_material_chorus_overlap
:
decision
=
DuplicateDecision
.
NEW
reason
=
"重合内容主要集中在重复副歌行,按片段歌词处理"
elif
(
has_material_chorus_overlap
or
translation_only
translation_only
or
has_low_confidence_split_overlap
or
has_review_level_overlap
):
decision
=
DuplicateDecision
.
REVIEW
reason
=
"候选相似度达到复核阈值,需要人工确认"
if
has_material_chorus_overlap
:
reason
=
"重合内容主要集中在重复副歌行,不自动判重"
elif
translation_only
:
if
translation_only
:
reason
=
"仅翻译行相似,原文字面重合不足,不自动判重"
elif
has_low_confidence_split_overlap
:
reason
=
"疑似整段翻译结构但拆分置信度较低,需要人工复核"
...
...
@@ -430,6 +447,27 @@ def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[
return
len
(
set
(
matched_lines
))
/
len
(
query_unique_lines
)
def
_is_plain_fragment
(
query_lines
:
tuple
[
str
,
...
],
candidate_lines
:
tuple
[
str
,
...
],
matched_lines
:
list
[
str
],
*
,
min_query_coverage
:
float
,
max_line_ratio
:
float
,
min_matched_lines
:
int
,
)
->
bool
:
query_unique_lines
=
set
(
query_lines
)
candidate_unique_lines
=
set
(
candidate_lines
)
matched_unique_lines
=
set
(
matched_lines
)
if
not
query_unique_lines
or
not
candidate_unique_lines
:
return
False
if
len
(
matched_unique_lines
)
<
min_matched_lines
:
return
False
line_ratio
=
len
(
query_unique_lines
)
/
len
(
candidate_unique_lines
)
query_coverage
=
len
(
matched_unique_lines
)
/
len
(
query_unique_lines
)
return
line_ratio
<=
max_line_ratio
and
query_coverage
>=
min_query_coverage
def
_is_chorus_only_match
(
left
:
NormalizedLyrics
,
right
:
NormalizedLyrics
,
matched_lines
:
list
[
str
])
->
bool
:
if
not
matched_lines
:
return
False
...
...
lyric_dedup_server/app.py
View file @
cdfa3a5
...
...
@@ -50,6 +50,7 @@ class CheckResponse(BaseModel):
decision
:
str
|
None
=
None
confidence
:
float
|
None
=
None
reason
:
str
|
None
=
None
record_ids
:
list
[
str
]
=
[]
class
HealthResponse
(
BaseModel
):
...
...
@@ -108,6 +109,7 @@ def check_lyric(req: CheckRequest) -> Any:
decision
=
result
.
decision
,
confidence
=
result
.
confidence
,
reason
=
result
.
reason
,
record_ids
=
result
.
record_ids
,
)
...
...
lyric_dedup_server/config.py
View file @
cdfa3a5
...
...
@@ -81,16 +81,28 @@ class ServerConfig:
# Raising this makes partial-fragment review stricter.
review_query_coverage_threshold
:
float
=
float
(
os
.
getenv
(
"LYRIC_DEDUP_REVIEW_QUERY_COVERAGE_THRESHOLD"
,
"0.40"
))
# Very short query lyric line count that can force repeated-chorus overlap into review.
# Raising this catches more short chorus-like inputs; lowering it reduces review volume.
# Plain fragment guard: query-side match ratio required to treat the input as a lyric fragment.
# When this is met together with fragment_max_line_ratio, the result is new instead of review/duplicate.
fragment_query_coverage_threshold
:
float
=
float
(
os
.
getenv
(
"LYRIC_DEDUP_FRAGMENT_QUERY_COVERAGE_THRESHOLD"
,
"0.80"
))
# Plain fragment guard: maximum query/candidate line-count ratio still considered a fragment.
# Lower values protect only shorter fragments; higher values treat longer partial uploads as new.
fragment_max_line_ratio
:
float
=
float
(
os
.
getenv
(
"LYRIC_DEDUP_FRAGMENT_MAX_LINE_RATIO"
,
"0.75"
))
# Plain fragment guard: minimum matched unique lyric lines before fragment protection can apply.
# This avoids classifying tiny common phrases as meaningful fragments.
fragment_min_matched_lines
:
int
=
int
(
os
.
getenv
(
"LYRIC_DEDUP_FRAGMENT_MIN_MATCHED_LINES"
,
"3"
))
# Very short query lyric line count that can force repeated-chorus overlap into fragment protection.
# Matches protected by this path return new instead of duplicate/review.
chorus_short_line_count_threshold
:
int
=
int
(
os
.
getenv
(
"LYRIC_DEDUP_CHORUS_SHORT_LINE_COUNT_THRESHOLD"
,
"6"
))
# Minimum similarity/coverage signal for repeated-chorus overlap to be considered material.
# Raising this makes chorus-only
review
stricter.
# Raising this makes chorus-only
fragment protection
stricter.
chorus_material_overlap_threshold
:
float
=
float
(
os
.
getenv
(
"LYRIC_DEDUP_CHORUS_MATERIAL_OVERLAP_THRESHOLD"
,
"0.20"
))
# Minimum query-side coverage for repeated-chorus overlap to be considered material.
# Raising this reduces
review decisions
caused by small shared chorus fragments.
# Raising this reduces
fragment protection
caused by small shared chorus fragments.
chorus_material_query_coverage_threshold
:
float
=
float
(
os
.
getenv
(
"LYRIC_DEDUP_CHORUS_MATERIAL_QUERY_COVERAGE_THRESHOLD"
,
"0.40"
)
)
...
...
lyric_dedup_server/service.py
View file @
cdfa3a5
...
...
@@ -27,6 +27,7 @@ class CheckResult:
confidence
:
float
=
0.0
reason
:
str
=
""
candidate_count
:
int
=
0
record_ids
:
list
[
str
]
=
field
(
default_factory
=
list
)
@dataclass
...
...
@@ -197,6 +198,9 @@ class DedupService:
review_jaccard_threshold
=
self
.
config
.
review_jaccard_threshold
,
review_line_coverage_threshold
=
self
.
config
.
review_line_coverage_threshold
,
review_query_coverage_threshold
=
self
.
config
.
review_query_coverage_threshold
,
fragment_query_coverage_threshold
=
self
.
config
.
fragment_query_coverage_threshold
,
fragment_max_line_ratio
=
self
.
config
.
fragment_max_line_ratio
,
fragment_min_matched_lines
=
self
.
config
.
fragment_min_matched_lines
,
chorus_short_line_count_threshold
=
self
.
config
.
chorus_short_line_count_threshold
,
chorus_material_overlap_threshold
=
self
.
config
.
chorus_material_overlap_threshold
,
chorus_material_query_coverage_threshold
=
self
.
config
.
chorus_material_query_coverage_threshold
,
...
...
@@ -208,12 +212,18 @@ class DedupService:
candidates
,
max_candidates
=
self
.
config
.
max_candidates
,
)
# 收集 duplicate/review 决策下的候选 record_id
matched_ids
=
[
c
.
record_id
for
c
in
result
.
candidates
if
c
.
decision
in
(
DuplicateDecision
.
DUPLICATE
,
DuplicateDecision
.
REVIEW
)
]
return
CheckResult
(
duplicate
=
result
.
decision
in
(
DuplicateDecision
.
DUPLICATE
,
DuplicateDecision
.
REVIEW
)
,
duplicate
=
result
.
decision
==
DuplicateDecision
.
DUPLICATE
,
decision
=
result
.
decision
.
value
,
confidence
=
result
.
confidence
,
reason
=
result
.
reason
,
candidate_count
=
len
(
result
.
candidates
),
record_ids
=
matched_ids
,
)
...
...
scripts/evaluate_postgres.py
View file @
cdfa3a5
...
...
@@ -22,38 +22,33 @@ from lyric_dedup.file_import import read_lyric_file
from
lyric_dedup.file_import
import
record_from_file
from
lyric_dedup.normalization
import
fingerprint_text
from
lyric_dedup.normalization
import
normalize_lyrics
from
lyric_dedup_server.config
import
ServerConfig
def
main
()
->
None
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Evaluate duplicate checking using PostgreSQL recall."
)
parser
.
add_argument
(
"--dsn"
,
required
=
True
)
parser
.
add_argument
(
"--csv"
,
required
=
True
)
parser
.
add_argument
(
"--out"
,
required
=
True
)
parser
.
add_argument
(
"--base-dir"
,
default
=
""
)
parser
.
add_argument
(
"--positive-decisions"
,
default
=
"duplicate"
)
parser
.
add_argument
(
"--max-candidates"
,
type
=
int
,
default
=
5
)
parser
.
add_argument
(
"--recall-limit"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"--enable-trgm"
,
action
=
"store_true"
,
help
=
"Enable pg_trgm full-text recall. Slower; exact + line recall is used by default."
)
parser
.
add_argument
(
"--trgm-threshold"
,
type
=
float
,
default
=
0.3
)
parser
.
add_argument
(
"--statement-timeout-ms"
,
type
=
int
,
default
=
5000
)
parser
.
add_argument
(
"--profile-every"
,
type
=
int
,
default
=
100
)
args
=
parser
.
parse_args
()
psycopg
=
_import_psycopg
()
config
=
ServerConfig
()
csv_path
=
Path
(
args
.
csv
)
out_path
=
Path
(
args
.
out
)
base_dir
=
Path
(
args
.
base_dir
)
if
args
.
base_dir
else
None
positive_decisions
=
{
item
.
strip
()
for
item
in
args
.
positive_decisions
.
split
(
","
)
if
item
.
strip
()
}
positive_decisions
=
{
"duplicate"
}
total
=
_csv_data_row_count
(
csv_path
)
rows
:
list
[
dict
[
str
,
object
]]
=
[]
profile_stats
=
_new_profile_stats
()
out_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
_progress
(
f
"evaluate postgres csv: 0/{total}"
)
with
psycopg
.
connect
(
args
.
dsn
)
as
conn
:
with
psycopg
.
connect
(
config
.
dsn
)
as
conn
:
with
conn
.
cursor
()
as
cursor
:
cursor
.
execute
(
"select set_config('statement_timeout',
%
s, false)"
,
(
str
(
args
.
statement_timeout_ms
),))
cursor
.
execute
(
"select set_config('pg_trgm.similarity_threshold',
%
s, false)"
,
(
str
(
args
.
trgm_threshold
),))
cursor
.
execute
(
"select set_config('statement_timeout',
%
s, false)"
,
(
str
(
config
.
statement_timeout_ms
),))
cursor
.
execute
(
"select set_config('pg_trgm.similarity_threshold',
%
s, false)"
,
(
str
(
config
.
trgm_threshold
),))
with
csv_path
.
open
(
encoding
=
"utf-8-sig"
,
newline
=
""
)
as
in_file
,
out_path
.
open
(
"w"
,
encoding
=
"utf-8"
,
newline
=
""
)
as
out_file
:
...
...
@@ -70,9 +65,7 @@ def main() -> None:
csv_path
=
csv_path
,
base_dir
=
base_dir
,
positive_decisions
=
positive_decisions
,
max_candidates
=
args
.
max_candidates
,
recall_limit
=
args
.
recall_limit
,
enable_trgm
=
args
.
enable_trgm
,
config
=
config
,
)
rows
.
append
(
row_out
)
writer
.
writerow
(
row_out
)
...
...
@@ -96,9 +89,7 @@ def _evaluate_row(
csv_path
:
Path
,
base_dir
:
Path
|
None
,
positive_decisions
:
set
[
str
],
max_candidates
:
int
,
recall_limit
:
int
,
enable_trgm
:
bool
,
config
:
ServerConfig
,
)
->
dict
[
str
,
object
]:
parse_started
=
time
.
perf_counter
()
sample_id
=
row
.
get
(
"id"
)
or
row
.
get
(
"sample_id"
)
or
str
(
row_number
)
...
...
@@ -108,12 +99,12 @@ def _evaluate_row(
candidates
,
timings
=
_recall_candidates
(
conn
,
record
,
recall_limit
=
recall_limit
,
enable_trgm
=
enable_trgm
,
recall_limit
=
config
.
recall_limit
,
enable_trgm
=
config
.
enable_trgm
,
exclude_record_ids
=
_exclude_record_ids_for_eval_row
(
row
),
)
rank_started
=
time
.
perf_counter
()
result
=
_check_against_candidates
(
record
,
candidates
,
max_candidates
=
max_candidates
)
result
=
_check_against_candidates
(
record
,
candidates
,
config
=
config
)
rank_ms
=
round
((
time
.
perf_counter
()
-
rank_started
)
*
1000
,
2
)
recall_ms
=
round
(
timings
[
"exact_ms"
]
+
timings
[
"trgm_ms"
]
+
timings
[
"line_ms"
],
2
)
predicted_duplicate
=
result
.
decision
.
value
in
positive_decisions
...
...
@@ -127,7 +118,7 @@ def _evaluate_row(
"correct"
:
expected_duplicate
==
predicted_duplicate
,
"confidence"
:
result
.
confidence
,
"reason"
:
result
.
reason
,
"candidate_count"
:
len
(
candidates
),
"candidate_count"
:
len
(
result
.
candidates
),
"parse_ms"
:
parse_ms
,
"recall_ms"
:
recall_ms
,
"exact_ms"
:
timings
[
"exact_ms"
],
...
...
@@ -246,10 +237,26 @@ def _check_against_candidates(
record
:
LyricRecord
,
candidates
:
list
[
LyricRecord
],
*
,
max_candidates
:
int
,
config
:
ServerConfig
,
):
checker
=
DuplicateChecker
()
return
checker
.
check_record_against_candidates
(
record
,
candidates
,
max_candidates
=
max_candidates
)
checker
=
DuplicateChecker
(
duplicate_jaccard_threshold
=
config
.
duplicate_jaccard_threshold
,
duplicate_line_coverage_threshold
=
config
.
duplicate_line_coverage_threshold
,
duplicate_high_coverage_jaccard_threshold
=
config
.
duplicate_high_coverage_jaccard_threshold
,
duplicate_high_coverage_line_coverage_threshold
=
config
.
duplicate_high_coverage_line_coverage_threshold
,
review_jaccard_threshold
=
config
.
review_jaccard_threshold
,
review_line_coverage_threshold
=
config
.
review_line_coverage_threshold
,
review_query_coverage_threshold
=
config
.
review_query_coverage_threshold
,
fragment_query_coverage_threshold
=
config
.
fragment_query_coverage_threshold
,
fragment_max_line_ratio
=
config
.
fragment_max_line_ratio
,
fragment_min_matched_lines
=
config
.
fragment_min_matched_lines
,
chorus_short_line_count_threshold
=
config
.
chorus_short_line_count_threshold
,
chorus_material_overlap_threshold
=
config
.
chorus_material_overlap_threshold
,
chorus_material_query_coverage_threshold
=
config
.
chorus_material_query_coverage_threshold
,
confidence_jaccard_weight
=
config
.
confidence_jaccard_weight
,
confidence_line_coverage_weight
=
config
.
confidence_line_coverage_weight
,
)
return
checker
.
check_record_against_candidates
(
record
,
candidates
,
max_candidates
=
config
.
max_candidates
)
def
_record_from_eval_row
(
row
:
dict
[
str
,
str
],
*
,
csv_path
:
Path
,
base_dir
:
Path
|
None
)
->
tuple
[
LyricRecord
,
str
]:
...
...
test_api/test_dedup_api.py
View file @
cdfa3a5
...
...
@@ -110,6 +110,7 @@ def main():
print
(
f
" decision: {result.get('decision', 'N/A')}"
)
print
(
f
" confidence: {result.get('confidence', 'N/A')}"
)
print
(
f
" reason: {result.get('reason', 'N/A')}"
)
print
(
f
" record_ids: {result.get('record_ids', [])}"
)
if
__name__
==
"__main__"
:
...
...
test_api/test_lyric.txt
View file @
cdfa3a5
## 消失的波段
### 【主歌 1】 — *(压抑、低沉的叙事)*
霓虹灯……在车窗外退后,
霓虹——和夜色融为一体。
收音机里,只剩沙沙的电流……
(像你在旧地址留下的呼吸……)
有些习惯……总是很难去修正,
比如——在人群中,辨认你的背影。
### 【主歌 2】 — *(情绪渐进,带有一丝无奈)*
朋友圈里……你更新了风景,
坐标是——没听过的、陌、生、城、市。
我们从无话不说……退回到【静音】,
像两条失去交集的——平行线。
那些没有寄出的长信……
最后都变成,草稿箱里的——灰、尘。
### 【副歌】 —— *(情感爆发,高亢而撕裂)*
我们成了彼此消 逝 的 波 段 !!
在同一个频段……却再也无法呼喊!
那些同频共振的夜晚……
最终被淹没在——嘈杂的市中心!!
我调整着微弱的接收信号……
却只听到——时光断裂的声音!!!
### 【桥段】 —— *(节奏加快,连续的内心追问)*
是不是所有的连接……都有保质期?!
到期后……就自动切断了所有联系?!
我们在各自的轨道里——加!速!运!行!
再也找不到……那天傍晚的引力。
### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)*
我们成了彼此消 逝 的 波 段 ——!!
在同一个频段……却再也无法呼喊!
那些同频共振的夜晚……
最终被淹没在——嘈杂的市中心!!
我调整着微弱的接收信号……
却只听到……(时光断裂的声音)……
### 【尾奏】 —— *(情绪下沉,最终归于死寂)*
【信号中断……请勿追赶。】
城市入睡……灯光渐暗……
一个人的波段。
(查……无……此……人……)
【 挂 断 。】
### 副歌
我们成了彼此消失的波段
在同一个频段却再也无法呼喊
那些同频共振的夜晚
最终被淹没在嘈杂的市中心
我调整着微弱的接收信号
却只听到时光断裂的声音
### 桥段
是不是所有的连接都有保质期
到期后就自动切断了所有联系
我们在各自的轨道里加速运行
再也找不到那天傍晚的引力
### 副歌
我们成了彼此消失的波段
在同一个频段却再也无法呼喊
那些同频共振的夜晚
最终被淹没在嘈杂的市中心
我调整着微弱的接收信号
却只听到时光断裂的声音
### 尾奏
信号中断,请勿追赶
城市入睡,灯光渐暗
一个人的波段
查无此人
挂断
\ No newline at end of file
歌曲题目:《星空大冒险》
【主歌 2】
小兔子,在划船,
它的浆是胡萝卜。
小熊坐在树枝上,
正把蜂蜜涂面包。
风儿吹过坏脾气,
在这儿变成甜泡泡。
没有作业和烦恼,
大家都在哈哈笑。
【副歌 2】
飞呀飞,飞向大月亮,
月亮像个大香蕉,挂在夜空上。
摇呀摇,摇到银河旁,
捞起一颗小星星,放在手心里亮。
【桥段(Bridge)】
(节奏放慢,变温柔)
天上的城堡亮晶晶,
那是梦里的风景。
玩累的小孩要睡了,
听一听,风的呼吸。
呼——噜——呼——噜——
做个好梦到天明。
【副歌 3】
(节奏恢复,渐弱结束)
飞呀飞,飞向大月亮,
月亮像个大香蕉,挂在夜空上。
摇呀摇,摇到银河旁,
捞起一颗小星星,
抱在怀里……睡着啦。
\ No newline at end of file
...
...
tests/test_lyric_dedup.py
View file @
cdfa3a5
...
...
@@ -7,6 +7,8 @@ from lyric_dedup import LyricRecord
from
lyric_dedup.eval_dataset
import
generate_eval_set
from
lyric_dedup.file_import
import
record_from_file
from
lyric_dedup.normalization
import
normalize_lyrics
from
lyric_dedup_server.config
import
ServerConfig
from
lyric_dedup_server.service
import
DedupService
BASE_LYRIC
=
"""
...
...
@@ -55,7 +57,7 @@ def test_exact_duplicate_handles_timestamps_punctuation_traditional_and_chorus_c
assert
result
.
candidates
[
0
]
.
record_id
==
"song-1"
def
test_short_shared_repeated_chorus_is_
review_not_duplicate
()
->
None
:
def
test_short_shared_repeated_chorus_is_
new_fragment
()
->
None
:
result
=
check_against
(
[
LyricRecord
(
...
...
@@ -78,8 +80,41 @@ def test_short_shared_repeated_chorus_is_review_not_duplicate() -> None:
"""
)
assert
result
.
decision
==
DuplicateDecision
.
REVIEW
assert
result
.
candidates
[
0
]
.
reason
==
"重合内容主要集中在重复副歌行,不自动判重"
assert
result
.
decision
==
DuplicateDecision
.
NEW
assert
result
.
candidates
[
0
]
.
reason
==
"重合内容主要集中在重复副歌行,按片段歌词处理"
def
test_service_short_chorus_fragment_result_is_new
()
->
None
:
service
=
DedupService
(
config
=
ServerConfig
())
result
=
service
.
_check_against_candidates
(
LyricRecord
(
"__query__"
,
"""
山谷的雨落在清晨
我把名字交给星辰
啦啦啦 我们不分离
啦啦啦 我们不分离
世界安静等一个人
"""
,
),
[
LyricRecord
(
"song-1"
,
"""
海边的风吹过旧信
你说夏天不会远去
啦啦啦 我们不分离
啦啦啦 我们不分离
转身以后各自旅行
"""
,
)
],
)
assert
result
.
decision
==
DuplicateDecision
.
NEW
.
value
assert
result
.
duplicate
is
False
assert
result
.
record_ids
==
[]
def
test_substantial_line_overlap_is_duplicate_after_pg_recall
()
->
None
:
...
...
@@ -110,10 +145,40 @@ def test_fragment_of_full_song_is_not_duplicate() -> None:
"""
)
assert
result
.
decision
!=
DuplicateDecision
.
DUPLICATE
assert
result
.
decision
==
DuplicateDecision
.
NEW
assert
result
.
candidates
[
0
]
.
reason
==
"歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
assert
result
.
candidates
[
0
]
.
primary_line_coverage
<
0.72
def
test_long_plain_fragment_of_full_song_is_new_not_review
()
->
None
:
full_song
=
"""
第一行写给凌晨的风
第二行写给远处的灯
第三行写给没有寄出的信
第四行写给还醒着的人
第五行写给旧车站
第六行写给长街尽头
第七行写给明天的太阳
第八行写给重新出发
第九行写给路过的雨
第十行写给沉默的月光
"""
result
=
check_against
(
[
LyricRecord
(
"song-1"
,
full_song
)],
"""
第二行写给远处的灯
第三行写给没有寄出的信
第四行写给还醒着的人
第五行写给旧车站
第六行写给长街尽头
第七行写给明天的太阳
"""
,
)
assert
result
.
decision
==
DuplicateDecision
.
NEW
assert
result
.
candidates
[
0
]
.
reason
==
"歌词片段只覆盖候选完整歌词的一部分,按新歌词处理"
def
test_catalog_mashup_fragments_are_new_not_review
()
->
None
:
result
=
check_against
(
[
...
...
Please
register
or
sign in
to post a comment