Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
lyric_rhyme
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
ba39ce6a
...
ba39ce6aa50b5bc45d24bc32bcbc7c51b870922a
authored
2026-06-03 11:25:52 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
添加测试集内部去重
1 parent
f8ad329c
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
469 additions
and
136 deletions
README.md
TEST_WORKFLOW.md
lyric_dedup/checker.py
lyric_dedup/cli.py
lyric_dedup/eval_dataset.py
scripts/process_library.py
tests/test_lyric_dedup.py
README.md
View file @
ba39ce6
...
...
@@ -80,6 +80,7 @@ python -m lyric_dedup.cli generate-eval-set \
--lyrics-dir data/generated_eval/incoming
\
--csv data/generated_eval/eval_50000.csv
\
--index outputs/indexes/lyrics.pkl
\
--eval-index data/generated_eval/eval_50000.csv.index.pkl
\
--size 50000
\
--positive-ratio 0.3
```
...
...
@@ -88,10 +89,10 @@ python -m lyric_dedup.cli generate-eval-set \
-
先扫描整个曲库,按有效歌词行数、语言类型、文件来源前缀做分层采样,不再按排序前缀取样。
-
`应去重`
样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。
-
`不应去重`
样本
包含同主题新歌词、hard negative、片段歌词、重复副歌碰撞、仅翻译相似
、短歌词/占位边界样本。
-
`不应去重`
样本
以真实 holdout 完整歌词为主,也包含片段歌词、重复副歌碰撞、仅翻译相似、同主题新歌词
、短歌词/占位边界样本。
-
片段歌词即使命中已有歌曲的一部分,也不应该输出
`duplicate`
;最多进入
`review`
。
-
如果传入
`--index`
,生成器会用现有索引构造更接近线上召回风险的 hard negative
。
-
同时会生成
`*.manifest.json`
,记录 seed、曲库规模、样本类型分布、语言/来源分桶和样本来源覆盖数。
-
生成器会额外写出
`--eval-index`
,这个索引排除了 holdout 歌,评估生成 CSV 时应使用它
。
-
同时会生成
`*.manifest.json`
,记录 seed、曲库规模、
holdout 数、
样本类型分布、语言/来源分桶和样本来源覆盖数。
先准备一个 CSV,例如
`data/eval/eval.csv`
:
...
...
TEST_WORKFLOW.md
View file @
ba39ce6
...
...
@@ -103,6 +103,7 @@ python -m lyric_dedup.cli generate-eval-set \
--lyrics-dir data/generated_eval/incoming
\
--csv data/generated_eval/eval_50000.csv
\
--index outputs/indexes/library_lyrics.pkl
\
--eval-index data/generated_eval/eval_50000.csv.index.pkl
\
--size 50000
\
--positive-ratio 0.3
```
...
...
@@ -120,24 +121,26 @@ python -m lyric_dedup.cli generate-eval-set \
```
text
positive_* = 应去重,全曲歌词样式变化
negative_random_unrelated = 不应去重,同主题新歌词
negative_hard_candidate = 不应去重,系统容易召回的短句/局部重合样本
negative_real_holdout_full_song = 不应去重,完整真实歌词,已从评估索引中排除
negative_fragment = 不应去重,单曲片段
negative_shared_chorus = 不应去重,重复副歌碰撞
negative_translation_only = 不应去重,仅翻译相似
negative_same_theme_synthetic = 不应去重,同主题新歌词
edge_short_or_placeholder = 不应去重,短歌词/占位边界样本
```
生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。
传入
`--index`
后会用现有索引生成 hard negative
。每次还会输出:
生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。
它会分出一批 holdout 完整歌词作为真实新歌负样本,并生成一个排除 holdout 的评估索引
。每次还会输出:
```
text
data/generated_eval/eval_50000.csv.manifest.json
data/generated_eval/eval_50000.csv.index.pkl
```
manifest 里重点看:
```
text
library_files 曲库歌词文件数
holdout_records 从评估索引中排除、作为真实新歌负样本的数量
sample_type_counts 各样本类型数量
line_count_bucket_counts / language_bucket_counts / source_bucket_counts
unique_source_records 本次评估覆盖了多少真实源文件
...
...
@@ -147,7 +150,7 @@ unique_source_records 本次评估覆盖了多少真实源文件
```
bash
python -m lyric_dedup.cli evaluate-csv
\
--index
outputs/indexes/library_lyrics
.pkl
\
--index
data/generated_eval/eval_50000.csv.index
.pkl
\
--csv data/generated_eval/eval_50000.csv
\
--base-dir data/generated_eval
\
--out outputs/results/library_eval_50000.csv
...
...
@@ -171,7 +174,7 @@ false_positive
```
bash
python -m lyric_dedup.cli evaluate-csv
\
--index
outputs/indexes/library_lyrics
.pkl
\
--index
data/generated_eval/eval_50000.csv.index
.pkl
\
--csv data/generated_eval/eval_50000.csv
\
--base-dir data/generated_eval
\
--positive-decisions duplicate,review
\
...
...
lyric_dedup/checker.py
View file @
ba39ce6
...
...
@@ -96,16 +96,24 @@ class DuplicateChecker:
def
add_record
(
self
,
record
:
LyricRecord
)
->
None
:
indexed
=
self
.
_index
(
record
)
self
.
_records
[
record
.
record_id
]
=
indexed
self
.
_exact_hash_to_ids
.
setdefault
(
indexed
.
exact_hash
,
set
())
.
add
(
record
.
record_id
)
self
.
_add_indexed
(
record
.
record_id
,
indexed
)
def
add_normalized_record
(
self
,
record
:
LyricRecord
,
normalized
:
NormalizedLyrics
)
->
None
:
"""Add a record when normalized lyrics have already been computed."""
indexed
=
self
.
_index_normalized
(
record
,
normalized
)
self
.
_add_indexed
(
record
.
record_id
,
indexed
)
def
_add_indexed
(
self
,
record_id
:
str
,
indexed
:
_IndexedRecord
)
->
None
:
self
.
_records
[
record_id
]
=
indexed
self
.
_exact_hash_to_ids
.
setdefault
(
indexed
.
exact_hash
,
set
())
.
add
(
record_id
)
for
line
in
indexed
.
normalized
.
unique_lines
:
if
len
(
line
)
>=
4
:
self
.
_line_to_ids
.
setdefault
(
line
,
set
())
.
add
(
record
.
record
_id
)
self
.
_line_to_ids
.
setdefault
(
line
,
set
())
.
add
(
record_id
)
for
token
in
indexed
.
tokens
:
self
.
_token_to_ids
.
setdefault
(
token
,
set
())
.
add
(
record
.
record
_id
)
self
.
_token_to_ids
.
setdefault
(
token
,
set
())
.
add
(
record_id
)
for
token
in
indexed
.
fallback_tokens
:
self
.
_token_to_ids
.
setdefault
(
token
,
set
())
.
add
(
record
.
record
_id
)
self
.
_lsh
.
add
(
record
.
record
_id
,
indexed
.
signature
)
self
.
_token_to_ids
.
setdefault
(
token
,
set
())
.
add
(
record_id
)
self
.
_lsh
.
add
(
record_id
,
indexed
.
signature
)
def
save
(
self
,
path
:
str
|
Path
)
->
None
:
"""Persist the in-memory index for later checks."""
...
...
@@ -187,6 +195,9 @@ class DuplicateChecker:
def
_index
(
self
,
record
:
LyricRecord
)
->
_IndexedRecord
:
normalized
=
normalize_lyrics
(
record
.
lyrics
)
return
self
.
_index_normalized
(
record
,
normalized
)
def
_index_normalized
(
self
,
record
:
LyricRecord
,
normalized
:
NormalizedLyrics
)
->
_IndexedRecord
:
tokens
=
lyric_tokens
(
normalized
)
primary_tokens
=
lyric_tokens
(
normalized
,
lines
=
normalized
.
primary_lines
)
translation_tokens
=
lyric_tokens
(
normalized
,
lines
=
normalized
.
translation_lines
)
...
...
lyric_dedup/cli.py
View file @
ba39ce6
...
...
@@ -5,6 +5,7 @@ from __future__ import annotations
import
argparse
import
csv
import
json
import
sys
from
pathlib
import
Path
from
lyric_dedup.checker
import
DuplicateChecker
...
...
@@ -50,7 +51,8 @@ def main() -> None:
generate
.
add_argument
(
"--size"
,
type
=
int
,
default
=
100
)
generate
.
add_argument
(
"--positive-ratio"
,
type
=
float
,
default
=
0.3
)
generate
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
20260602
)
generate
.
add_argument
(
"--index"
,
default
=
""
,
help
=
"optional existing index for hard-negative generation"
)
generate
.
add_argument
(
"--index"
,
default
=
""
,
help
=
"optional source index path recorded in the manifest"
)
generate
.
add_argument
(
"--eval-index"
,
default
=
""
,
help
=
"output index built from non-holdout records for this eval set"
)
args
=
parser
.
parse_args
()
if
args
.
command
==
"build-index"
:
...
...
@@ -77,6 +79,7 @@ def main() -> None:
positive_ratio
=
args
.
positive_ratio
,
seed
=
args
.
seed
,
index_path
=
Path
(
args
.
index
)
if
args
.
index
else
None
,
eval_index_path
=
Path
(
args
.
eval_index
)
if
args
.
eval_index
else
None
,
)
print
(
json
.
dumps
(
summary
,
ensure_ascii
=
False
))
...
...
@@ -155,52 +158,58 @@ def evaluate_csv(
positive_decisions
:
set
[
str
],
max_candidates
:
int
,
)
->
None
:
_progress
(
f
"load index: {index_path}"
)
checker
=
DuplicateChecker
.
load
(
index_path
)
rows
:
list
[
dict
[
str
,
object
]]
=
[]
total
=
_csv_data_row_count
(
csv_path
)
_progress
(
f
"evaluate csv: 0/{total}"
)
out_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
csv_path
.
open
(
encoding
=
"utf-8-sig"
,
newline
=
""
)
as
file
:
reader
=
csv
.
DictReader
(
file
)
if
reader
.
fieldnames
is
None
:
raise
ValueError
(
"评估 CSV 需要表头"
)
for
row_number
,
row
in
enumerate
(
reader
,
start
=
2
):
sample_id
=
row
.
get
(
"id"
)
or
row
.
get
(
"sample_id"
)
or
str
(
row_number
)
record
,
source
=
_record_from_eval_row
(
row
,
csv_path
=
csv_path
,
base_dir
=
base_dir
)
expected_duplicate
=
_parse_expected
(
row
.
get
(
"expected"
)
or
row
.
get
(
"label"
)
or
row
.
get
(
"target"
))
result
=
checker
.
check_record
(
record
,
max_candidates
=
max_candidates
)
predicted_duplicate
=
result
.
decision
.
value
in
positive_decisions
best
=
result
.
candidates
[
0
]
if
result
.
candidates
else
None
rows
.
append
(
{
"id"
:
sample_id
,
"source"
:
source
,
"expected_duplicate"
:
expected_duplicate
,
"decision"
:
result
.
decision
.
value
,
"predicted_duplicate"
:
predicted_duplicate
,
"correct"
:
expected_duplicate
==
predicted_duplicate
,
"confidence"
:
result
.
confidence
,
"reason"
:
result
.
reason
,
"best_candidate_id"
:
best
.
record_id
if
best
else
""
,
"best_candidate_decision"
:
best
.
decision
.
value
if
best
else
""
,
"best_candidate_confidence"
:
best
.
confidence
if
best
else
""
,
"best_candidate_jaccard"
:
best
.
jaccard
if
best
else
""
,
"best_candidate_line_coverage"
:
best
.
line_coverage
if
best
else
""
,
"best_candidate_primary_jaccard"
:
best
.
primary_jaccard
if
best
else
""
,
"best_candidate_primary_line_coverage"
:
best
.
primary_line_coverage
if
best
else
""
,
"best_candidate_translation_jaccard"
:
best
.
translation_jaccard
if
best
else
""
,
"best_candidate_translation_line_coverage"
:
best
.
translation_line_coverage
if
best
else
""
,
"best_candidate_reason"
:
best
.
reason
if
best
else
""
,
"matched_unique_lines"
:
" | "
.
join
(
best
.
matched_unique_lines
)
if
best
else
""
,
}
)
out_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
out_path
.
open
(
"w"
,
encoding
=
"utf-8"
,
newline
=
""
)
as
file
:
writer
=
csv
.
DictWriter
(
file
,
fieldnames
=
list
(
rows
[
0
]
.
keys
())
if
rows
else
[
"id"
])
writer
.
writeheader
()
writer
.
writerows
(
rows
)
fieldnames
=
[
"id"
,
"source"
,
"expected_duplicate"
,
"decision"
,
"predicted_duplicate"
,
"correct"
,
"confidence"
,
"reason"
,
"best_candidate_id"
,
"best_candidate_decision"
,
"best_candidate_confidence"
,
"best_candidate_jaccard"
,
"best_candidate_line_coverage"
,
"best_candidate_primary_jaccard"
,
"best_candidate_primary_line_coverage"
,
"best_candidate_translation_jaccard"
,
"best_candidate_translation_line_coverage"
,
"best_candidate_reason"
,
"matched_unique_lines"
,
]
with
out_path
.
open
(
"w"
,
encoding
=
"utf-8"
,
newline
=
""
)
as
out_file
:
writer
=
csv
.
DictWriter
(
out_file
,
fieldnames
=
fieldnames
)
writer
.
writeheader
()
for
index
,
row
in
enumerate
(
reader
,
start
=
1
):
row_out
=
_evaluate_row
(
row
,
row_number
=
index
+
1
,
checker
=
checker
,
csv_path
=
csv_path
,
base_dir
=
base_dir
,
positive_decisions
=
positive_decisions
,
max_candidates
=
max_candidates
,
)
rows
.
append
(
row_out
)
writer
.
writerow
(
row_out
)
_progress_count
(
"evaluate csv"
,
index
,
total
,
step
=
1000
)
summary
=
_evaluation_summary
(
rows
,
positive_decisions
=
positive_decisions
,
out_path
=
out_path
)
summary_path
=
out_path
.
with_suffix
(
out_path
.
suffix
+
".summary.json"
)
summary_path
.
write_text
(
json
.
dumps
(
summary
,
ensure_ascii
=
False
,
indent
=
2
),
encoding
=
"utf-8"
)
_progress
(
"evaluation complete"
)
print
(
json
.
dumps
(
summary
,
ensure_ascii
=
False
))
...
...
@@ -229,6 +238,45 @@ def _result_to_dict(result, *, source: str) -> dict[str, object]:
}
def
_evaluate_row
(
row
:
dict
[
str
,
str
],
*
,
row_number
:
int
,
checker
:
DuplicateChecker
,
csv_path
:
Path
,
base_dir
:
Path
|
None
,
positive_decisions
:
set
[
str
],
max_candidates
:
int
,
)
->
dict
[
str
,
object
]:
sample_id
=
row
.
get
(
"id"
)
or
row
.
get
(
"sample_id"
)
or
str
(
row_number
)
record
,
source
=
_record_from_eval_row
(
row
,
csv_path
=
csv_path
,
base_dir
=
base_dir
)
expected_duplicate
=
_parse_expected
(
row
.
get
(
"expected"
)
or
row
.
get
(
"label"
)
or
row
.
get
(
"target"
))
result
=
checker
.
check_record
(
record
,
max_candidates
=
max_candidates
)
predicted_duplicate
=
result
.
decision
.
value
in
positive_decisions
best
=
result
.
candidates
[
0
]
if
result
.
candidates
else
None
return
{
"id"
:
sample_id
,
"source"
:
source
,
"expected_duplicate"
:
expected_duplicate
,
"decision"
:
result
.
decision
.
value
,
"predicted_duplicate"
:
predicted_duplicate
,
"correct"
:
expected_duplicate
==
predicted_duplicate
,
"confidence"
:
result
.
confidence
,
"reason"
:
result
.
reason
,
"best_candidate_id"
:
best
.
record_id
if
best
else
""
,
"best_candidate_decision"
:
best
.
decision
.
value
if
best
else
""
,
"best_candidate_confidence"
:
best
.
confidence
if
best
else
""
,
"best_candidate_jaccard"
:
best
.
jaccard
if
best
else
""
,
"best_candidate_line_coverage"
:
best
.
line_coverage
if
best
else
""
,
"best_candidate_primary_jaccard"
:
best
.
primary_jaccard
if
best
else
""
,
"best_candidate_primary_line_coverage"
:
best
.
primary_line_coverage
if
best
else
""
,
"best_candidate_translation_jaccard"
:
best
.
translation_jaccard
if
best
else
""
,
"best_candidate_translation_line_coverage"
:
best
.
translation_line_coverage
if
best
else
""
,
"best_candidate_reason"
:
best
.
reason
if
best
else
""
,
"matched_unique_lines"
:
" | "
.
join
(
best
.
matched_unique_lines
)
if
best
else
""
,
}
def
_lyrics_from_eval_row
(
row
:
dict
[
str
,
str
],
*
,
csv_path
:
Path
,
base_dir
:
Path
|
None
)
->
tuple
[
str
,
str
]:
lyrics
=
(
row
.
get
(
"lyrics"
)
or
""
)
.
strip
()
if
lyrics
:
...
...
@@ -322,5 +370,23 @@ def _evaluation_summary(
}
def
_csv_data_row_count
(
csv_path
:
Path
)
->
int
:
with
csv_path
.
open
(
encoding
=
"utf-8-sig"
,
newline
=
""
)
as
file
:
reader
=
csv
.
reader
(
file
)
next
(
reader
,
None
)
return
sum
(
1
for
_
in
reader
)
def
_progress
(
message
:
str
)
->
None
:
print
(
f
"[eval] {message}"
,
file
=
sys
.
stderr
,
flush
=
True
)
def
_progress_count
(
label
:
str
,
current
:
int
,
total
:
int
,
*
,
step
:
int
=
1000
)
->
None
:
if
total
<=
0
:
return
if
current
==
1
or
current
==
total
or
current
%
step
==
0
:
_progress
(
f
"{label}: {current}/{total}"
)
if
__name__
==
"__main__"
:
main
()
...
...
lyric_dedup/eval_dataset.py
View file @
ba39ce6
...
...
@@ -7,14 +7,14 @@ import hashlib
import
json
import
random
import
re
import
sys
from
collections
import
Counter
from
dataclasses
import
dataclass
from
pathlib
import
Path
from
lyric_dedup.checker
import
DuplicateChecker
from
lyric_dedup.checker
import
DuplicateDecision
from
lyric_dedup.checker
import
LyricRecord
from
lyric_dedup.file_import
import
iter_lyric_files
from
lyric_dedup.file_import
import
read_lyric_file
from
lyric_dedup.file_import
import
record_from_file
from
lyric_dedup.normalization
import
NormalizedLyrics
from
lyric_dedup.normalization
import
fingerprint_text
...
...
@@ -23,19 +23,31 @@ from lyric_dedup.normalization import normalize_lyrics
DEFAULT_SAMPLE_MIX
=
{
"positive_full_duplicate"
:
0.30
,
"negative_random_unrelated"
:
0.20
,
"negative_hard_candidate"
:
0.25
,
"negative_real_holdout_full_song"
:
0.40
,
"negative_fragment"
:
0.10
,
"negative_shared_chorus"
:
0.05
,
"negative_translation_only"
:
0.05
,
"negative_same_theme_synthetic"
:
0.05
,
"edge_short_or_placeholder"
:
0.05
,
}
def
_progress
(
message
:
str
)
->
None
:
print
(
f
"[eval-gen] {message}"
,
file
=
sys
.
stderr
,
flush
=
True
)
def
_progress_count
(
label
:
str
,
current
:
int
,
total
:
int
,
*
,
step
:
int
=
1000
)
->
None
:
if
total
<=
0
:
return
if
current
==
1
or
current
==
total
or
current
%
step
==
0
:
_progress
(
f
"{label}: {current}/{total}"
)
@dataclass
(
frozen
=
True
)
class
LyricProfile
:
path
:
Path
record_id
:
str
raw_text
:
str
title
:
str
artist
:
str
normalized
:
NormalizedLyrics
...
...
@@ -74,6 +86,7 @@ def generate_eval_set(
positive_ratio
:
float
=
0.30
,
seed
:
int
=
20260602
,
index_path
:
Path
|
None
=
None
,
eval_index_path
:
Path
|
None
=
None
,
)
->
dict
[
str
,
object
]:
"""Generate a stratified production evaluation set.
...
...
@@ -83,6 +96,7 @@ def generate_eval_set(
if
size
<=
0
:
raise
ValueError
(
"size must be positive"
)
_progress
(
f
"start generation: size={size}, positive_ratio={positive_ratio}, seed={seed}"
)
rng
=
random
.
Random
(
seed
)
profiles
=
profile_library
(
library_dir
)
if
not
profiles
:
...
...
@@ -90,13 +104,25 @@ def generate_eval_set(
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
csv_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
_progress
(
f
"clean output dir: {output_dir}"
)
_clean_generated_output_dir
(
output_dir
)
checker
=
DuplicateChecker
.
load
(
index_path
)
if
index_path
else
None
plan
=
_sample_plan
(
size
,
positive_ratio
=
positive_ratio
)
groups
=
_profile_groups
(
profiles
)
_progress
(
f
"sample plan: {plan}"
)
holdout_count
=
min
(
plan
[
"negative_real_holdout_full_song"
],
max
(
1
,
len
(
profiles
)
//
2
))
holdout_profiles
=
_stratified_unique_sample
(
profiles
,
holdout_count
,
rng
,
)
holdout_ids
=
{
profile
.
record_id
for
profile
in
holdout_profiles
}
indexed_profiles
=
[
profile
for
profile
in
profiles
if
profile
.
record_id
not
in
holdout_ids
]
or
profiles
eval_index_path
=
eval_index_path
or
csv_path
.
with_suffix
(
csv_path
.
suffix
+
".index.pkl"
)
_build_eval_index
(
indexed_profiles
,
eval_index_path
)
groups
=
_profile_groups
(
indexed_profiles
)
samples
:
list
[
GeneratedSample
]
=
[]
_progress
(
"build positive_full_duplicate samples"
)
samples
.
extend
(
_build_positive_samples
(
_stratified_sample
(
groups
[
"normal"
],
plan
[
"positive_full_duplicate"
],
rng
),
...
...
@@ -106,53 +132,62 @@ def generate_eval_set(
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build negative_real_holdout_full_song samples"
)
samples
.
extend
(
_build_
random_unrelated
_samples
(
plan
[
"negative_random_unrelated"
]
,
_build_
holdout_full_song
_samples
(
holdout_profiles
,
output_dir
,
csv_path
.
parent
,
rng
,
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build negative_fragment samples"
)
samples
.
extend
(
_build_hard_candidate_samples
(
groups
[
"normal"
],
plan
[
"negative_hard_candidate"
],
_build_fragment_samples
(
_stratified_sample
(
groups
[
"fragmentable"
],
plan
[
"negative_fragment"
],
rng
),
output_dir
,
csv_path
.
parent
,
rng
,
checker
=
checker
,
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build negative_shared_chorus samples"
)
samples
.
extend
(
_build_
fragment
_samples
(
_stratified_sample
(
groups
[
"
fragmentable"
],
plan
[
"negative_fragment
"
],
rng
),
_build_
shared_chorus
_samples
(
_stratified_sample
(
groups
[
"
normal"
],
plan
[
"negative_shared_chorus
"
],
rng
),
output_dir
,
csv_path
.
parent
,
rng
,
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build negative_translation_only samples"
)
samples
.
extend
(
_build_
shared_chorus
_samples
(
_stratified_sample
(
groups
[
"
normal"
],
plan
[
"negative_shared_chorus
"
],
rng
),
_build_
translation_only
_samples
(
_stratified_sample
(
groups
[
"
foreign"
],
plan
[
"negative_translation_only
"
],
rng
),
output_dir
,
csv_path
.
parent
,
rng
,
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build negative_same_theme_synthetic samples"
)
samples
.
extend
(
_build_
translation_only
_samples
(
_stratified_sample
(
groups
[
"foreign"
],
plan
[
"negative_translation_only"
],
rng
)
,
_build_
same_theme_synthetic
_samples
(
plan
[
"negative_same_theme_synthetic"
]
,
output_dir
,
csv_path
.
parent
,
rng
,
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
_progress
(
"build edge_short_or_placeholder samples"
)
samples
.
extend
(
_build_edge_samples
(
_stratified_sample
(
groups
[
"edge"
],
plan
[
"edge_short_or_placeholder"
],
rng
),
...
...
@@ -162,10 +197,12 @@ def generate_eval_set(
start_index
=
len
(
samples
)
+
1
,
)
)
_progress
(
f
"built samples: {len(samples)}/{size}"
)
if
len
(
samples
)
<
size
:
_progress
(
f
"top up with negative_same_theme_synthetic samples: {size - len(samples)}"
)
samples
.
extend
(
_build_
random_unrelated
_samples
(
_build_
same_theme_synthetic
_samples
(
size
-
len
(
samples
),
output_dir
,
csv_path
.
parent
,
...
...
@@ -176,7 +213,9 @@ def generate_eval_set(
samples
=
samples
[:
size
]
rng
.
shuffle
(
samples
)
_progress
(
f
"write csv: {csv_path}"
)
_write_csv
(
samples
,
csv_path
,
seed
=
seed
)
_progress
(
"write manifest"
)
manifest
=
_write_manifest
(
profiles
=
profiles
,
samples
=
samples
,
...
...
@@ -185,15 +224,21 @@ def generate_eval_set(
seed
=
seed
,
plan
=
plan
,
index_path
=
index_path
,
eval_index_path
=
eval_index_path
,
holdout_count
=
len
(
holdout_profiles
),
)
_progress
(
"generation complete"
)
return
manifest
def
profile_library
(
library_dir
:
Path
)
->
list
[
LyricProfile
]:
profiles
:
list
[
LyricProfile
]
=
[]
for
path
in
iter_lyric_files
(
library_dir
):
paths
=
iter_lyric_files
(
library_dir
)
_progress
(
f
"profile library: 0/{len(paths)}"
)
for
index
,
path
in
enumerate
(
paths
,
start
=
1
):
record
=
record_from_file
(
path
,
base_dir
=
library_dir
)
normalized
=
normalize_lyrics
(
record
.
lyrics
)
raw_text
=
record
.
lyrics
normalized
=
normalize_lyrics
(
raw_text
)
lines
=
normalized
.
primary_lines
or
normalized
.
unique_lines
line_count
=
len
(
lines
)
normalized_text
=
fingerprint_text
(
normalized
)
or
normalized
.
normalized_full_text
...
...
@@ -202,6 +247,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
LyricProfile
(
path
=
path
,
record_id
=
record
.
record_id
,
raw_text
=
raw_text
,
title
=
record
.
title
or
""
,
artist
=
record
.
artist
or
""
,
normalized
=
normalized
,
...
...
@@ -214,6 +260,7 @@ def profile_library(library_dir: Path) -> list[LyricProfile]:
has_translation
=
bool
(
normalized
.
translation_lines
),
)
)
_progress_count
(
"profile library"
,
index
,
len
(
paths
),
step
=
5000
)
return
profiles
...
...
@@ -283,6 +330,31 @@ def _stratified_sample(profiles: list[LyricProfile], count: int, rng: random.Ran
return
selected
def
_stratified_unique_sample
(
profiles
:
list
[
LyricProfile
],
count
:
int
,
rng
:
random
.
Random
)
->
list
[
LyricProfile
]:
if
count
<=
0
or
not
profiles
:
return
[]
return
_stratified_sample
(
profiles
,
min
(
count
,
len
(
profiles
)),
rng
)
def
_build_eval_index
(
profiles
:
list
[
LyricProfile
],
index_path
:
Path
)
->
None
:
_progress
(
f
"build eval index excluding holdout: {index_path}"
)
checker
=
DuplicateChecker
()
total
=
len
(
profiles
)
for
index
,
profile
in
enumerate
(
profiles
,
start
=
1
):
checker
.
add_normalized_record
(
LyricRecord
(
record_id
=
profile
.
record_id
,
lyrics
=
profile
.
raw_text
,
title
=
profile
.
title
or
None
,
artist
=
profile
.
artist
or
None
,
),
profile
.
normalized
,
)
_progress_count
(
"build eval index"
,
index
,
total
,
step
=
5000
)
index_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
checker
.
save
(
index_path
)
def
_build_positive_samples
(
profiles
:
list
[
LyricProfile
],
output_dir
:
Path
,
...
...
@@ -293,7 +365,7 @@ def _build_positive_samples(
)
->
list
[
GeneratedSample
]:
samples
:
list
[
GeneratedSample
]
=
[]
for
offset
,
profile
in
enumerate
(
profiles
):
raw
=
read_lyric_file
(
profile
.
path
)
raw
=
profile
.
raw_text
lines
=
_content_lines
(
raw
)
variants
=
[
(
"positive_exact_copy"
,
raw
),
...
...
@@ -308,80 +380,62 @@ def _build_positive_samples(
index
=
start_index
+
offset
path
=
_write_sample_file
(
output_dir
,
f
"pos_{index:05d}_{sample_type}.txt"
,
text
)
samples
.
append
(
_sample_from_profile
(
index
,
path
,
csv_base
,
"应去重"
,
sample_type
,
profile
))
_progress_count
(
"positive_full_duplicate"
,
len
(
samples
),
len
(
profiles
))
return
samples
def
_build_
random_unrelated
_samples
(
count
:
int
,
def
_build_
holdout_full_song
_samples
(
profiles
:
list
[
LyricProfile
]
,
output_dir
:
Path
,
csv_base
:
Path
,
rng
:
random
.
Random
,
*
,
start_index
:
int
,
)
->
list
[
GeneratedSample
]:
_progress
(
"build negative_real_holdout_full_song samples"
)
samples
:
list
[
GeneratedSample
]
=
[]
for
offset
in
range
(
count
):
for
offset
,
profile
in
enumerate
(
profiles
):
index
=
start_index
+
offset
text
=
_same_theme_synthetic
(
index
,
rng
)
path
=
_write_sample_file
(
output_dir
,
f
"neg_{index:05d}_negative_r
andom_unrelated
.txt"
,
text
)
text
=
profile
.
raw_text
path
=
_write_sample_file
(
output_dir
,
f
"neg_{index:05d}_negative_r
eal_holdout_full_song
.txt"
,
text
)
samples
.
append
(
GeneratedSample
(
sample_id
=
f
"sample-{index:05d}"
,
file
=
str
(
path
.
relative_to
(
csv_base
)),
expected
=
"不应去重"
,
sample_type
=
"negative_random_unrelated"
,
source
=
"synthetic"
,
notes
=
"same-theme synthetic full lyric not copied from library"
,
_sample_from_profile
(
index
,
path
,
csv_base
,
"不应去重"
,
"negative_real_holdout_full_song"
,
profile
,
notes
=
"full real lyric held out from the generated eval index"
,
)
)
_progress_count
(
"negative_real_holdout_full_song"
,
len
(
samples
),
len
(
profiles
))
return
samples
def
_build_hard_candidate_samples
(
profiles
:
list
[
LyricProfile
],
def
_build_same_theme_synthetic_samples
(
count
:
int
,
output_dir
:
Path
,
csv_base
:
Path
,
rng
:
random
.
Random
,
*
,
checker
:
DuplicateChecker
|
None
,
start_index
:
int
,
)
->
list
[
GeneratedSample
]:
if
count
<=
0
:
return
[]
sources
=
_stratified_sample
(
profiles
,
count
*
3
,
rng
)
samples
:
list
[
GeneratedSample
]
=
[]
for
profile
in
sources
:
if
len
(
samples
)
>=
count
:
break
lines
=
list
(
profile
.
normalized
.
primary_lines
or
profile
.
normalized
.
unique_lines
)
text
=
_short_shared_snippet
(
lines
,
rng
)
candidate_id
=
""
if
checker
is
not
None
:
result
=
checker
.
check
(
text
,
max_candidates
=
5
)
candidate
=
next
(
(
item
for
item
in
result
.
candidates
if
item
.
record_id
!=
profile
.
record_id
and
item
.
decision
!=
DuplicateDecision
.
NEW
),
result
.
candidates
[
0
]
if
result
.
candidates
else
None
,
)
candidate_id
=
candidate
.
record_id
if
candidate
else
""
index
=
start_index
+
len
(
samples
)
path
=
_write_sample_file
(
output_dir
,
f
"neg_{index:05d}_negative_hard_candidate.txt"
,
text
)
for
offset
in
range
(
count
):
index
=
start_index
+
offset
text
=
_same_theme_synthetic
(
index
,
rng
)
path
=
_write_sample_file
(
output_dir
,
f
"neg_{index:05d}_negative_same_theme_synthetic.txt"
,
text
)
samples
.
append
(
_sample_from_profile
(
index
,
path
,
csv_base
,
"不应去重"
,
"negative_hard_candidate"
,
profile
,
candidate_record_id
=
candidate_id
,
notes
=
"shares a few real lines plus new filler; should not auto duplicate"
,
GeneratedSample
(
sample_id
=
f
"sample-{index:05d}"
,
file
=
str
(
path
.
relative_to
(
csv_base
)),
expected
=
"不应去重"
,
sample_type
=
"negative_same_theme_synthetic"
,
source
=
"synthetic"
,
notes
=
"same-theme synthetic full lyric not copied from library"
,
)
)
_progress_count
(
"negative_same_theme_synthetic"
,
len
(
samples
),
count
)
return
samples
...
...
@@ -410,6 +464,7 @@ def _build_fragment_samples(
notes
=
"partial lyric fragment only"
,
)
)
_progress_count
(
"negative_fragment"
,
len
(
samples
),
len
(
profiles
))
return
samples
...
...
@@ -447,6 +502,7 @@ def _build_shared_chorus_samples(
notes
=
"shared repeated lines with new surrounding content"
,
)
)
_progress_count
(
"negative_shared_chorus"
,
len
(
samples
),
len
(
profiles
))
return
samples
...
...
@@ -478,6 +534,7 @@ def _build_translation_only_samples(
notes
=
"translation-like text without matching original lyric"
,
)
)
_progress_count
(
"negative_translation_only"
,
len
(
samples
),
len
(
profiles
))
return
samples
...
...
@@ -511,6 +568,7 @@ def _build_edge_samples(
notes
=
notes
,
)
)
_progress_count
(
"edge_short_or_placeholder"
,
len
(
samples
),
len
(
profiles
))
return
samples
...
...
@@ -598,13 +656,17 @@ def _write_manifest(
seed
:
int
,
plan
:
dict
[
str
,
int
],
index_path
:
Path
|
None
,
eval_index_path
:
Path
,
holdout_count
:
int
,
)
->
dict
[
str
,
object
]:
manifest
=
{
"seed"
:
seed
,
"library_files"
:
len
(
profiles
),
"sample_size"
:
len
(
samples
),
"plan"
:
plan
,
"index"
:
str
(
index_path
)
if
index_path
else
""
,
"source_index"
:
str
(
index_path
)
if
index_path
else
""
,
"eval_index"
:
str
(
eval_index_path
),
"holdout_records"
:
holdout_count
,
"lyrics_dir"
:
str
(
output_dir
),
"csv"
:
str
(
csv_path
),
"manifest"
:
str
(
csv_path
.
with_suffix
(
csv_path
.
suffix
+
".manifest.json"
)),
...
...
scripts/process_library.py
View file @
ba39ce6
...
...
@@ -4,8 +4,9 @@ This script is intended for the recurring workflow after adding files to
``data/library``:
1. Move pure-music placeholder lyric files out of the active library.
2. Rebuild the duplicate-checking index.
3. Optionally regenerate and evaluate a synthetic regression set.
2. Move duplicate lyric files out of the active library.
3. Rebuild the duplicate-checking index from retained files.
4. Optionally regenerate and evaluate a production-style eval set.
"""
from
__future__
import
annotations
...
...
@@ -15,6 +16,7 @@ import csv
import
json
import
shutil
import
sys
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
pathlib
import
Path
...
...
@@ -23,11 +25,14 @@ if str(PROJECT_ROOT) not in sys.path:
sys
.
path
.
insert
(
0
,
str
(
PROJECT_ROOT
))
from
lyric_dedup.checker
import
DuplicateChecker
from
lyric_dedup.checker
import
DuplicateDecision
from
lyric_dedup.checker
import
LyricRecord
from
lyric_dedup.cli
import
evaluate_csv
from
lyric_dedup.eval_dataset
import
generate_eval_set
from
lyric_dedup.file_import
import
iter_lyric_files
from
lyric_dedup.file_import
import
read_lyric_file
from
lyric_dedup.file_import
import
records_from_dir
from
lyric_dedup.file_import
import
record_from_file
from
lyric_dedup.normalization
import
NormalizedLyrics
from
lyric_dedup.normalization
import
normalize_lyrics
...
...
@@ -37,13 +42,25 @@ PLACEHOLDER_MARKERS = (
)
@dataclass
(
frozen
=
True
)
class
LibraryProfile
:
path
:
Path
record
:
LyricRecord
normalized
:
NormalizedLyrics
line_count
:
int
char_count
:
int
def
main
()
->
None
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Process lyric library additions."
)
parser
.
add_argument
(
"--library-dir"
,
default
=
"data/library"
)
parser
.
add_argument
(
"--index"
,
default
=
"outputs/indexes/library_lyrics.pkl"
)
parser
.
add_argument
(
"--quarantine-dir"
,
default
=
"data/quarantine/no_lyrics_placeholders"
)
parser
.
add_argument
(
"--duplicate-quarantine-dir"
,
default
=
"data/quarantine/duplicates"
)
parser
.
add_argument
(
"--dry-run"
,
action
=
"store_true"
,
help
=
"Only report placeholder files; do not move or write outputs."
)
parser
.
add_argument
(
"--delete-placeholders"
,
action
=
"store_true"
,
help
=
"Delete matched placeholder files instead of moving them."
)
parser
.
add_argument
(
"--delete-duplicates"
,
action
=
"store_true"
,
help
=
"Delete duplicate lyric files instead of moving them."
)
parser
.
add_argument
(
"--skip-library-dedup"
,
action
=
"store_true"
,
help
=
"Skip internal duplicate cleanup before rebuilding the index."
)
parser
.
add_argument
(
"--eval-size"
,
type
=
int
,
default
=
0
,
help
=
"Generate and evaluate this many synthetic samples. 0 disables eval."
)
parser
.
add_argument
(
"--positive-ratio"
,
type
=
float
,
default
=
0.2
)
parser
.
add_argument
(
"--eval-dir"
,
default
=
"data/generated_eval/incoming"
)
...
...
@@ -54,13 +71,18 @@ def main() -> None:
library_dir
=
Path
(
args
.
library_dir
)
quarantine_dir
=
Path
(
args
.
quarantine_dir
)
duplicate_quarantine_dir
=
Path
(
args
.
duplicate_quarantine_dir
)
report_path
=
Path
(
args
.
report
)
files_before
=
iter_lyric_files
(
library_dir
)
placeholders
=
_find_placeholder_files
(
library_dir
)
short_effective
=
_effective_line_report
(
library_dir
)
duplicate_report_path
=
report_path
.
with_suffix
(
".duplicates.csv"
)
moved_or_deleted
:
list
[
str
]
=
[]
duplicate_actions
:
list
[
str
]
=
[]
duplicate_rows
:
list
[
dict
[
str
,
object
]]
=
[]
short_effective
:
dict
[
str
,
int
]
retained_count
=
0
if
not
args
.
dry_run
:
moved_or_deleted
=
_handle_placeholders
(
placeholders
,
...
...
@@ -68,9 +90,25 @@ def main() -> None:
quarantine_dir
=
quarantine_dir
,
delete
=
args
.
delete_placeholders
,
)
_build_index
(
library_dir
,
Path
(
args
.
index
))
if
args
.
skip_library_dedup
:
profiles
=
_profile_library
(
library_dir
)
short_effective
=
_effective_line_report_from_profiles
(
profiles
)
retained_count
=
_build_index_from_profiles
(
profiles
,
Path
(
args
.
index
))
else
:
profiles
=
_profile_library
(
library_dir
)
short_effective
=
_effective_line_report_from_profiles
(
profiles
)
retained_count
,
duplicate_rows
,
duplicate_actions
=
_deduplicate_and_build_index
(
profiles
,
library_dir
=
library_dir
,
index_path
=
Path
(
args
.
index
),
duplicate_quarantine_dir
=
duplicate_quarantine_dir
,
delete
=
args
.
delete_duplicates
,
dry_run
=
False
,
)
_write_duplicate_report
(
duplicate_rows
,
duplicate_report_path
)
if
args
.
eval_size
>
0
:
eval_index_path
=
Path
(
args
.
eval_csv
)
.
with_suffix
(
".index.pkl"
)
generate_eval_set
(
library_dir
=
library_dir
,
output_dir
=
Path
(
args
.
eval_dir
),
...
...
@@ -78,9 +116,10 @@ def main() -> None:
size
=
args
.
eval_size
,
positive_ratio
=
args
.
positive_ratio
,
index_path
=
Path
(
args
.
index
),
eval_index_path
=
eval_index_path
,
)
evaluate_csv
(
Path
(
args
.
index
)
,
eval_index_path
,
Path
(
args
.
eval_csv
),
Path
(
args
.
eval_out
),
base_dir
=
Path
(
args
.
eval_csv
)
.
parent
,
...
...
@@ -88,13 +127,27 @@ def main() -> None:
max_candidates
=
5
,
)
evaluate_csv
(
Path
(
args
.
index
)
,
eval_index_path
,
Path
(
args
.
eval_csv
),
Path
(
args
.
eval_out
)
.
with_name
(
Path
(
args
.
eval_out
)
.
stem
+
"_review_positive.csv"
),
base_dir
=
Path
(
args
.
eval_csv
)
.
parent
,
positive_decisions
=
{
"duplicate"
,
"review"
},
max_candidates
=
5
,
)
else
:
profiles
=
_profile_library
(
library_dir
)
short_effective
=
_effective_line_report_from_profiles
(
profiles
)
if
not
args
.
skip_library_dedup
:
retained_count
,
duplicate_rows
,
duplicate_actions
=
_deduplicate_and_build_index
(
profiles
,
library_dir
=
library_dir
,
index_path
=
Path
(
args
.
index
),
duplicate_quarantine_dir
=
duplicate_quarantine_dir
,
delete
=
args
.
delete_duplicates
,
dry_run
=
True
,
)
else
:
retained_count
=
len
(
profiles
)
report
=
{
"timestamp"
:
datetime
.
now
()
.
isoformat
(
timespec
=
"seconds"
),
...
...
@@ -104,11 +157,18 @@ def main() -> None:
"placeholder_matches"
:
len
(
placeholders
),
"placeholder_files"
:
[
str
(
path
)
for
path
in
placeholders
],
"handled_placeholder_files"
:
moved_or_deleted
,
"library_dedup_skipped"
:
args
.
skip_library_dedup
,
"duplicate_matches"
:
len
(
duplicate_rows
),
"duplicate_report"
:
str
(
duplicate_report_path
)
if
duplicate_rows
else
""
,
"handled_duplicate_files"
:
duplicate_actions
[:
1000
],
"handled_duplicate_files_truncated"
:
len
(
duplicate_actions
)
>
1000
,
"retained_index_records"
:
retained_count
,
"files_after"
:
len
(
iter_lyric_files
(
library_dir
)),
"index"
:
str
(
args
.
index
),
"eval_size"
:
args
.
eval_size
,
"eval_csv"
:
str
(
args
.
eval_csv
)
if
args
.
eval_size
>
0
else
""
,
"eval_out"
:
str
(
args
.
eval_out
)
if
args
.
eval_size
>
0
else
""
,
"eval_index"
:
str
(
Path
(
args
.
eval_csv
)
.
with_suffix
(
".index.pkl"
))
if
args
.
eval_size
>
0
else
""
,
"short_effective_line_counts"
:
short_effective
,
}
...
...
@@ -154,15 +214,133 @@ def _handle_placeholders(
return
handled
def
_build_index
(
library_dir
:
Path
,
index_path
:
Path
)
->
None
:
def
_profile_library
(
library_dir
:
Path
)
->
list
[
LibraryProfile
]:
profiles
:
list
[
LibraryProfile
]
=
[]
files
=
iter_lyric_files
(
library_dir
)
_progress
(
f
"profile active library: 0/{len(files)}"
)
for
index
,
path
in
enumerate
(
files
,
start
=
1
):
record
=
record_from_file
(
path
,
base_dir
=
library_dir
)
normalized
=
normalize_lyrics
(
record
.
lyrics
)
lines
=
normalized
.
primary_lines
or
normalized
.
unique_lines
normalized_text
=
normalized
.
normalized_full_text
profiles
.
append
(
LibraryProfile
(
path
=
path
,
record
=
record
,
normalized
=
normalized
,
line_count
=
len
(
lines
),
char_count
=
len
(
normalized_text
),
)
)
_progress_count
(
"profile active library"
,
index
,
len
(
files
),
step
=
5000
)
return
profiles
def
_build_index_from_profiles
(
profiles
:
list
[
LibraryProfile
],
index_path
:
Path
)
->
int
:
checker
=
DuplicateChecker
()
for
record
in
records_from_dir
(
library_dir
):
checker
.
add_record
(
record
)
for
index
,
profile
in
enumerate
(
profiles
,
start
=
1
):
checker
.
add_normalized_record
(
profile
.
record
,
profile
.
normalized
)
_progress_count
(
"build index"
,
index
,
len
(
profiles
),
step
=
5000
)
index_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
checker
.
save
(
index_path
)
return
checker
.
record_count
def
_deduplicate_and_build_index
(
profiles
:
list
[
LibraryProfile
],
*
,
library_dir
:
Path
,
index_path
:
Path
,
duplicate_quarantine_dir
:
Path
,
delete
:
bool
,
dry_run
:
bool
,
)
->
tuple
[
int
,
list
[
dict
[
str
,
object
]],
list
[
str
]]:
checker
=
DuplicateChecker
()
duplicate_rows
:
list
[
dict
[
str
,
object
]]
=
[]
duplicate_actions
:
list
[
str
]
=
[]
ordered
=
sorted
(
profiles
,
key
=
_profile_quality_key
)
_progress
(
f
"deduplicate active library: 0/{len(ordered)}"
)
for
index
,
profile
in
enumerate
(
ordered
,
start
=
1
):
result
=
checker
.
check_record
(
profile
.
record
,
max_candidates
=
1
)
best
=
result
.
candidates
[
0
]
if
result
.
candidates
else
None
if
result
.
decision
==
DuplicateDecision
.
DUPLICATE
and
best
is
not
None
:
duplicate_rows
.
append
(
{
"duplicate_path"
:
str
(
profile
.
path
),
"duplicate_record_id"
:
profile
.
record
.
record_id
,
"kept_record_id"
:
best
.
record_id
,
"decision"
:
result
.
decision
.
value
,
"confidence"
:
result
.
confidence
,
"reason"
:
result
.
reason
,
"best_candidate_jaccard"
:
best
.
jaccard
,
"best_candidate_line_coverage"
:
best
.
line_coverage
,
"best_candidate_primary_jaccard"
:
best
.
primary_jaccard
,
"best_candidate_primary_line_coverage"
:
best
.
primary_line_coverage
,
"matched_unique_lines"
:
" | "
.
join
(
best
.
matched_unique_lines
),
"line_count"
:
profile
.
line_count
,
"char_count"
:
profile
.
char_count
,
}
)
if
not
dry_run
:
duplicate_actions
.
append
(
_handle_duplicate_file
(
profile
.
path
,
library_dir
=
library_dir
,
duplicate_quarantine_dir
=
duplicate_quarantine_dir
,
delete
=
delete
,
)
)
else
:
checker
.
add_normalized_record
(
profile
.
record
,
profile
.
normalized
)
_progress_count
(
"deduplicate active library"
,
index
,
len
(
ordered
),
step
=
5000
)
if
not
dry_run
:
index_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
checker
.
save
(
index_path
)
return
checker
.
record_count
,
duplicate_rows
,
duplicate_actions
def
_handle_duplicate_file
(
path
:
Path
,
*
,
library_dir
:
Path
,
duplicate_quarantine_dir
:
Path
,
delete
:
bool
,
)
->
str
:
if
delete
:
path
.
unlink
()
return
f
"deleted:{path}"
duplicate_quarantine_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
relative
=
path
.
resolve
()
.
relative_to
(
library_dir
.
resolve
())
destination
=
duplicate_quarantine_dir
/
relative
destination
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
destination
.
exists
():
destination
=
destination
.
with_name
(
f
"{destination.stem}_{datetime.now().strftime('
%
Y
%
m
%
d
%
H
%
M
%
S')}{destination.suffix}"
)
shutil
.
move
(
str
(
path
),
str
(
destination
))
return
f
"moved:{path}->{destination}"
def
_profile_quality_key
(
profile
:
LibraryProfile
)
->
tuple
[
int
,
int
,
int
,
str
]:
# Sort ascending; negative values make higher-quality records come first.
filename_quality
=
0
if
not
profile
.
path
.
name
.
startswith
(
"None_"
)
else
1
return
(
filename_quality
,
-
profile
.
line_count
,
-
profile
.
char_count
,
str
(
profile
.
path
))
def
_write_duplicate_report
(
rows
:
list
[
dict
[
str
,
object
]],
report_path
:
Path
)
->
None
:
if
not
rows
:
return
report_path
.
parent
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
report_path
.
open
(
"w"
,
encoding
=
"utf-8"
,
newline
=
""
)
as
file
:
writer
=
csv
.
DictWriter
(
file
,
fieldnames
=
list
(
rows
[
0
]
.
keys
()))
writer
.
writeheader
()
writer
.
writerows
(
rows
)
def
_effective_line_report
(
library_dir
:
Path
)
->
dict
[
str
,
int
]:
return
_effective_line_report_from_profiles
(
_profile_library
(
library_dir
))
def
_effective_line_report_from_profiles
(
profiles
:
list
[
LibraryProfile
])
->
dict
[
str
,
int
]:
buckets
=
{
"total"
:
0
,
"zero_effective_lines"
:
0
,
...
...
@@ -170,10 +348,9 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
"four_to_five_effective_lines"
:
0
,
"six_plus_effective_lines"
:
0
,
}
for
p
ath
in
iter_lyric_files
(
library_dir
)
:
for
p
rofile
in
profiles
:
buckets
[
"total"
]
+=
1
normalized
=
normalize_lyrics
(
read_lyric_file
(
path
))
line_count
=
len
(
normalized
.
primary_lines
or
normalized
.
unique_lines
)
line_count
=
profile
.
line_count
if
line_count
==
0
:
buckets
[
"zero_effective_lines"
]
+=
1
elif
line_count
<=
3
:
...
...
@@ -185,5 +362,16 @@ def _effective_line_report(library_dir: Path) -> dict[str, int]:
return
buckets
def
_progress
(
message
:
str
)
->
None
:
print
(
f
"[process-library] {message}"
,
file
=
sys
.
stderr
,
flush
=
True
)
def
_progress_count
(
label
:
str
,
current
:
int
,
total
:
int
,
*
,
step
:
int
=
1000
)
->
None
:
if
total
<=
0
:
return
if
current
==
1
or
current
==
total
or
current
%
step
==
0
:
_progress
(
f
"{label}: {current}/{total}"
)
if
__name__
==
"__main__"
:
main
()
...
...
tests/test_lyric_dedup.py
View file @
ba39ce6
...
...
@@ -308,9 +308,11 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None:
assert
manifest
[
"library_files"
]
==
12
assert
manifest
[
"sample_size"
]
==
30
assert
manifest
[
"unique_source_records"
]
>
1
assert
manifest
[
"holdout_records"
]
>
1
assert
(
tmp_path
/
"generated"
/
"eval.csv.index.pkl"
)
.
exists
()
assert
"positive_full_duplicate"
in
manifest
[
"plan"
]
assert
"negative_real_holdout_full_song"
in
negative_types
assert
"negative_fragment"
in
negative_types
assert
"negative_hard_candidate"
in
negative_types
assert
all
(
row
[
"expected"
]
==
"不应去重"
for
row
in
rows
if
row
[
"sample_type"
]
.
startswith
(
"negative_"
))
...
...
Please
register
or
sign in
to post a comment