Commit 49008962 4900896283e6b52190437fd467f52ab75caf2530 by 沈秋雨

新增 PostgreSQL 去重检索链路与 hard 评估集支持

- 新增 PostgreSQL 导入脚本、评估脚本和 schema 定义,支持基于 exact_hash、pg_trgm 和行级 hash 的三层召回策略
- 评估 CLI 新增 hard profile,覆盖错别字、OCR 错误、整段翻译、medley 片段等更贴近业务边界的场景
- 调整 checker.py 复核阈值与匹配理由文案,优化翻译行相似与仅副歌重复场景的判定逻辑
- 同步更新 README、TEST_WORKFLOW 和单元测试

Co-Authored-By: Claude <noreply@anthropic.com>
1 parent ba39ce6a
......@@ -85,15 +85,33 @@ python -m lyric_dedup.cli generate-eval-set \
--positive-ratio 0.3
```
生成器的业务口径:
默认 `--profile standard` 生成常规生产评估集。也可以生成更贴近业务边界的 hard 集:
```bash
python -m lyric_dedup.cli generate-eval-set \
--profile hard \
--library-dir data/library \
--lyrics-dir data/generated_eval/hard_incoming \
--csv data/generated_eval/eval_hard_5000.csv \
--eval-index data/generated_eval/eval_hard_5000.csv.index.pkl \
--size 5000 \
--positive-ratio 0.3
```
standard 业务口径:
- 先扫描整个曲库,按有效歌词行数、语言类型、文件来源前缀做分层采样,不再按排序前缀取样。
- `应去重` 样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译。
- `应去重` 样本只生成全曲歌词的样式变化,例如时间戳、标点、平台噪声、空行、重复副歌次数变化、附加中文翻译、少量错别字/英文拼写错误
- `不应去重` 样本以真实 holdout 完整歌词为主,也包含片段歌词、重复副歌碰撞、仅翻译相似、同主题新歌词、短歌词/占位边界样本。
- 片段歌词即使命中已有歌曲的一部分,也不应该输出 `duplicate`;最多进入 `review`
- 生成器会额外写出 `--eval-index`,这个索引排除了 holdout 歌,评估生成 CSV 时应使用它。
- 同时会生成 `*.manifest.json`,记录 seed、曲库规模、holdout 数、样本类型分布、语言/来源分桶和样本来源覆盖数。
hard 业务口径不故意制造反常输入,主要覆盖上线更容易踩边界的情况:
- `应去重`: 同曲平台版本噪声、较完整歌词缺少一段、整段中文翻译附加、较真实的录入/OCR 错别字、时间戳和平台元信息混合。
- `不应去重`: 真实 holdout 新歌、从 holdout 中优先挑选和曲库有行重合的近邻新歌、较长但不完整的单曲片段、多曲 medley/串烧式片段、重复副歌碰撞、仅翻译相似、短歌词边界。
先准备一个 CSV,例如 `data/eval/eval.csv`
```csv
......
......@@ -108,6 +108,20 @@ python -m lyric_dedup.cli generate-eval-set \
--positive-ratio 0.3
```
如需生成更贴近业务边界的 hard 口径测试集:
```bash
python -m lyric_dedup.cli generate-eval-set \
--profile hard \
--library-dir data/library \
--lyrics-dir data/generated_eval/hard_incoming \
--csv data/generated_eval/eval_hard_5000.csv \
--index outputs/indexes/library_lyrics.pkl \
--eval-index data/generated_eval/eval_hard_5000.csv.index.pkl \
--size 5000 \
--positive-ratio 0.3
```
默认生产评估口径:
```text
......@@ -120,7 +134,7 @@ python -m lyric_dedup.cli generate-eval-set \
业务口径:
```text
positive_* = 应去重,全曲歌词样式变化
positive_* = 应去重,全曲歌词样式变化,包括少量错别字/英文拼写错误扰动
negative_real_holdout_full_song = 不应去重,完整真实歌词,已从评估索引中排除
negative_fragment = 不应去重,单曲片段
negative_shared_chorus = 不应去重,重复副歌碰撞
......@@ -129,6 +143,15 @@ negative_same_theme_synthetic = 不应去重,同主题新歌词
edge_short_or_placeholder = 不应去重,短歌词/占位边界样本
```
hard 口径额外强调真实业务边界,而不是故意制造反常难题:
```text
positive_realistic_variant = 应去重,同曲平台版本噪声、较完整缺段、整段翻译附加、真实录入/OCR 错
negative_near_neighbor_holdout_full_song = 不应去重,和曲库有较多行重合的真实 holdout 新歌
negative_long_fragment = 不应去重,较长但不完整的单曲片段
negative_catalog_mashup = 不应去重,多首真实歌词片段组成的串烧/混剪式输入
```
生成器会扫描整个曲库并按有效歌词行数、语言类型、文件来源前缀分层采样。它会分出一批 holdout 完整歌词作为真实新歌负样本,并生成一个排除 holdout 的评估索引。每次还会输出:
```text
......
......@@ -5,7 +5,7 @@ from __future__ import annotations
import hashlib
import pickle
from dataclasses import dataclass
from enum import StrEnum
from enum import Enum
from pathlib import Path
from lyric_dedup.minhash_lsh import MinHashConfig
......@@ -16,7 +16,7 @@ from lyric_dedup.normalization import lyric_tokens
from lyric_dedup.normalization import normalize_lyrics
class DuplicateDecision(StrEnum):
class DuplicateDecision(str, Enum):
DUPLICATE = "duplicate"
REVIEW = "review"
NEW = "new"
......
......@@ -53,6 +53,12 @@ def main() -> None:
generate.add_argument("--seed", type=int, default=20260602)
generate.add_argument("--index", default="", help="optional source index path recorded in the manifest")
generate.add_argument("--eval-index", default="", help="output index built from non-holdout records for this eval set")
generate.add_argument(
"--profile",
choices=("standard", "hard"),
default="standard",
help="evaluation sample profile: standard production mix or harder business-realistic edge mix",
)
args = parser.parse_args()
if args.command == "build-index":
......@@ -80,6 +86,7 @@ def main() -> None:
seed=args.seed,
index_path=Path(args.index) if args.index else None,
eval_index_path=Path(args.eval_index) if args.eval_index else None,
profile=args.profile,
)
print(json.dumps(summary, ensure_ascii=False))
......
# Test runner
pytest>=8.0
# PostgreSQL storage prototype
psycopg[binary]>=3.2
# Existing MySQL/COS lyric download utilities
pymysql>=1.1
cos-python-sdk-v5>=1.9
tqdm>=4.66
"""Initialize PostgreSQL schema for lyric dedup storage."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
SCHEMA_PATH = PROJECT_ROOT / "scripts" / "postgres_schema.sql"
def main() -> None:
parser = argparse.ArgumentParser(description="Initialize PostgreSQL schema for lyric dedup.")
parser.add_argument("--dsn", required=True, help="PostgreSQL DSN, e.g. postgresql://user:pass@localhost:5432/lyric_dedup")
parser.add_argument("--schema", default=str(SCHEMA_PATH))
args = parser.parse_args()
psycopg = _import_psycopg()
schema_sql = Path(args.schema).read_text(encoding="utf-8")
with psycopg.connect(args.dsn) as conn:
with conn.cursor() as cursor:
cursor.execute(schema_sql)
conn.commit()
print(f"initialized schema from {args.schema}")
def _import_psycopg():
try:
import psycopg
return psycopg
except ModuleNotFoundError:
print(
"Missing dependency: psycopg. Install it with:\n"
" python -m pip install 'psycopg[binary]'",
file=sys.stderr,
)
raise SystemExit(1)
if __name__ == "__main__":
main()
create extension if not exists pg_trgm;
create table if not exists lyrics (
id bigserial primary key,
record_id text not null unique,
source_path text not null,
title text,
artist text,
raw_text text not null,
normalized_text text not null,
primary_text text not null,
translation_text text,
exact_hash text not null,
split_confidence text,
split_reason text,
line_count integer not null,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
deleted_at timestamptz
);
create index if not exists lyrics_exact_hash_idx
on lyrics (exact_hash)
where deleted_at is null;
create index if not exists lyrics_primary_text_trgm_idx
on lyrics using gin (primary_text gin_trgm_ops);
create table if not exists lyric_lines (
lyric_id bigint not null references lyrics(id) on delete cascade,
role text not null,
line_no integer not null,
normalized_line text not null,
line_hash text not null,
primary key (lyric_id, role, line_no)
);
create index if not exists lyric_lines_hash_idx
on lyric_lines (line_hash);
create index if not exists lyric_lines_lyric_id_idx
on lyric_lines (lyric_id);
......@@ -316,6 +316,40 @@ def test_generated_eval_set_uses_stratified_production_mix(tmp_path) -> None:
assert all(row["expected"] == "不应去重" for row in rows if row["sample_type"].startswith("negative_"))
def test_generated_hard_eval_set_uses_business_realistic_edge_mix(tmp_path) -> None:
library = tmp_path / "library"
incoming = tmp_path / "generated" / "incoming"
eval_csv = tmp_path / "generated" / "eval_hard.csv"
library.mkdir()
for idx in range(24):
prefix = "AY" if idx % 3 == 0 else "WHHY"
lyric = BASE_LYRIC.replace("我爱你", f"我想你{idx}").replace("城市", f"城市{idx}")
if idx % 4 == 0:
lyric += "\nI miss you tonight\nUnder the moonlight\nNever let me go\n"
(library / f"{idx}_{prefix}{idx:06d}.txt").write_text(lyric, encoding="utf-8")
generate_eval_set(
library_dir=library,
output_dir=incoming,
csv_path=eval_csv,
size=40,
positive_ratio=0.3,
profile="hard",
)
rows = list(csv.DictReader(eval_csv.open(encoding="utf-8")))
manifest = json.loads((tmp_path / "generated" / "eval_hard.csv.manifest.json").read_text(encoding="utf-8"))
sample_types = {row["sample_type"] for row in rows}
assert len(rows) == 40
assert manifest["profile"] == "hard"
assert "positive_realistic_variant" in manifest["plan"]
assert "negative_near_neighbor_holdout_full_song" in manifest["plan"]
assert "negative_long_fragment" in sample_types
assert "negative_catalog_mashup" in sample_types
assert any(row["sample_type"].startswith("positive_") for row in rows)
def test_foreign_original_with_added_chinese_translation_is_duplicate() -> None:
checker = DuplicateChecker()
checker.add_record(
......