接口化服务
Showing
11 changed files
with
981 additions
and
8 deletions
| ... | @@ -284,6 +284,7 @@ class DuplicateChecker: | ... | @@ -284,6 +284,7 @@ class DuplicateChecker: |
| 284 | query.normalized.primary_lines, | 284 | query.normalized.primary_lines, |
| 285 | candidate.normalized.primary_lines, | 285 | candidate.normalized.primary_lines, |
| 286 | ) | 286 | ) |
| 287 | query_primary_coverage = _matched_query_line_ratio(query.normalized.primary_lines, primary_matched_lines) | ||
| 287 | translation_jaccard = _jaccard(query.translation_tokens, candidate.translation_tokens) | 288 | translation_jaccard = _jaccard(query.translation_tokens, candidate.translation_tokens) |
| 288 | translation_coverage, translation_matched_lines = _line_coverage_lines( | 289 | translation_coverage, translation_matched_lines = _line_coverage_lines( |
| 289 | query.normalized.translation_lines, | 290 | query.normalized.translation_lines, |
| ... | @@ -299,6 +300,27 @@ class DuplicateChecker: | ... | @@ -299,6 +300,27 @@ class DuplicateChecker: |
| 299 | low_confidence_split = ( | 300 | low_confidence_split = ( |
| 300 | query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" | 301 | query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" |
| 301 | ) | 302 | ) |
| 303 | query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines) | ||
| 304 | has_review_level_overlap = ( | ||
| 305 | primary_jaccard >= self.review_jaccard_threshold | ||
| 306 | or jaccard >= self.review_jaccard_threshold | ||
| 307 | or ( | ||
| 308 | primary_coverage >= self.review_line_coverage_threshold | ||
| 309 | and query_primary_coverage >= 0.40 | ||
| 310 | ) | ||
| 311 | or ( | ||
| 312 | coverage >= self.review_line_coverage_threshold | ||
| 313 | and query_coverage >= 0.40 | ||
| 314 | ) | ||
| 315 | ) | ||
| 316 | has_material_chorus_overlap = chorus_only and ( | ||
| 317 | query.normalized.content_line_count <= 6 | ||
| 318 | or (primary_jaccard >= 0.20 and query_primary_coverage >= 0.40) | ||
| 319 | or (jaccard >= 0.20 and query_coverage >= 0.40) | ||
| 320 | or (primary_coverage >= 0.20 and query_primary_coverage >= 0.40) | ||
| 321 | or (coverage >= 0.20 and query_coverage >= 0.40) | ||
| 322 | ) | ||
| 323 | has_low_confidence_split_overlap = low_confidence_split and has_review_level_overlap | ||
| 302 | 324 | ||
| 303 | confidence = round((0.58 * primary_jaccard) + (0.42 * primary_coverage), 4) | 325 | confidence = round((0.58 * primary_jaccard) + (0.42 * primary_coverage), 4) |
| 304 | if ( | 326 | if ( |
| ... | @@ -314,21 +336,18 @@ class DuplicateChecker: | ... | @@ -314,21 +336,18 @@ class DuplicateChecker: |
| 314 | else: | 336 | else: |
| 315 | reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" | 337 | reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" |
| 316 | elif ( | 338 | elif ( |
| 317 | chorus_only | 339 | has_material_chorus_overlap |
| 318 | or translation_only | 340 | or translation_only |
| 319 | or low_confidence_split | 341 | or has_low_confidence_split_overlap |
| 320 | or primary_jaccard >= self.review_jaccard_threshold | 342 | or has_review_level_overlap |
| 321 | or primary_coverage >= self.review_line_coverage_threshold | ||
| 322 | or jaccard >= self.review_jaccard_threshold | ||
| 323 | or coverage >= self.review_line_coverage_threshold | ||
| 324 | ): | 343 | ): |
| 325 | decision = DuplicateDecision.REVIEW | 344 | decision = DuplicateDecision.REVIEW |
| 326 | reason = "候选相似度达到复核阈值,需要人工确认" | 345 | reason = "候选相似度达到复核阈值,需要人工确认" |
| 327 | if chorus_only: | 346 | if has_material_chorus_overlap: |
| 328 | reason = "重合内容主要集中在重复副歌行,不自动判重" | 347 | reason = "重合内容主要集中在重复副歌行,不自动判重" |
| 329 | elif translation_only: | 348 | elif translation_only: |
| 330 | reason = "仅翻译行相似,原文字面重合不足,不自动判重" | 349 | reason = "仅翻译行相似,原文字面重合不足,不自动判重" |
| 331 | elif low_confidence_split: | 350 | elif has_low_confidence_split_overlap: |
| 332 | reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" | 351 | reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" |
| 333 | else: | 352 | else: |
| 334 | decision = DuplicateDecision.NEW | 353 | decision = DuplicateDecision.NEW |
| ... | @@ -446,6 +465,13 @@ def _line_coverage_lines(left: tuple[str, ...], right: tuple[str, ...]) -> tuple | ... | @@ -446,6 +465,13 @@ def _line_coverage_lines(left: tuple[str, ...], right: tuple[str, ...]) -> tuple |
| 446 | return len(matched) / max(len(left_lines), len(right_lines)), matched | 465 | return len(matched) / max(len(left_lines), len(right_lines)), matched |
| 447 | 466 | ||
| 448 | 467 | ||
| 468 | def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[str]) -> float: | ||
| 469 | query_unique_lines = set(query_lines) | ||
| 470 | if not query_unique_lines: | ||
| 471 | return 0.0 | ||
| 472 | return len(set(matched_lines)) / len(query_unique_lines) | ||
| 473 | |||
| 474 | |||
| 449 | def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: | 475 | def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: |
| 450 | if not matched_lines: | 476 | if not matched_lines: |
| 451 | return False | 477 | return False | ... | ... |
lyric_dedup_server/__init__.py
0 → 100644
lyric_dedup_server/app.py
0 → 100644
| 1 | """FastAPI application for lyric duplicate checking.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | import logging | ||
| 6 | from dataclasses import dataclass | ||
| 7 | from pathlib import Path | ||
| 8 | from typing import Any | ||
| 9 | |||
| 10 | from fastapi import FastAPI | ||
| 11 | from fastapi.responses import JSONResponse | ||
| 12 | from pydantic import BaseModel, Field | ||
| 13 | |||
| 14 | from .config import ServerConfig | ||
| 15 | from .service import DedupService | ||
| 16 | |||
| 17 | logger = logging.getLogger(__name__) | ||
| 18 | |||
| 19 | # --------------------------------------------------------------------------- | ||
| 20 | # App lifecycle | ||
| 21 | # --------------------------------------------------------------------------- | ||
| 22 | |||
| 23 | app = FastAPI(title="Lyric Dedup API", version="0.1.0") | ||
| 24 | |||
| 25 | _config: ServerConfig | None = None | ||
| 26 | _service: DedupService | None = None | ||
| 27 | |||
| 28 | |||
| 29 | @app.on_event("startup") | ||
| 30 | def _startup() -> None: | ||
| 31 | global _config, _service | ||
| 32 | _config = ServerConfig() | ||
| 33 | _service = DedupService(config=_config) | ||
| 34 | logger.info("Lyric Dedup API started (DSN=%s, trgm=%s)", _config.dsn, _config.enable_trgm) | ||
| 35 | |||
| 36 | |||
| 37 | # --------------------------------------------------------------------------- | ||
| 38 | # Request / response models | ||
| 39 | # --------------------------------------------------------------------------- | ||
| 40 | |||
| 41 | |||
| 42 | class CheckRequest(BaseModel): | ||
| 43 | url: str = Field(..., description="URL of the LRC/TXT lyric file") | ||
| 44 | title: str | None = Field(None, description="Song title (optional)") | ||
| 45 | artist: str | None = Field(None, description="Artist name (optional)") | ||
| 46 | |||
| 47 | |||
| 48 | class CheckResponse(BaseModel): | ||
| 49 | duplicate: bool | ||
| 50 | decision: str | None = None | ||
| 51 | confidence: float | None = None | ||
| 52 | reason: str | None = None | ||
| 53 | |||
| 54 | |||
| 55 | class HealthResponse(BaseModel): | ||
| 56 | status: str | ||
| 57 | |||
| 58 | |||
| 59 | # --------------------------------------------------------------------------- | ||
| 60 | # Endpoints | ||
| 61 | # --------------------------------------------------------------------------- | ||
| 62 | |||
| 63 | @app.get("/health", response_model=HealthResponse) | ||
| 64 | def health() -> dict[str, str]: | ||
| 65 | return {"status": "ok"} | ||
| 66 | |||
| 67 | |||
| 68 | @app.post("/api/v1/check", response_model=CheckResponse) | ||
| 69 | def check_lyric(req: CheckRequest) -> Any: | ||
| 70 | if _service is None: | ||
| 71 | return JSONResponse( | ||
| 72 | status_code=503, | ||
| 73 | content={"detail": "service not initialized"}, | ||
| 74 | ) | ||
| 75 | |||
| 76 | # 校验文件格式(仅接受 .txt / .lrc) | ||
| 77 | if not _is_valid_lyric_url(req.url): | ||
| 78 | return JSONResponse( | ||
| 79 | status_code=400, | ||
| 80 | content={"detail": "仅支持 .txt 或 .lrc 格式的歌词文件"}, | ||
| 81 | ) | ||
| 82 | |||
| 83 | try: | ||
| 84 | lyrics = _download_lyrics(req.url) | ||
| 85 | except ValueError as exc: | ||
| 86 | return JSONResponse( | ||
| 87 | status_code=400, | ||
| 88 | content={"detail": str(exc)}, | ||
| 89 | ) | ||
| 90 | except Exception as exc: | ||
| 91 | logger.exception("unexpected error during download") | ||
| 92 | return JSONResponse( | ||
| 93 | status_code=500, | ||
| 94 | content={"detail": f"下载歌词失败: {exc}"}, | ||
| 95 | ) | ||
| 96 | |||
| 97 | try: | ||
| 98 | result = _service.check(lyrics, title=req.title, artist=req.artist, source_url=req.url) | ||
| 99 | except Exception as exc: | ||
| 100 | logger.exception("unexpected error during dedup check") | ||
| 101 | return JSONResponse( | ||
| 102 | status_code=500, | ||
| 103 | content={"detail": f"歌词去重检测失败: {exc}"}, | ||
| 104 | ) | ||
| 105 | |||
| 106 | return CheckResponse( | ||
| 107 | duplicate=result.duplicate, | ||
| 108 | decision=result.decision, | ||
| 109 | confidence=result.confidence, | ||
| 110 | reason=result.reason, | ||
| 111 | ) | ||
| 112 | |||
| 113 | |||
| 114 | # --------------------------------------------------------------------------- | ||
| 115 | # Helpers | ||
| 116 | # --------------------------------------------------------------------------- | ||
| 117 | |||
| 118 | _ENCODING_CHAIN = ("utf-8-sig", "utf-8", "gb18030", "big5") | ||
| 119 | |||
| 120 | |||
| 121 | _ALLOWED_EXTENSIONS = {".txt", ".lrc"} | ||
| 122 | |||
| 123 | |||
| 124 | def _is_valid_lyric_url(url: str) -> bool: | ||
| 125 | """Check if URL points to a .txt or .lrc file.""" | ||
| 126 | from urllib.parse import urlparse | ||
| 127 | |||
| 128 | ext = Path(urlparse(url).path).suffix.lower() | ||
| 129 | return ext in _ALLOWED_EXTENSIONS | ||
| 130 | |||
| 131 | |||
| 132 | def _download_lyrics(url: str) -> str: | ||
| 133 | """Download a lyric file and decode with encoding fallback chain.""" | ||
| 134 | import urllib.error | ||
| 135 | import urllib.request | ||
| 136 | |||
| 137 | try: | ||
| 138 | with urllib.request.urlopen(url, timeout=_config.download_timeout if _config else 10) as resp: | ||
| 139 | data = resp.read() | ||
| 140 | except urllib.error.HTTPError as exc: | ||
| 141 | raise ValueError(f"下载失败: HTTP {exc.code}") from exc | ||
| 142 | except urllib.error.URLError as exc: | ||
| 143 | raise ValueError(f"下载失败: {exc.reason}") from exc | ||
| 144 | except TimeoutError as exc: | ||
| 145 | raise ValueError("下载超时") from exc | ||
| 146 | except Exception as exc: | ||
| 147 | raise ValueError(f"下载失败: {exc}") from exc | ||
| 148 | |||
| 149 | for encoding in _ENCODING_CHAIN: | ||
| 150 | try: | ||
| 151 | return data.decode(encoding) | ||
| 152 | except UnicodeDecodeError: | ||
| 153 | continue | ||
| 154 | raise ValueError("无法解析文件编码,支持: utf-8-sig / utf-8 / gb18030 / big5") |
lyric_dedup_server/config.py
0 → 100644
| 1 | """Server configuration loaded from environment variables.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | import os | ||
| 6 | from dataclasses import dataclass | ||
| 7 | |||
| 8 | |||
| 9 | @dataclass | ||
| 10 | class ServerConfig: | ||
| 11 | dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup") | ||
| 12 | max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5")) | ||
| 13 | recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100")) | ||
| 14 | enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true" | ||
| 15 | trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3")) | ||
| 16 | statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000")) | ||
| 17 | download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10")) |
lyric_dedup_server/service.py
0 → 100644
| 1 | """Core deduplication service: PostgreSQL recall + DuplicateChecker.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | import hashlib | ||
| 6 | import logging | ||
| 7 | from dataclasses import dataclass, field | ||
| 8 | from typing import Any | ||
| 9 | |||
| 10 | import psycopg | ||
| 11 | |||
| 12 | from lyric_dedup.checker import DuplicateChecker | ||
| 13 | from lyric_dedup.checker import DuplicateDecision | ||
| 14 | from lyric_dedup.checker import LyricRecord | ||
| 15 | from lyric_dedup.normalization import fingerprint_text | ||
| 16 | from lyric_dedup.normalization import normalize_lyrics | ||
| 17 | |||
| 18 | from .config import ServerConfig | ||
| 19 | |||
| 20 | logger = logging.getLogger(__name__) | ||
| 21 | |||
| 22 | |||
| 23 | @dataclass(frozen=True) | ||
| 24 | class CheckResult: | ||
| 25 | duplicate: bool | ||
| 26 | decision: str = "" | ||
| 27 | confidence: float = 0.0 | ||
| 28 | reason: str = "" | ||
| 29 | candidate_count: int = 0 | ||
| 30 | |||
| 31 | |||
| 32 | @dataclass | ||
| 33 | class DedupService: | ||
| 34 | """Thin wrapper around the PostgreSQL recall + DuplicateChecker pipeline.""" | ||
| 35 | |||
| 36 | config: ServerConfig | ||
| 37 | _logger: logging.Logger = field(default_factory=lambda: logger, repr=False) | ||
| 38 | |||
| 39 | def check( | ||
| 40 | self, | ||
| 41 | lyrics_text: str, | ||
| 42 | title: str | None = None, | ||
| 43 | artist: str | None = None, | ||
| 44 | source_url: str | None = None, | ||
| 45 | ) -> CheckResult: | ||
| 46 | """Core entry: download lyrics, recall candidates from PG, decide.""" | ||
| 47 | record = LyricRecord( | ||
| 48 | record_id="__query__", | ||
| 49 | lyrics=lyrics_text, | ||
| 50 | title=title, | ||
| 51 | artist=artist, | ||
| 52 | ) | ||
| 53 | with psycopg.connect(self.config.dsn) as conn: | ||
| 54 | with conn.cursor() as cursor: | ||
| 55 | cursor.execute("select set_config('statement_timeout', %s, false)", (str(self.config.statement_timeout_ms),)) | ||
| 56 | cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(self.config.trgm_threshold),)) | ||
| 57 | candidates = self._recall_candidates(conn, record) | ||
| 58 | result = self._check_against_candidates(record, candidates) | ||
| 59 | if result.decision == "new" and source_url: | ||
| 60 | self._insert_new_record(conn, record, source_url) | ||
| 61 | return result | ||
| 62 | |||
| 63 | def _insert_new_record(self, conn: Any, record: LyricRecord, source_url: str) -> None: | ||
| 64 | """Insert new lyric into PostgreSQL (lyrics + lyric_lines tables).""" | ||
| 65 | raw_text = _pg_text(record.lyrics)[0] or "" | ||
| 66 | normalized = normalize_lyrics(raw_text) | ||
| 67 | primary_text = _pg_text("\n".join(normalized.primary_lines))[0] | ||
| 68 | translation_text = _pg_text("\n".join(normalized.translation_lines))[0] or None | ||
| 69 | normalized_text = _pg_text(normalized.normalized_full_text)[0] | ||
| 70 | exact_text = fingerprint_text(normalized) | ||
| 71 | exact_hash = hashlib.sha256(exact_text.encode("utf-8")).hexdigest() | ||
| 72 | |||
| 73 | with conn.cursor() as cursor: | ||
| 74 | cursor.execute( | ||
| 75 | """ | ||
| 76 | insert into lyrics ( | ||
| 77 | record_id, source_path, title, artist, raw_text, normalized_text, | ||
| 78 | primary_text, translation_text, exact_hash, split_confidence, | ||
| 79 | split_reason, line_count, updated_at, deleted_at | ||
| 80 | ) values ( | ||
| 81 | %(record_id)s, %(source_path)s, %(title)s, %(artist)s, %(raw_text)s, | ||
| 82 | %(normalized_text)s, %(primary_text)s, %(translation_text)s, | ||
| 83 | %(exact_hash)s, %(split_confidence)s, %(split_reason)s, | ||
| 84 | %(line_count)s, now(), null | ||
| 85 | ) | ||
| 86 | on conflict (record_id) do update set | ||
| 87 | source_path = excluded.source_path, title = excluded.title, | ||
| 88 | artist = excluded.artist, raw_text = excluded.raw_text, | ||
| 89 | normalized_text = excluded.normalized_text, primary_text = excluded.primary_text, | ||
| 90 | translation_text = excluded.translation_text, exact_hash = excluded.exact_hash, | ||
| 91 | split_confidence = excluded.split_confidence, split_reason = excluded.split_reason, | ||
| 92 | line_count = excluded.line_count, updated_at = now(), deleted_at = null | ||
| 93 | returning id | ||
| 94 | """, | ||
| 95 | { | ||
| 96 | "record_id": _build_record_id(source_url), | ||
| 97 | "source_path": source_url, | ||
| 98 | "title": _pg_text(record.title)[0], | ||
| 99 | "artist": _pg_text(record.artist)[0], | ||
| 100 | "raw_text": raw_text, | ||
| 101 | "normalized_text": normalized_text, | ||
| 102 | "primary_text": primary_text, | ||
| 103 | "translation_text": translation_text, | ||
| 104 | "exact_hash": exact_hash, | ||
| 105 | "split_confidence": _pg_text(normalized.split_confidence)[0], | ||
| 106 | "split_reason": _pg_text(normalized.split_reason)[0], | ||
| 107 | "line_count": len(normalized.primary_lines or normalized.unique_lines), | ||
| 108 | }, | ||
| 109 | ) | ||
| 110 | lyric_id = cursor.fetchone()[0] | ||
| 111 | |||
| 112 | cursor.execute("delete from lyric_lines where lyric_id = %s", (lyric_id,)) | ||
| 113 | line_rows: list[tuple] = list(_line_rows(lyric_id, "primary", normalized.primary_lines)) | ||
| 114 | line_rows.extend(_line_rows(lyric_id, "translation", normalized.translation_lines)) | ||
| 115 | line_rows.extend(_line_rows(lyric_id, "unknown", normalized.unknown_lines)) | ||
| 116 | if line_rows: | ||
| 117 | cursor.executemany( | ||
| 118 | "insert into lyric_lines (lyric_id, role, line_no, normalized_line, line_hash) values (%s, %s, %s, %s, %s)", | ||
| 119 | line_rows, | ||
| 120 | ) | ||
| 121 | conn.commit() | ||
| 122 | |||
| 123 | def _recall_candidates(self, conn: Any, record: LyricRecord) -> list[LyricRecord]: | ||
| 124 | """Three-tier recall: exact_hash → pg_trgm → line_hash.""" | ||
| 125 | query_lyrics = _pg_text(record.lyrics)[0] or "" | ||
| 126 | normalized = normalize_lyrics(query_lyrics) | ||
| 127 | exact_text = fingerprint_text(normalized) | ||
| 128 | exact_hash = hashlib.sha256(exact_text.encode("utf-8")).hexdigest() | ||
| 129 | primary_text = "\n".join(normalized.primary_lines) | ||
| 130 | line_hashes = [hashlib.sha256(line.encode("utf-8")).hexdigest() for line in normalized.primary_lines if line] | ||
| 131 | |||
| 132 | candidates: dict[str, LyricRecord] = {} | ||
| 133 | exclude_record_ids: list[str] = [] | ||
| 134 | |||
| 135 | with conn.cursor() as cursor: | ||
| 136 | # Tier 1: exact hash match | ||
| 137 | cursor.execute( | ||
| 138 | """ | ||
| 139 | select record_id, raw_text, title, artist | ||
| 140 | from lyrics | ||
| 141 | where deleted_at is null | ||
| 142 | and exact_hash = %s | ||
| 143 | and not (record_id = any(%s)) | ||
| 144 | limit %s | ||
| 145 | """, | ||
| 146 | (exact_hash, exclude_record_ids, self.config.recall_limit), | ||
| 147 | ) | ||
| 148 | _add_rows(candidates, cursor.fetchall()) | ||
| 149 | |||
| 150 | # Tier 2: pg_trgm similarity (optional) | ||
| 151 | if self.config.enable_trgm and primary_text: | ||
| 152 | cursor.execute( | ||
| 153 | """ | ||
| 154 | select record_id, raw_text, title, artist | ||
| 155 | from lyrics | ||
| 156 | where deleted_at is null | ||
| 157 | and not (record_id = any(%s)) | ||
| 158 | and primary_text %% %s | ||
| 159 | order by similarity(primary_text, %s) desc | ||
| 160 | limit %s | ||
| 161 | """, | ||
| 162 | (exclude_record_ids, primary_text, primary_text, self.config.recall_limit), | ||
| 163 | ) | ||
| 164 | _add_rows(candidates, cursor.fetchall()) | ||
| 165 | |||
| 166 | # Tier 3: line hash match | ||
| 167 | if line_hashes: | ||
| 168 | cursor.execute( | ||
| 169 | """ | ||
| 170 | select l.record_id, l.raw_text, l.title, l.artist | ||
| 171 | from lyric_lines ll | ||
| 172 | join lyrics l on l.id = ll.lyric_id | ||
| 173 | where l.deleted_at is null | ||
| 174 | and not (l.record_id = any(%s)) | ||
| 175 | and ll.role = 'primary' | ||
| 176 | and ll.line_hash = any(%s) | ||
| 177 | group by l.id | ||
| 178 | order by count(*) desc | ||
| 179 | limit %s | ||
| 180 | """, | ||
| 181 | (exclude_record_ids, line_hashes, self.config.recall_limit), | ||
| 182 | ) | ||
| 183 | _add_rows(candidates, cursor.fetchall()) | ||
| 184 | |||
| 185 | return list(candidates.values()) | ||
| 186 | |||
| 187 | def _check_against_candidates( | ||
| 188 | self, | ||
| 189 | record: LyricRecord, | ||
| 190 | candidates: list[LyricRecord], | ||
| 191 | ) -> CheckResult: | ||
| 192 | """Run DuplicateChecker against recalled candidates.""" | ||
| 193 | checker = DuplicateChecker() | ||
| 194 | for candidate in candidates: | ||
| 195 | checker.add_record(candidate) | ||
| 196 | result = checker.check_record(record, max_candidates=self.config.max_candidates) | ||
| 197 | return CheckResult( | ||
| 198 | duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW), | ||
| 199 | decision=result.decision.value, | ||
| 200 | confidence=result.confidence, | ||
| 201 | reason=result.reason, | ||
| 202 | candidate_count=len(result.candidates), | ||
| 203 | ) | ||
| 204 | |||
| 205 | |||
| 206 | def _add_rows(candidates: dict[str, LyricRecord], rows: list[tuple[object, ...]]) -> None: | ||
| 207 | for record_id, raw_text, title, artist in rows: | ||
| 208 | candidates.setdefault( | ||
| 209 | str(record_id), | ||
| 210 | LyricRecord( | ||
| 211 | record_id=str(record_id), | ||
| 212 | lyrics=str(raw_text), | ||
| 213 | title=str(title) if title is not None else None, | ||
| 214 | artist=str(artist) if artist is not None else None, | ||
| 215 | ), | ||
| 216 | ) | ||
| 217 | |||
| 218 | |||
| 219 | def _build_record_id(source_url: str) -> str: | ||
| 220 | """From URL to record_id, format url:{sha12}:{url}.""" | ||
| 221 | digest = hashlib.sha1(source_url.encode("utf-8")).hexdigest()[:12] | ||
| 222 | return f"url:{digest}:{source_url}" | ||
| 223 | |||
| 224 | |||
| 225 | def _line_rows(lyric_id: int, role: str, lines: tuple[str, ...]) -> list[tuple]: | ||
| 226 | rows: list[tuple] = [] | ||
| 227 | for index, line in enumerate(lines): | ||
| 228 | line = _pg_text(line)[0] or "" | ||
| 229 | line_hash = hashlib.sha256(line.encode("utf-8")).hexdigest() | ||
| 230 | rows.append((lyric_id, role, index, line, line_hash)) | ||
| 231 | return rows | ||
| 232 | |||
| 233 | |||
| 234 | def _pg_text(value: str | None) -> tuple[str | None, bool]: | ||
| 235 | """Return (text, had_nul).""" | ||
| 236 | if value is None: | ||
| 237 | return None, False | ||
| 238 | if "\x00" not in value: | ||
| 239 | return value, False | ||
| 240 | return value.replace("\x00", ""), True |
test_api/config.py
0 → 100644
| 1 | """测试环境配置,从 .env 或环境变量读取 OSS 凭据""" | ||
| 2 | import os | ||
| 3 | from pathlib import Path | ||
| 4 | |||
| 5 | # 自动加载 .env 文件 | ||
| 6 | _env_path = Path(__file__).parent / ".env" | ||
| 7 | if _env_path.exists(): | ||
| 8 | with open(_env_path, encoding="utf-8") as _f: | ||
| 9 | for _line in _f: | ||
| 10 | _line = _line.strip() | ||
| 11 | if _line and not _line.startswith("#") and "=" in _line: | ||
| 12 | _key, _value = _line.split("=", 1) | ||
| 13 | os.environ.setdefault(_key.strip(), _value.strip()) | ||
| 14 | |||
| 15 | OSS_ACCESS_KEY_ID = os.getenv("OSS_ACCESS_KEY_ID", "") | ||
| 16 | OSS_ACCESS_KEY_SECRET = os.getenv("OSS_ACCESS_KEY_SECRET", "") | ||
| 17 | OSS_ENDPOINT = os.getenv("OSS_ENDPOINT", "oss-cn-hangzhou.aliyuncs.com") | ||
| 18 | OSS_BUCKET_NAME = os.getenv("OSS_BUCKET_NAME", "") | ||
| 19 | OSS_ENDPOINT_INTERNAL = os.getenv("OSS_ENDPOINT_INTERNAL", OSS_ENDPOINT) |
test_api/oss_uploader.py
0 → 100644
| 1 | """ | ||
| 2 | 阿里云OSS文件上传模块 | ||
| 3 | """ | ||
| 4 | import uuid | ||
| 5 | |||
| 6 | import oss2 | ||
| 7 | import os | ||
| 8 | from datetime import datetime, timedelta | ||
| 9 | from .config import OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT, OSS_BUCKET_NAME, OSS_ENDPOINT_INTERNAL | ||
| 10 | |||
| 11 | |||
| 12 | |||
| 13 | class OSSUploader: | ||
| 14 | """阿里云OSS上传器""" | ||
| 15 | |||
| 16 | def __init__(self): | ||
| 17 | """初始化OSS客户端""" | ||
| 18 | self.access_key_id = OSS_ACCESS_KEY_ID | ||
| 19 | self.access_key_secret = OSS_ACCESS_KEY_SECRET | ||
| 20 | self.endpoint = OSS_ENDPOINT | ||
| 21 | self.bucket_name = OSS_BUCKET_NAME | ||
| 22 | self.endpoint_internal = OSS_ENDPOINT_INTERNAL | ||
| 23 | # 创建认证对象 | ||
| 24 | self.auth = oss2.Auth(self.access_key_id, self.access_key_secret) | ||
| 25 | |||
| 26 | # 创建Bucket对象 | ||
| 27 | self.bucket = oss2.Bucket(self.auth, self.endpoint, self.bucket_name) | ||
| 28 | |||
| 29 | def upload_file(self, local_file_path, oss_object_name=None): | ||
| 30 | """ | ||
| 31 | 上传文件到OSS | ||
| 32 | |||
| 33 | Args: | ||
| 34 | local_file_path: 本地文件路径 | ||
| 35 | oss_object_name: OSS对象名称,如果不指定则使用时间戳+原文件名 | ||
| 36 | |||
| 37 | Returns: | ||
| 38 | tuple: (success: bool, url: str) 或 (success: bool, error: str) | ||
| 39 | """ | ||
| 40 | try: | ||
| 41 | if not os.path.exists(local_file_path): | ||
| 42 | return False, "本地文件不存在" | ||
| 43 | |||
| 44 | if not oss_object_name: | ||
| 45 | _, ext = os.path.splitext(local_file_path) | ||
| 46 | oss_object_name = f"{uuid.uuid4()}{ext}" | ||
| 47 | |||
| 48 | # 如果没有指定OSS对象名称,则生成一个 | ||
| 49 | date = datetime.now().strftime("%Y%m%d") | ||
| 50 | oss_object_name = f"public_test/{date}/{oss_object_name}" | ||
| 51 | |||
| 52 | # 上传文件 | ||
| 53 | result = self.bucket.put_object_from_file(oss_object_name, local_file_path) | ||
| 54 | |||
| 55 | # 构建文件URL(使用标准公网域名格式) | ||
| 56 | endpoint_host = self.endpoint.lstrip("https://").lstrip("http://").split("/")[0] | ||
| 57 | file_url = f"https://{self.bucket_name}.{endpoint_host}/{oss_object_name}" | ||
| 58 | |||
| 59 | return True, file_url | ||
| 60 | |||
| 61 | except Exception as e: | ||
| 62 | return False, str(e) | ||
| 63 | |||
| 64 | def upload_data(self, data, oss_object_name): | ||
| 65 | """ | ||
| 66 | 上传数据到OSS | ||
| 67 | |||
| 68 | Args: | ||
| 69 | data: 要上传的数据(字符串或字节) | ||
| 70 | oss_object_name: OSS对象名称 | ||
| 71 | |||
| 72 | Returns: | ||
| 73 | dict: 包含上传结果的字典 | ||
| 74 | """ | ||
| 75 | try: | ||
| 76 | # 上传数据 | ||
| 77 | result = self.bucket.put_object(oss_object_name, data) | ||
| 78 | |||
| 79 | # 构建文件URL | ||
| 80 | file_url = f"{self.endpoint.rstrip('/')}/{self.bucket_name}/{oss_object_name}" | ||
| 81 | |||
| 82 | return { | ||
| 83 | "success": True, | ||
| 84 | "oss_object_name": oss_object_name, | ||
| 85 | "file_url": file_url, | ||
| 86 | "etag": result.etag, | ||
| 87 | "size": len(data) if isinstance(data, (str, bytes)) else 0 | ||
| 88 | } | ||
| 89 | |||
| 90 | except Exception as e: | ||
| 91 | return {"success": False, "error": str(e)} | ||
| 92 | |||
| 93 | |||
| 94 | def get_bucket(): | ||
| 95 | """获取Bucket对象""" | ||
| 96 | auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET) | ||
| 97 | bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME) | ||
| 98 | return bucket | ||
| 99 | |||
| 100 | |||
| 101 | def clean_expire_file(): | ||
| 102 | """核心任务函数""" | ||
| 103 | print(f"\n[{datetime.now()}] 开始执行每日清理任务...") | ||
| 104 | ROOT_PREFIX = 'temp_ai/' | ||
| 105 | bucket = get_bucket() | ||
| 106 | |||
| 107 | # 1. 计算时间阈值 | ||
| 108 | now = datetime.now() | ||
| 109 | yesterday_date = (now - timedelta(days=1)).date() | ||
| 110 | print(f"保留阈值: {yesterday_date} (即 {yesterday_date} 之前的数据将被删除)") | ||
| 111 | |||
| 112 | # 2. 遍历目录 | ||
| 113 | try: | ||
| 114 | for obj in oss2.ObjectIterator(bucket, prefix=ROOT_PREFIX, delimiter='/'): | ||
| 115 | path = "" | ||
| 116 | is_directory = False | ||
| 117 | |||
| 118 | # --- [核心修改] 统一路径获取方式 --- | ||
| 119 | |||
| 120 | # 情况 A: 它是虚拟目录 (CommonPrefix) | ||
| 121 | if hasattr(obj, 'prefix'): | ||
| 122 | path = obj.prefix | ||
| 123 | is_directory = True | ||
| 124 | |||
| 125 | # 情况 B: 它是实际对象 (SimplifiedObjectInfo) | ||
| 126 | elif hasattr(obj, 'key'): | ||
| 127 | path = obj.key | ||
| 128 | # 如果 key 以 / 结尾,说明它是一个显式创建的文件夹对象 | ||
| 129 | if path.endswith('/'): | ||
| 130 | is_directory = True | ||
| 131 | else: | ||
| 132 | is_directory = False # 这是一个普通文件 | ||
| 133 | |||
| 134 | # --- 逻辑分流 --- | ||
| 135 | |||
| 136 | if not is_directory: | ||
| 137 | # 这是一个真正的文件(且不是文件夹对象),直接跳过 | ||
| 138 | # print(f"[跳过] 散落文件: {path}") | ||
| 139 | continue | ||
| 140 | |||
| 141 | # 此时 path 必定是目录格式 (如 'temp_ai/20251229/') | ||
| 142 | # 下面开始正常的日期判断逻辑 | ||
| 143 | |||
| 144 | # 防御性去空,防止路径即为 'temp_ai/' 本身 | ||
| 145 | if path == ROOT_PREFIX: | ||
| 146 | continue | ||
| 147 | |||
| 148 | # 解析目录名 (取倒数第二个元素,因为最后一位是空字符串) | ||
| 149 | folder_name_raw = path.strip('/').split('/')[-1] | ||
| 150 | |||
| 151 | try: | ||
| 152 | folder_date_obj = datetime.strptime(folder_name_raw, "%Y%m%d").date() | ||
| 153 | |||
| 154 | if folder_date_obj < yesterday_date: | ||
| 155 | print(f"[删除] 发现过期目录: {path}") | ||
| 156 | # 注意:delete_objects_by_prefix 会删除该前缀下的所有文件 | ||
| 157 | # 如果这个目录本身是个对象,也会被一并删除,无需特殊处理 | ||
| 158 | delete_objects_by_prefix(bucket, path) | ||
| 159 | else: | ||
| 160 | # print(f"[跳过] 目录较新: {path}") | ||
| 161 | pass | ||
| 162 | |||
| 163 | except ValueError: | ||
| 164 | print(f"[跳过] 非日期命名目录: {path}") | ||
| 165 | |||
| 166 | except Exception as e: | ||
| 167 | import traceback | ||
| 168 | print(f"[严重错误] 任务执行失败: {e}") | ||
| 169 | traceback.print_exc() | ||
| 170 | |||
| 171 | |||
| 172 | def delete_objects_by_prefix(bucket, prefix): | ||
| 173 | """递归删除指定前缀下的所有文件""" | ||
| 174 | print(f" -> 正在清理目录: {prefix} ...") | ||
| 175 | batch_list = [] | ||
| 176 | try: | ||
| 177 | for obj in oss2.ObjectIterator(bucket, prefix=prefix): | ||
| 178 | batch_list.append(obj.key) | ||
| 179 | if len(batch_list) >= 1000: | ||
| 180 | bucket.batch_delete_objects(batch_list) | ||
| 181 | batch_list = [] | ||
| 182 | |||
| 183 | if batch_list: | ||
| 184 | bucket.batch_delete_objects(batch_list) | ||
| 185 | print(f" -> 目录 {prefix} 清理完毕。") | ||
| 186 | except Exception as e: | ||
| 187 | print(f" [错误] 删除过程出错: {e}") | ||
| 188 | |||
| 189 | |||
| 190 | # 创建OSS上传器实例 | ||
| 191 | oss_uploader = OSSUploader() | ||
| 192 | |||
| 193 | if __name__ == '__main__': | ||
| 194 | clean_expire_file() | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
test_api/test_dedup_api.py
0 → 100644
| 1 | """歌词去重 API 测试脚本 | ||
| 2 | |||
| 3 | 用法: | ||
| 4 | # 上传指定歌词文件并调用去重 API | ||
| 5 | python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc | ||
| 6 | |||
| 7 | # 指定标题和歌手 | ||
| 8 | python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --title "夜曲" --artist "周杰伦" | ||
| 9 | |||
| 10 | # 仅上传不调用 API | ||
| 11 | python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --upload-only | ||
| 12 | |||
| 13 | # 仅调用 API(使用已有 URL) | ||
| 14 | python test_api/test_dedup_api.py --url "https://hikoon-ai-test.oss-cn-hangzhou.aliyuncs.com/temp_ai/20250603/xxx.lrc" | ||
| 15 | |||
| 16 | # 指定 API 地址 | ||
| 17 | python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --api-url "http://localhost:8000" | ||
| 18 | """ | ||
| 19 | import argparse | ||
| 20 | import json | ||
| 21 | import os | ||
| 22 | import sys | ||
| 23 | |||
| 24 | # 确保项目根目录在 path 中 | ||
| 25 | PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
| 26 | if str(PROJECT_ROOT) not in sys.path: | ||
| 27 | sys.path.insert(0, str(PROJECT_ROOT)) | ||
| 28 | |||
| 29 | import urllib.request | ||
| 30 | import urllib.error | ||
| 31 | |||
| 32 | from test_api.config import OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT, OSS_BUCKET_NAME | ||
| 33 | from test_api.oss_uploader import OSSUploader | ||
| 34 | |||
| 35 | |||
| 36 | def upload_lyric_file(file_path: str) -> str: | ||
| 37 | """上传歌词文件到 OSS,返回公开 URL""" | ||
| 38 | uploader = OSSUploader() | ||
| 39 | success, result = uploader.upload_file(file_path) | ||
| 40 | if not success: | ||
| 41 | print(f"上传失败: {result}") | ||
| 42 | sys.exit(1) | ||
| 43 | return result | ||
| 44 | |||
| 45 | |||
| 46 | def call_dedup_api(url: str, title: str | None, artist: str | None, api_base: str) -> dict: | ||
| 47 | """调用去重 API""" | ||
| 48 | payload = json.dumps({ | ||
| 49 | "url": url, | ||
| 50 | "title": title, | ||
| 51 | "artist": artist, | ||
| 52 | }).encode("utf-8") | ||
| 53 | |||
| 54 | req = urllib.request.Request( | ||
| 55 | f"{api_base.rstrip('/')}/api/v1/check", | ||
| 56 | data=payload, | ||
| 57 | headers={"Content-Type": "application/json"}, | ||
| 58 | method="POST", | ||
| 59 | ) | ||
| 60 | |||
| 61 | try: | ||
| 62 | with urllib.request.urlopen(req, timeout=30) as resp: | ||
| 63 | body = json.loads(resp.read().decode("utf-8")) | ||
| 64 | return body | ||
| 65 | except urllib.error.HTTPError as exc: | ||
| 66 | error_body = exc.read().decode("utf-8", errors="replace") | ||
| 67 | print(f"API 请求失败 (HTTP {exc.code}): {error_body}") | ||
| 68 | sys.exit(1) | ||
| 69 | except urllib.error.URLError as exc: | ||
| 70 | print(f"API 请求失败: {exc.reason}") | ||
| 71 | print("请确认 API 服务已启动: uvicorn lyric_dedup_server.app:app --host 0.0.0.0 --port 8000") | ||
| 72 | sys.exit(1) | ||
| 73 | |||
| 74 | |||
| 75 | def main(): | ||
| 76 | parser = argparse.ArgumentParser(description="歌词去重 API 测试") | ||
| 77 | parser.add_argument("--file", "-f", help="本地歌词文件路径") | ||
| 78 | parser.add_argument("--url", "-u", help="已上传的歌词 URL(跳过上传步骤)") | ||
| 79 | parser.add_argument("--title", "-t", help="歌曲标题(可选)") | ||
| 80 | parser.add_argument("--artist", "-a", help="歌手名(可选)") | ||
| 81 | parser.add_argument("--api-url", default="http://localhost:8000", help="API 服务地址 (默认 http://localhost:8000)") | ||
| 82 | parser.add_argument("--upload-only", action="store_true", help="仅上传到 OSS,不调用 API") | ||
| 83 | args = parser.parse_args() | ||
| 84 | |||
| 85 | if not args.file and not args.url: | ||
| 86 | parser.error("需要指定 --file 或 --url") | ||
| 87 | |||
| 88 | # Step 1: 上传 | ||
| 89 | if args.file: | ||
| 90 | abs_path = os.path.join(PROJECT_ROOT, args.file) if not os.path.isabs(args.file) else args.file | ||
| 91 | if not os.path.exists(abs_path): | ||
| 92 | print(f"文件不存在: {abs_path}") | ||
| 93 | sys.exit(1) | ||
| 94 | print(f"正在上传: {abs_path}") | ||
| 95 | lyric_url = upload_lyric_file(abs_path) | ||
| 96 | print(f"上传成功: {lyric_url}") | ||
| 97 | else: | ||
| 98 | lyric_url = args.url | ||
| 99 | print(f"使用已有 URL: {lyric_url}") | ||
| 100 | |||
| 101 | if args.upload_only: | ||
| 102 | return | ||
| 103 | |||
| 104 | # Step 2: 调用去重 API | ||
| 105 | print(f"\n正在调用去重 API...") | ||
| 106 | result = call_dedup_api(lyric_url, title=args.title, artist=args.artist, api_base=args.api_url) | ||
| 107 | |||
| 108 | print(f"\n结果:") | ||
| 109 | print(f" duplicate: {result.get('duplicate')}") | ||
| 110 | print(f" decision: {result.get('decision', 'N/A')}") | ||
| 111 | print(f" confidence: {result.get('confidence', 'N/A')}") | ||
| 112 | print(f" reason: {result.get('reason', 'N/A')}") | ||
| 113 | |||
| 114 | |||
| 115 | if __name__ == "__main__": | ||
| 116 | main() |
test_api/test_lyric.txt
0 → 100644
| 1 | ## 消失的波段 | ||
| 2 | |||
| 3 | ### 【主歌 1】 — *(压抑、低沉的叙事)* | ||
| 4 | |||
| 5 | 霓虹灯……在车窗外退后, | ||
| 6 | 霓虹——和夜色融为一体。 | ||
| 7 | 收音机里,只剩沙沙的电流…… | ||
| 8 | (像你在旧地址留下的呼吸……) | ||
| 9 | 有些习惯……总是很难去修正, | ||
| 10 | 比如——在人群中,辨认你的背影。 | ||
| 11 | |||
| 12 | ### 【主歌 2】 — *(情绪渐进,带有一丝无奈)* | ||
| 13 | |||
| 14 | 朋友圈里……你更新了风景, | ||
| 15 | 坐标是——没听过的、陌、生、城、市。 | ||
| 16 | 我们从无话不说……退回到【静音】, | ||
| 17 | 像两条失去交集的——平行线。 | ||
| 18 | 那些没有寄出的长信…… | ||
| 19 | 最后都变成,草稿箱里的——灰、尘。 | ||
| 20 | |||
| 21 | ### 【副歌】 —— *(情感爆发,高亢而撕裂)* | ||
| 22 | |||
| 23 | 我们成了彼此消 逝 的 波 段 !! | ||
| 24 | 在同一个频段……却再也无法呼喊! | ||
| 25 | 那些同频共振的夜晚…… | ||
| 26 | 最终被淹没在——嘈杂的市中心!! | ||
| 27 | 我调整着微弱的接收信号…… | ||
| 28 | 却只听到——时光断裂的声音!!! | ||
| 29 | |||
| 30 | ### 【桥段】 —— *(节奏加快,连续的内心追问)* | ||
| 31 | |||
| 32 | 是不是所有的连接……都有保质期?! | ||
| 33 | 到期后……就自动切断了所有联系?! | ||
| 34 | 我们在各自的轨道里——加!速!运!行! | ||
| 35 | 再也找不到……那天傍晚的引力。 | ||
| 36 | |||
| 37 | ### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)* | ||
| 38 | |||
| 39 | 我们成了彼此消 逝 的 波 段 ——!! | ||
| 40 | 在同一个频段……却再也无法呼喊! | ||
| 41 | 那些同频共振的夜晚…… | ||
| 42 | 最终被淹没在——嘈杂的市中心!! | ||
| 43 | 我调整着微弱的接收信号…… | ||
| 44 | 却只听到……(时光断裂的声音)…… | ||
| 45 | |||
| 46 | ### 【尾奏】 —— *(情绪下沉,最终归于死寂)* | ||
| 47 | |||
| 48 | 【信号中断……请勿追赶。】 | ||
| 49 | 城市入睡……灯光渐暗…… | ||
| 50 | 一个人的波段。 | ||
| 51 | (查……无……此……人……) | ||
| 52 | 【 挂 断 。】 | ||
| 53 | ### 副歌 | ||
| 54 | |||
| 55 | 我们成了彼此消失的波段 | ||
| 56 | 在同一个频段却再也无法呼喊 | ||
| 57 | 那些同频共振的夜晚 | ||
| 58 | 最终被淹没在嘈杂的市中心 | ||
| 59 | 我调整着微弱的接收信号 | ||
| 60 | 却只听到时光断裂的声音 | ||
| 61 | |||
| 62 | ### 桥段 | ||
| 63 | |||
| 64 | 是不是所有的连接都有保质期 | ||
| 65 | 到期后就自动切断了所有联系 | ||
| 66 | 我们在各自的轨道里加速运行 | ||
| 67 | 再也找不到那天傍晚的引力 | ||
| 68 | |||
| 69 | ### 副歌 | ||
| 70 | |||
| 71 | 我们成了彼此消失的波段 | ||
| 72 | 在同一个频段却再也无法呼喊 | ||
| 73 | 那些同频共振的夜晚 | ||
| 74 | 最终被淹没在嘈杂的市中心 | ||
| 75 | 我调整着微弱的接收信号 | ||
| 76 | 却只听到时光断裂的声音 | ||
| 77 | |||
| 78 | ### 尾奏 | ||
| 79 | |||
| 80 | 信号中断,请勿追赶 | ||
| 81 | 城市入睡,灯光渐暗 | ||
| 82 | 一个人的波段 | ||
| 83 | 查无此人 | ||
| 84 | 挂断 | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -115,6 +115,121 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: | ... | @@ -115,6 +115,121 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: |
| 115 | assert result.candidates[0].primary_line_coverage < 0.72 | 115 | assert result.candidates[0].primary_line_coverage < 0.72 |
| 116 | 116 | ||
| 117 | 117 | ||
| 118 | def test_catalog_mashup_fragments_are_new_not_review() -> None: | ||
| 119 | checker = DuplicateChecker() | ||
| 120 | checker.add_record( | ||
| 121 | LyricRecord( | ||
| 122 | "song-1", | ||
| 123 | """ | ||
| 124 | 第一首歌的清晨 | ||
| 125 | 第一首歌的街口 | ||
| 126 | 每天都在伪装幸福快乐 | ||
| 127 | 还要瞒着所有人不说 | ||
| 128 | 第一首歌的结尾 | ||
| 129 | """, | ||
| 130 | ) | ||
| 131 | ) | ||
| 132 | checker.add_record( | ||
| 133 | LyricRecord( | ||
| 134 | "song-2", | ||
| 135 | """ | ||
| 136 | 第二首歌的海边 | ||
| 137 | 第二首歌的远方 | ||
| 138 | 想起那年夏天 | ||
| 139 | 我们走过人群 | ||
| 140 | 第二首歌的结尾 | ||
| 141 | """, | ||
| 142 | ) | ||
| 143 | ) | ||
| 144 | checker.add_record( | ||
| 145 | LyricRecord( | ||
| 146 | "song-3", | ||
| 147 | """ | ||
| 148 | 第三首歌的月光 | ||
| 149 | 第三首歌的旧梦 | ||
| 150 | 风吹过了窗前 | ||
| 151 | 你没有再回来 | ||
| 152 | 第三首歌的结尾 | ||
| 153 | """, | ||
| 154 | ) | ||
| 155 | ) | ||
| 156 | |||
| 157 | result = checker.check( | ||
| 158 | """ | ||
| 159 | 每天都在伪装幸福快乐 | ||
| 160 | 还要瞒着所有人不说 | ||
| 161 | 想起那年夏天 | ||
| 162 | 我们走过人群 | ||
| 163 | 风吹过了窗前 | ||
| 164 | 你没有再回来 | ||
| 165 | """ | ||
| 166 | ) | ||
| 167 | |||
| 168 | assert result.decision == DuplicateDecision.NEW | ||
| 169 | |||
| 170 | |||
| 171 | def test_large_mashup_with_one_recognizable_song_fragment_is_new() -> None: | ||
| 172 | checker = DuplicateChecker() | ||
| 173 | checker.add_record( | ||
| 174 | LyricRecord( | ||
| 175 | "song-1", | ||
| 176 | """ | ||
| 177 | 桃花春风十里 | ||
| 178 | 花瓣飘散满地 | ||
| 179 | 对不起我无法忘记你 | ||
| 180 | 一去遥遥无期 | ||
| 181 | 一个人一支笔 | ||
| 182 | 多想你能留在我这里 | ||
| 183 | 天空下起了雨 | ||
| 184 | 淋湿我的心里 | ||
| 185 | 久别中多少人都不是你 | ||
| 186 | 屋檐下一人想起 | ||
| 187 | 关于你的回忆 | ||
| 188 | 无人在只剩下我自己 | ||
| 189 | """, | ||
| 190 | ) | ||
| 191 | ) | ||
| 192 | |||
| 193 | result = checker.check( | ||
| 194 | """ | ||
| 195 | scroll through the pictures from a year ago | ||
| 196 | the pixels change but the feelings dont grow | ||
| 197 | an empty inbox and a dial tone heart | ||
| 198 | we built a network just to tear it apart | ||
| 199 | im tracking signals that have long gone cold | ||
| 200 | living a script that has already been sold | ||
| 201 | 当我睁开了眼睛 | ||
| 202 | 感受到一片的灰烬 | ||
| 203 | 我的梦一直都fighting 可是我没 | ||
| 204 | 也许我只有加足马力 | ||
| 205 | 让他们看见都诧异 | ||
| 206 | 留下的华丽的背影 才 | ||
| 207 | 桃花春风十里 | ||
| 208 | 花瓣飘散满地 | ||
| 209 | 对不起我无法忘记你 | ||
| 210 | 一去遥遥无期 | ||
| 211 | 一个人一支笔 | ||
| 212 | 多想你能留在我这里 | ||
| 213 | 天空下起了雨 | ||
| 214 | 淋湿我的心里 | ||
| 215 | 久别中多少人都不是你 | ||
| 216 | 屋檐下一人想起 | ||
| 217 | 关于你的回忆 | ||
| 218 | 无人在只剩下我自己 | ||
| 219 | 疼痛感很弱 | ||
| 220 | 我想我堕落 | ||
| 221 | 哎呦 我逃脱 | ||
| 222 | 是不是我的 | ||
| 223 | 不管你拿不拿走 | ||
| 224 | 我反正都不会动 | ||
| 225 | 哎呦 我难过 | ||
| 226 | 反复的折磨 | ||
| 227 | """ | ||
| 228 | ) | ||
| 229 | |||
| 230 | assert result.decision == DuplicateDecision.NEW | ||
| 231 | |||
| 232 | |||
| 118 | def test_no_effective_lyrics_use_metadata_fallback_without_empty_hash_collision() -> None: | 233 | def test_no_effective_lyrics_use_metadata_fallback_without_empty_hash_collision() -> None: |
| 119 | placeholder = """ | 234 | placeholder = """ |
| 120 | 作词:DJ金木 | 235 | 作词:DJ金木 | ... | ... |
-
Please register or sign in to post a comment