Commit 21677240 216772407a081c1c8d391a03bc7d687b995dd040 by 沈秋雨

接口化服务

1 parent ed19c4ee
...@@ -284,6 +284,7 @@ class DuplicateChecker: ...@@ -284,6 +284,7 @@ class DuplicateChecker:
284 query.normalized.primary_lines, 284 query.normalized.primary_lines,
285 candidate.normalized.primary_lines, 285 candidate.normalized.primary_lines,
286 ) 286 )
287 query_primary_coverage = _matched_query_line_ratio(query.normalized.primary_lines, primary_matched_lines)
287 translation_jaccard = _jaccard(query.translation_tokens, candidate.translation_tokens) 288 translation_jaccard = _jaccard(query.translation_tokens, candidate.translation_tokens)
288 translation_coverage, translation_matched_lines = _line_coverage_lines( 289 translation_coverage, translation_matched_lines = _line_coverage_lines(
289 query.normalized.translation_lines, 290 query.normalized.translation_lines,
...@@ -299,6 +300,27 @@ class DuplicateChecker: ...@@ -299,6 +300,27 @@ class DuplicateChecker:
299 low_confidence_split = ( 300 low_confidence_split = (
300 query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low" 301 query.normalized.split_confidence == "low" or candidate.normalized.split_confidence == "low"
301 ) 302 )
303 query_coverage = _matched_query_line_ratio(query.normalized.unique_lines, matched_lines)
304 has_review_level_overlap = (
305 primary_jaccard >= self.review_jaccard_threshold
306 or jaccard >= self.review_jaccard_threshold
307 or (
308 primary_coverage >= self.review_line_coverage_threshold
309 and query_primary_coverage >= 0.40
310 )
311 or (
312 coverage >= self.review_line_coverage_threshold
313 and query_coverage >= 0.40
314 )
315 )
316 has_material_chorus_overlap = chorus_only and (
317 query.normalized.content_line_count <= 6
318 or (primary_jaccard >= 0.20 and query_primary_coverage >= 0.40)
319 or (jaccard >= 0.20 and query_coverage >= 0.40)
320 or (primary_coverage >= 0.20 and query_primary_coverage >= 0.40)
321 or (coverage >= 0.20 and query_coverage >= 0.40)
322 )
323 has_low_confidence_split_overlap = low_confidence_split and has_review_level_overlap
302 324
303 confidence = round((0.58 * primary_jaccard) + (0.42 * primary_coverage), 4) 325 confidence = round((0.58 * primary_jaccard) + (0.42 * primary_coverage), 4)
304 if ( 326 if (
...@@ -314,21 +336,18 @@ class DuplicateChecker: ...@@ -314,21 +336,18 @@ class DuplicateChecker:
314 else: 336 else:
315 reason = "原文 n-gram 字面相似度高,且行级覆盖范围广" 337 reason = "原文 n-gram 字面相似度高,且行级覆盖范围广"
316 elif ( 338 elif (
317 chorus_only 339 has_material_chorus_overlap
318 or translation_only 340 or translation_only
319 or low_confidence_split 341 or has_low_confidence_split_overlap
320 or primary_jaccard >= self.review_jaccard_threshold 342 or has_review_level_overlap
321 or primary_coverage >= self.review_line_coverage_threshold
322 or jaccard >= self.review_jaccard_threshold
323 or coverage >= self.review_line_coverage_threshold
324 ): 343 ):
325 decision = DuplicateDecision.REVIEW 344 decision = DuplicateDecision.REVIEW
326 reason = "候选相似度达到复核阈值,需要人工确认" 345 reason = "候选相似度达到复核阈值,需要人工确认"
327 if chorus_only: 346 if has_material_chorus_overlap:
328 reason = "重合内容主要集中在重复副歌行,不自动判重" 347 reason = "重合内容主要集中在重复副歌行,不自动判重"
329 elif translation_only: 348 elif translation_only:
330 reason = "仅翻译行相似,原文字面重合不足,不自动判重" 349 reason = "仅翻译行相似,原文字面重合不足,不自动判重"
331 elif low_confidence_split: 350 elif has_low_confidence_split_overlap:
332 reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核" 351 reason = "疑似整段翻译结构但拆分置信度较低,需要人工复核"
333 else: 352 else:
334 decision = DuplicateDecision.NEW 353 decision = DuplicateDecision.NEW
...@@ -446,6 +465,13 @@ def _line_coverage_lines(left: tuple[str, ...], right: tuple[str, ...]) -> tuple ...@@ -446,6 +465,13 @@ def _line_coverage_lines(left: tuple[str, ...], right: tuple[str, ...]) -> tuple
446 return len(matched) / max(len(left_lines), len(right_lines)), matched 465 return len(matched) / max(len(left_lines), len(right_lines)), matched
447 466
448 467
468 def _matched_query_line_ratio(query_lines: tuple[str, ...], matched_lines: list[str]) -> float:
469 query_unique_lines = set(query_lines)
470 if not query_unique_lines:
471 return 0.0
472 return len(set(matched_lines)) / len(query_unique_lines)
473
474
449 def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool: 475 def _is_chorus_only_match(left: NormalizedLyrics, right: NormalizedLyrics, matched_lines: list[str]) -> bool:
450 if not matched_lines: 476 if not matched_lines:
451 return False 477 return False
......
1 from .config import ServerConfig
2 from .service import DedupService
3
4 __all__ = ["ServerConfig", "DedupService"]
1 """FastAPI application for lyric duplicate checking."""
2
3 from __future__ import annotations
4
5 import logging
6 from dataclasses import dataclass
7 from pathlib import Path
8 from typing import Any
9
10 from fastapi import FastAPI
11 from fastapi.responses import JSONResponse
12 from pydantic import BaseModel, Field
13
14 from .config import ServerConfig
15 from .service import DedupService
16
17 logger = logging.getLogger(__name__)
18
19 # ---------------------------------------------------------------------------
20 # App lifecycle
21 # ---------------------------------------------------------------------------
22
23 app = FastAPI(title="Lyric Dedup API", version="0.1.0")
24
25 _config: ServerConfig | None = None
26 _service: DedupService | None = None
27
28
29 @app.on_event("startup")
30 def _startup() -> None:
31 global _config, _service
32 _config = ServerConfig()
33 _service = DedupService(config=_config)
34 logger.info("Lyric Dedup API started (DSN=%s, trgm=%s)", _config.dsn, _config.enable_trgm)
35
36
37 # ---------------------------------------------------------------------------
38 # Request / response models
39 # ---------------------------------------------------------------------------
40
41
42 class CheckRequest(BaseModel):
43 url: str = Field(..., description="URL of the LRC/TXT lyric file")
44 title: str | None = Field(None, description="Song title (optional)")
45 artist: str | None = Field(None, description="Artist name (optional)")
46
47
48 class CheckResponse(BaseModel):
49 duplicate: bool
50 decision: str | None = None
51 confidence: float | None = None
52 reason: str | None = None
53
54
55 class HealthResponse(BaseModel):
56 status: str
57
58
59 # ---------------------------------------------------------------------------
60 # Endpoints
61 # ---------------------------------------------------------------------------
62
63 @app.get("/health", response_model=HealthResponse)
64 def health() -> dict[str, str]:
65 return {"status": "ok"}
66
67
68 @app.post("/api/v1/check", response_model=CheckResponse)
69 def check_lyric(req: CheckRequest) -> Any:
70 if _service is None:
71 return JSONResponse(
72 status_code=503,
73 content={"detail": "service not initialized"},
74 )
75
76 # 校验文件格式(仅接受 .txt / .lrc)
77 if not _is_valid_lyric_url(req.url):
78 return JSONResponse(
79 status_code=400,
80 content={"detail": "仅支持 .txt 或 .lrc 格式的歌词文件"},
81 )
82
83 try:
84 lyrics = _download_lyrics(req.url)
85 except ValueError as exc:
86 return JSONResponse(
87 status_code=400,
88 content={"detail": str(exc)},
89 )
90 except Exception as exc:
91 logger.exception("unexpected error during download")
92 return JSONResponse(
93 status_code=500,
94 content={"detail": f"下载歌词失败: {exc}"},
95 )
96
97 try:
98 result = _service.check(lyrics, title=req.title, artist=req.artist, source_url=req.url)
99 except Exception as exc:
100 logger.exception("unexpected error during dedup check")
101 return JSONResponse(
102 status_code=500,
103 content={"detail": f"歌词去重检测失败: {exc}"},
104 )
105
106 return CheckResponse(
107 duplicate=result.duplicate,
108 decision=result.decision,
109 confidence=result.confidence,
110 reason=result.reason,
111 )
112
113
114 # ---------------------------------------------------------------------------
115 # Helpers
116 # ---------------------------------------------------------------------------
117
118 _ENCODING_CHAIN = ("utf-8-sig", "utf-8", "gb18030", "big5")
119
120
121 _ALLOWED_EXTENSIONS = {".txt", ".lrc"}
122
123
124 def _is_valid_lyric_url(url: str) -> bool:
125 """Check if URL points to a .txt or .lrc file."""
126 from urllib.parse import urlparse
127
128 ext = Path(urlparse(url).path).suffix.lower()
129 return ext in _ALLOWED_EXTENSIONS
130
131
132 def _download_lyrics(url: str) -> str:
133 """Download a lyric file and decode with encoding fallback chain."""
134 import urllib.error
135 import urllib.request
136
137 try:
138 with urllib.request.urlopen(url, timeout=_config.download_timeout if _config else 10) as resp:
139 data = resp.read()
140 except urllib.error.HTTPError as exc:
141 raise ValueError(f"下载失败: HTTP {exc.code}") from exc
142 except urllib.error.URLError as exc:
143 raise ValueError(f"下载失败: {exc.reason}") from exc
144 except TimeoutError as exc:
145 raise ValueError("下载超时") from exc
146 except Exception as exc:
147 raise ValueError(f"下载失败: {exc}") from exc
148
149 for encoding in _ENCODING_CHAIN:
150 try:
151 return data.decode(encoding)
152 except UnicodeDecodeError:
153 continue
154 raise ValueError("无法解析文件编码,支持: utf-8-sig / utf-8 / gb18030 / big5")
1 """Server configuration loaded from environment variables."""
2
3 from __future__ import annotations
4
5 import os
6 from dataclasses import dataclass
7
8
9 @dataclass
10 class ServerConfig:
11 dsn: str = os.getenv("LYRIC_DEDUP_DSN", "postgresql:///lyric_dedup")
12 max_candidates: int = int(os.getenv("LYRIC_DEDUP_MAX_CANDIDATES", "5"))
13 recall_limit: int = int(os.getenv("LYRIC_DEDUP_RECALL_LIMIT", "100"))
14 enable_trgm: bool = os.getenv("LYRIC_DEDUP_ENABLE_TRGM", "false").lower() == "true"
15 trgm_threshold: float = float(os.getenv("LYRIC_DEDUP_TRGM_THRESHOLD", "0.3"))
16 statement_timeout_ms: int = int(os.getenv("LYRIC_DEDUP_STATEMENT_TIMEOUT_MS", "5000"))
17 download_timeout: int = int(os.getenv("LYRIC_DEDUP_DOWNLOAD_TIMEOUT", "10"))
1 """Core deduplication service: PostgreSQL recall + DuplicateChecker."""
2
3 from __future__ import annotations
4
5 import hashlib
6 import logging
7 from dataclasses import dataclass, field
8 from typing import Any
9
10 import psycopg
11
12 from lyric_dedup.checker import DuplicateChecker
13 from lyric_dedup.checker import DuplicateDecision
14 from lyric_dedup.checker import LyricRecord
15 from lyric_dedup.normalization import fingerprint_text
16 from lyric_dedup.normalization import normalize_lyrics
17
18 from .config import ServerConfig
19
20 logger = logging.getLogger(__name__)
21
22
23 @dataclass(frozen=True)
24 class CheckResult:
25 duplicate: bool
26 decision: str = ""
27 confidence: float = 0.0
28 reason: str = ""
29 candidate_count: int = 0
30
31
32 @dataclass
33 class DedupService:
34 """Thin wrapper around the PostgreSQL recall + DuplicateChecker pipeline."""
35
36 config: ServerConfig
37 _logger: logging.Logger = field(default_factory=lambda: logger, repr=False)
38
39 def check(
40 self,
41 lyrics_text: str,
42 title: str | None = None,
43 artist: str | None = None,
44 source_url: str | None = None,
45 ) -> CheckResult:
46 """Core entry: download lyrics, recall candidates from PG, decide."""
47 record = LyricRecord(
48 record_id="__query__",
49 lyrics=lyrics_text,
50 title=title,
51 artist=artist,
52 )
53 with psycopg.connect(self.config.dsn) as conn:
54 with conn.cursor() as cursor:
55 cursor.execute("select set_config('statement_timeout', %s, false)", (str(self.config.statement_timeout_ms),))
56 cursor.execute("select set_config('pg_trgm.similarity_threshold', %s, false)", (str(self.config.trgm_threshold),))
57 candidates = self._recall_candidates(conn, record)
58 result = self._check_against_candidates(record, candidates)
59 if result.decision == "new" and source_url:
60 self._insert_new_record(conn, record, source_url)
61 return result
62
63 def _insert_new_record(self, conn: Any, record: LyricRecord, source_url: str) -> None:
64 """Insert new lyric into PostgreSQL (lyrics + lyric_lines tables)."""
65 raw_text = _pg_text(record.lyrics)[0] or ""
66 normalized = normalize_lyrics(raw_text)
67 primary_text = _pg_text("\n".join(normalized.primary_lines))[0]
68 translation_text = _pg_text("\n".join(normalized.translation_lines))[0] or None
69 normalized_text = _pg_text(normalized.normalized_full_text)[0]
70 exact_text = fingerprint_text(normalized)
71 exact_hash = hashlib.sha256(exact_text.encode("utf-8")).hexdigest()
72
73 with conn.cursor() as cursor:
74 cursor.execute(
75 """
76 insert into lyrics (
77 record_id, source_path, title, artist, raw_text, normalized_text,
78 primary_text, translation_text, exact_hash, split_confidence,
79 split_reason, line_count, updated_at, deleted_at
80 ) values (
81 %(record_id)s, %(source_path)s, %(title)s, %(artist)s, %(raw_text)s,
82 %(normalized_text)s, %(primary_text)s, %(translation_text)s,
83 %(exact_hash)s, %(split_confidence)s, %(split_reason)s,
84 %(line_count)s, now(), null
85 )
86 on conflict (record_id) do update set
87 source_path = excluded.source_path, title = excluded.title,
88 artist = excluded.artist, raw_text = excluded.raw_text,
89 normalized_text = excluded.normalized_text, primary_text = excluded.primary_text,
90 translation_text = excluded.translation_text, exact_hash = excluded.exact_hash,
91 split_confidence = excluded.split_confidence, split_reason = excluded.split_reason,
92 line_count = excluded.line_count, updated_at = now(), deleted_at = null
93 returning id
94 """,
95 {
96 "record_id": _build_record_id(source_url),
97 "source_path": source_url,
98 "title": _pg_text(record.title)[0],
99 "artist": _pg_text(record.artist)[0],
100 "raw_text": raw_text,
101 "normalized_text": normalized_text,
102 "primary_text": primary_text,
103 "translation_text": translation_text,
104 "exact_hash": exact_hash,
105 "split_confidence": _pg_text(normalized.split_confidence)[0],
106 "split_reason": _pg_text(normalized.split_reason)[0],
107 "line_count": len(normalized.primary_lines or normalized.unique_lines),
108 },
109 )
110 lyric_id = cursor.fetchone()[0]
111
112 cursor.execute("delete from lyric_lines where lyric_id = %s", (lyric_id,))
113 line_rows: list[tuple] = list(_line_rows(lyric_id, "primary", normalized.primary_lines))
114 line_rows.extend(_line_rows(lyric_id, "translation", normalized.translation_lines))
115 line_rows.extend(_line_rows(lyric_id, "unknown", normalized.unknown_lines))
116 if line_rows:
117 cursor.executemany(
118 "insert into lyric_lines (lyric_id, role, line_no, normalized_line, line_hash) values (%s, %s, %s, %s, %s)",
119 line_rows,
120 )
121 conn.commit()
122
123 def _recall_candidates(self, conn: Any, record: LyricRecord) -> list[LyricRecord]:
124 """Three-tier recall: exact_hash → pg_trgm → line_hash."""
125 query_lyrics = _pg_text(record.lyrics)[0] or ""
126 normalized = normalize_lyrics(query_lyrics)
127 exact_text = fingerprint_text(normalized)
128 exact_hash = hashlib.sha256(exact_text.encode("utf-8")).hexdigest()
129 primary_text = "\n".join(normalized.primary_lines)
130 line_hashes = [hashlib.sha256(line.encode("utf-8")).hexdigest() for line in normalized.primary_lines if line]
131
132 candidates: dict[str, LyricRecord] = {}
133 exclude_record_ids: list[str] = []
134
135 with conn.cursor() as cursor:
136 # Tier 1: exact hash match
137 cursor.execute(
138 """
139 select record_id, raw_text, title, artist
140 from lyrics
141 where deleted_at is null
142 and exact_hash = %s
143 and not (record_id = any(%s))
144 limit %s
145 """,
146 (exact_hash, exclude_record_ids, self.config.recall_limit),
147 )
148 _add_rows(candidates, cursor.fetchall())
149
150 # Tier 2: pg_trgm similarity (optional)
151 if self.config.enable_trgm and primary_text:
152 cursor.execute(
153 """
154 select record_id, raw_text, title, artist
155 from lyrics
156 where deleted_at is null
157 and not (record_id = any(%s))
158 and primary_text %% %s
159 order by similarity(primary_text, %s) desc
160 limit %s
161 """,
162 (exclude_record_ids, primary_text, primary_text, self.config.recall_limit),
163 )
164 _add_rows(candidates, cursor.fetchall())
165
166 # Tier 3: line hash match
167 if line_hashes:
168 cursor.execute(
169 """
170 select l.record_id, l.raw_text, l.title, l.artist
171 from lyric_lines ll
172 join lyrics l on l.id = ll.lyric_id
173 where l.deleted_at is null
174 and not (l.record_id = any(%s))
175 and ll.role = 'primary'
176 and ll.line_hash = any(%s)
177 group by l.id
178 order by count(*) desc
179 limit %s
180 """,
181 (exclude_record_ids, line_hashes, self.config.recall_limit),
182 )
183 _add_rows(candidates, cursor.fetchall())
184
185 return list(candidates.values())
186
187 def _check_against_candidates(
188 self,
189 record: LyricRecord,
190 candidates: list[LyricRecord],
191 ) -> CheckResult:
192 """Run DuplicateChecker against recalled candidates."""
193 checker = DuplicateChecker()
194 for candidate in candidates:
195 checker.add_record(candidate)
196 result = checker.check_record(record, max_candidates=self.config.max_candidates)
197 return CheckResult(
198 duplicate=result.decision in (DuplicateDecision.DUPLICATE, DuplicateDecision.REVIEW),
199 decision=result.decision.value,
200 confidence=result.confidence,
201 reason=result.reason,
202 candidate_count=len(result.candidates),
203 )
204
205
206 def _add_rows(candidates: dict[str, LyricRecord], rows: list[tuple[object, ...]]) -> None:
207 for record_id, raw_text, title, artist in rows:
208 candidates.setdefault(
209 str(record_id),
210 LyricRecord(
211 record_id=str(record_id),
212 lyrics=str(raw_text),
213 title=str(title) if title is not None else None,
214 artist=str(artist) if artist is not None else None,
215 ),
216 )
217
218
219 def _build_record_id(source_url: str) -> str:
220 """From URL to record_id, format url:{sha12}:{url}."""
221 digest = hashlib.sha1(source_url.encode("utf-8")).hexdigest()[:12]
222 return f"url:{digest}:{source_url}"
223
224
225 def _line_rows(lyric_id: int, role: str, lines: tuple[str, ...]) -> list[tuple]:
226 rows: list[tuple] = []
227 for index, line in enumerate(lines):
228 line = _pg_text(line)[0] or ""
229 line_hash = hashlib.sha256(line.encode("utf-8")).hexdigest()
230 rows.append((lyric_id, role, index, line, line_hash))
231 return rows
232
233
234 def _pg_text(value: str | None) -> tuple[str | None, bool]:
235 """Return (text, had_nul)."""
236 if value is None:
237 return None, False
238 if "\x00" not in value:
239 return value, False
240 return value.replace("\x00", ""), True
...@@ -8,3 +8,7 @@ psycopg[binary]>=3.2 ...@@ -8,3 +8,7 @@ psycopg[binary]>=3.2
8 pymysql>=1.1 8 pymysql>=1.1
9 cos-python-sdk-v5>=1.9 9 cos-python-sdk-v5>=1.9
10 tqdm>=4.66 10 tqdm>=4.66
11
12 # HTTP API server
13 fastapi>=0.110.0
14 uvicorn[standard]>=0.29.0
......
1 """测试环境配置,从 .env 或环境变量读取 OSS 凭据"""
2 import os
3 from pathlib import Path
4
5 # 自动加载 .env 文件
6 _env_path = Path(__file__).parent / ".env"
7 if _env_path.exists():
8 with open(_env_path, encoding="utf-8") as _f:
9 for _line in _f:
10 _line = _line.strip()
11 if _line and not _line.startswith("#") and "=" in _line:
12 _key, _value = _line.split("=", 1)
13 os.environ.setdefault(_key.strip(), _value.strip())
14
15 OSS_ACCESS_KEY_ID = os.getenv("OSS_ACCESS_KEY_ID", "")
16 OSS_ACCESS_KEY_SECRET = os.getenv("OSS_ACCESS_KEY_SECRET", "")
17 OSS_ENDPOINT = os.getenv("OSS_ENDPOINT", "oss-cn-hangzhou.aliyuncs.com")
18 OSS_BUCKET_NAME = os.getenv("OSS_BUCKET_NAME", "")
19 OSS_ENDPOINT_INTERNAL = os.getenv("OSS_ENDPOINT_INTERNAL", OSS_ENDPOINT)
1 """
2 阿里云OSS文件上传模块
3 """
4 import uuid
5
6 import oss2
7 import os
8 from datetime import datetime, timedelta
9 from .config import OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT, OSS_BUCKET_NAME, OSS_ENDPOINT_INTERNAL
10
11
12
13 class OSSUploader:
14 """阿里云OSS上传器"""
15
16 def __init__(self):
17 """初始化OSS客户端"""
18 self.access_key_id = OSS_ACCESS_KEY_ID
19 self.access_key_secret = OSS_ACCESS_KEY_SECRET
20 self.endpoint = OSS_ENDPOINT
21 self.bucket_name = OSS_BUCKET_NAME
22 self.endpoint_internal = OSS_ENDPOINT_INTERNAL
23 # 创建认证对象
24 self.auth = oss2.Auth(self.access_key_id, self.access_key_secret)
25
26 # 创建Bucket对象
27 self.bucket = oss2.Bucket(self.auth, self.endpoint, self.bucket_name)
28
29 def upload_file(self, local_file_path, oss_object_name=None):
30 """
31 上传文件到OSS
32
33 Args:
34 local_file_path: 本地文件路径
35 oss_object_name: OSS对象名称,如果不指定则使用时间戳+原文件名
36
37 Returns:
38 tuple: (success: bool, url: str) 或 (success: bool, error: str)
39 """
40 try:
41 if not os.path.exists(local_file_path):
42 return False, "本地文件不存在"
43
44 if not oss_object_name:
45 _, ext = os.path.splitext(local_file_path)
46 oss_object_name = f"{uuid.uuid4()}{ext}"
47
48 # 如果没有指定OSS对象名称,则生成一个
49 date = datetime.now().strftime("%Y%m%d")
50 oss_object_name = f"public_test/{date}/{oss_object_name}"
51
52 # 上传文件
53 result = self.bucket.put_object_from_file(oss_object_name, local_file_path)
54
55 # 构建文件URL(使用标准公网域名格式)
56 endpoint_host = self.endpoint.lstrip("https://").lstrip("http://").split("/")[0]
57 file_url = f"https://{self.bucket_name}.{endpoint_host}/{oss_object_name}"
58
59 return True, file_url
60
61 except Exception as e:
62 return False, str(e)
63
64 def upload_data(self, data, oss_object_name):
65 """
66 上传数据到OSS
67
68 Args:
69 data: 要上传的数据(字符串或字节)
70 oss_object_name: OSS对象名称
71
72 Returns:
73 dict: 包含上传结果的字典
74 """
75 try:
76 # 上传数据
77 result = self.bucket.put_object(oss_object_name, data)
78
79 # 构建文件URL
80 file_url = f"{self.endpoint.rstrip('/')}/{self.bucket_name}/{oss_object_name}"
81
82 return {
83 "success": True,
84 "oss_object_name": oss_object_name,
85 "file_url": file_url,
86 "etag": result.etag,
87 "size": len(data) if isinstance(data, (str, bytes)) else 0
88 }
89
90 except Exception as e:
91 return {"success": False, "error": str(e)}
92
93
94 def get_bucket():
95 """获取Bucket对象"""
96 auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET)
97 bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME)
98 return bucket
99
100
101 def clean_expire_file():
102 """核心任务函数"""
103 print(f"\n[{datetime.now()}] 开始执行每日清理任务...")
104 ROOT_PREFIX = 'temp_ai/'
105 bucket = get_bucket()
106
107 # 1. 计算时间阈值
108 now = datetime.now()
109 yesterday_date = (now - timedelta(days=1)).date()
110 print(f"保留阈值: {yesterday_date} (即 {yesterday_date} 之前的数据将被删除)")
111
112 # 2. 遍历目录
113 try:
114 for obj in oss2.ObjectIterator(bucket, prefix=ROOT_PREFIX, delimiter='/'):
115 path = ""
116 is_directory = False
117
118 # --- [核心修改] 统一路径获取方式 ---
119
120 # 情况 A: 它是虚拟目录 (CommonPrefix)
121 if hasattr(obj, 'prefix'):
122 path = obj.prefix
123 is_directory = True
124
125 # 情况 B: 它是实际对象 (SimplifiedObjectInfo)
126 elif hasattr(obj, 'key'):
127 path = obj.key
128 # 如果 key 以 / 结尾,说明它是一个显式创建的文件夹对象
129 if path.endswith('/'):
130 is_directory = True
131 else:
132 is_directory = False # 这是一个普通文件
133
134 # --- 逻辑分流 ---
135
136 if not is_directory:
137 # 这是一个真正的文件(且不是文件夹对象),直接跳过
138 # print(f"[跳过] 散落文件: {path}")
139 continue
140
141 # 此时 path 必定是目录格式 (如 'temp_ai/20251229/')
142 # 下面开始正常的日期判断逻辑
143
144 # 防御性去空,防止路径即为 'temp_ai/' 本身
145 if path == ROOT_PREFIX:
146 continue
147
148 # 解析目录名 (取倒数第二个元素,因为最后一位是空字符串)
149 folder_name_raw = path.strip('/').split('/')[-1]
150
151 try:
152 folder_date_obj = datetime.strptime(folder_name_raw, "%Y%m%d").date()
153
154 if folder_date_obj < yesterday_date:
155 print(f"[删除] 发现过期目录: {path}")
156 # 注意:delete_objects_by_prefix 会删除该前缀下的所有文件
157 # 如果这个目录本身是个对象,也会被一并删除,无需特殊处理
158 delete_objects_by_prefix(bucket, path)
159 else:
160 # print(f"[跳过] 目录较新: {path}")
161 pass
162
163 except ValueError:
164 print(f"[跳过] 非日期命名目录: {path}")
165
166 except Exception as e:
167 import traceback
168 print(f"[严重错误] 任务执行失败: {e}")
169 traceback.print_exc()
170
171
172 def delete_objects_by_prefix(bucket, prefix):
173 """递归删除指定前缀下的所有文件"""
174 print(f" -> 正在清理目录: {prefix} ...")
175 batch_list = []
176 try:
177 for obj in oss2.ObjectIterator(bucket, prefix=prefix):
178 batch_list.append(obj.key)
179 if len(batch_list) >= 1000:
180 bucket.batch_delete_objects(batch_list)
181 batch_list = []
182
183 if batch_list:
184 bucket.batch_delete_objects(batch_list)
185 print(f" -> 目录 {prefix} 清理完毕。")
186 except Exception as e:
187 print(f" [错误] 删除过程出错: {e}")
188
189
190 # 创建OSS上传器实例
191 oss_uploader = OSSUploader()
192
193 if __name__ == '__main__':
194 clean_expire_file()
...\ No newline at end of file ...\ No newline at end of file
1 """歌词去重 API 测试脚本
2
3 用法:
4 # 上传指定歌词文件并调用去重 API
5 python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc
6
7 # 指定标题和歌手
8 python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --title "夜曲" --artist "周杰伦"
9
10 # 仅上传不调用 API
11 python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --upload-only
12
13 # 仅调用 API(使用已有 URL)
14 python test_api/test_dedup_api.py --url "https://hikoon-ai-test.oss-cn-hangzhou.aliyuncs.com/temp_ai/20250603/xxx.lrc"
15
16 # 指定 API 地址
17 python test_api/test_dedup_api.py --file data/library/None_WHHY134166.lrc --api-url "http://localhost:8000"
18 """
19 import argparse
20 import json
21 import os
22 import sys
23
24 # 确保项目根目录在 path 中
25 PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
26 if str(PROJECT_ROOT) not in sys.path:
27 sys.path.insert(0, str(PROJECT_ROOT))
28
29 import urllib.request
30 import urllib.error
31
32 from test_api.config import OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT, OSS_BUCKET_NAME
33 from test_api.oss_uploader import OSSUploader
34
35
36 def upload_lyric_file(file_path: str) -> str:
37 """上传歌词文件到 OSS,返回公开 URL"""
38 uploader = OSSUploader()
39 success, result = uploader.upload_file(file_path)
40 if not success:
41 print(f"上传失败: {result}")
42 sys.exit(1)
43 return result
44
45
46 def call_dedup_api(url: str, title: str | None, artist: str | None, api_base: str) -> dict:
47 """调用去重 API"""
48 payload = json.dumps({
49 "url": url,
50 "title": title,
51 "artist": artist,
52 }).encode("utf-8")
53
54 req = urllib.request.Request(
55 f"{api_base.rstrip('/')}/api/v1/check",
56 data=payload,
57 headers={"Content-Type": "application/json"},
58 method="POST",
59 )
60
61 try:
62 with urllib.request.urlopen(req, timeout=30) as resp:
63 body = json.loads(resp.read().decode("utf-8"))
64 return body
65 except urllib.error.HTTPError as exc:
66 error_body = exc.read().decode("utf-8", errors="replace")
67 print(f"API 请求失败 (HTTP {exc.code}): {error_body}")
68 sys.exit(1)
69 except urllib.error.URLError as exc:
70 print(f"API 请求失败: {exc.reason}")
71 print("请确认 API 服务已启动: uvicorn lyric_dedup_server.app:app --host 0.0.0.0 --port 8000")
72 sys.exit(1)
73
74
75 def main():
76 parser = argparse.ArgumentParser(description="歌词去重 API 测试")
77 parser.add_argument("--file", "-f", help="本地歌词文件路径")
78 parser.add_argument("--url", "-u", help="已上传的歌词 URL(跳过上传步骤)")
79 parser.add_argument("--title", "-t", help="歌曲标题(可选)")
80 parser.add_argument("--artist", "-a", help="歌手名(可选)")
81 parser.add_argument("--api-url", default="http://localhost:8000", help="API 服务地址 (默认 http://localhost:8000)")
82 parser.add_argument("--upload-only", action="store_true", help="仅上传到 OSS,不调用 API")
83 args = parser.parse_args()
84
85 if not args.file and not args.url:
86 parser.error("需要指定 --file 或 --url")
87
88 # Step 1: 上传
89 if args.file:
90 abs_path = os.path.join(PROJECT_ROOT, args.file) if not os.path.isabs(args.file) else args.file
91 if not os.path.exists(abs_path):
92 print(f"文件不存在: {abs_path}")
93 sys.exit(1)
94 print(f"正在上传: {abs_path}")
95 lyric_url = upload_lyric_file(abs_path)
96 print(f"上传成功: {lyric_url}")
97 else:
98 lyric_url = args.url
99 print(f"使用已有 URL: {lyric_url}")
100
101 if args.upload_only:
102 return
103
104 # Step 2: 调用去重 API
105 print(f"\n正在调用去重 API...")
106 result = call_dedup_api(lyric_url, title=args.title, artist=args.artist, api_base=args.api_url)
107
108 print(f"\n结果:")
109 print(f" duplicate: {result.get('duplicate')}")
110 print(f" decision: {result.get('decision', 'N/A')}")
111 print(f" confidence: {result.get('confidence', 'N/A')}")
112 print(f" reason: {result.get('reason', 'N/A')}")
113
114
115 if __name__ == "__main__":
116 main()
1 ## 消失的波段
2
3 ### 【主歌 1】 — *(压抑、低沉的叙事)*
4
5 霓虹灯……在车窗外退后,
6 霓虹——和夜色融为一体。
7 收音机里,只剩沙沙的电流……
8 (像你在旧地址留下的呼吸……)
9 有些习惯……总是很难去修正,
10 比如——在人群中,辨认你的背影。
11
12 ### 【主歌 2】 — *(情绪渐进,带有一丝无奈)*
13
14 朋友圈里……你更新了风景,
15 坐标是——没听过的、陌、生、城、市。
16 我们从无话不说……退回到【静音】,
17 像两条失去交集的——平行线。
18 那些没有寄出的长信……
19 最后都变成,草稿箱里的——灰、尘。
20
21 ### 【副歌】 —— *(情感爆发,高亢而撕裂)*
22
23 我们成了彼此消 逝 的 波 段 !!
24 在同一个频段……却再也无法呼喊!
25 那些同频共振的夜晚……
26 最终被淹没在——嘈杂的市中心!!
27 我调整着微弱的接收信号……
28 却只听到——时光断裂的声音!!!
29
30 ### 【桥段】 —— *(节奏加快,连续的内心追问)*
31
32 是不是所有的连接……都有保质期?!
33 到期后……就自动切断了所有联系?!
34 我们在各自的轨道里——加!速!运!行!
35 再也找不到……那天傍晚的引力。
36
37 ### 【副歌】 —— *(最后一次宣泄,带有哭腔的强音)*
38
39 我们成了彼此消 逝 的 波 段 ——!!
40 在同一个频段……却再也无法呼喊!
41 那些同频共振的夜晚……
42 最终被淹没在——嘈杂的市中心!!
43 我调整着微弱的接收信号……
44 却只听到……(时光断裂的声音)……
45
46 ### 【尾奏】 —— *(情绪下沉,最终归于死寂)*
47
48 【信号中断……请勿追赶。】
49 城市入睡……灯光渐暗……
50 一个人的波段。
51 (查……无……此……人……)
52 【 挂 断 。】
53 ### 副歌
54
55 我们成了彼此消失的波段
56 在同一个频段却再也无法呼喊
57 那些同频共振的夜晚
58 最终被淹没在嘈杂的市中心
59 我调整着微弱的接收信号
60 却只听到时光断裂的声音
61
62 ### 桥段
63
64 是不是所有的连接都有保质期
65 到期后就自动切断了所有联系
66 我们在各自的轨道里加速运行
67 再也找不到那天傍晚的引力
68
69 ### 副歌
70
71 我们成了彼此消失的波段
72 在同一个频段却再也无法呼喊
73 那些同频共振的夜晚
74 最终被淹没在嘈杂的市中心
75 我调整着微弱的接收信号
76 却只听到时光断裂的声音
77
78 ### 尾奏
79
80 信号中断,请勿追赶
81 城市入睡,灯光渐暗
82 一个人的波段
83 查无此人
84 挂断
...\ No newline at end of file ...\ No newline at end of file
...@@ -115,6 +115,121 @@ def test_fragment_of_full_song_is_not_duplicate() -> None: ...@@ -115,6 +115,121 @@ def test_fragment_of_full_song_is_not_duplicate() -> None:
115 assert result.candidates[0].primary_line_coverage < 0.72 115 assert result.candidates[0].primary_line_coverage < 0.72
116 116
117 117
118 def test_catalog_mashup_fragments_are_new_not_review() -> None:
119 checker = DuplicateChecker()
120 checker.add_record(
121 LyricRecord(
122 "song-1",
123 """
124 第一首歌的清晨
125 第一首歌的街口
126 每天都在伪装幸福快乐
127 还要瞒着所有人不说
128 第一首歌的结尾
129 """,
130 )
131 )
132 checker.add_record(
133 LyricRecord(
134 "song-2",
135 """
136 第二首歌的海边
137 第二首歌的远方
138 想起那年夏天
139 我们走过人群
140 第二首歌的结尾
141 """,
142 )
143 )
144 checker.add_record(
145 LyricRecord(
146 "song-3",
147 """
148 第三首歌的月光
149 第三首歌的旧梦
150 风吹过了窗前
151 你没有再回来
152 第三首歌的结尾
153 """,
154 )
155 )
156
157 result = checker.check(
158 """
159 每天都在伪装幸福快乐
160 还要瞒着所有人不说
161 想起那年夏天
162 我们走过人群
163 风吹过了窗前
164 你没有再回来
165 """
166 )
167
168 assert result.decision == DuplicateDecision.NEW
169
170
171 def test_large_mashup_with_one_recognizable_song_fragment_is_new() -> None:
172 checker = DuplicateChecker()
173 checker.add_record(
174 LyricRecord(
175 "song-1",
176 """
177 桃花春风十里
178 花瓣飘散满地
179 对不起我无法忘记你
180 一去遥遥无期
181 一个人一支笔
182 多想你能留在我这里
183 天空下起了雨
184 淋湿我的心里
185 久别中多少人都不是你
186 屋檐下一人想起
187 关于你的回忆
188 无人在只剩下我自己
189 """,
190 )
191 )
192
193 result = checker.check(
194 """
195 scroll through the pictures from a year ago
196 the pixels change but the feelings dont grow
197 an empty inbox and a dial tone heart
198 we built a network just to tear it apart
199 im tracking signals that have long gone cold
200 living a script that has already been sold
201 当我睁开了眼睛
202 感受到一片的灰烬
203 我的梦一直都fighting 可是我没
204 也许我只有加足马力
205 让他们看见都诧异
206 留下的华丽的背影 才
207 桃花春风十里
208 花瓣飘散满地
209 对不起我无法忘记你
210 一去遥遥无期
211 一个人一支笔
212 多想你能留在我这里
213 天空下起了雨
214 淋湿我的心里
215 久别中多少人都不是你
216 屋檐下一人想起
217 关于你的回忆
218 无人在只剩下我自己
219 疼痛感很弱
220 我想我堕落
221 哎呦 我逃脱
222 是不是我的
223 不管你拿不拿走
224 我反正都不会动
225 哎呦 我难过
226 反复的折磨
227 """
228 )
229
230 assert result.decision == DuplicateDecision.NEW
231
232
118 def test_no_effective_lyrics_use_metadata_fallback_without_empty_hash_collision() -> None: 233 def test_no_effective_lyrics_use_metadata_fallback_without_empty_hash_collision() -> None:
119 placeholder = """ 234 placeholder = """
120 作词:DJ金木 235 作词:DJ金木
......