file_import.py
2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""Import LRC/TXT lyric files into records."""
from __future__ import annotations
import hashlib
from pathlib import Path
from lyric_dedup.checker import LyricRecord
SUPPORTED_SUFFIXES = {".lrc", ".txt"}
def iter_lyric_files(root: str | Path) -> list[Path]:
base = Path(root)
return sorted(
path
for path in base.rglob("*")
if path.is_file() and path.suffix.lower() in SUPPORTED_SUFFIXES
)
def read_lyric_file(path: str | Path) -> str:
file_path = Path(path)
data = file_path.read_bytes()
for encoding in ("utf-8-sig", "utf-8", "gb18030", "big5"):
try:
return data.decode(encoding)
except UnicodeDecodeError:
continue
return data.decode("utf-8", errors="replace")
def record_from_file(path: str | Path, *, base_dir: str | Path | None = None) -> LyricRecord:
file_path = Path(path)
lyrics = read_lyric_file(file_path)
title, artist = _metadata_from_name(file_path.stem)
record_id = _record_id(file_path, base_dir)
return LyricRecord(record_id=record_id, lyrics=lyrics, title=title, artist=artist)
def records_from_dir(root: str | Path) -> list[LyricRecord]:
return [record_from_file(path, base_dir=root) for path in iter_lyric_files(root)]
def _record_id(path: Path, base_dir: str | Path | None) -> str:
if base_dir is None:
source = str(path.resolve())
else:
source = str(path.resolve().relative_to(Path(base_dir).resolve()))
digest = hashlib.sha1(source.encode("utf-8")).hexdigest()[:12]
return f"{digest}:{source}"
def _metadata_from_name(stem: str) -> tuple[str | None, str | None]:
cleaned = stem.removesuffix("-歌词").removesuffix("_歌词").removesuffix(" 歌词").strip()
if " - " in cleaned:
artist, title = cleaned.split(" - ", 1)
return title.strip() or None, artist.strip() or None
for sep in ("-", "_"):
if sep in cleaned:
title, artist = cleaned.rsplit(sep, 1)
return title.strip() or None, artist.strip() or None
return stem.strip() or None, None