file_import.py 2.04 KB
"""Import LRC/TXT lyric files into records."""

from __future__ import annotations

import hashlib
from pathlib import Path

from lyric_dedup.checker import LyricRecord


SUPPORTED_SUFFIXES = {".lrc", ".txt"}


def iter_lyric_files(root: str | Path) -> list[Path]:
    base = Path(root)
    return sorted(
        path
        for path in base.rglob("*")
        if path.is_file() and path.suffix.lower() in SUPPORTED_SUFFIXES
    )


def read_lyric_file(path: str | Path) -> str:
    file_path = Path(path)
    data = file_path.read_bytes()
    for encoding in ("utf-8-sig", "utf-8", "gb18030", "big5"):
        try:
            return data.decode(encoding)
        except UnicodeDecodeError:
            continue
    return data.decode("utf-8", errors="replace")


def record_from_file(path: str | Path, *, base_dir: str | Path | None = None) -> LyricRecord:
    file_path = Path(path)
    lyrics = read_lyric_file(file_path)
    title, artist = _metadata_from_name(file_path.stem)
    record_id = _record_id(file_path, base_dir)
    return LyricRecord(record_id=record_id, lyrics=lyrics, title=title, artist=artist)


def records_from_dir(root: str | Path) -> list[LyricRecord]:
    return [record_from_file(path, base_dir=root) for path in iter_lyric_files(root)]


def _record_id(path: Path, base_dir: str | Path | None) -> str:
    if base_dir is None:
        source = str(path.resolve())
    else:
        source = str(path.resolve().relative_to(Path(base_dir).resolve()))
    digest = hashlib.sha1(source.encode("utf-8")).hexdigest()[:12]
    return f"{digest}:{source}"


def _metadata_from_name(stem: str) -> tuple[str | None, str | None]:
    cleaned = stem.removesuffix("-歌词").removesuffix("_歌词").removesuffix(" 歌词").strip()
    if " - " in cleaned:
        artist, title = cleaned.split(" - ", 1)
        return title.strip() or None, artist.strip() or None
    for sep in ("-", "_"):
        if sep in cleaned:
            title, artist = cleaned.rsplit(sep, 1)
            return title.strip() or None, artist.strip() or None
    return stem.strip() or None, None