batch_analyze_xlsx.py 16.9 KB

Raw Blame History Permalink

# -*- coding: utf-8 -*-
"""Batch analyze audio URLs from an xlsx file and export results to xlsx."""

from __future__ import annotations

import argparse
import json
import math
import os
import sys
import traceback
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from pathlib import Path
from typing import Any

import pandas as pd

# 允许直接 `python pipeline/batch_analyze_xlsx.py` 运行
PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from app.middleware.music_analyze import analyze_music


DEFAULT_OUTPUT_COLUMNS = [
    "tmeid",
    "歌曲ID",
    "歌曲名",
    "表演者",
    "歌曲时长",
    "表演者类型",
    "语种",
    "BPM速度",
    "情绪",
    "网络/抖音歌曲",
    "音乐风格",
    "配器",
    "场景",
]

ANALYZE_COLUMNS = [
    "表演者类型",
    "语种",
    "BPM速度",
    "情绪",
    "网络/抖音歌曲",
    "音乐风格",
    "配器",
    "场景",
]


def _is_blank(value: Any) -> bool:
    if value is None:
        return True
    if isinstance(value, float) and math.isnan(value):
        return True
    return str(value).strip() == ""


def _join_multi_value(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, str):
        return value.strip()
    if isinstance(value, list):
        parts = [str(v).strip() for v in value if str(v).strip()]
        return "、".join(parts)
    return str(value).strip()


def _pick_first_non_blank(row: pd.Series, candidates: list[str]) -> str:
    for col in candidates:
        if col in row.index and not _is_blank(row[col]):
            value = row[col]
            if isinstance(value, float) and value.is_integer():
                return str(int(value))
            return str(value).strip()
    return ""


def _normalize_key_value(value: Any) -> str:
    if _is_blank(value):
        return ""
    if isinstance(value, float) and value.is_integer():
        return str(int(value))
    return str(value).strip()


def _resolve_url_column(df: pd.DataFrame, requested_column: str) -> str:
    if requested_column in df.columns:
        return requested_column

    candidates = ["URL", "url", "cos访问地址", "cos_url", "audio_url"]
    for col in candidates:
        if col in df.columns:
            print(
                f"[run] url column `{requested_column}` not found, fallback to `{col}`"
            )
            return col

    raise ValueError(
        f"column `{requested_column}` not found, available={list(df.columns)}"
    )


def _is_row_completed(out_df: pd.DataFrame, idx: int) -> bool:
    for col in ANALYZE_COLUMNS:
        if col not in out_df.columns:
            continue
        value = out_df.at[idx, col]
        if not _is_blank(value):
            return True
    return False


def _resolve_checkpoint_path(output_path: Path, checkpoint_path: Path | None) -> Path:
    if checkpoint_path is not None:
        return checkpoint_path
    return output_path.with_suffix(output_path.suffix + ".checkpoint.json")


def _save_progress(
    out_df: pd.DataFrame,
    output_path: Path,
    checkpoint_path: Path,
    completed_indices: set[int],
) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)

    tmp_output = output_path.with_suffix(output_path.suffix + ".tmp")
    out_df = out_df[DEFAULT_OUTPUT_COLUMNS]
    out_df.to_excel(tmp_output, index=False)
    tmp_output.replace(output_path)

    payload = {
        "completed_indices": sorted(completed_indices),
        "completed_count": len(completed_indices),
        "total": int(len(out_df)),
    }
    tmp_checkpoint = checkpoint_path.with_suffix(checkpoint_path.suffix + ".tmp")
    tmp_checkpoint.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    tmp_checkpoint.replace(checkpoint_path)


def _load_checkpoint(checkpoint_path: Path) -> set[int]:
    if not checkpoint_path.exists():
        return set()
    try:
        payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
        values = payload.get("completed_indices", [])
        return {int(v) for v in values if isinstance(v, int) or str(v).isdigit()}
    except Exception:
        return set()


def _filter_checkpoint_indices(
    checkpoint_indices: set[int],
    out_df: pd.DataFrame,
    df: pd.DataFrame,
    url_column: str,
) -> set[int]:
    """
    过滤 checkpoint 中的索引：
    - 保留已存在分析结果的行（避免重复分析）
    - 保留当前仍为空 URL 的行（继续跳过）
    - 若 URL 已补齐且该行无分析结果，则不保留（允许后续补分析）
    """
    filtered: set[int] = set()
    for idx in checkpoint_indices:
        if idx < 0 or idx >= len(out_df):
            continue
        if _is_row_completed(out_df, idx):
            filtered.add(idx)
            continue
        url = df.at[idx, url_column] if url_column in df.columns else None
        if _is_blank(url):
            filtered.add(idx)
    return filtered


def _build_metadata(row: pd.Series, metadata_columns: list[str]) -> dict[str, Any]:
    metadata: dict[str, Any] = {}
    # 关键字段自动透传，避免遗漏导致下游无法建立映射
    for col in ["歌曲ID", "song_id", "id"]:
        if col in row.index and not _is_blank(row[col]):
            metadata[col] = row[col]
            break
    for col in ["tmeid", "tmeID", "TMEID"]:
        if col in row.index and not _is_blank(row[col]):
            metadata["tmeid"] = row[col]
            break
    for col in metadata_columns:
        if col in row.index and not _is_blank(row[col]):
            metadata[col] = row[col]
    return metadata


def _normalize_result(result: dict[str, Any]) -> dict[str, Any]:
    return {
        "表演者类型": (
            str(result.get("performer_type") or result.get("vocal_texture") or "").strip()
        ),
        "语种": str(result.get("language") or "").strip(),
        "BPM速度": result.get("bpm"),
        "情绪": _join_multi_value(result.get("emotion", [])),
        "网络/抖音歌曲": _join_multi_value(result.get("douyin_tags", [])),
        "音乐风格": _join_multi_value(
            result.get("music_style_tags", [])
            or [v for v in [result.get("genre"), result.get("sub_genre")] if v]
        ),
        "配器": _join_multi_value(result.get("instrument_tags", [])),
        "场景": _join_multi_value(result.get("scene", [])),
    }


def _build_song_tmeid_maps(df: pd.DataFrame) -> tuple[dict[str, int], dict[str, int]]:
    song_id_map: dict[str, int] = {}
    tmeid_map: dict[str, int] = {}
    for idx, row in df.iterrows():
        song_id = _pick_first_non_blank(row, ["歌曲ID", "song_id", "id"])
        tmeid = _pick_first_non_blank(row, ["tmeid", "tmeID", "TMEID"])
        if song_id and song_id not in song_id_map:
            song_id_map[song_id] = int(idx)
        if tmeid and tmeid not in tmeid_map:
            tmeid_map[tmeid] = int(idx)
    return song_id_map, tmeid_map


def _resume_from_existing_by_keys(out_df: pd.DataFrame, existing: pd.DataFrame) -> set[int]:
    """当输入行数变化时，按 歌曲ID/tmeid 匹配复用旧结果。"""
    completed_indices: set[int] = set()
    if existing.empty:
        return completed_indices

    old_song_map, old_tmeid_map = _build_song_tmeid_maps(existing)

    reused = 0
    reused_by_song = 0
    reused_by_tmeid = 0
    for idx in out_df.index:
        song_id = _normalize_key_value(out_df.at[idx, "歌曲ID"])
        tmeid = _normalize_key_value(out_df.at[idx, "tmeid"])

        old_idx = None
        if song_id and song_id in old_song_map:
            old_idx = old_song_map[song_id]
            reused_by_song += 1
        elif tmeid and tmeid in old_tmeid_map:
            old_idx = old_tmeid_map[tmeid]
            reused_by_tmeid += 1

        if old_idx is None:
            continue

        for col in DEFAULT_OUTPUT_COLUMNS:
            if col in existing.columns:
                out_df.at[idx, col] = existing.at[old_idx, col]

        if _is_row_completed(out_df, int(idx)):
            completed_indices.add(int(idx))
            reused += 1

    print(
        "[resume] row mismatch, reused by key: "
        f"song_id_match={reused_by_song}, tmeid_match={reused_by_tmeid}, "
        f"completed={reused}/{len(out_df)}"
    )
    return completed_indices


def _analyze_one(
    idx: int,
    row: pd.Series,
    url_column: str,
    provider: str,
    extract_lyrics: bool,
    label_level: int,
    metadata_columns: list[str],
) -> tuple[int, dict[str, Any]]:
    url = row.get(url_column)
    if _is_blank(url):
        return idx, {}

    try:
        metadata = _build_metadata(row, metadata_columns)
        result = analyze_music(
            metadata=metadata,
            music_url=str(url).strip(),
            provider=provider,
            extract_lyrics=extract_lyrics,
            label_level=label_level,
        )
        if not result:
            return idx, {}
        return idx, _normalize_result(result)
    except Exception as exc:
        print(f"[warn] row={idx} analyze failed: {type(exc).__name__}: {exc}")
        print(traceback.format_exc(limit=3))
        return idx, {}


def run_batch(
    input_path: Path,
    output_path: Path,
    checkpoint_path: Path | None,
    url_column: str,
    provider: str,
    extract_lyrics: bool,
    label_level: int,
    metadata_columns: list[str],
    workers: int,
    checkpoint_every: int,
    resume: bool,
) -> None:
    df = pd.read_excel(input_path)
    url_column = _resolve_url_column(df, url_column)
    checkpoint_path = _resolve_checkpoint_path(output_path, checkpoint_path)
    blank_url_indices = {int(idx) for idx, value in df[url_column].items() if _is_blank(value)}

    # 先构建参考表基础列（来自输入元数据）
    out_df = pd.DataFrame(index=df.index)
    out_df["tmeid"] = [
        _pick_first_non_blank(row, ["tmeid", "tmeID", "TMEID"]) for _, row in df.iterrows()
    ]
    out_df["歌曲ID"] = [
        _pick_first_non_blank(row, ["歌曲ID", "song_id", "id"]) for _, row in df.iterrows()
    ]
    out_df["歌曲名"] = [
        _pick_first_non_blank(row, ["歌曲名", "歌曲名称", "title"]) for _, row in df.iterrows()
    ]
    out_df["表演者"] = [
        _pick_first_non_blank(row, ["表演者", "歌手", "artist"]) for _, row in df.iterrows()
    ]
    out_df["歌曲时长"] = [
        _pick_first_non_blank(row, ["歌曲时长", "duration"]) for _, row in df.iterrows()
    ]

    for col in DEFAULT_OUTPUT_COLUMNS:
        if col not in out_df.columns:
            out_df[col] = ""

    completed_indices: set[int] = set()
    output_aligned_by_index = False
    if resume:
        if output_path.exists():
            try:
                existing = pd.read_excel(output_path)
                if len(existing) == len(out_df):
                    output_aligned_by_index = True
                    for col in DEFAULT_OUTPUT_COLUMNS:
                        if col in existing.columns:
                            out_df[col] = existing[col]
                    for idx in out_df.index:
                        if _is_row_completed(out_df, idx):
                            completed_indices.add(int(idx))
                    print(
                        f"[resume] loaded existing output: {len(completed_indices)}/{len(out_df)} completed"
                    )
                else:
                    completed_indices |= _resume_from_existing_by_keys(out_df, existing)
            except Exception as exc:
                print(f"[resume] failed to read existing output: {type(exc).__name__}: {exc}")

        checkpoint_completed = _load_checkpoint(checkpoint_path)
        if checkpoint_completed:
            if output_aligned_by_index:
                checkpoint_completed = _filter_checkpoint_indices(
                    checkpoint_completed, out_df, df, url_column
                )
                before = len(completed_indices)
                completed_indices |= {idx for idx in checkpoint_completed if 0 <= idx < len(out_df)}
                if len(completed_indices) != before:
                    print(
                        f"[resume] loaded checkpoint: {len(completed_indices)}/{len(out_df)} completed"
                    )
            else:
                print("[resume] ignore checkpoint due to row mismatch with previous output")

    # 空 URL 行直接跳过，不参与分析
    if blank_url_indices:
        completed_indices |= blank_url_indices
        print(f"[run] skip blank `{url_column}` rows: {len(blank_url_indices)}")

    pending_indices = [int(idx) for idx in out_df.index if int(idx) not in completed_indices]
    if not pending_indices:
        print("[resume] no pending rows, nothing to do")
        _save_progress(out_df, output_path, checkpoint_path, completed_indices)
        return

    print(
        f"[run] total={len(out_df)}, completed={len(completed_indices)}, pending={len(pending_indices)}"
    )

    workers = max(1, workers)
    checkpoint_every = max(1, checkpoint_every)
    processed_since_checkpoint = 0
    executor = ThreadPoolExecutor(max_workers=workers)
    futures = []
    try:
        for idx in pending_indices:
            row = df.iloc[idx]
            futures.append(
                executor.submit(
                    _analyze_one,
                    idx,
                    row,
                    url_column,
                    provider,
                    extract_lyrics,
                    label_level,
                    metadata_columns,
                )
            )

        pending_futures = set(futures)
        while pending_futures:
            done, pending_futures = wait(
                pending_futures,
                timeout=1.0,
                return_when=FIRST_COMPLETED,
            )
            if not done:
                continue

            for future in done:
                idx, result = future.result()
                for k, v in result.items():
                    out_df.at[idx, k] = v
                if result:
                    completed_indices.add(int(idx))

                processed_since_checkpoint += 1
                if processed_since_checkpoint >= checkpoint_every:
                    _save_progress(out_df, output_path, checkpoint_path, completed_indices)
                    processed_since_checkpoint = 0
    except KeyboardInterrupt:
        print("[interrupt] received keyboard interrupt, saving checkpoint...")
        try:
            _save_progress(out_df, output_path, checkpoint_path, completed_indices)
        except Exception as exc:
            print(f"[interrupt] failed to save checkpoint: {type(exc).__name__}: {exc}")

        for future in futures:
            future.cancel()
        executor.shutdown(wait=False, cancel_futures=True)
        print("[interrupt] force exit to avoid blocking on running worker threads")
        os._exit(130)
    finally:
        try:
            executor.shutdown(wait=True, cancel_futures=False)
        except Exception:
            pass

    _save_progress(out_df, output_path, checkpoint_path, completed_indices)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Batch audio analysis from xlsx")
    parser.add_argument("--input", required=True, help="input xlsx path")
    parser.add_argument("--output", required=True, help="output xlsx path")
    parser.add_argument(
        "--checkpoint",
        default="",
        help="checkpoint json path (default: <output>.checkpoint.json)",
    )
    parser.add_argument("--url-column", default="URL", help="url column name")
    parser.add_argument("--provider", default="qwen", choices=["qwen", "doubao"])
    parser.add_argument("--extract-lyrics", action="store_true", help="enable lyrics extraction")
    parser.add_argument("--label-level", type=int, default=0, choices=[0, 1])
    parser.add_argument(
        "--metadata-columns",
        default="tmeID,歌曲名称,歌曲名,歌手,表演者,版本,词作者,曲作者",
        help="comma separated metadata columns",
    )
    parser.add_argument("--workers", type=int, default=3, help="parallel workers")
    parser.add_argument(
        "--checkpoint-every",
        type=int,
        default=10,
        help="save checkpoint every N processed rows",
    )
    parser.add_argument(
        "--no-resume",
        action="store_true",
        help="disable resume from existing output/checkpoint",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    metadata_columns = [c.strip() for c in args.metadata_columns.split(",") if c.strip()]

    run_batch(
        input_path=Path(args.input),
        output_path=Path(args.output),
        checkpoint_path=Path(args.checkpoint) if args.checkpoint.strip() else None,
        url_column=args.url_column,
        provider=args.provider,
        extract_lyrics=args.extract_lyrics,
        label_level=args.label_level,
        metadata_columns=metadata_columns,
        workers=args.workers,
        checkpoint_every=args.checkpoint_every,
        resume=not args.no_resume,
    )


if __name__ == "__main__":
    main()