qwen_analyzer.py 65.3 KB

Raw Blame History Permalink

# -*- coding: utf-8 -*-
"""
通义千问音乐分析器实现
"""

import os
import time
import tempfile
import subprocess
import threading
import hashlib
import csv
from datetime import datetime
from pathlib import Path
import requests
import logging
from typing import Dict, Any, Optional, Tuple, List
from concurrent.futures import ThreadPoolExecutor

from .base import AudioAnalyzer
from .prompts import (
    build_analyze_prompt,
    build_lyrics_prompt,
)
from .audio_features import (
    extract_audio_features,
    extract_beat_timestamps,
    extract_emotion_curve,
    aggregate_emotion_by_segments,
)

# 使用项目统一的配置
from app.core.config import settings

logger = logging.getLogger(__name__)

MUSIC_MAPPING_HEADERS = [
    "song_id",
    "audio_file_name",
    "audio_file_path",
    "source_url",
    "updated_at",
]

MUSIC_MAPPING_HEADER_ALIASES = {
    "song_id": ("song_id", "歌曲ID"),
    "audio_file_name": ("audio_file_name", "音频文件名"),
    "audio_file_path": ("audio_file_path", "音频文件路径"),
    "source_url": ("source_url", "原始URL"),
    "updated_at": ("updated_at", "更新时间"),
}


class QwenAnalyzer(AudioAnalyzer):
    """通义千问音乐分析器"""

    def __init__(
        self,
        api_key: Optional[str] = None,
        base_url: Optional[str] = None,
        model: Optional[str] = None,
        max_retries: int = 3,
    ):
        """
        初始化通义千问分析器

        Args:
            api_key: API Key（默认从环境变量读取 QWEN_API_KEY）
            base_url: API 基础URL（默认从环境变量读取）
            model: 模型名称（默认: qwen3-omni-flash）
            timeout: 超时时间（秒）
            max_retries: 最大重试次数
        """
        # 优先使用传入的参数，其次使用项目统一的 settings
        if api_key is None:
            # 按优先级：QWEN_API_KEY -> QWEN_DASHSCOPE_API_KEY
            api_key = settings.QWEN_API_KEY or settings.QWEN_DASHSCOPE_API_KEY
        self.api_key = api_key
        self.base_url = (
            base_url
            or settings.QWEN_BASE_URL
            or "https://dashscope.aliyuncs.com/compatible-mode/v1"
        )
        self.model = model or settings.QWEN_MODEL or "qwen3-omni-flash"
        self.timeout =  settings.QWEN_TIMEOUT or 15.0
        self.lyrics_timeout =  settings.QWEN_LYRICS_TIMEOUT or 90.0
        self.max_retries = max_retries or settings.QWEN_MAX_RETRIES or 3

        self._client = None
        self._project_root = Path(__file__).resolve().parents[3]
        self._music_dir = self._resolve_music_dir()
        self._music_mapping_path = self._resolve_music_mapping_path()
        self._mapping_lock = threading.Lock()
        self._mapping_seen: set[tuple[str, str]] = self._load_existing_mapping_keys()

    def _resolve_music_dir(self) -> Path:
        raw_dir = str(getattr(settings, "MUSIC_DOWNLOAD_DIR", "music") or "music").strip()
        path = Path(raw_dir)
        if not path.is_absolute():
            path = self._project_root / path
        path.mkdir(parents=True, exist_ok=True)
        return path

    def _resolve_music_mapping_path(self) -> Path:
        raw_file = str(
            getattr(settings, "MUSIC_MAPPING_FILE", "music/music_file_mapping.csv")
            or "music/music_file_mapping.csv"
        ).strip()
        path = Path(raw_file)
        if not path.is_absolute():
            path = self._project_root / path
        path.parent.mkdir(parents=True, exist_ok=True)
        return path

    def _load_existing_mapping_keys(self) -> set[tuple[str, str]]:
        if not self._music_mapping_path.exists():
            return set()
        seen: set[tuple[str, str]] = set()
        try:
            with open(self._music_mapping_path, "r", encoding="utf-8-sig", newline="") as f:
                reader = csv.DictReader(f)
                for row in reader:
                    song_id = self._get_mapping_value(row, "song_id")
                    file_path = self._get_mapping_value(row, "audio_file_path")
                    if file_path:
                        try:
                            file_path = str(Path(file_path).resolve())
                        except Exception:
                            pass
                        seen.add((song_id, file_path))
        except Exception:
            return set()
        return seen

    def _get_mapping_value(self, row: Dict[str, Any], field: str) -> str:
        for alias in MUSIC_MAPPING_HEADER_ALIASES.get(field, (field,)):
            value = row.get(alias)
            if value is not None and str(value).strip():
                return str(value).strip()
        return ""

    def _extract_song_id(self, metadata: Optional[Dict[str, Any]]) -> str:
        if not metadata:
            return ""
        for key in ("歌曲ID", "song_id", "id", "track_id", "tmeid", "tmeID", "TMEID"):
            value = metadata.get(key)
            if value is not None and str(value).strip():
                return str(value).strip()
        return ""

    def _sanitize_filename_part(self, value: str) -> str:
        safe_chars = []
        for ch in value:
            if ch.isalnum() or ch in {"-", "_", "."}:
                safe_chars.append(ch)
            else:
                safe_chars.append("_")
        cleaned = "".join(safe_chars).strip("._")
        return cleaned[:80] if cleaned else "unknown"

    def _build_music_file_path(
        self,
        music_url: str,
        ext: str,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> Path:
        song_id = self._extract_song_id(metadata)
        song_part = self._sanitize_filename_part(song_id or "unknown")
        url_hash = hashlib.md5(music_url.encode("utf-8")).hexdigest()[:12]
        return self._music_dir / f"{song_part}_{url_hash}{ext}"

    def _append_music_mapping(
        self,
        file_path: Path,
        music_url: str,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None:
        song_id = self._extract_song_id(metadata)
        mapping_key = (song_id, str(file_path.resolve()))

        with self._mapping_lock:
            if mapping_key in self._mapping_seen:
                return

            write_header = not self._music_mapping_path.exists()
            encoding = "utf-8-sig" if write_header else "utf-8"
            with open(self._music_mapping_path, "a", encoding=encoding, newline="") as f:
                writer = csv.DictWriter(
                    f,
                    fieldnames=MUSIC_MAPPING_HEADERS,
                )
                if write_header:
                    writer.writeheader()
                writer.writerow(
                    {
                        "song_id": song_id,
                        "audio_file_name": file_path.name,
                        "audio_file_path": str(file_path.resolve()),
                        "source_url": music_url,
                        "updated_at": datetime.now().isoformat(timespec="seconds"),
                    }
                )
            self._mapping_seen.add(mapping_key)

    def _is_persisted_music_file(self, file_path: str) -> bool:
        try:
            candidate = Path(file_path).resolve()
            return candidate.parent == self._music_dir.resolve()
        except Exception:
            return False

    def _get_client(self):
        """获取 OpenAI 兼容客户端"""
        if self._client is None:
            from openai import OpenAI

            self._client = OpenAI(
                api_key=self.api_key,
                base_url=self.base_url,
                timeout=self.timeout,
                max_retries=0,
            )
        return self._client

    def get_provider_name(self) -> str:
        return "qwen"

    def get_model_name(self) -> str:
        return self.model

    def _call_songformer(self, music_url: str) -> Optional[Dict]:
        """
        调用 SongFormer 服务获取歌曲结构和高潮点

        Args:
            music_url: 音乐文件 URL

        Returns:
            SongFormer 返回的完整数据字典
        """
        songformer_url = getattr(settings, "SONGFORMER_URL", None)
        if not songformer_url:
            print("  [Qwen] SongFormer URL 未配置，跳过高潮点分析")
            return None

        try:
            print(f"  [Qwen] 调用 SongFormer 服务...")
            resp = requests.post(
                songformer_url,
                json={"url": music_url, "chorus_k": 3},
                timeout=60,
            )
            resp.raise_for_status()
            data = resp.json()
            print(f"  [Qwen] SongFormer 调用成功")
            return data
        except Exception as e:
            print(f"  [Qwen] SongFormer 调用失败: {e}")
            return None

    def _extract_climax_point(self, songformer_data: Optional[Dict]) -> str:
        """
        从 SongFormer 数据中提取高潮点

        Args:
            songformer_data: SongFormer 返回的数据

        Returns:
            str: "最强", "强", 或 ""
        """
        if not songformer_data:
            return ""

        # 首先尝试从 climax_points 字段获取（旧格式）
        climax_points = songformer_data.get("climax_points", {})
        if climax_points:
            # 检查是否有最强高潮
            if climax_points.get("strongest_climax"):
                return "最强"
            # 检查是否有强高潮
            if climax_points.get("strong_climax"):
                return "强"

        # 从 top_k_chorus 字段获取（新格式）
        top_k_chorus = songformer_data.get("top_k_chorus", [])
        if isinstance(top_k_chorus, list) and len(top_k_chorus) > 0:
            # 按 score 排序，取最高分作为最强高潮
            sorted_chorus = sorted(
                [
                    c
                    for c in top_k_chorus
                    if isinstance(c, dict) and c.get("score") is not None
                ],
                key=lambda x: x.get("score", 0),
                reverse=True,
            )
            if sorted_chorus:
                # 最高分 > 7.0 认为是"最强"，否则是"强"
                highest_score = sorted_chorus[0].get("score", 0)
                if highest_score > 7.0:
                    return "最强"
                else:
                    return "强"

        return ""

    def _build_climax_points(self, songformer_data: Optional[Dict]) -> Dict[str, Any]:
        """
        从 SongFormer 数据构建完整的 climax_points 对象

        Args:
            songformer_data: SongFormer 返回的数据

        Returns:
            包含 strong_climax 和 strongest_climax 的字典
        """
        if not songformer_data:
            return {
                "strong_climax": None,
                "strongest_climax": None,
                "analysis_time": 0.0,
            }

        # 首先尝试从 climax_points 字段获取（旧格式）
        climax_points = songformer_data.get("climax_points", {})
        if climax_points and (
            climax_points.get("strong_climax") or climax_points.get("strongest_climax")
        ):
            return {
                "strong_climax": climax_points.get("strong_climax"),
                "strongest_climax": climax_points.get("strongest_climax"),
                "analysis_time": climax_points.get("analysis_time", 0.0),
            }

        # 从 top_k_chorus 字段构建（新格式）
        top_k_chorus = songformer_data.get("top_k_chorus", [])
        segments = songformer_data.get("segments", [])

        if isinstance(top_k_chorus, list) and len(top_k_chorus) > 0:
            # 按 score 排序
            sorted_chorus = sorted(
                [
                    c
                    for c in top_k_chorus
                    if isinstance(c, dict) and c.get("score") is not None
                ],
                key=lambda x: x.get("score", 0),
                reverse=True,
            )

            if sorted_chorus:
                # 最高分作为 strongest_climax
                highest = sorted_chorus[0]
                highest_score = highest.get("score", 0)

                # 找到对应的段落标签
                start_time = highest.get("start", 0)
                section_label = "chorus"
                for seg in segments:
                    if isinstance(seg, dict):
                        seg_start = seg.get("start", 0)
                        seg_end = seg.get("end", 0)
                        if seg_start <= start_time < seg_end:
                            section_label = seg.get("label", "chorus")
                            break

                strongest_climax = {
                    "time": start_time,
                    "intensity": "strongest",
                    "section_label": section_label,
                    "reason": f"Highest chorus score: {highest_score:.2f}",
                }

                # 第二高作为 strong_climax（如果存在且分数差距不大）
                strong_climax = None
                if len(sorted_chorus) > 1:
                    second = sorted_chorus[1]
                    second_score = second.get("score", 0)
                    second_start = second.get("start", 0)

                    # 找到对应的段落标签
                    second_section_label = "chorus"
                    for seg in segments:
                        if isinstance(seg, dict):
                            seg_start = seg.get("start", 0)
                            seg_end = seg.get("end", 0)
                            if seg_start <= second_start < seg_end:
                                second_section_label = seg.get("label", "chorus")
                                break

                    strong_climax = {
                        "time": second_start,
                        "intensity": "strong",
                        "section_label": second_section_label,
                        "reason": f"Second highest chorus score: {second_score:.2f}",
                    }

                return {
                    "strong_climax": strong_climax,
                    "strongest_climax": strongest_climax,
                    "analysis_time": 0.0,
                }

        return {
            "strong_climax": None,
            "strongest_climax": None,
            "analysis_time": 0.0,
        }

    def analyze(
        self,
        metadata: Dict[str, Any],
        music_url: str,
        extract_lyrics: bool = False,
        label_level: int = 0,
    ) -> Optional[Dict[str, Any]]:
        """
        分析音乐

        Args:
            metadata: 音乐元数据
            music_url: 音乐文件 URL
            extract_lyrics: 是否识别歌词
            label_level: 标签级别

        Returns:
            分析结果字典
        """
        client = self._get_client()

        light_mode = bool(getattr(settings, "MUSIC_ANALYZE_LIGHT_MODE", True))
        songformer_data = None if light_mode else self._call_songformer(music_url)

        # 下载音频并提取本地特征
        local_features = {}
        tmp_file_path = None
        try:
            if light_mode:
                print("  [Qwen] 轻量模式: 仅提取 BPM")
                tmp_file_path, _ = self._download_audio(music_url, metadata=metadata)
                beat_info = extract_beat_timestamps(tmp_file_path)
                local_features = {"bpm": round(beat_info.tempo)}
                print(f"  [Qwen] 本地特征: BPM={local_features.get('bpm')}")
            else:
                print(f"  [Qwen] 下载音频并提取本地特征...")
                tmp_file_path, _ = self._download_audio(music_url, metadata=metadata)

                # 从 songformer 获取段落结构用于情绪聚合
                segments = songformer_data.get("segments") if songformer_data else None
                local_features = self._extract_local_features(tmp_file_path, segments=segments)

                # 从 SongFormer 数据中提取高潮点
                climax_point = self._extract_climax_point(songformer_data)
                local_features["climax_point"] = climax_point

                # 构建完整的 climax_points 对象
                climax_points = self._build_climax_points(songformer_data)
                local_features["climax_points"] = climax_points

                print(
                    f"  [Qwen] 本地特征: BPM={local_features.get('bpm')}, "
                    f"段落情绪数={len(local_features.get('segment_emotions', []))}, "
                    f"高潮点={climax_point}"
                )
        except Exception as e:
            print(f"  [Qwen] 本地特征提取失败，将使用LLM估算值: {e}")
        finally:
            # 清理临时文件
            if (
                tmp_file_path
                and os.path.exists(tmp_file_path)
                and not self._is_persisted_music_file(tmp_file_path)
            ):
                try:
                    os.unlink(tmp_file_path)
                except:
                    pass

        # 执行LLM分析
        if extract_lyrics:
            result = self._analyze_with_lyrics(client, metadata, music_url, label_level)
        else:
            result = self._analyze_basic(client, metadata, music_url, label_level)

        # 合并本地特征到结果中
        if result and local_features:
            # 使用本地提取的值覆盖
            result.update(local_features)

        return result

    def _analyze_basic(
        self,
        client,
        metadata: Dict[str, Any],
        music_url: str,
        label_level: int = 0,
    ) -> Optional[Dict[str, Any]]:
        """基础分析（不含歌词，单轮标签分析）"""
        # 提取音频ID用于错误定位
        song_id = self._extract_song_id(metadata)
        print(f"  [Qwen] 分析音频: 歌曲ID={song_id}")

        system_prompt, user_prompt = build_analyze_prompt(
            metadata=metadata,
            include_lyrics=False,
            label_level=label_level,
        )

        prompt = self._build_dashscope_prompt(system_prompt, user_prompt)
        response = self._call_with_retry_dashscope(music_url, prompt, song_id=song_id, metadata=metadata)
        if response is None:
            return None

        raw_content = response.get("content", "")
        parsed = self._parse_response(raw_content)
        if parsed is None:
            return None
        if isinstance(parsed, list):
            if parsed and isinstance(parsed[0], dict):
                parsed = parsed[0]
            else:
                return None
        if not isinstance(parsed, dict):
            return None

        return self._normalize_result(parsed, self.model, response.get("usage"))

    def _download_audio(
        self, music_url: str, metadata: Optional[Dict[str, Any]] = None
    ) -> Tuple[str, str]:
        """
        下载音频文件到 music 目录（按 URL+歌曲ID 命名并复用缓存）

        Args:
            music_url: 音频URL
            metadata: 音乐元数据（用于提取歌曲ID生成映射表）

        Returns:
            (本地文件路径, 文件扩展名)
        """
        # 确定文件扩展名
        ext = ".mp3"
        if "." in music_url:
            url_ext = music_url.split(".")[-1].split("?")[0].lower()
            if url_ext in ["mp3", "wav", "flac", "aac", "m4a", "ogg"]:
                ext = f".{url_ext}"

        target_path = self._build_music_file_path(music_url, ext, metadata=metadata)

        if not target_path.exists():
            response = requests.get(music_url, timeout=60)
            response.raise_for_status()
            with open(target_path, "wb") as f:
                f.write(response.content)
            print(f"  [Qwen] 音频已保存: {target_path}")

        self._append_music_mapping(target_path, music_url, metadata=metadata)
        return str(target_path), ext

    def _extract_local_features(
        self,
        audio_path: str,
        segments: Optional[List[Dict[str, Any]]] = None,
    ) -> Dict[str, Any]:
        """
        提取本地音频特征

        Args:
            audio_path: 本地音频文件路径
            segments: songformer 返回的段落结构（可选），用于聚合情绪曲线

        Returns:
            包含bpm、卡点时间戳、情绪曲线的字典
        """
        try:
            features = extract_audio_features(audio_path)

            # 卡点检测
            beat_info = extract_beat_timestamps(audio_path)

            # 情绪曲线
            emotion_curve = extract_emotion_curve(audio_path)

            # beat_info.tempo 经过节拍层级纠正，比 features.tempo 更准确
            result = {
                "bpm": round(beat_info.tempo),
                # 卡点信息
                "beat_timestamps": beat_info.beat_timestamps,
                "downbeat_timestamps": beat_info.downbeat_timestamps,
                "beat_intervals": beat_info.beat_intervals,
            }

            # 如果有段落结构，返回按段落聚合的情绪数据
            if segments:
                segment_emotions = aggregate_emotion_by_segments(emotion_curve, segments)
                result["segment_emotions"] = [
                    {
                        "start": se.start,
                        "end": se.end,
                        "label": se.label,
                        "intensity": se.intensity,
                        "energy": se.energy,
                        "valence": se.valence,
                        "arousal": se.arousal,
                        "trend": se.trend,
                    }
                    for se in segment_emotions
                ]
            else:
                # 没有段落结构时，返回原始情绪曲线
                result["emotion_curve"] = {
                    "timestamps": emotion_curve.timestamps,
                    "energy_values": emotion_curve.energy_values,
                    "valence_values": emotion_curve.valence_values,
                    "arousal_values": emotion_curve.arousal_values,
                    "values": emotion_curve.smoothed_curve,
                }

            return result
        except Exception as e:
            print(f"  [Qwen] 本地特征提取失败: {e}")
            return {}

    def _analyze_with_lyrics(
        self,
        client,
        metadata: Dict[str, Any],
        music_url: str,
        label_level: int = 0,
    ) -> Optional[Dict[str, Any]]:
        """分析（含歌词识别，单轮标签分析 + 歌词并发）"""
        # 提取音频ID用于错误定位
        song_id = self._extract_song_id(metadata)
        print(f"  [Qwen] 分析音频: 歌曲ID={song_id}")

        system_prompt, user_prompt = build_analyze_prompt(
            metadata=metadata,
            include_lyrics=False,
            label_level=label_level,
        )

        prompt = self._build_dashscope_prompt(system_prompt, user_prompt)

        lyrics_prompt = build_lyrics_prompt()

        messages_lyrics = self._build_messages(
            "请识别这段音频中的歌词内容", lyrics_prompt, music_url
        )

        print("  [Qwen] 并发执行基础标签分析和歌词识别...")
        start_time = time.time()

        result_main: Optional[Dict[str, Any]] = None
        usage_main: Optional[Dict[str, Any]] = None
        response_lyrics = None
        timing: Dict[str, float] = {}

        def _timed_call_dashscope(prompt_text: str) -> tuple[Optional[Dict], float]:
            call_start = time.time()
            resp = self._call_with_retry_dashscope(music_url, prompt_text, song_id=song_id, metadata=metadata)
            return resp, round(time.time() - call_start, 2)

        futures = {}
        with ThreadPoolExecutor(max_workers=2) as executor:
            futures[executor.submit(_timed_call_dashscope, prompt)] = "main"
            futures[executor.submit(self._timed_call_openai, client, messages_lyrics)] = "lyrics"

            for future in futures:
                part = futures[future]
                response, part_elapsed = future.result()
                if part == "lyrics":
                    timing["lyrics"] = part_elapsed
                    response_lyrics = response
                    continue
                timing["analysis"] = part_elapsed
                if response is None:
                    continue
                raw_content = response.get("content", "")
                parsed = self._parse_response(raw_content)
                if parsed is None:
                    continue
                if isinstance(parsed, list):
                    if parsed and isinstance(parsed[0], dict):
                        parsed = parsed[0]
                    else:
                        continue
                if not isinstance(parsed, dict):
                    continue
                result_main = parsed
                usage_main = response.get("usage")

        elapsed = time.time() - start_time
        print(f"  [Qwen] 并发调用完成，总耗时: {elapsed:.2f}s")

        if result_main is None:
            return None
        if not isinstance(result_main, dict):
            return None

        result: Dict[str, Any] = dict(result_main)

        # 处理歌词识别结果
        if response_lyrics:
            raw_lyrics = response_lyrics.get("content", "")
            lyrics_result = self._parse_response(raw_lyrics)
            if isinstance(lyrics_result, list):
                if lyrics_result and isinstance(lyrics_result[0], dict):
                    lyrics_result = lyrics_result[0]
            if lyrics_result and "lyrics" in lyrics_result:
                result["lyrics"] = lyrics_result["lyrics"]
        result["_timing"] = timing

        # 合并 token 使用信息
        usage: Dict[str, Any] = {}
        if usage_main:
            usage.update(usage_main)
        if response_lyrics and response_lyrics.get("usage"):
            usage_lyrics = response_lyrics["usage"]
            usage = {
                "prompt_tokens": usage.get("prompt_tokens", 0)
                + usage_lyrics.get("prompt_tokens", 0),
                "completion_tokens": usage.get("completion_tokens", 0)
                + usage_lyrics.get("completion_tokens", 0),
                "total_tokens": usage.get("total_tokens", 0)
                + usage_lyrics.get("total_tokens", 0),
            }

        result["_token_info_parts"] = {
            "main": usage_main,
            "lyrics": response_lyrics.get("usage") if response_lyrics else None,
        }

        return self._normalize_result(result, self.model, usage)

    def analyze_lyrics_only(
        self,
        metadata: Dict[str, Any],
        music_url: str,
    ) -> Optional[Dict[str, Any]]:
        """仅执行歌词识别，不做基础标签分析（ASR异步任务）"""
        backend = (
            str(
                os.getenv("MUSIC_LYRICS_ASR_BACKEND")
                or getattr(settings, "MUSIC_LYRICS_ASR_BACKEND", "funasr")
            )
            .strip()
            .lower()
        )

        if backend == "whisper":
            analyze_fn = self._analyze_lyrics_only_whisper
        elif backend in {"omni", "qwen-omni", "qwen_omni"}:
            # qwen-omni: 单轮流程内最多3次请求，失败后直接降级 funasr
            omni_result = self._analyze_lyrics_only_qwen_omni(music_url)
            if omni_result:
                return omni_result
            logger.warning(
                "qwen-omni 歌词识别失败，降级到 funasr (lyrics_timeout=%ss)",
                self.lyrics_timeout,
            )

            fallback_retry_count = 1
            fallback_retry_delay_seconds = 2.0
            for attempt in range(1, fallback_retry_count + 2):
                fallback_result = self._analyze_lyrics_only_funasr(music_url)
                if fallback_result:
                    logger.info(
                        "funasr 降级成功: attempt=%s/%s",
                        attempt,
                        fallback_retry_count + 1,
                    )
                    return fallback_result

                if attempt <= fallback_retry_count:
                    logger.warning(
                        "funasr 降级失败，%s 秒后重试 (%s/%s)",
                        fallback_retry_delay_seconds,
                        attempt,
                        fallback_retry_count,
                    )
                    time.sleep(fallback_retry_delay_seconds)

            logger.warning("funasr 降级失败，继续降级到 whisper")
            whisper_result = self._analyze_lyrics_only_whisper(music_url)
            if whisper_result:
                logger.info("whisper 降级成功")
                return whisper_result

            logger.error("歌词识别降级链全部失败: qwen-omni -> funasr -> whisper")
            return None
        elif backend in {"fun", "funasr", "fun-asr"}:
            analyze_fn = self._analyze_lyrics_only_funasr
        else:
            logger.error(
                "不支持的歌词识别后端: %s，仅支持 whisper/funasr/qwen-omni",
                backend,
            )
            return None

        retry_count = 2
        retry_delay_seconds = 2.0
        for attempt in range(1, retry_count + 2):
            result = analyze_fn(music_url)
            if result:
                return result
            if attempt <= retry_count:
                logger.warning(
                    "歌词识别失败，%s 秒后重试 (%d/%d): backend=%s",
                    retry_delay_seconds,
                    attempt,
                    retry_count,
                    backend,
                )
                time.sleep(retry_delay_seconds)
        return None

    def _analyze_lyrics_only_qwen_omni(self, music_url: str) -> Optional[Dict[str, Any]]:
        """qwen-omni V2 版歌词识别流程"""
        client = self._get_client()
        logger.info(
            "开始 qwen-omni 歌词识别: timeout=%ss, max_retries=%s",
            self.lyrics_timeout,
            3,
        )
        lyrics_prompt = build_lyrics_prompt()
        messages = self._build_messages(
            "请识别这段音频中的歌词内容",
            lyrics_prompt,
            music_url,
        )

        response = self._call_with_retry(client, messages, max_retries=3)
        if response is None:
            return None

        parsed = self._parse_response(response.get("content", ""))
        payload: Any = parsed
        if isinstance(parsed, dict):
            payload = (
                parsed.get("lyrics")
                or parsed.get("lyric")
                or parsed.get("歌词")
                or parsed
            )

        lyrics = self._convert_qwen_omni_payload_to_lyrics(payload)

        return {
            "lyrics": lyrics,
            "_model": self.model,
            "_token_info": response.get("usage"),
            "_transcription_url": None,
            "_asr_task_id": None,
            "_asr_backend": "qwen-omni",
        }

    def _convert_qwen_omni_payload_to_lyrics(self, payload: Any) -> List[Dict[str, Any]]:
        """将 qwen-omni 返回的 lyric 结构统一为 [{time, text}]"""
        if payload is None:
            return []

        if isinstance(payload, str):
            lines = [line.strip() for line in payload.splitlines() if line.strip()]
            return [{"time": None, "text": line} for line in lines]

        if isinstance(payload, dict):
            candidate = (
                payload.get("lyrics")
                or payload.get("lines")
                or payload.get("歌词")
                or payload.get("lyric")
            )
            return self._convert_qwen_omni_payload_to_lyrics(candidate)

        if isinstance(payload, list):
            lyrics: List[Dict[str, Any]] = []
            for item in payload:
                if isinstance(item, str):
                    line = item.strip()
                    if line:
                        lyrics.append({"time": None, "text": line})
                    continue

                if not isinstance(item, dict):
                    continue

                text = item.get("text") or item.get("lyric") or item.get("歌词")
                if not isinstance(text, str):
                    text = str(text) if text is not None else ""
                text = text.strip()
                if not text:
                    continue

                time_str = item.get("time")
                if not isinstance(time_str, str):
                    time_str = None
                lyrics.append({"time": time_str, "text": text})
            return lyrics

        return []

    def _analyze_lyrics_only_whisper(self, music_url: str) -> Optional[Dict[str, Any]]:
        """whisper-1 版歌词识别流程（91 API）"""
        try:
            from dotenv import load_dotenv

            load_dotenv()
        except Exception:
            pass

        api_key = (os.getenv("API_KEY_whisper") or os.getenv("91API_KEY") or "").strip()
        if not api_key:
            logger.error("whisper 调用失败: 缺少环境变量 API_KEY_whisper/91API_KEY")
            return None

        api_url = os.getenv(
            "WHISPER_API_URL",
            "https://xuedingmao.top/v1/audio/transcriptions",
        ).strip()
        headers = {"Authorization": f"Bearer {api_key}"}

        tmp_file_path = None
        upload_file_path = None
        ext = ".mp3"
        try:
            tmp_file_path, ext = self._download_audio(music_url, metadata=None)
            upload_file_path = tmp_file_path
            upload_ext = ext
            if ext.lower() == ".flac":
                converted_wav = self._convert_audio_to_wav_for_whisper(tmp_file_path)
                if converted_wav:
                    upload_file_path = converted_wav
                    upload_ext = ".wav"
                    logger.info("whisper 上传文件已从 flac 转换为 wav")

            filename = f"audio{upload_ext}"
            print(f"下载完成:{filename}")

            content_type = "audio/wav" if upload_ext == ".wav" else "audio/mpeg"

            with open(upload_file_path, "rb") as audio_file:
                files = {
                    "file": (filename, audio_file, content_type),
                }
                data = {
                    "model": "whisper-1",
                    "response_format": "verbose_json",
                    "timestamp_granularities": ["segment"],
                    "prompt": "没有歌词的片段用...代替，时间戳需要精准与每句歌词进行对应，对于纯音乐直接输出‘纯音乐，禁止输出歌名，作词/作曲等元数据，仅输出歌词与时间戳’",
                }
                response = requests.post(
                    api_url,
                    headers=headers,
                    data=data,
                    files=files,
                    timeout=300,
                )
            if response.status_code >= 400:
                logger.error(
                    "whisper API 返回错误: status=%s, body=%s",
                    response.status_code,
                    response.text,
                )
            response.raise_for_status()
            payload = response.json()
        except Exception as exc:
            logger.exception("whisper API 调用失败: %s", exc)
            return None
        finally:
            if (
                tmp_file_path
                and os.path.exists(tmp_file_path)
                and not self._is_persisted_music_file(tmp_file_path)
            ):
                try:
                    os.unlink(tmp_file_path)
                except Exception:
                    pass
            if (
                upload_file_path
                and upload_file_path != tmp_file_path
                and os.path.exists(upload_file_path)
            ):
                try:
                    os.unlink(upload_file_path)
                except Exception:
                    pass

        lyrics = self._convert_whisper_payload_to_lyrics(payload)
        return {
            "lyrics": lyrics,
            "_model": "whisper-1",
            "_token_info": None,
            "_transcription_url": None,
            "_asr_task_id": None,
            "_asr_backend": "whisper",
        }

    def _convert_whisper_payload_to_lyrics(
        self, payload: Any
    ) -> List[Dict[str, Any]]:
        """将 whisper 接口响应转换为 lyrics: [{time, text}]"""
        if not isinstance(payload, dict):
            return []

        segments = payload.get("segments")
        if isinstance(segments, list):
            lyrics: List[Dict[str, Any]] = []
            for seg in segments:
                if not isinstance(seg, dict):
                    continue
                text = seg.get("text")
                if not isinstance(text, str):
                    continue
                text = text.strip()
                if not text:
                    continue

                start = seg.get("start")
                if not isinstance(start, (int, float)):
                    # 兼容部分接口返回 begin_time(毫秒)
                    begin_time = seg.get("begin_time")
                    if isinstance(begin_time, (int, float)):
                        start = float(begin_time) / 1000.0

                time_str = None
                if isinstance(start, (int, float)):
                    try:
                        time_str = self._format_asr_time_ms(float(start) * 1000)
                    except (TypeError, ValueError, OverflowError):
                        time_str = None
                lyrics.append({"time": time_str, "text": text})
            if lyrics:
                return lyrics

        text = payload.get("text")
        if isinstance(text, str) and text.strip():
            return [{"time": None, "text": text.strip()}]
        return []

    def _convert_audio_to_wav_for_whisper(self, source_audio_path: str) -> Optional[str]:
        """
        将音频转换为 whisper 更稳定支持的 WAV 格式。
        """
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_tmp:
                wav_path = wav_tmp.name

            cmd = [
                "ffmpeg",
                "-y",
                "-i",
                source_audio_path,
                "-acodec",
                "pcm_s16le",
                "-ac",
                "1",
                "-ar",
                "16000",
                wav_path,
            ]
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            return wav_path
        except Exception as exc:
            logger.warning("flac 转 wav 失败，将继续使用原文件: %s", exc)
            return None

    def _analyze_lyrics_only_funasr(self, music_url: str) -> Optional[Dict[str, Any]]:
        """fun-asr SDK 版异步 ASR 流程"""
        try:
            from http import HTTPStatus
            import dashscope
            from dashscope.audio.asr import Transcription
        except Exception as exc:
            logger.exception("导入 dashscope.audio.asr.Transcription 失败: %s", exc)
            return None

        api_key = self._get_dashscope_api_key()
        if not api_key:
            logger.error("funasr 调用失败: 缺少 DashScope API Key")
            return None

        asr_model = getattr(settings, "DASHSCOPE_FUNASR_MODEL", "fun-asr")
        dashscope.base_http_api_url = getattr(
            settings,
            "DASHSCOPE_BASE_HTTP_API_URL",
            "https://dashscope.aliyuncs.com/api/v1",
        )
        dashscope.api_key = api_key
        poll_interval = float(getattr(settings, "DASHSCOPE_ASR_POLL_INTERVAL", 1.0))
        poll_timeout = float(getattr(settings, "DASHSCOPE_ASR_POLL_TIMEOUT", 120.0))

        try:
            task_resp = Transcription.async_call(
                model=asr_model,
                file_urls=[music_url],
            )
        except Exception as exc:
            logger.exception("funasr async_call 失败: %s", exc)
            return None

        task_id = self._extract_task_id_from_asr_response(task_resp)
        latest_resp: Any = task_resp

        deadline = time.time() + poll_timeout
        while time.time() < deadline:
            task_status = self._extract_task_status_from_asr_response(latest_resp)
            if task_status == "SUCCEEDED":
                break
            if task_status in {"FAILED", "CANCELED"}:
                logger.error(
                    "funasr 任务失败: task_id=%s, status=%s",
                    task_id,
                    task_status,
                )
                return None
            try:
                latest_resp = Transcription.fetch(
                    task=latest_resp,
                )
            except Exception as exc:
                logger.exception("funasr fetch 失败: %s", exc)
                return None
            time.sleep(poll_interval)
        else:
            logger.error("funasr 轮询超时: task_id=%s", task_id)
            return None

        status_code = getattr(latest_resp, "status_code", None)
        if status_code is not None and status_code != HTTPStatus.OK:
            logger.error(
                "funasr 返回非OK状态: task_id=%s, status_code=%s",
                task_id,
                status_code,
            )
            return None

        transcription_url = self._extract_transcription_url_from_asr_response(latest_resp)
        if not transcription_url:
            logger.error("funasr 结果缺少 transcription_url: task_id=%s", task_id)
            return None

        transcript_data = self._fetch_asr_transcription(transcription_url)
        if not transcript_data:
            return None

        lyrics = self._convert_asr_transcription_to_lyrics(transcript_data)
        token_info = self._extract_usage_from_asr_response(latest_resp)

        return {
            "lyrics": lyrics,
            "_model": asr_model,
            "_token_info": token_info,
            "_transcription_url": transcription_url,
            "_asr_task_id": task_id,
            "_asr_backend": "funasr",
        }

    def _submit_asr_transcription_task(self, music_url: str) -> Optional[str]:
        """提交 DashScope 异步ASR任务，返回 task_id"""
        api_key = self._get_dashscope_api_key()
        if not api_key:
            logger.error("提交ASR任务失败: 缺少 DashScope API Key")
            return None

        submit_url = getattr(
            settings,
            "DASHSCOPE_ASR_SUBMIT_URL",
            "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription",
        )
        asr_model = getattr(settings, "DASHSCOPE_ASR_MODEL", "qwen3-asr-flash-filetrans")

        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "X-DashScope-Async": "enable",
        }
        payload = {
            "model": asr_model,
            "input": {"file_url": music_url},
            "parameters": {
                "channel_id": [0],
                "enable_itn": False,
                "enable_words": False,
            },
        }

        try:
            response = requests.post(
                submit_url,
                headers=headers,
                json=payload,
                timeout=self.timeout,
            )
            response.raise_for_status()
            data = response.json()
        except Exception as exc:
            logger.exception("提交ASR任务异常: %s", exc)
            return None

        output = data.get("output") if isinstance(data, dict) else None
        if not isinstance(output, dict):
            logger.error("提交ASR任务失败: 缺少 output 字段")
            return None

        task_id = output.get("task_id")
        if not isinstance(task_id, str) or not task_id.strip():
            logger.error("提交ASR任务失败: 缺少 task_id")
            return None
        return task_id.strip()

    def _poll_asr_task_result(self, task_id: str) -> Optional[Dict[str, Any]]:
        """轮询 DashScope 任务直到结束"""
        api_key = self._get_dashscope_api_key()
        if not api_key:
            logger.error("轮询ASR任务失败: 缺少 DashScope API Key")
            return None

        task_base_url = getattr(
            settings,
            "DASHSCOPE_TASK_STATUS_BASE_URL",
            "https://dashscope.aliyuncs.com/api/v1/tasks",
        ).rstrip("/")
        task_url = f"{task_base_url}/{task_id}"

        headers = {
            "Authorization": f"Bearer {api_key}",
            "X-DashScope-Async": "enable",
            "Content-Type": "application/json",
        }

        poll_interval = float(getattr(settings, "DASHSCOPE_ASR_POLL_INTERVAL", 1.0))
        poll_timeout = float(getattr(settings, "DASHSCOPE_ASR_POLL_TIMEOUT", 120.0))
        deadline = time.time() + poll_timeout

        while time.time() < deadline:
            try:
                response = requests.get(task_url, headers=headers, timeout=self.timeout)
                response.raise_for_status()
                data = response.json()
            except Exception as exc:
                logger.exception("轮询ASR任务异常: task_id=%s, error=%s", task_id, exc)
                return None

            output = data.get("output") if isinstance(data, dict) else None
            task_status = output.get("task_status") if isinstance(output, dict) else None
            if task_status == "SUCCEEDED":
                return data
            if task_status in {"FAILED", "CANCELED"}:
                logger.error(
                    "ASR任务失败: task_id=%s, status=%s, data=%s",
                    task_id,
                    task_status,
                    data,
                )
                return None

            time.sleep(poll_interval)

        logger.error("轮询ASR任务超时: task_id=%s", task_id)
        return None

    def _fetch_asr_transcription(self, transcription_url: str) -> Optional[Dict[str, Any]]:
        """下载 transcription_url 对应的转写结果JSON"""
        try:
            response = requests.get(transcription_url, timeout=self.timeout)
            response.raise_for_status()
            data = response.json()
            return data if isinstance(data, dict) else None
        except Exception as exc:
            logger.exception("下载ASR转写结果失败: %s", exc)
            return None

    def _convert_asr_transcription_to_lyrics(
        self, transcript_data: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """将ASR结果转换为 lyrics: [{time, text}]"""
        transcripts = transcript_data.get("transcripts")
        if not isinstance(transcripts, list):
            return []

        lyrics: List[Dict[str, Any]] = []
        for transcript in transcripts:
            if not isinstance(transcript, dict):
                continue

            sentences = transcript.get("sentences")
            if not isinstance(sentences, list):
                continue

            for sentence in sentences:
                if not isinstance(sentence, dict):
                    continue

                text = sentence.get("text")
                if not isinstance(text, str):
                    continue
                text = text.strip()
                if not text:
                    continue

                begin_time = sentence.get("begin_time")
                time_str = (
                    self._format_asr_time_ms(begin_time)
                    if isinstance(begin_time, (int, float))
                    else None
                )
                lyrics.append(
                    {
                        "time": time_str,
                        "text": text,
                    }
                )

        return lyrics

    @staticmethod
    def _format_asr_time_ms(ms_value: float) -> str:
        """毫秒转 mm:ss.xxx"""
        total_ms = int(max(0, ms_value))
        minutes = total_ms // 60000
        seconds = (total_ms % 60000) // 1000
        milliseconds = total_ms % 1000
        return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

    def _get_dashscope_api_key(self) -> Optional[str]:
        """获取 DashScope API Key（ASR专用）"""
        return (
            self.api_key
            or settings.QWEN_DASHSCOPE_API_KEY
            or settings.QWEN_API_KEY
            or os.getenv("DASHSCOPE_API_KEY")
            or os.getenv("QWEN_DASHSCOPE_API_KEY")
            or os.getenv("QWEN_API_KEY")
        )

    @staticmethod
    def _as_dict(response_obj: Any) -> Dict[str, Any]:
        """尽可能将 SDK 响应对象转换为 dict"""
        if isinstance(response_obj, dict):
            return response_obj
        if response_obj is None:
            return {}

        for attr in ("to_dict", "as_dict", "dict"):
            fn = getattr(response_obj, attr, None)
            if callable(fn):
                try:
                    value = fn()
                    if isinstance(value, dict):
                        return value
                except Exception:
                    pass

        data: Dict[str, Any] = {}
        for key in ("request_id", "output", "usage"):
            val = getattr(response_obj, key, None)
            if val is not None:
                if key in ("output", "usage") and not isinstance(val, dict):
                    nested = QwenAnalyzer._as_dict(val)
                    data[key] = nested if nested else val
                else:
                    data[key] = val
        return data

    def _extract_task_id_from_asr_response(self, response_obj: Any) -> Optional[str]:
        data = self._as_dict(response_obj)
        output = data.get("output")
        if isinstance(output, dict):
            task_id = output.get("task_id")
            if isinstance(task_id, str) and task_id.strip():
                return task_id.strip()
        return None

    def _extract_task_status_from_asr_response(self, response_obj: Any) -> Optional[str]:
        data = self._as_dict(response_obj)
        output = data.get("output")
        if isinstance(output, dict):
            task_status = output.get("task_status")
            if isinstance(task_status, str):
                return task_status
        return None

    def _extract_transcription_url_from_asr_response(
        self, response_obj: Any
    ) -> Optional[str]:
        data = self._as_dict(response_obj)
        output = data.get("output")
        if not isinstance(output, dict):
            return None

        # 兼容 output.results: [{transcription_url: ...}]
        results = output.get("results")
        if isinstance(results, list) and results:
            first = results[0]
            if isinstance(first, dict):
                transcription_url = first.get("transcription_url")
                if isinstance(transcription_url, str) and transcription_url.strip():
                    return transcription_url.strip()

        result = output.get("result")
        if not isinstance(result, dict):
            # 兜底兼容 output.transcription_url
            transcription_url = output.get("transcription_url")
            if isinstance(transcription_url, str) and transcription_url.strip():
                return transcription_url.strip()
            return None
        transcription_url = result.get("transcription_url")
        if isinstance(transcription_url, str) and transcription_url.strip():
            return transcription_url.strip()
        return None

    def _extract_usage_from_asr_response(
        self, response_obj: Any
    ) -> Optional[Dict[str, Any]]:
        data = self._as_dict(response_obj)
        usage = data.get("usage")
        return usage if isinstance(usage, dict) else None

    def _build_messages(
        self,
        system_prompt: str,
        user_prompt: str,
        music_url: str,
    ) -> list:
        """构建消息格式"""
        messages = []

        # 添加系统提示词
        if system_prompt:
            messages.append(
                {
                    "role": "system",
                    "content": system_prompt,
                }
            )

        # 添加用户消息（包含音频和文本）
        messages.append(
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {"data": music_url, "format": "mp3"},
                    },
                    {"type": "text", "text": user_prompt},
                ],
            }
        )

        return messages

    def _build_dashscope_prompt(self, system_prompt: str, user_prompt: str) -> str:
        """构建 DashScope 调用的文本提示词"""
        if system_prompt and system_prompt.strip():
            return f"{system_prompt.strip()}\n\n{user_prompt}".strip()
        return user_prompt.strip()

    def _timed_call_openai(
        self, client, messages: list
    ) -> tuple[Optional[Dict], float]:
        """为 OpenAI 兼容调用提供耗时统计"""
        call_start = time.time()
        resp = self._call_with_retry(client, messages)
        return resp, round(time.time() - call_start, 2)

    def _call_with_retry_dashscope(
        self, music_url: str, prompt: str, timeout: Optional[float] = None, song_id: str = "", metadata: Optional[Dict[str, Any]] = None
    ) -> Optional[Dict]:
        """使用 DashScope SDK 进行多模态调用（带重试，自动降级到 base64）"""
        import dashscope

        dashscope_key = (
            self.api_key
            or settings.QWEN_DASHSCOPE_API_KEY
            or os.getenv("QWEN_OMNI_API_KEY")
            or os.getenv("DASHSCOPE_API_KEY")
        )
        if not dashscope_key:
            print("  ⚠ 未设置 DASHSCOPE_API_KEY 环境变量，请先配置")
            return None

        messages = [
            {
                "role": "user",
                "content": [
                    {"audio": music_url},
                    {"text": prompt},
                ],
            }
        ]

        timeout = timeout or self.timeout

        for attempt in range(1, self.max_retries + 1):
            try:
                print(
                    f"  [{self.model}] 正在分析 (DashScope 尝试 {attempt}/{self.max_retries}, timeout={timeout}s)..."
                )
                response = self._dashscope_call_with_hard_timeout(
                    dashscope=dashscope,
                    api_key=dashscope_key,
                    model=self.model,
                    messages=messages,
                    timeout=timeout,
                )

                if response.status_code != 200:
                    error_msg = getattr(response, "message", "")
                    error_code = getattr(response, "code", "")
                    error_output = getattr(response, "output", {})
                    print(
                        f"  ✗ [{self.model}] API 调用失败，状态码: {response.status_code}"
                    )
                    if song_id:
                        print(f"    歌曲ID: {song_id}")
                    if error_code:
                        print(f"    错误代码: {error_code}")
                    if error_msg:
                        print(f"    错误信息: {error_msg}")
                    if error_output:
                        print(f"    响应内容: {error_output}")

                    # 检测文件过大错误，自动降级到 OSS 方式
                    if "file size is too large" in str(error_msg).lower() or "file size is too large" in str(error_output).lower():
                        print(f"  [Qwen] 检测到文件过大，自动降级到 OSS 方式...")
                        try:
                            temp_audio_path = self._download_audio_temp(music_url)
                            if temp_audio_path:
                                mono_path = self._convert_to_mono(temp_audio_path)
                                oss_url = self._upload_audio_to_oss(mono_path)
                                # 只删除转换后的单声道文件，保留原始下载文件
                                self._cleanup_temp_audio(mono_path)
                                if oss_url:
                                    print(f"  [Qwen] 使用 OSS URL 重新请求: {oss_url[:60]}...")
                                    return self._call_with_retry_dashscope(oss_url, prompt, timeout=timeout, song_id=song_id, metadata=metadata)
                        except Exception as e:
                            print(f"  [Qwen] OSS 降级失败: {e}")
                            return None

                    if attempt < self.max_retries:
                        time.sleep(attempt)
                        continue
                    return None

                content = response.output.choices[0].message.content
                if isinstance(content, list):
                    if content and isinstance(content[0], dict) and "text" in content[0]:
                        result_text = content[0]["text"]
                    else:
                        result_text = ""
                else:
                    result_text = content

                usage = None
                resp_usage = getattr(response, "usage", None)
                if isinstance(resp_usage, dict):
                    input_tokens = resp_usage.get(
                        "input_tokens", resp_usage.get("prompt_tokens", 0)
                    )
                    output_tokens = resp_usage.get(
                        "output_tokens", resp_usage.get("completion_tokens", 0)
                    )
                    total_tokens = resp_usage.get("total_tokens")
                    usage = {
                        "prompt_tokens": input_tokens or 0,
                        "completion_tokens": output_tokens or 0,
                        "total_tokens": total_tokens
                        if total_tokens is not None
                        else (input_tokens or 0) + (output_tokens or 0),
                    }
                elif resp_usage is not None:
                    input_tokens = getattr(resp_usage, "input_tokens", None)
                    output_tokens = getattr(resp_usage, "output_tokens", None)
                    total_tokens = getattr(resp_usage, "total_tokens", None)
                    usage = {
                        "prompt_tokens": input_tokens or 0,
                        "completion_tokens": output_tokens or 0,
                        "total_tokens": total_tokens
                        if total_tokens is not None
                        else (input_tokens or 0) + (output_tokens or 0),
                    }

                return {"content": result_text, "usage": usage}

            except TimeoutError:
                print(f"  ✗ [{self.model}] API 调用超时 (尝试 {attempt}/{self.max_retries})")
                if attempt < self.max_retries:
                    time.sleep(attempt)
                    continue
                return None
            except Exception as e:
                print(f"  ✗ [{self.model}] API 调用异常: {e}")
                if attempt < self.max_retries:
                    time.sleep(attempt)
                    continue
                return None

        return None

    def _download_audio_temp(self, music_url: str) -> Optional[str]:
        """
        临时下载音频文件到系统临时目录

        Args:
            music_url: 音频URL

        Returns:
            临时文件路径，如果下载失败返回 None
        """
        try:
            # 确定文件扩展名
            ext = ".mp3"
            if "." in music_url:
                url_ext = music_url.split(".")[-1].split("?")[0].lower()
                if url_ext in ["mp3", "wav", "flac", "aac", "m4a", "ogg"]:
                    ext = f".{url_ext}"

            # 下载到系统临时目录
            temp_dir = tempfile.gettempdir()
            url_hash = hashlib.md5(music_url.encode("utf-8")).hexdigest()[:12]
            temp_path = os.path.join(temp_dir, f"qwen_audio_{url_hash}{ext}")

            if not os.path.exists(temp_path):
                response = requests.get(music_url, timeout=60)
                response.raise_for_status()
                with open(temp_path, "wb") as f:
                    f.write(response.content)
                print(f"  [Qwen] 临时音频已下载: {temp_path}")
            else:
                print(f"  [Qwen] 使用缓存的临时音频")

            return temp_path
        except Exception as e:
            print(f"  [Qwen] 临时音频下载失败: {e}")
            return None

    def _convert_to_mono(self, audio_path: str) -> str:
        """
        将音频转换为单声道

        Args:
            audio_path: 原始音频文件路径

        Returns:
            转换后的音频文件路径
        """
        import time
        timestamp = int(time.time() * 1000)
        base_name = os.path.basename(audio_path)
        name_parts = base_name.rsplit(".", 1)
        if len(name_parts) == 2:
            mono_path = os.path.join(
                os.path.dirname(audio_path),
                f"{name_parts[0]}_mono_{timestamp}.{name_parts[1]}"
            )
        else:
            mono_path = f"{audio_path}_mono_{timestamp}"

        try:
            cmd = [
                "ffmpeg",
                "-i", audio_path,
                "-ac", "1",          # 转为单声道
                "-y",
                mono_path
            ]
            print(f"  [Qwen] 转换为单声道: ffmpeg -i ... -ac 1")
            subprocess.run(cmd, capture_output=True, timeout=60, check=True)
            original_size = os.path.getsize(audio_path)
            mono_size = os.path.getsize(mono_path)
            ratio = (1 - mono_size / original_size) * 100
            print(f"  [Qwen] 音频已转换: {original_size/1024/1024:.1f}MB -> {mono_size/1024/1024:.1f}MB (压缩率: {ratio:.1f}%)")
            return mono_path
        except Exception as e:
            print(f"  [Qwen] 音频转换失败: {e}，将使用原文件")
            return audio_path

    def _upload_audio_to_oss(self, audio_path: str) -> Optional[str]:
        """
        将音频文件上传到 OSS

        Args:
            audio_path: 音频文件路径

        Returns:
            OSS 文件 URL，如果上传失败返回 None
        """
        try:
            from app.utils.oss_uploader import oss_uploader

            success, result = oss_uploader.upload_file(audio_path)
            if not success:
                print(f"  [Qwen] 音频上传到 OSS 失败: {result}")
                return None

            oss_url = result
            print(f"  [Qwen] 音频已上传到 OSS: {oss_url}")
            return oss_url
        except Exception as e:
            print(f"  [Qwen] 音频上传到 OSS 失败: {e}")
            return None

    def _cleanup_temp_audio(self, temp_path: str) -> None:
        """清理临时音频文件"""
        if temp_path and os.path.exists(temp_path):
            try:
                os.unlink(temp_path)
                print(f"  [Qwen] 已清理临时音频文件")
            except:
                pass

    def _dashscope_call_with_hard_timeout(
        self,
        dashscope,
        api_key: str,
        model: str,
        messages: list,
        timeout: float,
    ):
        """
        DashScope SDK 某些版本下 request_timeout 可能无法稳定生效。
        这里增加线程级硬超时，避免单次调用无限阻塞。
        """
        box: Dict[str, Any] = {}
        done = threading.Event()

        def _target() -> None:
            try:
                box["response"] = dashscope.MultiModalConversation.call(
                    api_key=api_key,
                    model=model,
                    messages=messages,
                    request_timeout=timeout,
                )
            except Exception as exc:
                box["error"] = exc
            finally:
                done.set()

        worker = threading.Thread(target=_target, daemon=True)
        worker.start()
        hard_timeout = max(float(timeout), 1.0) + 3.0
        if not done.wait(hard_timeout):
            raise TimeoutError(f"DashScope hard timeout after {hard_timeout:.1f}s")
        if "error" in box:
            raise box["error"]
        return box.get("response")

    def _call_with_retry(
        self,
        client,
        messages: list,
        timeout: Optional[float] = None,
        max_retries: Optional[int] = None,
    ) -> Optional[Dict]:
        """带重试的 API 调用（非流式）"""
        timeout = timeout or self.lyrics_timeout
        retries = max_retries or self.max_retries

        for attempt in range(1, retries + 1):
            try:
                print(
                    f"  [Qwen] 调用模型 (尝试 {attempt}/{retries}, timeout={timeout}s)..."
                )

                response = client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    modalities=["text"],
                    stream=False,
                    timeout=timeout,
                    extra_body={"enable_thinking": False},
                )

                content = (
                    response.choices[0].message.content if response.choices else ""
                )
                usage = {
                    "prompt_tokens": response.usage.prompt_tokens
                    if response.usage
                    else 0,
                    "completion_tokens": response.usage.completion_tokens
                    if response.usage
                    else 0,
                    "total_tokens": response.usage.total_tokens
                    if response.usage
                    else 0,
                }

                print(f"  [Qwen] 响应: {content[:100]}...")

                return {"content": content, "usage": usage}

            except Exception as e:
                error_type = type(e).__name__
                print(f"  [Qwen] 错误 ({error_type}): {e}")

                if attempt < retries:
                    wait_time = attempt
                    print(f"    等待 {wait_time} 秒后重试...")
                    time.sleep(wait_time)
                else:
                    print(f"    已达到最大重试次数")
                    return None

        return None