audio_features.py 22.6 KB

Raw Blame History Permalink

"""
音频特征提取模块
提供音频特征提取、节奏强度和能量级别计算功能
"""

import os
import warnings
import numpy as np
import librosa
from typing import Any, Dict, List, Optional, Tuple
from dataclasses import dataclass

from .bpm_analyzer_tools import RealtimeBPMAnalyzerTest

# 抑制 librosa 的 audioread 弃用警告
warnings.filterwarnings("ignore", category=FutureWarning, module="librosa")


@dataclass
class AudioFeatures:
    """音频特征数据"""

    # 时域特征
    rms_energy: np.ndarray  # RMS 能量 (帧级别)
    rms_times: np.ndarray  # 对应的时间戳

    # 频域特征
    spectral_centroid: np.ndarray  # 频谱质心 (亮度)
    spectral_rolloff: np.ndarray  # 频谱滚降 (低频占比)
    spectral_bandwidth: np.ndarray  # 频谱带宽

    # 节奏特征
    onset_strength: np.ndarray  # onset 强度
    tempo: float  # BPM

    # 统计信息
    duration: float
    sr: int


def extract_audio_features(audio_path: str, hop_length: int = 512) -> AudioFeatures:
    """
    提取音频特征

    Args:
        audio_path: 音频文件路径
        hop_length: 帧移长度 (默认 512 samples ≈ 11.6ms @ 44.1kHz)

    Returns:
        AudioFeatures: 音频特征对象
    """
    # 加载音频
    y, sr = librosa.load(audio_path, sr=None, mono=True)
    duration = librosa.get_duration(y=y, sr=sr)

    # 1. RMS 能量 (时域响度)
    rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
    rms_db = librosa.amplitude_to_db(rms, ref=np.max)
    rms_times = librosa.frames_to_time(
        np.arange(len(rms)), sr=sr, hop_length=hop_length
    )

    # 2. 频谱特征
    spectral_centroid = librosa.feature.spectral_centroid(
        y=y, sr=sr, hop_length=hop_length
    )[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(
        y=y, sr=sr, hop_length=hop_length
    )[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(
        y=y, sr=sr, hop_length=hop_length
    )[0]

    # 3. 节奏特征
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

    # 使用统一 BPM 分析入口（带倍频纠正）
    bpm_analyzer = RealtimeBPMAnalyzerTest(verbose=False)
    bpm_result = bpm_analyzer.analyze_bpm(y=y, sr=sr)
    corrected_tempo = bpm_result.get('bpm', 120.0)

    return AudioFeatures(
        rms_energy=rms_db,
        rms_times=rms_times,
        spectral_centroid=spectral_centroid,
        spectral_rolloff=spectral_rolloff,
        spectral_bandwidth=spectral_bandwidth,
        onset_strength=onset_env,
        tempo=corrected_tempo,
        duration=duration,
        sr=int(sr),
    )


def calculate_rhythm_intensity(features: AudioFeatures) -> int:
    """
    根据音频特征计算节奏强度 (1-5)

    基于以下因素综合计算：
    - BPM (速度)
    - Onset 强度 (节奏密度)
    - 能量变化 (动态范围)

    Args:
        features: 音频特征对象

    Returns:
        int: 节奏强度 (1-5)
    """
    tempo = features.tempo
    onset = features.onset_strength
    rms = features.rms_energy

    # 1. BPM 得分 (40-200 BPM 映射到 1-5)
    if tempo >= 160:
        tempo_score = 5
    elif tempo >= 130:
        tempo_score = 4
    elif tempo >= 100:
        tempo_score = 3
    elif tempo >= 70:
        tempo_score = 2
    else:
        tempo_score = 1

    # 2. Onset 密度得分
    onset_mean = np.mean(onset)
    onset_max = np.max(onset) if len(onset) > 0 else 1
    onset_density = onset_mean / onset_max if onset_max > 0 else 0

    if onset_density >= 0.5:
        density_score = 5
    elif onset_density >= 0.4:
        density_score = 4
    elif onset_density >= 0.3:
        density_score = 3
    elif onset_density >= 0.2:
        density_score = 2
    else:
        density_score = 1

    # 3. 能量动态得分
    rms_std = np.std(rms)
    if rms_std >= 15:
        dynamic_score = 5
    elif rms_std >= 12:
        dynamic_score = 4
    elif rms_std >= 9:
        dynamic_score = 3
    elif rms_std >= 6:
        dynamic_score = 2
    else:
        dynamic_score = 1

    # 加权平均 (BPM 40%, 密度 35%, 动态 25%)
    final_score = tempo_score * 0.4 + density_score * 0.35 + dynamic_score * 0.25

    return int(round(final_score))


def calculate_energy_level(
    features: AudioFeatures,
) -> Tuple[int, Dict[str, float]]:
    """
    计算能量级别 (1-5) 和详细信息

    Args:
        features: 音频特征对象

    Returns:
        Tuple[int, Dict]: (能量级别 1-5, 详细信息字典)
    """
    # 1. 响度得分 (基于 RMS 能量)
    rms_db = features.rms_energy
    loudness_normalized = np.clip((rms_db + 60) / 10, 0, 5)
    loudness_score = float(np.percentile(loudness_normalized, 75))

    # 2. 亮度得分 (基于频谱质心)
    centroid = features.spectral_centroid
    centroid_normalized = np.clip(centroid / 4000, 0, 1)
    brightness_score = float(np.mean(centroid_normalized)) * 5

    # 3. 节奏得分 (基于 onset 强度)
    onset = features.onset_strength
    onset_normalized = np.clip(onset / np.percentile(onset, 90), 0, 1)
    rhythm_score = float(np.mean(onset_normalized)) * 5

    # 4. BPM 因子
    tempo = features.tempo
    if tempo > 140:
        tempo_factor = 1.3
    elif tempo > 120:
        tempo_factor = 1.15
    elif tempo > 100:
        tempo_factor = 1.0
    elif tempo > 80:
        tempo_factor = 0.9
    else:
        tempo_factor = 0.8

    # 综合计算
    weights = {"loudness": 0.40, "brightness": 0.25, "rhythm": 0.35}

    composite_score = (
        weights["loudness"] * loudness_score
        + weights["brightness"] * brightness_score
        + weights["rhythm"] * rhythm_score
    ) * tempo_factor

    # 映射到 1-5 级别
    if composite_score < 1.5:
        energy_level = 1
    elif composite_score < 2.5:
        energy_level = 2
    elif composite_score < 3.5:
        energy_level = 3
    elif composite_score < 4.5:
        energy_level = 4
    else:
        energy_level = 5

    details = {
        "loudness_score": round(loudness_score, 2),
        "brightness_score": round(brightness_score, 2),
        "rhythm_score": round(rhythm_score, 2),
        "tempo_factor": tempo_factor,
        "composite_score": round(composite_score, 2),
    }

    return energy_level, details


def energy_level_to_string(level: int) -> str:
    """
    将能量级别数字转换为字符串描述

    Args:
        level: 能量级别 (1-5)

    Returns:
        str: 能量密度描述
    """
    mapping = {
        1: "舒缓",
        2: "柔和",
        3: "律动",
        4: "强烈",
        5: "爆发",
    }
    return mapping.get(level, "律动")


@dataclass
class BeatInfo:
    """节拍信息"""
    beat_timestamps: List[float]      # 所有节拍时间点
    downbeat_timestamps: List[float]  # 强拍时间点（每小节第一拍）
    tempo: float                       # BPM
    beat_intervals: List[float]       # 节拍间隔（用于检测节奏变化）


@dataclass
class EmotionCurve:
    """情绪曲线数据"""
    timestamps: List[float]           # 时间点
    energy_values: List[float]        # 能量值 (0-1)
    valence_values: List[float]       # 情绪效价 (0-1, 低=悲伤, 高=欢快)
    arousal_values: List[float]       # 情绪唤醒度 (0-1, 低=平静, 高=激动)
    smoothed_curve: List[float]       # 平滑后的综合情绪曲线


@dataclass
class SegmentEmotion:
    """段落情绪数据（与 songformer 段落对齐）"""
    start: float                      # 段落开始时间
    end: float                        # 段落结束时间
    label: str                        # 段落标签 (intro/verse/chorus/bridge/outro)
    intensity: float                  # 情绪强度 (0-1)
    energy: float                     # 能量值 (0-1)
    valence: float                    # 效价值 (0-1)
    arousal: float                    # 唤醒度 (0-1)
    trend: str                        # 情绪趋势 (rising/falling/stable/peak)


@dataclass
class BeatDensityInfo:
    """节拍密度信息（用于分镜时长规划）"""
    segment_label: str                # 段落标签
    start: float                      # 开始时间
    end: float                        # 结束时间
    beat_count: int                   # 节拍数
    avg_interval: float               # 平均间隔（秒）
    density_level: str                # sparse/normal/dense/very_dense
    recommended_shot_duration: str    # 推荐分镜时长


@dataclass
class EnhancedClimaxInfo:
    """增强高潮点信息（包含铺垫/持续/缓冲时长）"""
    time: float                       # 高潮时间点
    intensity: str                    # strong/strongest
    buildup_start: float              # 铺垫开始时间
    buildup_duration: float           # 铺垫时长（秒）
    climax_duration: float            # 高潮持续时长（秒）
    winddown_duration: float          # 缓冲时长（秒）


def extract_beat_timestamps(audio_path: str) -> BeatInfo:
    """
    提取节拍时间戳（卡点）

    使用智能 BPM 检测（带倍频纠正）

    Args:
        audio_path: 音频文件路径

    Returns:
        BeatInfo: 节拍信息对象
    """
    y, sr = librosa.load(audio_path, sr=22050, mono=True)

    # 使用统一 BPM 分析入口（带倍频纠正 + beat_times）
    bpm_analyzer = RealtimeBPMAnalyzerTest(verbose=False)
    bpm_result = bpm_analyzer.analyze_bpm(y=y, sr=sr)
    corrected_tempo = bpm_result.get('bpm', 120.0)

    # beat_times 已经由 analyze_bpm 根据 BPM 减半情况做了抽样处理
    beat_times = np.array(bpm_result.get('beat_times', []))

    # 强拍检测（每4拍取第1拍，假设4/4拍）
    downbeat_times = beat_times[::4].tolist() if len(beat_times) > 0 else []

    # 计算节拍间隔
    beat_intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else []

    return BeatInfo(
        beat_timestamps=beat_times.tolist(),
        downbeat_timestamps=downbeat_times,
        tempo=corrected_tempo,
        beat_intervals=beat_intervals,
    )


def extract_emotion_curve(
    audio_path: str,
    window_size: float = 2.0,  # 窗口大小（秒）
    hop_size: float = 0.5      # 步长（秒）
) -> EmotionCurve:
    """
    提取情绪曲线

    基于音频特征推断情绪：
    - Energy (能量): RMS 能量 → 情绪强度
    - Valence (效价): 频谱质心 + 大小调 → 正面/负面情绪
    - Arousal (唤醒度): 节奏密度 + 能量变化 → 激动/平静

    Args:
        audio_path: 音频文件路径
        window_size: 滑动窗口大小（秒）
        hop_size: 滑动步长（秒）

    Returns:
        EmotionCurve: 情绪曲线数据对象
    """
    y, sr = librosa.load(audio_path, sr=None, mono=True)

    timestamps: List[float] = []
    energy_values: List[float] = []
    valence_values: List[float] = []
    arousal_values: List[float] = []

    # 滑动窗口分析
    window_samples = int(window_size * sr)
    hop_samples = int(hop_size * sr)

    for start_sample in range(0, len(y) - window_samples, hop_samples):
        end_sample = start_sample + window_samples
        y_window = y[start_sample:end_sample]
        t = start_sample / sr

        timestamps.append(t)

        # 1. Energy: RMS 能量归一化
        rms = np.sqrt(np.mean(y_window ** 2))
        energy = min(rms / 0.1, 1.0)  # 归一化到 0-1
        energy_values.append(float(energy))

        # 2. Valence: 基于频谱质心（高=明亮=正面）
        centroid = librosa.feature.spectral_centroid(y=y_window, sr=sr)[0]
        valence = min(np.mean(centroid) / 4000, 1.0)
        valence_values.append(float(valence))

        # 3. Arousal: 基于 onset 密度和能量变化
        onset_env = librosa.onset.onset_strength(y=y_window, sr=sr)
        arousal = min(np.mean(onset_env) / 2.0, 1.0)
        arousal_values.append(float(arousal))

    # 4. 综合情绪曲线（加权平均）
    smoothed: List[float] = []
    for i in range(len(timestamps)):
        # 权重：能量 40%, 唤醒度 40%, 效价 20%
        combined = (
            energy_values[i] * 0.4 +
            arousal_values[i] * 0.4 +
            valence_values[i] * 0.2
        )
        smoothed.append(combined)

    # 平滑处理（移动平均）
    if len(smoothed) >= 3:
        smoothed = np.convolve(smoothed, np.ones(3)/3, mode='same').tolist()

    return EmotionCurve(
        timestamps=timestamps,
        energy_values=energy_values,
        valence_values=valence_values,
        arousal_values=arousal_values,
        smoothed_curve=smoothed,
    )


def aggregate_emotion_by_segments(
    emotion_curve: EmotionCurve,
    segments: List[Dict[str, Any]],
) -> List[SegmentEmotion]:
    """
    将情绪曲线按 songformer 段落结构聚合

    Args:
        emotion_curve: 原始情绪曲线数据
        segments: songformer 返回的段落列表，格式为:
                  [{"start": 0.0, "end": 30.5, "label": "intro"}, ...]

    Returns:
        List[SegmentEmotion]: 按段落聚合的情绪数据
    """
    if not segments or not emotion_curve.timestamps:
        return []

    result: List[SegmentEmotion] = []
    timestamps = np.array(emotion_curve.timestamps)
    energy_values = np.array(emotion_curve.energy_values)
    valence_values = np.array(emotion_curve.valence_values)
    arousal_values = np.array(emotion_curve.arousal_values)
    smoothed_values = np.array(emotion_curve.smoothed_curve)

    for seg in segments:
        start = float(seg.get("start", 0))
        end = float(seg.get("end", 0))
        label = str(seg.get("label", "unknown"))

        # 找出该段落内的数据点索引
        mask = (timestamps >= start) & (timestamps < end)
        indices = np.where(mask)[0]

        if len(indices) == 0:
            # 没有数据点落在该段落内，使用默认值
            result.append(SegmentEmotion(
                start=start,
                end=end,
                label=label,
                intensity=0.5,
                energy=0.5,
                valence=0.5,
                arousal=0.5,
                trend="stable",
            ))
            continue

        # 计算该段落的平均值
        seg_energy = float(np.mean(energy_values[indices]))
        seg_valence = float(np.mean(valence_values[indices]))
        seg_arousal = float(np.mean(arousal_values[indices]))
        seg_intensity = float(np.mean(smoothed_values[indices]))

        # 计算情绪趋势
        seg_smoothed = smoothed_values[indices]
        trend = _calculate_trend(seg_smoothed, seg_intensity)

        result.append(SegmentEmotion(
            start=start,
            end=end,
            label=label,
            intensity=round(seg_intensity, 3),
            energy=round(seg_energy, 3),
            valence=round(seg_valence, 3),
            arousal=round(seg_arousal, 3),
            trend=trend,
        ))

    return result


def _calculate_trend(values: np.ndarray, avg_intensity: float) -> str:
    """
    计算情绪趋势

    Args:
        values: 该段落内的情绪值数组
        avg_intensity: 平均情绪强度

    Returns:
        str: rising/falling/stable/peak
    """
    if len(values) < 3:
        return "stable"

    # 将段落分成前半和后半
    mid = len(values) // 2
    first_half_avg = float(np.mean(values[:mid]))
    second_half_avg = float(np.mean(values[mid:]))

    diff = second_half_avg - first_half_avg
    threshold = 0.05  # 5% 变化阈值

    # 检查是否是高峰（平均强度高且变化不大）
    if avg_intensity > 0.7 and abs(diff) < threshold:
        return "peak"

    if diff > threshold:
        return "rising"
    elif diff < -threshold:
        return "falling"
    else:
        return "stable"


def extract_segment_emotions(
    audio_path: str,
    segments: List[Dict[str, Any]],
) -> List[SegmentEmotion]:
    """
    一站式提取按段落聚合的情绪数据

    Args:
        audio_path: 音频文件路径
        segments: songformer 返回的段落列表

    Returns:
        List[SegmentEmotion]: 按段落聚合的情绪数据
    """
    emotion_curve = extract_emotion_curve(audio_path)
    return aggregate_emotion_by_segments(emotion_curve, segments)


def calculate_beat_density_by_segments(
    beat_timestamps: List[float],
    segments: List[Dict[str, Any]],
    tempo: float = 120.0,
) -> List[BeatDensityInfo]:
    """
    按段落计算节拍密度，用于指导分镜时长规划

    Args:
        beat_timestamps: 节拍时间戳列表
        segments: songformer 返回的段落列表，格式为:
                  [{"start": 0.0, "end": 30.5, "label": "intro"}, ...]
        tempo: BPM（用于辅助判断密度级别）

    Returns:
        List[BeatDensityInfo]: 按段落的节拍密度信息
    """
    if not segments or not beat_timestamps:
        return []

    result: List[BeatDensityInfo] = []
    beat_array = np.array(beat_timestamps)

    for seg in segments:
        start = float(seg.get("start", 0))
        end = float(seg.get("end", 0))
        label = str(seg.get("label", "unknown"))

        # 找出该段落内的节拍
        mask = (beat_array >= start) & (beat_array < end)
        segment_beats = beat_array[mask]
        beat_count = len(segment_beats)

        # 计算平均间隔
        if beat_count >= 2:
            intervals = np.diff(segment_beats)
            avg_interval = float(np.mean(intervals))
        elif beat_count == 1:
            # 只有一个节拍，使用 BPM 估算
            avg_interval = 60.0 / tempo
        else:
            # 没有节拍，使用默认值
            avg_interval = 60.0 / tempo

        # 根据平均间隔和 BPM 判断密度级别
        # 间隔越小 = 密度越高
        if avg_interval <= 0.3 or tempo >= 160:
            density_level = "very_dense"
            recommended_shot_duration = "2-4秒"
        elif avg_interval <= 0.45 or tempo >= 130:
            density_level = "dense"
            recommended_shot_duration = "3-5秒"
        elif avg_interval <= 0.6 or tempo >= 100:
            density_level = "normal"
            recommended_shot_duration = "4-6秒"
        else:
            density_level = "sparse"
            recommended_shot_duration = "6-10秒"

        result.append(BeatDensityInfo(
            segment_label=label,
            start=round(start, 2),
            end=round(end, 2),
            beat_count=beat_count,
            avg_interval=round(avg_interval, 3),
            density_level=density_level,
            recommended_shot_duration=recommended_shot_duration,
        ))

    return result


def enhance_climax_points(
    climax_points: List[Dict[str, Any]],
    segments: List[Dict[str, Any]],
    music_duration: float,
) -> List[EnhancedClimaxInfo]:
    """
    增强高潮点信息，添加铺垫/持续/缓冲时长指导

    Args:
        climax_points: 原始高潮点列表，格式为:
                       [{"time": 60.0, "intensity": "strong"}, ...]
        segments: songformer 返回的段落列表
        music_duration: 音乐总时长（秒）

    Returns:
        List[EnhancedClimaxInfo]: 增强后的高潮点信息
    """
    if not climax_points:
        return []

    result: List[EnhancedClimaxInfo] = []

    # 按时间排序高潮点
    sorted_climax = sorted(climax_points, key=lambda x: float(x.get("time", 0)))

    for i, climax in enumerate(sorted_climax):
        time = float(climax.get("time", 0))
        intensity = str(climax.get("intensity", "strong"))

        # 根据强度确定时长参数
        if intensity == "strongest":
            buildup_duration = 10.0   # 最强高潮：更长的铺垫
            climax_duration = 20.0    # 更长的高潮持续
            winddown_duration = 10.0  # 更长的缓冲
        else:
            buildup_duration = 5.0    # 普通高潮
            climax_duration = 10.0
            winddown_duration = 5.0

        # 计算铺垫开始时间（不能小于0或前一个高潮的结束）
        buildup_start = max(0, time - buildup_duration)

        # 如果有前一个高潮点，确保不重叠
        if i > 0:
            prev_climax_time = float(sorted_climax[i - 1].get("time", 0))
            prev_intensity = str(sorted_climax[i - 1].get("intensity", "strong"))
            prev_winddown = 10.0 if prev_intensity == "strongest" else 5.0
            prev_end = prev_climax_time + prev_winddown

            if buildup_start < prev_end:
                # 调整铺垫开始时间，避免重叠
                buildup_start = prev_end
                buildup_duration = time - buildup_start

        # 确保高潮持续+缓冲不超过音乐结束
        if time + climax_duration + winddown_duration > music_duration:
            # 按比例缩减
            remaining = music_duration - time
            if remaining > 0:
                ratio = remaining / (climax_duration + winddown_duration)
                climax_duration = climax_duration * ratio
                winddown_duration = winddown_duration * ratio

        result.append(EnhancedClimaxInfo(
            time=round(time, 2),
            intensity=intensity,
            buildup_start=round(buildup_start, 2),
            buildup_duration=round(buildup_duration, 2),
            climax_duration=round(climax_duration, 2),
            winddown_duration=round(winddown_duration, 2),
        ))

    return result


def format_beat_density_for_prompt(beat_density_list: List[BeatDensityInfo]) -> str:
    """
    将节拍密度信息格式化为提示词文本

    Args:
        beat_density_list: 节拍密度信息列表

    Returns:
        str: 格式化的文本
    """
    if not beat_density_list:
        return "（无节拍密度数据）"

    lines = []
    for info in beat_density_list:
        lines.append(
            f"- [{info.segment_label}] {info.start:.1f}s-{info.end:.1f}s: "
            f"节拍数={info.beat_count}, 平均间隔={info.avg_interval:.2f}s, "
            f"密度={info.density_level}, 推荐分镜时长={info.recommended_shot_duration}"
        )
    return "\n".join(lines)


def format_enhanced_climax_for_prompt(enhanced_climax_list: List[EnhancedClimaxInfo]) -> str:
    """
    将增强高潮点信息格式化为提示词文本

    Args:
        enhanced_climax_list: 增强高潮点信息列表

    Returns:
        str: 格式化的文本
    """
    if not enhanced_climax_list:
        return "（无高潮点数据）"

    lines = []
    for info in enhanced_climax_list:
        lines.append(
            f"- 高潮点 {info.time:.1f}s ({info.intensity}):\n"
            f"  · 铺垫阶段: {info.buildup_start:.1f}s - {info.time:.1f}s (约{info.buildup_duration:.1f}秒)\n"
            f"  · 高潮阶段: {info.time:.1f}s - {info.time + info.climax_duration:.1f}s (约{info.climax_duration:.1f}秒)\n"
            f"  · 缓冲阶段: {info.time + info.climax_duration:.1f}s - {info.time + info.climax_duration + info.winddown_duration:.1f}s (约{info.winddown_duration:.1f}秒)"
        )
    return "\n".join(lines)