dataset.py 22.3 KB

Raw Blame History Permalink

import json
import random
from pathlib import Path
from typing import Dict, List, Optional

import librosa
import numpy as np
import torch
from torch.utils.data import Dataset

from src.utils.audio import AudioProcessor
from src.utils.augment import AugmentPipeline


def compute_candidate_offsets(
    y: np.ndarray,
    sr: int,
    segment_len: int,
    strategy: str,
    silence_top_db: int,
) -> List[int]:
    if len(y) <= segment_len:
        return [0]

    if strategy == "silence_aware":
        intervals = librosa.effects.split(y, top_db=silence_top_db)
        if intervals is None or len(intervals) == 0:
            return []
        offsets = []
        for start, end in intervals:
            start = int(start)
            end = int(end)
            if end - start >= segment_len:
                offsets.append(start)
                last = end - segment_len
                if last > start:
                    offsets.append(last)
        return sorted(set(offsets))

    if strategy == "high_energy":
        hop = max(segment_len // 2, 1)
        scores: List[tuple[float, int]] = []
        for start in range(0, max(len(y) - segment_len + 1, 1), hop):
            seg = y[start : start + segment_len]
            if len(seg) < segment_len:
                seg = np.pad(seg, (0, segment_len - len(seg)))
            rms = float(np.sqrt(np.mean(np.square(seg)) + 1e-12))
            scores.append((rms, start))
        scores.sort(key=lambda x: x[0], reverse=True)
        return [start for _, start in scores[: min(6, len(scores))]]

    if strategy == "onset_aware":
        try:
            onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames")
            onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512)
        except Exception:
            onset_samples = np.array([], dtype=int)
        if onset_samples.size == 0:
            return []
        offsets = []
        max_start = max(len(y) - segment_len, 0)
        for onset in onset_samples.tolist():
            start = max(0, min(int(onset), max_start))
            offsets.append(start)
        return sorted(set(offsets[: min(8, len(offsets))]))

    if strategy == "beat_aware":
        try:
            tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=512, units="frames")
            beat_samples = librosa.frames_to_samples(beat_frames, hop_length=512)
        except Exception:
            beat_samples = np.array([], dtype=int)
        if beat_samples.size == 0:
            try:
                onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames")
                onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512)
                if onset_samples.size >= 2:
                    diffs = np.diff(onset_samples)
                    median_step = int(np.median(diffs)) if diffs.size else 0
                    if median_step > 0:
                        approx = [int(onset_samples[0])]
                        while approx[-1] + median_step < len(y):
                            approx.append(approx[-1] + median_step)
                        beat_samples = np.array(approx, dtype=int)
                elif onset_samples.size == 1:
                    beat_samples = onset_samples
            except Exception:
                beat_samples = np.array([], dtype=int)
        if beat_samples.size == 0:
            return []
        offsets = []
        max_start = max(len(y) - segment_len, 0)
        for beat in beat_samples.tolist():
            start = max(0, min(int(beat), max_start))
            offsets.append(start)
        if not offsets:
            return []
        step = max(1, len(offsets) // 8)
        return sorted(set(offsets[::step][:8]))

    if strategy == "repeated_section_aware":
        hop = max(segment_len // 2, 1)
        starts = list(range(0, max(len(y) - segment_len + 1, 1), hop))
        if len(starts) < 2:
            return starts[:1]
        feats = []
        for start in starts:
            seg = y[start : start + segment_len]
            if len(seg) < segment_len:
                seg = np.pad(seg, (0, segment_len - len(seg)))
            chroma = librosa.feature.chroma_cqt(y=seg, sr=sr)
            feat = np.mean(chroma, axis=1)
            norm = float(np.linalg.norm(feat) + 1e-12)
            feats.append(feat / norm)
        scores: List[tuple[float, int]] = []
        for i, feat in enumerate(feats):
            sims = []
            for j, other in enumerate(feats):
                if i == j:
                    continue
                sims.append(float(np.dot(feat, other)))
            repeat_score = max(sims) if sims else 0.0
            scores.append((repeat_score, starts[i]))
        scores.sort(key=lambda x: x[0], reverse=True)
        return sorted(set(start for _, start in scores[: min(6, len(scores))]))

    return []


class DualStreamFeatureExtractor:
    def __init__(self, sr: int, n_mels: int, n_fft: int, hop_length: int):
        self.audio = AudioProcessor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        self.n_mels = n_mels

    def extract(self, y: np.ndarray) -> Dict[str, torch.Tensor]:
        mel = self.audio.to_mel(y)
        melody = self.audio.extract_f0(y)
        melody = librosa.hz_to_midi(melody)
        melody = np.nan_to_num(melody, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        chroma = self.audio.extract_chroma(y).astype(np.float32)

        time_steps = mel.shape[1]
        if melody.size == 0:
            melody = np.zeros(time_steps, dtype=np.float32)
        else:
            melody = np.interp(
                np.linspace(0, melody.size - 1, time_steps),
                np.arange(melody.size),
                melody,
            ).astype(np.float32)
        chroma_resized = np.stack(
            [
                np.interp(
                    np.linspace(0, chroma.shape[1] - 1, time_steps),
                    np.arange(chroma.shape[1]),
                    chroma_row,
                )
                for chroma_row in chroma
            ],
            axis=0,
        ).astype(np.float32)

        return {
            "mel": torch.FloatTensor(mel),
            "melody": torch.FloatTensor(melody).unsqueeze(0),
            "chroma": torch.FloatTensor(chroma_resized),
        }


class PairSamplerDataset(Dataset):
    def __init__(
        self,
        data_dir: str,
        split: str = "train",
        sr: int = 16000,
        n_mels: int = 80,
        n_fft: int = 512,
        hop_length: int = 160,
        segment_dur: float = 5.0,
        augment: bool = True,
        segment_strategy: str = "random",
        silence_top_db: int = 30,
        sample_type_weights: Optional[Dict[str, int]] = None,
        pair_type_weights: Optional[Dict[str, float]] = None,
        hard_negative_k: int = 1,
        noise_roots: Optional[List[str]] = None,
    ):
        self.sr = sr
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.segment_len = int(segment_dur * sr)
        self.augment = augment
        self.segment_strategy = segment_strategy
        self.silence_top_db = silence_top_db
        self.data_dir = Path(data_dir)
        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
        self.sample_type_weights = {
            "default": 1,
            "compressed": 2,
            "recording": 3,
            "environment": 4,
            **(sample_type_weights or {}),
        }
        self.pair_type_weights = {
            "default": 1.0,
            "compressed": 1.5,
            "recording": 2.0,
            "environment": 2.5,
            **(pair_type_weights or {}),
        }
        self.hard_negative_k = hard_negative_k
        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)
        self.aggressive_augmenter = AugmentPipeline(sr, aggressive=True, noise_roots=noise_roots)

        with open(self.data_dir / f"{split}.json") as f:
            metadata = json.load(f)

        self.by_song: Dict[str, List[Dict]] = {}
        for item in metadata:
            if not self._is_training_candidate(item):
                continue
            p = self.asset_root / item["audio_path"]
            if p.exists():
                self.by_song.setdefault(item["song_id"], []).append(item)

        self.song_ids = sorted(self.by_song)
        self.song_to_idx = {sid: i for i, sid in enumerate(self.song_ids)}
        self.sample_song_ids = []
        self.hard_negative_map: Dict[str, List[str]] = self._build_hard_negative_map()
        for sid, items in self.by_song.items():
            item_types = {self._normalize_sample_type(x.get("type")) for x in items}
            weight = self.sample_type_weights.get("default", 1)
            for item_type in item_types:
                weight = max(weight, int(self.sample_type_weights.get(item_type, weight)))
            self.sample_song_ids.extend([sid] * weight)

    @staticmethod
    def _normalize_sample_type(sample_type: Optional[str]) -> str:
        mapping = {
            "reference": "reference",
            "compressed": "compressed",
            "recording": "recording",
            "environment": "environment",
            "humming_like": "recording",
            "confused": "environment",
            None: "default",
        }
        return mapping.get(sample_type, sample_type or "default")

    def _is_training_candidate(self, item: Dict) -> bool:
        sample_type = self._normalize_sample_type(item.get("type"))
        return sample_type != "reference"

    def _build_hard_negative_map(self) -> Dict[str, List[str]]:
        song_features: Dict[str, np.ndarray] = {}
        for song_id, items in self.by_song.items():
            feats = []
            for item in items[:2]:
                path = self.asset_root / item["audio_path"]
                try:
                    y, _ = librosa.load(str(path), sr=self.sr, mono=True, duration=8.0)
                    mel = self.feature_extractor.audio.to_mel(y)
                    feats.append(np.mean(mel, axis=1))
                except Exception:
                    continue
            if feats:
                song_features[song_id] = np.mean(feats, axis=0)

        hard_negative_map: Dict[str, List[str]] = {}
        song_ids = list(song_features)
        for song_id in song_ids:
            anchor = song_features[song_id]
            anchor_norm = np.linalg.norm(anchor) + 1e-12
            scored = []
            for other_song_id in song_ids:
                if other_song_id == song_id:
                    continue
                other = song_features[other_song_id]
                score = float(np.dot(anchor, other) / (anchor_norm * (np.linalg.norm(other) + 1e-12)))
                scored.append((score, other_song_id))
            scored.sort(reverse=True)
            hard_negative_map[song_id] = [other_song_id for _, other_song_id in scored[: max(self.hard_negative_k, 1) * 4]]
        return hard_negative_map

    def __len__(self):
        return len(self.sample_song_ids)

    def _load_clip(self, sample: Dict) -> np.ndarray:
        path = self.asset_root / sample["audio_path"]
        full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
        duration = float(sample.get("duration", len(full_y) / self.sr))
        max_offset = max(0.0, duration - (self.segment_len / self.sr))
        offset = 0.0
        if max_offset > 0:
            if self.segment_strategy == "random":
                offset = random.uniform(0, max_offset)
            else:
                direct_candidates = compute_candidate_offsets(
                    y=full_y,
                    sr=self.sr,
                    segment_len=self.segment_len,
                    strategy=self.segment_strategy,
                    silence_top_db=self.silence_top_db,
                )
                if direct_candidates:
                    offset = min(random.choice(direct_candidates) / self.sr, max_offset)
                elif self.segment_strategy == "hybrid":
                    candidate_pool: List[int] = []
                    for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
                        candidate_pool.extend(
                            compute_candidate_offsets(
                                y=full_y,
                                sr=self.sr,
                                segment_len=self.segment_len,
                                strategy=strategy,
                                silence_top_db=self.silence_top_db,
                            )
                        )
                    if candidate_pool and random.random() < 0.75:
                        offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
                    else:
                        offset = random.uniform(0, max_offset)
                else:
                    offset = random.uniform(0, max_offset)
        start = int(offset * self.sr)
        y = full_y[start : start + self.segment_len]
        if len(y) < self.segment_len:
            y = np.pad(y, (0, self.segment_len - len(y)))
        return y

    def _augment_wave(self, sample: Dict, y: np.ndarray) -> np.ndarray:
        if not self.augment:
            return y
        sample_type = self._normalize_sample_type(sample.get("type"))
        if sample_type in {"recording", "environment"}:
            return self.aggressive_augmenter(y)
        return self.augmenter(y)

    def _load_features(self, sample: Dict) -> Dict[str, torch.Tensor]:
        y = self._load_clip(sample)
        y = self._augment_wave(sample, y)
        features = self.feature_extractor.extract(y)
        features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))
        return features

    def _pick_positive_pair(self, song_id: str) -> tuple[Dict, Dict]:
        choices = self.by_song[song_id]
        if len(choices) == 1:
            return choices[0], choices[0]
        return tuple(random.sample(choices, 2))

    def _pick_negative(self, song_id: str) -> Dict:
        hard_songs = self.hard_negative_map.get(song_id, [])
        candidate_song_ids = hard_songs[: self.hard_negative_k] if hard_songs else []
        if candidate_song_ids and random.random() < 0.8:
            negative_song_id = random.choice(candidate_song_ids)
        else:
            pool = [sid for sid in self.song_ids if sid != song_id]
            negative_song_id = random.choice(pool)
        return random.choice(self.by_song[negative_song_id])

    def __getitem__(self, idx):
        song_id = self.sample_song_ids[idx]
        pos_a, pos_b = self._pick_positive_pair(song_id)
        negative = self._pick_negative(song_id)

        positive_items = [pos_a, pos_b]
        positive_features = [self._load_features(sample) for sample in positive_items]
        negative_features = self._load_features(negative)

        hard_weights = [
            self.pair_type_weights.get(self._normalize_sample_type(sample.get("type")), self.pair_type_weights["default"])
            for sample in positive_items
        ]
        hard_weights.append(self.pair_type_weights.get("environment", 2.5))

        label = self.song_to_idx[song_id]
        negative_label = self.song_to_idx[negative["song_id"]]
        return {
            "mel": torch.stack([feat["mel"] for feat in positive_features] + [negative_features["mel"]], dim=0),
            "melody": torch.stack([feat["melody"] for feat in positive_features] + [negative_features["melody"]], dim=0),
            "chroma": torch.stack([feat["chroma"] for feat in positive_features] + [negative_features["chroma"]], dim=0),
            "song_id": torch.tensor([label, label, negative_label], dtype=torch.long),
            "song_name": song_id,
            "hard_weight": torch.tensor(hard_weights, dtype=torch.float32),
        }


class ACRDataset(Dataset):
    def __init__(
        self,
        data_dir: str,
        split: str = "train",
        sr: int = 16000,
        n_mels: int = 80,
        n_fft: int = 512,
        hop_length: int = 160,
        segment_dur: float = 5.0,
        augment: bool = True,
        n_crops_per_song: int = 4,
        song_to_idx: Optional[Dict[str, int]] = None,
        references_only: bool = False,
        segment_strategy: str = "random",
        silence_top_db: int = 30,
        noise_roots: Optional[List[str]] = None,
    ):
        self.sr = sr
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.segment_len = int(segment_dur * sr)
        self.augment = augment
        self.n_crops = n_crops_per_song
        self.segment_strategy = segment_strategy
        self.silence_top_db = silence_top_db
        self.data_dir = Path(data_dir)
        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        self.augmenter = AugmentPipeline(sr, noise_roots=noise_roots)

        meta_path = self.data_dir / f"{split}.json"
        with open(meta_path) as f:
            self.metadata = json.load(f)

        self.samples = []
        for item in self.metadata:
            if references_only and item.get("type") != "reference":
                continue
            song_path = self.asset_root / item["audio_path"]
            if song_path.exists():
                self.samples.append(item)

        self.song_ids = sorted(set(s["song_id"] for s in self.samples))
        self.song_to_idx = song_to_idx or {sid: i for i, sid in enumerate(self.song_ids)}

    def __len__(self):
        return len(self.samples) * self.n_crops

    def _load_segment(self, path: str, offset: float, duration: float) -> np.ndarray:
        y, _ = librosa.load(path, sr=self.sr, mono=True, offset=offset, duration=duration)
        if len(y) < self.segment_len:
            y = np.pad(y, (0, self.segment_len - len(y)))
        else:
            y = y[: self.segment_len]
        return y

    def _choose_offset(self, sample: Dict, audio_path: Path) -> float:
        duration = float(sample["duration"])
        max_offset = max(0.0, duration - (self.segment_len / self.sr))
        if max_offset <= 0:
            return 0.0

        if self.segment_strategy == "random":
            return random.uniform(0, max_offset)

        y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True)
        direct_candidates = compute_candidate_offsets(
            y=y,
            sr=self.sr,
            segment_len=self.segment_len,
            strategy=self.segment_strategy,
            silence_top_db=self.silence_top_db,
        )
        if direct_candidates:
            chosen = random.choice(direct_candidates)
            return min(chosen / self.sr, max_offset)

        if self.segment_strategy == "hybrid":
            candidate_pool: List[int] = []
            for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
                candidate_pool.extend(
                    compute_candidate_offsets(
                        y=y,
                        sr=self.sr,
                        segment_len=self.segment_len,
                        strategy=strategy,
                        silence_top_db=self.silence_top_db,
                    )
                )
            if candidate_pool and random.random() < 0.75:
                chosen = random.choice(sorted(set(candidate_pool)))
                return min(chosen / self.sr, max_offset)
            return random.uniform(0, max_offset)

        return random.uniform(0, max_offset)

    def __getitem__(self, idx):
        sample = self.samples[idx // self.n_crops]
        audio_path = self.asset_root / sample["audio_path"]
        offset = self._choose_offset(sample, audio_path)
        y = self._load_segment(str(audio_path), offset, 5.0)

        if self.augment and sample.get("type") != "reference":
            y = self.augmenter(y)

        features = self.feature_extractor.extract(y)
        features["mel"] = torch.FloatTensor(self.augmenter.apply_to_mel(features["mel"].numpy()))

        song_id = sample["song_id"]
        class_id = self.song_to_idx[song_id]
        return {
            "mel": features["mel"],
            "melody": features["melody"],
            "chroma": features["chroma"],
            "song_id": torch.tensor(class_id, dtype=torch.long),
            "song_name": song_id,
            "type": sample.get("type", "unknown"),
        }


class ACRTestDataset(Dataset):
    def __init__(
        self,
        data_dir: str,
        split: str = "test",
        sr: int = 16000,
        n_mels: int = 80,
        n_fft: int = 512,
        hop_length: int = 160,
        song_to_idx: Optional[Dict[str, int]] = None,
    ):
        self.sr = sr
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.data_dir = Path(data_dir)
        self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir
        self.feature_extractor = DualStreamFeatureExtractor(sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)

        meta_path = self.data_dir / f"{split}.json"
        with open(meta_path) as f:
            self.metadata = json.load(f)

        self.samples = []
        for item in self.metadata:
            p = self.asset_root / item["audio_path"]
            if p.exists():
                self.samples.append(item)

        self.song_ids = sorted(set(s["song_id"] for s in self.samples))
        self.song_to_idx = song_to_idx or {sid: i for i, sid in enumerate(self.song_ids)}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        audio_path = self.asset_root / sample["audio_path"]
        y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True, offset=0, duration=min(sample["duration"], 5.0))
        seg_len = 5 * self.sr
        if len(y) < seg_len:
            y = np.pad(y, (0, seg_len - len(y)))
        else:
            y = y[:seg_len]

        features = self.feature_extractor.extract(y)
        class_id = self.song_to_idx[sample["song_id"]]
        return {
            "mel": features["mel"],
            "melody": features["melody"],
            "chroma": features["chroma"],
            "song_id": torch.tensor(class_id, dtype=torch.long),
            "song_name": sample["song_id"],
            "type": sample.get("type", "unknown"),
        }


class SongPairDataset(PairSamplerDataset):
    pass