Bias music training crops toward salient energy and attack regions
Constraint: Music ACR queries should be closer to choruses, strong rhythmic sections, and attack regions without giving up the existing random and silence-aware fallbacks Rejected: Add only heavier beat/chorus modeling first | higher complexity and more brittle than lightweight energy/onset heuristics for the current training pipeline Confidence: high Scope-risk: moderate Directive: Keep high_energy/onset_aware as heuristic candidate generators; future beat/chorus logic should layer on top of them rather than replace the fallback stack Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy high_energy and onset_aware; handcrafted 20s audio fixture with high_energy/onset_aware query offset checks Not-tested: Full retraining/evaluation impact on FMA or internal production datasets
Showing
6 changed files
with
206 additions
and
77 deletions
| ... | @@ -9,6 +9,61 @@ import torch | ... | @@ -9,6 +9,61 @@ import torch |
| 9 | from torch.utils.data import Dataset | 9 | from torch.utils.data import Dataset |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | def compute_candidate_offsets( | ||
| 13 | y: np.ndarray, | ||
| 14 | sr: int, | ||
| 15 | segment_len: int, | ||
| 16 | strategy: str, | ||
| 17 | silence_top_db: int, | ||
| 18 | ) -> List[int]: | ||
| 19 | if len(y) <= segment_len: | ||
| 20 | return [0] | ||
| 21 | |||
| 22 | if strategy == "silence_aware": | ||
| 23 | intervals = librosa.effects.split(y, top_db=silence_top_db) | ||
| 24 | if intervals is None or len(intervals) == 0: | ||
| 25 | return [] | ||
| 26 | offsets = [] | ||
| 27 | for start, end in intervals: | ||
| 28 | start = int(start) | ||
| 29 | end = int(end) | ||
| 30 | if end - start >= segment_len: | ||
| 31 | offsets.append(start) | ||
| 32 | last = end - segment_len | ||
| 33 | if last > start: | ||
| 34 | offsets.append(last) | ||
| 35 | return sorted(set(offsets)) | ||
| 36 | |||
| 37 | if strategy == "high_energy": | ||
| 38 | hop = max(segment_len // 2, 1) | ||
| 39 | scores: List[tuple[float, int]] = [] | ||
| 40 | for start in range(0, max(len(y) - segment_len + 1, 1), hop): | ||
| 41 | seg = y[start : start + segment_len] | ||
| 42 | if len(seg) < segment_len: | ||
| 43 | seg = np.pad(seg, (0, segment_len - len(seg))) | ||
| 44 | rms = float(np.sqrt(np.mean(np.square(seg)) + 1e-12)) | ||
| 45 | scores.append((rms, start)) | ||
| 46 | scores.sort(key=lambda x: x[0], reverse=True) | ||
| 47 | return [start for _, start in scores[: min(6, len(scores))]] | ||
| 48 | |||
| 49 | if strategy == "onset_aware": | ||
| 50 | try: | ||
| 51 | onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames") | ||
| 52 | onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512) | ||
| 53 | except Exception: | ||
| 54 | onset_samples = np.array([], dtype=int) | ||
| 55 | if onset_samples.size == 0: | ||
| 56 | return [] | ||
| 57 | offsets = [] | ||
| 58 | max_start = max(len(y) - segment_len, 0) | ||
| 59 | for onset in onset_samples.tolist(): | ||
| 60 | start = max(0, min(int(onset), max_start)) | ||
| 61 | offsets.append(start) | ||
| 62 | return sorted(set(offsets[: min(8, len(offsets))])) | ||
| 63 | |||
| 64 | return [] | ||
| 65 | |||
| 66 | |||
| 12 | class ACRDataset(Dataset): | 67 | class ACRDataset(Dataset): |
| 13 | def __init__( | 68 | def __init__( |
| 14 | self, | 69 | self, |
| ... | @@ -74,15 +129,9 @@ class ACRDataset(Dataset): | ... | @@ -74,15 +129,9 @@ class ACRDataset(Dataset): |
| 74 | ) | 129 | ) |
| 75 | return librosa.power_to_db(mel, ref=np.max) | 130 | return librosa.power_to_db(mel, ref=np.max) |
| 76 | 131 | ||
| 77 | def _find_non_silent_intervals(self, y: np.ndarray) -> List[tuple[int, int]]: | ||
| 78 | intervals = librosa.effects.split(y, top_db=self.silence_top_db) | ||
| 79 | if intervals is None or len(intervals) == 0: | ||
| 80 | return [(0, len(y))] | ||
| 81 | return [(int(s), int(e)) for s, e in intervals] | ||
| 82 | |||
| 83 | def _choose_offset(self, sample: Dict, audio_path: Path) -> float: | 132 | def _choose_offset(self, sample: Dict, audio_path: Path) -> float: |
| 84 | duration = float(sample["duration"]) | 133 | duration = float(sample["duration"]) |
| 85 | max_offset = max(0.0, duration - 5.0) | 134 | max_offset = max(0.0, duration - (self.segment_len / self.sr)) |
| 86 | if max_offset <= 0: | 135 | if max_offset <= 0: |
| 87 | return 0.0 | 136 | return 0.0 |
| 88 | 137 | ||
| ... | @@ -90,26 +139,31 @@ class ACRDataset(Dataset): | ... | @@ -90,26 +139,31 @@ class ACRDataset(Dataset): |
| 90 | return random.uniform(0, max_offset) | 139 | return random.uniform(0, max_offset) |
| 91 | 140 | ||
| 92 | y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) | 141 | y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) |
| 93 | target_len = self.segment_len | 142 | direct_candidates = compute_candidate_offsets( |
| 94 | intervals = self._find_non_silent_intervals(y) | 143 | y=y, |
| 95 | valid_intervals = [] | 144 | sr=self.sr, |
| 96 | for start, end in intervals: | 145 | segment_len=self.segment_len, |
| 97 | if end - start >= target_len: | 146 | strategy=self.segment_strategy, |
| 98 | valid_intervals.append((start, end)) | 147 | silence_top_db=self.silence_top_db, |
| 99 | 148 | ) | |
| 100 | if self.segment_strategy == "silence_aware": | 149 | if direct_candidates: |
| 101 | if valid_intervals: | 150 | chosen = random.choice(direct_candidates) |
| 102 | start, end = random.choice(valid_intervals) | 151 | return min(chosen / self.sr, max_offset) |
| 103 | seg_max_start = max(start, end - target_len) | ||
| 104 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 105 | return min(chosen / self.sr, max_offset) | ||
| 106 | return random.uniform(0, max_offset) | ||
| 107 | 152 | ||
| 108 | if self.segment_strategy == "hybrid": | 153 | if self.segment_strategy == "hybrid": |
| 109 | if valid_intervals and random.random() < 0.7: | 154 | candidate_pool: List[int] = [] |
| 110 | start, end = random.choice(valid_intervals) | 155 | for strategy in ("high_energy", "onset_aware", "silence_aware"): |
| 111 | seg_max_start = max(start, end - target_len) | 156 | candidate_pool.extend( |
| 112 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | 157 | compute_candidate_offsets( |
| 158 | y=y, | ||
| 159 | sr=self.sr, | ||
| 160 | segment_len=self.segment_len, | ||
| 161 | strategy=strategy, | ||
| 162 | silence_top_db=self.silence_top_db, | ||
| 163 | ) | ||
| 164 | ) | ||
| 165 | if candidate_pool and random.random() < 0.75: | ||
| 166 | chosen = random.choice(sorted(set(candidate_pool))) | ||
| 113 | return min(chosen / self.sr, max_offset) | 167 | return min(chosen / self.sr, max_offset) |
| 114 | return random.uniform(0, max_offset) | 168 | return random.uniform(0, max_offset) |
| 115 | 169 | ||
| ... | @@ -260,24 +314,37 @@ class SongPairDataset(Dataset): | ... | @@ -260,24 +314,37 @@ class SongPairDataset(Dataset): |
| 260 | path = self.asset_root / sample["audio_path"] | 314 | path = self.asset_root / sample["audio_path"] |
| 261 | full_y, _ = librosa.load(str(path), sr=self.sr, mono=True) | 315 | full_y, _ = librosa.load(str(path), sr=self.sr, mono=True) |
| 262 | duration = float(sample.get("duration", len(full_y) / self.sr)) | 316 | duration = float(sample.get("duration", len(full_y) / self.sr)) |
| 263 | max_offset = max(0.0, duration - 5.0) | 317 | max_offset = max(0.0, duration - (self.segment_len / self.sr)) |
| 264 | offset = 0.0 | 318 | offset = 0.0 |
| 265 | if max_offset > 0: | 319 | if max_offset > 0: |
| 266 | if self.segment_strategy == "random": | 320 | if self.segment_strategy == "random": |
| 267 | offset = random.uniform(0, max_offset) | 321 | offset = random.uniform(0, max_offset) |
| 268 | else: | 322 | else: |
| 269 | intervals = librosa.effects.split(full_y, top_db=self.silence_top_db) | 323 | direct_candidates = compute_candidate_offsets( |
| 270 | valid = [(int(s), int(e)) for s, e in intervals if int(e) - int(s) >= self.segment_len] if len(intervals) else [] | 324 | y=full_y, |
| 271 | if self.segment_strategy == "silence_aware" and valid: | 325 | sr=self.sr, |
| 272 | start, end = random.choice(valid) | 326 | segment_len=self.segment_len, |
| 273 | seg_max_start = max(start, end - self.segment_len) | 327 | strategy=self.segment_strategy, |
| 274 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | 328 | silence_top_db=self.silence_top_db, |
| 275 | offset = min(chosen / self.sr, max_offset) | 329 | ) |
| 276 | elif self.segment_strategy == "hybrid" and valid and random.random() < 0.7: | 330 | if direct_candidates: |
| 277 | start, end = random.choice(valid) | 331 | offset = min(random.choice(direct_candidates) / self.sr, max_offset) |
| 278 | seg_max_start = max(start, end - self.segment_len) | 332 | elif self.segment_strategy == "hybrid": |
| 279 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | 333 | candidate_pool: List[int] = [] |
| 280 | offset = min(chosen / self.sr, max_offset) | 334 | for strategy in ("high_energy", "onset_aware", "silence_aware"): |
| 335 | candidate_pool.extend( | ||
| 336 | compute_candidate_offsets( | ||
| 337 | y=full_y, | ||
| 338 | sr=self.sr, | ||
| 339 | segment_len=self.segment_len, | ||
| 340 | strategy=strategy, | ||
| 341 | silence_top_db=self.silence_top_db, | ||
| 342 | ) | ||
| 343 | ) | ||
| 344 | if candidate_pool and random.random() < 0.75: | ||
| 345 | offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset) | ||
| 346 | else: | ||
| 347 | offset = random.uniform(0, max_offset) | ||
| 281 | else: | 348 | else: |
| 282 | offset = random.uniform(0, max_offset) | 349 | offset = random.uniform(0, max_offset) |
| 283 | start = int(offset * self.sr) | 350 | start = int(offset * self.sr) | ... | ... |
| ... | @@ -516,7 +516,7 @@ def main(): | ... | @@ -516,7 +516,7 @@ def main(): |
| 516 | p.add_argument("--eval-ratio", type=float, default=0.2) | 516 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 517 | p.add_argument("--query-duration", type=float, default=8.0) | 517 | p.add_argument("--query-duration", type=float, default=8.0) |
| 518 | p.add_argument("--query-stride", type=float, default=None) | 518 | p.add_argument("--query-stride", type=float, default=None) |
| 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") |
| 520 | p.add_argument("--silence-top-db", type=int, default=30) | 520 | p.add_argument("--silence-top-db", type=int, default=30) |
| 521 | p.add_argument("--seed", type=int, default=42) | 521 | p.add_argument("--seed", type=int, default=42) |
| 522 | 522 | ||
| ... | @@ -548,8 +548,8 @@ def main(): | ... | @@ -548,8 +548,8 @@ def main(): |
| 548 | p.add_argument("--eval-ratio", type=float, default=0.2) | 548 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 549 | p.add_argument("--query-duration", type=float, default=8.0) | 549 | p.add_argument("--query-duration", type=float, default=8.0) |
| 550 | p.add_argument("--query-stride", type=float, default=None) | 550 | p.add_argument("--query-stride", type=float, default=None) |
| 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") |
| 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") | 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") |
| 553 | p.add_argument("--silence-top-db", type=int, default=30) | 553 | p.add_argument("--silence-top-db", type=int, default=30) |
| 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) | 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) |
| 555 | p.add_argument("--seed", type=int, default=42) | 555 | p.add_argument("--seed", type=int, default=42) | ... | ... |
| ... | @@ -7,12 +7,19 @@ import csv | ... | @@ -7,12 +7,19 @@ import csv |
| 7 | import json | 7 | import json |
| 8 | import random | 8 | import random |
| 9 | import shutil | 9 | import shutil |
| 10 | import sys | ||
| 10 | from pathlib import Path | 11 | from pathlib import Path |
| 11 | from typing import List, Dict | 12 | from typing import List, Dict |
| 12 | import numpy as np | 13 | import numpy as np |
| 13 | import soundfile as sf | 14 | import soundfile as sf |
| 14 | import librosa | 15 | import librosa |
| 15 | 16 | ||
| 17 | ROOT = Path(__file__).resolve().parents[2] | ||
| 18 | if str(ROOT) not in sys.path: | ||
| 19 | sys.path.insert(0, str(ROOT)) | ||
| 20 | |||
| 21 | from src.data.dataset import compute_candidate_offsets | ||
| 22 | |||
| 16 | 23 | ||
| 17 | def write_catalog(records: List[Dict], output_path: Path): | 24 | def write_catalog(records: List[Dict], output_path: Path): |
| 18 | output_path.parent.mkdir(parents=True, exist_ok=True) | 25 | output_path.parent.mkdir(parents=True, exist_ok=True) |
| ... | @@ -62,34 +69,26 @@ def build_train_eval_from_audio_dir( | ... | @@ -62,34 +69,26 @@ def build_train_eval_from_audio_dir( |
| 62 | train = [] | 69 | train = [] |
| 63 | test = [] | 70 | test = [] |
| 64 | 71 | ||
| 65 | def compute_silence_aware_offsets(path: Path, duration: float) -> List[float]: | 72 | def compute_strategy_offsets(path: Path, duration: float, strategy: str) -> List[float]: |
| 66 | if duration < query_duration: | 73 | if duration < query_duration: |
| 67 | return [] | 74 | return [] |
| 68 | try: | 75 | try: |
| 69 | y, sr = librosa.load(str(path), sr=None, mono=True) | 76 | y, sr = librosa.load(str(path), sr=None, mono=True) |
| 70 | intervals = librosa.effects.split(y, top_db=silence_top_db) | ||
| 71 | if intervals is None or len(intervals) == 0: | ||
| 72 | raise ValueError("no_non_silent_intervals") | ||
| 73 | offsets = [] | ||
| 74 | target_len = int(query_duration * sr) | 77 | target_len = int(query_duration * sr) |
| 75 | for start, end in intervals: | 78 | candidates = compute_candidate_offsets( |
| 79 | y=y, | ||
| 80 | sr=sr, | ||
| 81 | segment_len=target_len, | ||
| 82 | strategy=strategy, | ||
| 83 | silence_top_db=silence_top_db, | ||
| 84 | ) | ||
| 85 | offsets = [] | ||
| 86 | for start in candidates: | ||
| 76 | start = int(start) | 87 | start = int(start) |
| 77 | end = int(end) | 88 | if query_stride and query_stride > 0 and strategy in {"silence_aware"}: |
| 78 | if end - start < target_len: | 89 | offsets.append(round(start / sr, 3)) |
| 79 | continue | ||
| 80 | if query_stride and query_stride > 0: | ||
| 81 | stride = int(query_stride * sr) | ||
| 82 | local_positions = list(range(start, max(start + 1, end - target_len + 1), stride)) | ||
| 83 | if not local_positions: | ||
| 84 | local_positions = [start] | ||
| 85 | last_pos = end - target_len | ||
| 86 | if last_pos >= start and local_positions[-1] != last_pos: | ||
| 87 | local_positions.append(last_pos) | ||
| 88 | offsets.extend([round(pos / sr, 3) for pos in local_positions]) | ||
| 89 | else: | 90 | else: |
| 90 | seg_max_start = max(start, end - target_len) | 91 | offsets.append(round(start / sr, 3)) |
| 91 | chosen = rng.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 92 | offsets.append(round(chosen / sr, 3)) | ||
| 93 | return sorted(set(x for x in offsets if x <= max(0.0, duration - query_duration))) | 92 | return sorted(set(x for x in offsets if x <= max(0.0, duration - query_duration))) |
| 94 | except Exception: | 93 | except Exception: |
| 95 | return [] | 94 | return [] |
| ... | @@ -117,20 +116,23 @@ def build_train_eval_from_audio_dir( | ... | @@ -117,20 +116,23 @@ def build_train_eval_from_audio_dir( |
| 117 | refs.append(ref) | 116 | refs.append(ref) |
| 118 | 117 | ||
| 119 | if duration >= query_duration: | 118 | if duration >= query_duration: |
| 120 | if query_strategy in {"silence_aware", "hybrid"}: | 119 | strategy_offsets = [] |
| 121 | silence_offsets = compute_silence_aware_offsets(path, duration) | 120 | if query_strategy in {"silence_aware", "high_energy", "onset_aware"}: |
| 122 | else: | 121 | strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) |
| 123 | silence_offsets = [] | 122 | elif query_strategy == "hybrid": |
| 124 | 123 | for strategy in ("high_energy", "onset_aware", "silence_aware"): | |
| 125 | if query_strategy == "silence_aware" and silence_offsets: | 124 | strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) |
| 126 | offsets = silence_offsets | 125 | strategy_offsets = sorted(set(strategy_offsets)) |
| 127 | elif query_strategy == "hybrid" and silence_offsets: | 126 | |
| 127 | if query_strategy in {"silence_aware", "high_energy", "onset_aware"} and strategy_offsets: | ||
| 128 | offsets = strategy_offsets | ||
| 129 | elif query_strategy == "hybrid" and strategy_offsets: | ||
| 128 | if query_stride and query_stride > 0: | 130 | if query_stride and query_stride > 0: |
| 129 | offsets = silence_offsets | 131 | offsets = strategy_offsets |
| 130 | else: | 132 | else: |
| 131 | max_offset = max(0.0, duration - query_duration) | 133 | max_offset = max(0.0, duration - query_duration) |
| 132 | random_offset = round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3) | 134 | random_offset = round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3) |
| 133 | offsets = sorted(set(silence_offsets + [random_offset])) | 135 | offsets = sorted(set(strategy_offsets + [random_offset])) |
| 134 | elif query_stride and query_stride > 0: | 136 | elif query_stride and query_stride > 0: |
| 135 | max_offset = max(0.0, duration - query_duration) | 137 | max_offset = max(0.0, duration - query_duration) |
| 136 | offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] | 138 | offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] |
| ... | @@ -275,7 +277,7 @@ def main(): | ... | @@ -275,7 +277,7 @@ def main(): |
| 275 | p.add_argument("--eval-ratio", type=float, default=0.2) | 277 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 276 | p.add_argument("--query-duration", type=float, default=8.0) | 278 | p.add_argument("--query-duration", type=float, default=8.0) |
| 277 | p.add_argument("--query-stride", type=float, default=None) | 279 | p.add_argument("--query-stride", type=float, default=None) |
| 278 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | 280 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") |
| 279 | p.add_argument("--silence-top-db", type=int, default=30) | 281 | p.add_argument("--silence-top-db", type=int, default=30) |
| 280 | p.add_argument("--seed", type=int, default=42) | 282 | p.add_argument("--seed", type=int, default=42) |
| 281 | 283 | ... | ... |
| ... | @@ -125,7 +125,7 @@ def main(): | ... | @@ -125,7 +125,7 @@ def main(): |
| 125 | parser.add_argument("--epochs", type=int, default=None) | 125 | parser.add_argument("--epochs", type=int, default=None) |
| 126 | parser.add_argument("--batch-size", type=int, default=None) | 126 | parser.add_argument("--batch-size", type=int, default=None) |
| 127 | parser.add_argument("--lr", type=float, default=None) | 127 | parser.add_argument("--lr", type=float, default=None) |
| 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") | 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") |
| 129 | parser.add_argument("--silence-top-db", type=int, default=30) | 129 | parser.add_argument("--silence-top-db", type=int, default=30) |
| 130 | parser.add_argument("--dry-run", action="store_true") | 130 | parser.add_argument("--dry-run", action="store_true") |
| 131 | args = parser.parse_args() | 131 | args = parser.parse_args() | ... | ... |
| ... | @@ -5522,3 +5522,50 @@ | ... | @@ -5522,3 +5522,50 @@ |
| 5522 | 结论: | 5522 | 结论: |
| 5523 | - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力 | 5523 | - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力 |
| 5524 | - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index | 5524 | - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index |
| 5525 | |||
| 5526 | ### Stage: high-energy / onset-aware music segmentation | ||
| 5527 | |||
| 5528 | 完成项: | ||
| 5529 | - 在 `acr-engine/src/data/dataset.py` 新增训练切片候选策略: | ||
| 5530 | - `high_energy` | ||
| 5531 | - `onset_aware` | ||
| 5532 | - 在 `acr-engine/src/data/manifest_tools.py` 新增外部 query 生成策略: | ||
| 5533 | - `--query-strategy high_energy` | ||
| 5534 | - `--query-strategy onset_aware` | ||
| 5535 | - 将 `hybrid` 升级为可复用: | ||
| 5536 | - `high_energy` | ||
| 5537 | - `onset_aware` | ||
| 5538 | - `silence_aware` | ||
| 5539 | 三类音乐感知候选,再补随机 fallback | ||
| 5540 | - 在 `train.py` 与 `external_adapters.py` 暴露新策略选项 | ||
| 5541 | - 在 [docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 增补策略说明与使用建议 | ||
| 5542 | |||
| 5543 | 验证结果: | ||
| 5544 | - 编译验证: | ||
| 5545 | - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py` | ||
| 5546 | - 人造音频验证: | ||
| 5547 | - 构造 `20s` 音频: | ||
| 5548 | - `4-6s` 低能 tone | ||
| 5549 | - `8/10/12s` 强起音脉冲 | ||
| 5550 | - `14-19s` 高能 tone | ||
| 5551 | - query 生成结果: | ||
| 5552 | - `high_energy` offsets: | ||
| 5553 | - `2.5, 7.5, 10.0, 12.5, 15.0` | ||
| 5554 | - `onset_aware` offsets: | ||
| 5555 | - `4.032, 6.048, 8.032, 10.016, 10.048, 12.032` | ||
| 5556 | - 训练侧偏移验证: | ||
| 5557 | - `TRAIN_HIGH_ENERGY_OFFSETS`: | ||
| 5558 | - `2.5, 15.0, 15.0, 2.5, 10.0, 12.5` | ||
| 5559 | - `TRAIN_ONSET_OFFSETS`: | ||
| 5560 | - `4.064, 4.032, 10.016, 8.032, 8.032, 6.048` | ||
| 5561 | - 说明新策略已明显偏向强能量区或起音邻域,而不是纯随机 | ||
| 5562 | - dry-run 验证: | ||
| 5563 | - `train.py --data data/synthetic_v2 --dry-run --segment-strategy high_energy` | ||
| 5564 | - forward/backward 成功,`Embedding shape: torch.Size([64, 192])` | ||
| 5565 | |||
| 5566 | 结论: | ||
| 5567 | - 当前项目的音乐感知切片已经从“避静音”扩展到了“偏主段 / 偏起音” | ||
| 5568 | - 下一步若继续增强,可在此基础上叠加: | ||
| 5569 | - beat-aware | ||
| 5570 | - chorus-aware | ||
| 5571 | - repeated-section-aware | ... | ... |
| ... | @@ -354,12 +354,14 @@ flowchart TD | ... | @@ -354,12 +354,14 @@ flowchart TD |
| 354 | | `random` | 训练 query | 增强泛化,模拟未知用户截取点 | 是 | | 354 | | `random` | 训练 query | 增强泛化,模拟未知用户截取点 | 是 | |
| 355 | | `sliding` | 建库 / query 生成 | 保证覆盖率,减少漏召回 | 是 | | 355 | | `sliding` | 建库 / query 生成 | 保证覆盖率,减少漏召回 | 是 | |
| 356 | | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | | 356 | | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | |
| 357 | | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | | ||
| 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | | ||
| 357 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | | 359 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | |
| 358 | 360 | ||
| 359 | 推荐理解: | 361 | 推荐理解: |
| 360 | 362 | ||
| 361 | 1. **训练不是全部随机切** | 363 | 1. **训练不是全部随机切** |
| 362 | 当前训练集可用 `random / silence_aware / hybrid` | 364 | 当前训练集可用 `random / silence_aware / high_energy / onset_aware / hybrid` |
| 363 | 2. **reference 建库不是随机切** | 365 | 2. **reference 建库不是随机切** |
| 364 | 建库仍然是固定滑窗 | 366 | 建库仍然是固定滑窗 |
| 365 | 3. **外部数据 query 生成也不是只能随机切** | 367 | 3. **外部数据 query 生成也不是只能随机切** |
| ... | @@ -384,6 +386,8 @@ flowchart TD | ... | @@ -384,6 +386,8 @@ flowchart TD |
| 384 | - baseline:`random` | 386 | - baseline:`random` |
| 385 | - 更稳的音乐任务:`hybrid` | 387 | - 更稳的音乐任务:`hybrid` |
| 386 | - 已知原始音频静音很多:`silence_aware` | 388 | - 已知原始音频静音很多:`silence_aware` |
| 389 | - 更想贴近副歌/强节奏:`high_energy` | ||
| 390 | - 更想贴近短音起点/打点:`onset_aware` | ||
| 387 | 391 | ||
| 388 | ### 外部数据 query 生成推荐 | 392 | ### 外部数据 query 生成推荐 |
| 389 | 393 | ||
| ... | @@ -392,11 +396,20 @@ flowchart TD | ... | @@ -392,11 +396,20 @@ flowchart TD |
| 392 | --output-root data/external_ingested \ | 396 | --output-root data/external_ingested \ |
| 393 | --query-duration 8 \ | 397 | --query-duration 8 \ |
| 394 | --query-stride 4 \ | 398 | --query-stride 4 \ |
| 395 | --query-strategy silence_aware \ | 399 | --query-strategy high_energy \ |
| 396 | --silence-top-db 30 | 400 | --silence-top-db 30 |
| 397 | ``` | 401 | ``` |
| 398 | 402 | ||
| 399 | 这会优先从非静音区生成 query,而不是从长静音头尾随机采样。 | 403 | 这会优先从高能区生成 query,而不是从长静音头尾或低能过门里随机采样。 |
| 404 | |||
| 405 | 补充建议: | ||
| 406 | |||
| 407 | | 场景 | 推荐策略 | | ||
| 408 | |---|---| | ||
| 409 | | 录音静音头尾很多 | `silence_aware` | | ||
| 410 | | 更想贴近副歌/主段 | `high_energy` | | ||
| 411 | | 更想贴近打点/起唱点 | `onset_aware` | | ||
| 412 | | 既要音乐感知,又要保留泛化 | `hybrid` | | ||
| 400 | 413 | ||
| 401 | --- | 414 | --- |
| 402 | 415 | ... | ... |
-
Please register or sign in to post a comment