Commit b6cdf668 b6cdf668df34481116416b05c30c2a2756ebe7d4 by cnb.bofCdSsphPA

Bias music training crops toward salient energy and attack regions

Constraint: Music ACR queries should be closer to choruses, strong rhythmic sections, and attack regions without giving up the existing random and silence-aware fallbacks
Rejected: Add only heavier beat/chorus modeling first | higher complexity and more brittle than lightweight energy/onset heuristics for the current training pipeline
Confidence: high
Scope-risk: moderate
Directive: Keep high_energy/onset_aware as heuristic candidate generators; future beat/chorus logic should layer on top of them rather than replace the fallback stack
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy high_energy and onset_aware; handcrafted 20s audio fixture with high_energy/onset_aware query offset checks
Not-tested: Full retraining/evaluation impact on FMA or internal production datasets
1 parent 4ceaa995
...@@ -9,6 +9,61 @@ import torch ...@@ -9,6 +9,61 @@ import torch
9 from torch.utils.data import Dataset 9 from torch.utils.data import Dataset
10 10
11 11
12 def compute_candidate_offsets(
13 y: np.ndarray,
14 sr: int,
15 segment_len: int,
16 strategy: str,
17 silence_top_db: int,
18 ) -> List[int]:
19 if len(y) <= segment_len:
20 return [0]
21
22 if strategy == "silence_aware":
23 intervals = librosa.effects.split(y, top_db=silence_top_db)
24 if intervals is None or len(intervals) == 0:
25 return []
26 offsets = []
27 for start, end in intervals:
28 start = int(start)
29 end = int(end)
30 if end - start >= segment_len:
31 offsets.append(start)
32 last = end - segment_len
33 if last > start:
34 offsets.append(last)
35 return sorted(set(offsets))
36
37 if strategy == "high_energy":
38 hop = max(segment_len // 2, 1)
39 scores: List[tuple[float, int]] = []
40 for start in range(0, max(len(y) - segment_len + 1, 1), hop):
41 seg = y[start : start + segment_len]
42 if len(seg) < segment_len:
43 seg = np.pad(seg, (0, segment_len - len(seg)))
44 rms = float(np.sqrt(np.mean(np.square(seg)) + 1e-12))
45 scores.append((rms, start))
46 scores.sort(key=lambda x: x[0], reverse=True)
47 return [start for _, start in scores[: min(6, len(scores))]]
48
49 if strategy == "onset_aware":
50 try:
51 onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames")
52 onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512)
53 except Exception:
54 onset_samples = np.array([], dtype=int)
55 if onset_samples.size == 0:
56 return []
57 offsets = []
58 max_start = max(len(y) - segment_len, 0)
59 for onset in onset_samples.tolist():
60 start = max(0, min(int(onset), max_start))
61 offsets.append(start)
62 return sorted(set(offsets[: min(8, len(offsets))]))
63
64 return []
65
66
12 class ACRDataset(Dataset): 67 class ACRDataset(Dataset):
13 def __init__( 68 def __init__(
14 self, 69 self,
...@@ -74,15 +129,9 @@ class ACRDataset(Dataset): ...@@ -74,15 +129,9 @@ class ACRDataset(Dataset):
74 ) 129 )
75 return librosa.power_to_db(mel, ref=np.max) 130 return librosa.power_to_db(mel, ref=np.max)
76 131
77 def _find_non_silent_intervals(self, y: np.ndarray) -> List[tuple[int, int]]:
78 intervals = librosa.effects.split(y, top_db=self.silence_top_db)
79 if intervals is None or len(intervals) == 0:
80 return [(0, len(y))]
81 return [(int(s), int(e)) for s, e in intervals]
82
83 def _choose_offset(self, sample: Dict, audio_path: Path) -> float: 132 def _choose_offset(self, sample: Dict, audio_path: Path) -> float:
84 duration = float(sample["duration"]) 133 duration = float(sample["duration"])
85 max_offset = max(0.0, duration - 5.0) 134 max_offset = max(0.0, duration - (self.segment_len / self.sr))
86 if max_offset <= 0: 135 if max_offset <= 0:
87 return 0.0 136 return 0.0
88 137
...@@ -90,26 +139,31 @@ class ACRDataset(Dataset): ...@@ -90,26 +139,31 @@ class ACRDataset(Dataset):
90 return random.uniform(0, max_offset) 139 return random.uniform(0, max_offset)
91 140
92 y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) 141 y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True)
93 target_len = self.segment_len 142 direct_candidates = compute_candidate_offsets(
94 intervals = self._find_non_silent_intervals(y) 143 y=y,
95 valid_intervals = [] 144 sr=self.sr,
96 for start, end in intervals: 145 segment_len=self.segment_len,
97 if end - start >= target_len: 146 strategy=self.segment_strategy,
98 valid_intervals.append((start, end)) 147 silence_top_db=self.silence_top_db,
99 148 )
100 if self.segment_strategy == "silence_aware": 149 if direct_candidates:
101 if valid_intervals: 150 chosen = random.choice(direct_candidates)
102 start, end = random.choice(valid_intervals)
103 seg_max_start = max(start, end - target_len)
104 chosen = random.randint(start, seg_max_start) if seg_max_start > start else start
105 return min(chosen / self.sr, max_offset) 151 return min(chosen / self.sr, max_offset)
106 return random.uniform(0, max_offset)
107 152
108 if self.segment_strategy == "hybrid": 153 if self.segment_strategy == "hybrid":
109 if valid_intervals and random.random() < 0.7: 154 candidate_pool: List[int] = []
110 start, end = random.choice(valid_intervals) 155 for strategy in ("high_energy", "onset_aware", "silence_aware"):
111 seg_max_start = max(start, end - target_len) 156 candidate_pool.extend(
112 chosen = random.randint(start, seg_max_start) if seg_max_start > start else start 157 compute_candidate_offsets(
158 y=y,
159 sr=self.sr,
160 segment_len=self.segment_len,
161 strategy=strategy,
162 silence_top_db=self.silence_top_db,
163 )
164 )
165 if candidate_pool and random.random() < 0.75:
166 chosen = random.choice(sorted(set(candidate_pool)))
113 return min(chosen / self.sr, max_offset) 167 return min(chosen / self.sr, max_offset)
114 return random.uniform(0, max_offset) 168 return random.uniform(0, max_offset)
115 169
...@@ -260,24 +314,37 @@ class SongPairDataset(Dataset): ...@@ -260,24 +314,37 @@ class SongPairDataset(Dataset):
260 path = self.asset_root / sample["audio_path"] 314 path = self.asset_root / sample["audio_path"]
261 full_y, _ = librosa.load(str(path), sr=self.sr, mono=True) 315 full_y, _ = librosa.load(str(path), sr=self.sr, mono=True)
262 duration = float(sample.get("duration", len(full_y) / self.sr)) 316 duration = float(sample.get("duration", len(full_y) / self.sr))
263 max_offset = max(0.0, duration - 5.0) 317 max_offset = max(0.0, duration - (self.segment_len / self.sr))
264 offset = 0.0 318 offset = 0.0
265 if max_offset > 0: 319 if max_offset > 0:
266 if self.segment_strategy == "random": 320 if self.segment_strategy == "random":
267 offset = random.uniform(0, max_offset) 321 offset = random.uniform(0, max_offset)
268 else: 322 else:
269 intervals = librosa.effects.split(full_y, top_db=self.silence_top_db) 323 direct_candidates = compute_candidate_offsets(
270 valid = [(int(s), int(e)) for s, e in intervals if int(e) - int(s) >= self.segment_len] if len(intervals) else [] 324 y=full_y,
271 if self.segment_strategy == "silence_aware" and valid: 325 sr=self.sr,
272 start, end = random.choice(valid) 326 segment_len=self.segment_len,
273 seg_max_start = max(start, end - self.segment_len) 327 strategy=self.segment_strategy,
274 chosen = random.randint(start, seg_max_start) if seg_max_start > start else start 328 silence_top_db=self.silence_top_db,
275 offset = min(chosen / self.sr, max_offset) 329 )
276 elif self.segment_strategy == "hybrid" and valid and random.random() < 0.7: 330 if direct_candidates:
277 start, end = random.choice(valid) 331 offset = min(random.choice(direct_candidates) / self.sr, max_offset)
278 seg_max_start = max(start, end - self.segment_len) 332 elif self.segment_strategy == "hybrid":
279 chosen = random.randint(start, seg_max_start) if seg_max_start > start else start 333 candidate_pool: List[int] = []
280 offset = min(chosen / self.sr, max_offset) 334 for strategy in ("high_energy", "onset_aware", "silence_aware"):
335 candidate_pool.extend(
336 compute_candidate_offsets(
337 y=full_y,
338 sr=self.sr,
339 segment_len=self.segment_len,
340 strategy=strategy,
341 silence_top_db=self.silence_top_db,
342 )
343 )
344 if candidate_pool and random.random() < 0.75:
345 offset = min(random.choice(sorted(set(candidate_pool))) / self.sr, max_offset)
346 else:
347 offset = random.uniform(0, max_offset)
281 else: 348 else:
282 offset = random.uniform(0, max_offset) 349 offset = random.uniform(0, max_offset)
283 start = int(offset * self.sr) 350 start = int(offset * self.sr)
......
...@@ -516,7 +516,7 @@ def main(): ...@@ -516,7 +516,7 @@ def main():
516 p.add_argument("--eval-ratio", type=float, default=0.2) 516 p.add_argument("--eval-ratio", type=float, default=0.2)
517 p.add_argument("--query-duration", type=float, default=8.0) 517 p.add_argument("--query-duration", type=float, default=8.0)
518 p.add_argument("--query-stride", type=float, default=None) 518 p.add_argument("--query-stride", type=float, default=None)
519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") 519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random")
520 p.add_argument("--silence-top-db", type=int, default=30) 520 p.add_argument("--silence-top-db", type=int, default=30)
521 p.add_argument("--seed", type=int, default=42) 521 p.add_argument("--seed", type=int, default=42)
522 522
...@@ -548,8 +548,8 @@ def main(): ...@@ -548,8 +548,8 @@ def main():
548 p.add_argument("--eval-ratio", type=float, default=0.2) 548 p.add_argument("--eval-ratio", type=float, default=0.2)
549 p.add_argument("--query-duration", type=float, default=8.0) 549 p.add_argument("--query-duration", type=float, default=8.0)
550 p.add_argument("--query-stride", type=float, default=None) 550 p.add_argument("--query-stride", type=float, default=None)
551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") 551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random")
552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") 552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random")
553 p.add_argument("--silence-top-db", type=int, default=30) 553 p.add_argument("--silence-top-db", type=int, default=30)
554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100) 554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100)
555 p.add_argument("--seed", type=int, default=42) 555 p.add_argument("--seed", type=int, default=42)
......
...@@ -7,12 +7,19 @@ import csv ...@@ -7,12 +7,19 @@ import csv
7 import json 7 import json
8 import random 8 import random
9 import shutil 9 import shutil
10 import sys
10 from pathlib import Path 11 from pathlib import Path
11 from typing import List, Dict 12 from typing import List, Dict
12 import numpy as np 13 import numpy as np
13 import soundfile as sf 14 import soundfile as sf
14 import librosa 15 import librosa
15 16
17 ROOT = Path(__file__).resolve().parents[2]
18 if str(ROOT) not in sys.path:
19 sys.path.insert(0, str(ROOT))
20
21 from src.data.dataset import compute_candidate_offsets
22
16 23
17 def write_catalog(records: List[Dict], output_path: Path): 24 def write_catalog(records: List[Dict], output_path: Path):
18 output_path.parent.mkdir(parents=True, exist_ok=True) 25 output_path.parent.mkdir(parents=True, exist_ok=True)
...@@ -62,34 +69,26 @@ def build_train_eval_from_audio_dir( ...@@ -62,34 +69,26 @@ def build_train_eval_from_audio_dir(
62 train = [] 69 train = []
63 test = [] 70 test = []
64 71
65 def compute_silence_aware_offsets(path: Path, duration: float) -> List[float]: 72 def compute_strategy_offsets(path: Path, duration: float, strategy: str) -> List[float]:
66 if duration < query_duration: 73 if duration < query_duration:
67 return [] 74 return []
68 try: 75 try:
69 y, sr = librosa.load(str(path), sr=None, mono=True) 76 y, sr = librosa.load(str(path), sr=None, mono=True)
70 intervals = librosa.effects.split(y, top_db=silence_top_db)
71 if intervals is None or len(intervals) == 0:
72 raise ValueError("no_non_silent_intervals")
73 offsets = []
74 target_len = int(query_duration * sr) 77 target_len = int(query_duration * sr)
75 for start, end in intervals: 78 candidates = compute_candidate_offsets(
79 y=y,
80 sr=sr,
81 segment_len=target_len,
82 strategy=strategy,
83 silence_top_db=silence_top_db,
84 )
85 offsets = []
86 for start in candidates:
76 start = int(start) 87 start = int(start)
77 end = int(end) 88 if query_stride and query_stride > 0 and strategy in {"silence_aware"}:
78 if end - start < target_len: 89 offsets.append(round(start / sr, 3))
79 continue
80 if query_stride and query_stride > 0:
81 stride = int(query_stride * sr)
82 local_positions = list(range(start, max(start + 1, end - target_len + 1), stride))
83 if not local_positions:
84 local_positions = [start]
85 last_pos = end - target_len
86 if last_pos >= start and local_positions[-1] != last_pos:
87 local_positions.append(last_pos)
88 offsets.extend([round(pos / sr, 3) for pos in local_positions])
89 else: 90 else:
90 seg_max_start = max(start, end - target_len) 91 offsets.append(round(start / sr, 3))
91 chosen = rng.randint(start, seg_max_start) if seg_max_start > start else start
92 offsets.append(round(chosen / sr, 3))
93 return sorted(set(x for x in offsets if x <= max(0.0, duration - query_duration))) 92 return sorted(set(x for x in offsets if x <= max(0.0, duration - query_duration)))
94 except Exception: 93 except Exception:
95 return [] 94 return []
...@@ -117,20 +116,23 @@ def build_train_eval_from_audio_dir( ...@@ -117,20 +116,23 @@ def build_train_eval_from_audio_dir(
117 refs.append(ref) 116 refs.append(ref)
118 117
119 if duration >= query_duration: 118 if duration >= query_duration:
120 if query_strategy in {"silence_aware", "hybrid"}: 119 strategy_offsets = []
121 silence_offsets = compute_silence_aware_offsets(path, duration) 120 if query_strategy in {"silence_aware", "high_energy", "onset_aware"}:
122 else: 121 strategy_offsets = compute_strategy_offsets(path, duration, query_strategy)
123 silence_offsets = [] 122 elif query_strategy == "hybrid":
124 123 for strategy in ("high_energy", "onset_aware", "silence_aware"):
125 if query_strategy == "silence_aware" and silence_offsets: 124 strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy))
126 offsets = silence_offsets 125 strategy_offsets = sorted(set(strategy_offsets))
127 elif query_strategy == "hybrid" and silence_offsets: 126
127 if query_strategy in {"silence_aware", "high_energy", "onset_aware"} and strategy_offsets:
128 offsets = strategy_offsets
129 elif query_strategy == "hybrid" and strategy_offsets:
128 if query_stride and query_stride > 0: 130 if query_stride and query_stride > 0:
129 offsets = silence_offsets 131 offsets = strategy_offsets
130 else: 132 else:
131 max_offset = max(0.0, duration - query_duration) 133 max_offset = max(0.0, duration - query_duration)
132 random_offset = round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3) 134 random_offset = round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3)
133 offsets = sorted(set(silence_offsets + [random_offset])) 135 offsets = sorted(set(strategy_offsets + [random_offset]))
134 elif query_stride and query_stride > 0: 136 elif query_stride and query_stride > 0:
135 max_offset = max(0.0, duration - query_duration) 137 max_offset = max(0.0, duration - query_duration)
136 offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] 138 offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()]
...@@ -275,7 +277,7 @@ def main(): ...@@ -275,7 +277,7 @@ def main():
275 p.add_argument("--eval-ratio", type=float, default=0.2) 277 p.add_argument("--eval-ratio", type=float, default=0.2)
276 p.add_argument("--query-duration", type=float, default=8.0) 278 p.add_argument("--query-duration", type=float, default=8.0)
277 p.add_argument("--query-stride", type=float, default=None) 279 p.add_argument("--query-stride", type=float, default=None)
278 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") 280 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random")
279 p.add_argument("--silence-top-db", type=int, default=30) 281 p.add_argument("--silence-top-db", type=int, default=30)
280 p.add_argument("--seed", type=int, default=42) 282 p.add_argument("--seed", type=int, default=42)
281 283
......
...@@ -125,7 +125,7 @@ def main(): ...@@ -125,7 +125,7 @@ def main():
125 parser.add_argument("--epochs", type=int, default=None) 125 parser.add_argument("--epochs", type=int, default=None)
126 parser.add_argument("--batch-size", type=int, default=None) 126 parser.add_argument("--batch-size", type=int, default=None)
127 parser.add_argument("--lr", type=float, default=None) 127 parser.add_argument("--lr", type=float, default=None)
128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") 128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random")
129 parser.add_argument("--silence-top-db", type=int, default=30) 129 parser.add_argument("--silence-top-db", type=int, default=30)
130 parser.add_argument("--dry-run", action="store_true") 130 parser.add_argument("--dry-run", action="store_true")
131 args = parser.parse_args() 131 args = parser.parse_args()
......
...@@ -5522,3 +5522,50 @@ ...@@ -5522,3 +5522,50 @@
5522 结论: 5522 结论:
5523 - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力 5523 - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力
5524 - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index 5524 - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index
5525
5526 ### Stage: high-energy / onset-aware music segmentation
5527
5528 完成项:
5529 -`acr-engine/src/data/dataset.py` 新增训练切片候选策略:
5530 - `high_energy`
5531 - `onset_aware`
5532 -`acr-engine/src/data/manifest_tools.py` 新增外部 query 生成策略:
5533 - `--query-strategy high_energy`
5534 - `--query-strategy onset_aware`
5535 -`hybrid` 升级为可复用:
5536 - `high_energy`
5537 - `onset_aware`
5538 - `silence_aware`
5539 三类音乐感知候选,再补随机 fallback
5540 -`train.py``external_adapters.py` 暴露新策略选项
5541 -[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 增补策略说明与使用建议
5542
5543 验证结果:
5544 - 编译验证:
5545 - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py`
5546 - 人造音频验证:
5547 - 构造 `20s` 音频:
5548 - `4-6s` 低能 tone
5549 - `8/10/12s` 强起音脉冲
5550 - `14-19s` 高能 tone
5551 - query 生成结果:
5552 - `high_energy` offsets:
5553 - `2.5, 7.5, 10.0, 12.5, 15.0`
5554 - `onset_aware` offsets:
5555 - `4.032, 6.048, 8.032, 10.016, 10.048, 12.032`
5556 - 训练侧偏移验证:
5557 - `TRAIN_HIGH_ENERGY_OFFSETS`
5558 - `2.5, 15.0, 15.0, 2.5, 10.0, 12.5`
5559 - `TRAIN_ONSET_OFFSETS`
5560 - `4.064, 4.032, 10.016, 8.032, 8.032, 6.048`
5561 - 说明新策略已明显偏向强能量区或起音邻域,而不是纯随机
5562 - dry-run 验证:
5563 - `train.py --data data/synthetic_v2 --dry-run --segment-strategy high_energy`
5564 - forward/backward 成功,`Embedding shape: torch.Size([64, 192])`
5565
5566 结论:
5567 - 当前项目的音乐感知切片已经从“避静音”扩展到了“偏主段 / 偏起音”
5568 - 下一步若继续增强,可在此基础上叠加:
5569 - beat-aware
5570 - chorus-aware
5571 - repeated-section-aware
......
...@@ -354,12 +354,14 @@ flowchart TD ...@@ -354,12 +354,14 @@ flowchart TD
354 | `random` | 训练 query | 增强泛化,模拟未知用户截取点 | 是 | 354 | `random` | 训练 query | 增强泛化,模拟未知用户截取点 | 是 |
355 | `sliding` | 建库 / query 生成 | 保证覆盖率,减少漏召回 | 是 | 355 | `sliding` | 建库 / query 生成 | 保证覆盖率,减少漏召回 | 是 |
356 | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | 356 | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 |
357 | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 |
358 | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 |
357 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | 359 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 |
358 360
359 推荐理解: 361 推荐理解:
360 362
361 1. **训练不是全部随机切** 363 1. **训练不是全部随机切**
362 当前训练集可用 `random / silence_aware / hybrid` 364 当前训练集可用 `random / silence_aware / high_energy / onset_aware / hybrid`
363 2. **reference 建库不是随机切** 365 2. **reference 建库不是随机切**
364 建库仍然是固定滑窗 366 建库仍然是固定滑窗
365 3. **外部数据 query 生成也不是只能随机切** 367 3. **外部数据 query 生成也不是只能随机切**
...@@ -384,6 +386,8 @@ flowchart TD ...@@ -384,6 +386,8 @@ flowchart TD
384 - baseline:`random` 386 - baseline:`random`
385 - 更稳的音乐任务:`hybrid` 387 - 更稳的音乐任务:`hybrid`
386 - 已知原始音频静音很多:`silence_aware` 388 - 已知原始音频静音很多:`silence_aware`
389 - 更想贴近副歌/强节奏:`high_energy`
390 - 更想贴近短音起点/打点:`onset_aware`
387 391
388 ### 外部数据 query 生成推荐 392 ### 外部数据 query 生成推荐
389 393
...@@ -392,11 +396,20 @@ flowchart TD ...@@ -392,11 +396,20 @@ flowchart TD
392 --output-root data/external_ingested \ 396 --output-root data/external_ingested \
393 --query-duration 8 \ 397 --query-duration 8 \
394 --query-stride 4 \ 398 --query-stride 4 \
395 --query-strategy silence_aware \ 399 --query-strategy high_energy \
396 --silence-top-db 30 400 --silence-top-db 30
397 ``` 401 ```
398 402
399 这会优先从非静音区生成 query,而不是从长静音头尾随机采样。 403 这会优先从高能区生成 query,而不是从长静音头尾或低能过门里随机采样。
404
405 补充建议:
406
407 | 场景 | 推荐策略 |
408 |---|---|
409 | 录音静音头尾很多 | `silence_aware` |
410 | 更想贴近副歌/主段 | `high_energy` |
411 | 更想贴近打点/起唱点 | `onset_aware` |
412 | 既要音乐感知,又要保留泛化 | `hybrid` |
400 413
401 --- 414 ---
402 415
......