Commit d7a08944 d7a08944003d0ceb67427dae605dfa7d46507600 by cnb.bofCdSsphPA

Align music crop sampling with rhythmic grid candidates

Constraint: Music queries often begin near stable pulse locations, but beat tracking can fail on sparse or synthetic signals and must degrade safely
Rejected: Depend on beat tracking alone for all rhythmic sampling | too brittle when beat extraction is weak or absent
Confidence: high
Scope-risk: moderate
Directive: Keep beat_aware as a lightweight candidate generator with onset fallback; future chorus/repeated-section logic should compose with beat-aware rather than bypass it
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy beat_aware; handcrafted 20s pulse-track fixture with beat_aware and hybrid offset checks
Not-tested: Full retraining/evaluation impact on open/internal datasets using beat_aware end-to-end
1 parent b6cdf668
...@@ -61,6 +61,40 @@ def compute_candidate_offsets( ...@@ -61,6 +61,40 @@ def compute_candidate_offsets(
61 offsets.append(start) 61 offsets.append(start)
62 return sorted(set(offsets[: min(8, len(offsets))])) 62 return sorted(set(offsets[: min(8, len(offsets))]))
63 63
64 if strategy == "beat_aware":
65 try:
66 tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=512, units="frames")
67 beat_samples = librosa.frames_to_samples(beat_frames, hop_length=512)
68 except Exception:
69 beat_samples = np.array([], dtype=int)
70 if beat_samples.size == 0:
71 try:
72 onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames")
73 onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512)
74 if onset_samples.size >= 2:
75 diffs = np.diff(onset_samples)
76 median_step = int(np.median(diffs)) if diffs.size else 0
77 if median_step > 0:
78 approx = [int(onset_samples[0])]
79 while approx[-1] + median_step < len(y):
80 approx.append(approx[-1] + median_step)
81 beat_samples = np.array(approx, dtype=int)
82 elif onset_samples.size == 1:
83 beat_samples = onset_samples
84 except Exception:
85 beat_samples = np.array([], dtype=int)
86 if beat_samples.size == 0:
87 return []
88 offsets = []
89 max_start = max(len(y) - segment_len, 0)
90 for beat in beat_samples.tolist():
91 start = max(0, min(int(beat), max_start))
92 offsets.append(start)
93 if not offsets:
94 return []
95 step = max(1, len(offsets) // 8)
96 return sorted(set(offsets[::step][:8]))
97
64 return [] 98 return []
65 99
66 100
...@@ -152,7 +186,7 @@ class ACRDataset(Dataset): ...@@ -152,7 +186,7 @@ class ACRDataset(Dataset):
152 186
153 if self.segment_strategy == "hybrid": 187 if self.segment_strategy == "hybrid":
154 candidate_pool: List[int] = [] 188 candidate_pool: List[int] = []
155 for strategy in ("high_energy", "onset_aware", "silence_aware"): 189 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"):
156 candidate_pool.extend( 190 candidate_pool.extend(
157 compute_candidate_offsets( 191 compute_candidate_offsets(
158 y=y, 192 y=y,
...@@ -331,7 +365,7 @@ class SongPairDataset(Dataset): ...@@ -331,7 +365,7 @@ class SongPairDataset(Dataset):
331 offset = min(random.choice(direct_candidates) / self.sr, max_offset) 365 offset = min(random.choice(direct_candidates) / self.sr, max_offset)
332 elif self.segment_strategy == "hybrid": 366 elif self.segment_strategy == "hybrid":
333 candidate_pool: List[int] = [] 367 candidate_pool: List[int] = []
334 for strategy in ("high_energy", "onset_aware", "silence_aware"): 368 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"):
335 candidate_pool.extend( 369 candidate_pool.extend(
336 compute_candidate_offsets( 370 compute_candidate_offsets(
337 y=full_y, 371 y=full_y,
......
...@@ -516,7 +516,7 @@ def main(): ...@@ -516,7 +516,7 @@ def main():
516 p.add_argument("--eval-ratio", type=float, default=0.2) 516 p.add_argument("--eval-ratio", type=float, default=0.2)
517 p.add_argument("--query-duration", type=float, default=8.0) 517 p.add_argument("--query-duration", type=float, default=8.0)
518 p.add_argument("--query-stride", type=float, default=None) 518 p.add_argument("--query-stride", type=float, default=None)
519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") 519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random")
520 p.add_argument("--silence-top-db", type=int, default=30) 520 p.add_argument("--silence-top-db", type=int, default=30)
521 p.add_argument("--seed", type=int, default=42) 521 p.add_argument("--seed", type=int, default=42)
522 522
...@@ -548,8 +548,8 @@ def main(): ...@@ -548,8 +548,8 @@ def main():
548 p.add_argument("--eval-ratio", type=float, default=0.2) 548 p.add_argument("--eval-ratio", type=float, default=0.2)
549 p.add_argument("--query-duration", type=float, default=8.0) 549 p.add_argument("--query-duration", type=float, default=8.0)
550 p.add_argument("--query-stride", type=float, default=None) 550 p.add_argument("--query-stride", type=float, default=None)
551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") 551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random")
552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") 552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random")
553 p.add_argument("--silence-top-db", type=int, default=30) 553 p.add_argument("--silence-top-db", type=int, default=30)
554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100) 554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100)
555 p.add_argument("--seed", type=int, default=42) 555 p.add_argument("--seed", type=int, default=42)
......
...@@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( ...@@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir(
117 117
118 if duration >= query_duration: 118 if duration >= query_duration:
119 strategy_offsets = [] 119 strategy_offsets = []
120 if query_strategy in {"silence_aware", "high_energy", "onset_aware"}: 120 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"}:
121 strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) 121 strategy_offsets = compute_strategy_offsets(path, duration, query_strategy)
122 elif query_strategy == "hybrid": 122 elif query_strategy == "hybrid":
123 for strategy in ("high_energy", "onset_aware", "silence_aware"): 123 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"):
124 strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) 124 strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy))
125 strategy_offsets = sorted(set(strategy_offsets)) 125 strategy_offsets = sorted(set(strategy_offsets))
126 126
127 if query_strategy in {"silence_aware", "high_energy", "onset_aware"} and strategy_offsets: 127 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"} and strategy_offsets:
128 offsets = strategy_offsets 128 offsets = strategy_offsets
129 elif query_strategy == "hybrid" and strategy_offsets: 129 elif query_strategy == "hybrid" and strategy_offsets:
130 if query_stride and query_stride > 0: 130 if query_stride and query_stride > 0:
...@@ -277,7 +277,7 @@ def main(): ...@@ -277,7 +277,7 @@ def main():
277 p.add_argument("--eval-ratio", type=float, default=0.2) 277 p.add_argument("--eval-ratio", type=float, default=0.2)
278 p.add_argument("--query-duration", type=float, default=8.0) 278 p.add_argument("--query-duration", type=float, default=8.0)
279 p.add_argument("--query-stride", type=float, default=None) 279 p.add_argument("--query-stride", type=float, default=None)
280 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") 280 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random")
281 p.add_argument("--silence-top-db", type=int, default=30) 281 p.add_argument("--silence-top-db", type=int, default=30)
282 p.add_argument("--seed", type=int, default=42) 282 p.add_argument("--seed", type=int, default=42)
283 283
......
...@@ -125,7 +125,7 @@ def main(): ...@@ -125,7 +125,7 @@ def main():
125 parser.add_argument("--epochs", type=int, default=None) 125 parser.add_argument("--epochs", type=int, default=None)
126 parser.add_argument("--batch-size", type=int, default=None) 126 parser.add_argument("--batch-size", type=int, default=None)
127 parser.add_argument("--lr", type=float, default=None) 127 parser.add_argument("--lr", type=float, default=None)
128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") 128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random")
129 parser.add_argument("--silence-top-db", type=int, default=30) 129 parser.add_argument("--silence-top-db", type=int, default=30)
130 parser.add_argument("--dry-run", action="store_true") 130 parser.add_argument("--dry-run", action="store_true")
131 args = parser.parse_args() 131 args = parser.parse_args()
......
...@@ -5569,3 +5569,52 @@ ...@@ -5569,3 +5569,52 @@
5569 - beat-aware 5569 - beat-aware
5570 - chorus-aware 5570 - chorus-aware
5571 - repeated-section-aware 5571 - repeated-section-aware
5572
5573 ### Stage: beat-aware music segmentation
5574
5575 完成项:
5576 -`acr-engine/src/data/dataset.py` 新增:
5577 - `beat_aware` 候选切片策略
5578 -`acr-engine/src/data/manifest_tools.py` 新增:
5579 - `--query-strategy beat_aware`
5580 -`train.py``external_adapters.py` 暴露:
5581 - `beat_aware` 选项
5582 -`beat_aware` 增加容错:
5583 - 优先使用 `librosa.beat.beat_track`
5584 - 若 beat 检测失败,则回退到 onset 间隔估计生成近似节拍点
5585 -`hybrid` 扩展为优先复用:
5586 - `beat_aware`
5587 - `high_energy`
5588 - `onset_aware`
5589 - `silence_aware`
5590
5591 验证结果:
5592 - 编译验证:
5593 - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py`
5594 - 人造节拍音频验证:
5595 - 构造 `20s` 音频
5596 - `4s-18s` 区间每 `0.5s` 注入一次脉冲(约 120 BPM)
5597 - 再叠加轻微 tonal bed
5598 - 直接 beat 候选结果:
5599 - `DIRECT_BEAT_CANDIDATES_SEC`
5600 - `4.032, 5.952, 7.872, 9.792, 11.712, 13.632, 15.0`
5601 - query 生成结果:
5602 - `BEAT_QUERY_OFFSETS`
5603 - `4.032, 7.872, 9.792, 11.712, 13.632, 15.0`
5604 - `HYBRID_QUERY_OFFSETS`
5605 - `3.968, 4.032, 4.064, 4.544, 5.0, 5.536, 6.016, 6.048, 7.872, 9.591, 9.792, 10.0`
5606 - 训练侧偏移验证:
5607 - `TRAIN_BEAT_AWARE_OFFSETS`
5608 - `13.632, 4.032, 4.032, 13.632, 7.872, 5.952`
5609 - `TRAIN_HYBRID_OFFSETS`
5610 - `2.5, 5.536, 4.064, 12.5, 7.872, 4.032`
5611 - 说明 beat-aware 已明显偏向规则拍点,hybrid 也已吸收 beat-aware 候选
5612 - dry-run 验证:
5613 - `train.py --data data/synthetic_v2 --dry-run --segment-strategy beat_aware`
5614 - forward/backward 成功,`Embedding shape: torch.Size([64, 192])`
5615
5616 结论:
5617 - 当前项目的音乐感知切片已经进一步从“高能/起音”扩展到“规则拍点”
5618 - 下一步可继续叠加:
5619 - repeated-section-aware
5620 - chorus-like candidate mining
......
...@@ -356,12 +356,13 @@ flowchart TD ...@@ -356,12 +356,13 @@ flowchart TD
356 | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | 356 | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 |
357 | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | 357 | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 |
358 | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | 358 | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 |
359 | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 |
359 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | 360 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 |
360 361
361 推荐理解: 362 推荐理解:
362 363
363 1. **训练不是全部随机切** 364 1. **训练不是全部随机切**
364 当前训练集可用 `random / silence_aware / high_energy / onset_aware / hybrid` 365 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / hybrid`
365 2. **reference 建库不是随机切** 366 2. **reference 建库不是随机切**
366 建库仍然是固定滑窗 367 建库仍然是固定滑窗
367 3. **外部数据 query 生成也不是只能随机切** 368 3. **外部数据 query 生成也不是只能随机切**
...@@ -388,6 +389,7 @@ flowchart TD ...@@ -388,6 +389,7 @@ flowchart TD
388 - 已知原始音频静音很多:`silence_aware` 389 - 已知原始音频静音很多:`silence_aware`
389 - 更想贴近副歌/强节奏:`high_energy` 390 - 更想贴近副歌/强节奏:`high_energy`
390 - 更想贴近短音起点/打点:`onset_aware` 391 - 更想贴近短音起点/打点:`onset_aware`
392 - 更想贴近稳定节拍网格:`beat_aware`
391 393
392 ### 外部数据 query 生成推荐 394 ### 外部数据 query 生成推荐
393 395
...@@ -409,6 +411,7 @@ flowchart TD ...@@ -409,6 +411,7 @@ flowchart TD
409 | 录音静音头尾很多 | `silence_aware` | 411 | 录音静音头尾很多 | `silence_aware` |
410 | 更想贴近副歌/主段 | `high_energy` | 412 | 更想贴近副歌/主段 | `high_energy` |
411 | 更想贴近打点/起唱点 | `onset_aware` | 413 | 更想贴近打点/起唱点 | `onset_aware` |
414 | 更想贴近规则拍点/律动骨架 | `beat_aware` |
412 | 既要音乐感知,又要保留泛化 | `hybrid` | 415 | 既要音乐感知,又要保留泛化 | `hybrid` |
413 416
414 --- 417 ---
......