Align music crop sampling with rhythmic grid candidates
Constraint: Music queries often begin near stable pulse locations, but beat tracking can fail on sparse or synthetic signals and must degrade safely Rejected: Depend on beat tracking alone for all rhythmic sampling | too brittle when beat extraction is weak or absent Confidence: high Scope-risk: moderate Directive: Keep beat_aware as a lightweight candidate generator with onset fallback; future chorus/repeated-section logic should compose with beat-aware rather than bypass it Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy beat_aware; handcrafted 20s pulse-track fixture with beat_aware and hybrid offset checks Not-tested: Full retraining/evaluation impact on open/internal datasets using beat_aware end-to-end
Showing
6 changed files
with
97 additions
and
11 deletions
| ... | @@ -61,6 +61,40 @@ def compute_candidate_offsets( | ... | @@ -61,6 +61,40 @@ def compute_candidate_offsets( |
| 61 | offsets.append(start) | 61 | offsets.append(start) |
| 62 | return sorted(set(offsets[: min(8, len(offsets))])) | 62 | return sorted(set(offsets[: min(8, len(offsets))])) |
| 63 | 63 | ||
| 64 | if strategy == "beat_aware": | ||
| 65 | try: | ||
| 66 | tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=512, units="frames") | ||
| 67 | beat_samples = librosa.frames_to_samples(beat_frames, hop_length=512) | ||
| 68 | except Exception: | ||
| 69 | beat_samples = np.array([], dtype=int) | ||
| 70 | if beat_samples.size == 0: | ||
| 71 | try: | ||
| 72 | onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, units="frames") | ||
| 73 | onset_samples = librosa.frames_to_samples(onset_frames, hop_length=512) | ||
| 74 | if onset_samples.size >= 2: | ||
| 75 | diffs = np.diff(onset_samples) | ||
| 76 | median_step = int(np.median(diffs)) if diffs.size else 0 | ||
| 77 | if median_step > 0: | ||
| 78 | approx = [int(onset_samples[0])] | ||
| 79 | while approx[-1] + median_step < len(y): | ||
| 80 | approx.append(approx[-1] + median_step) | ||
| 81 | beat_samples = np.array(approx, dtype=int) | ||
| 82 | elif onset_samples.size == 1: | ||
| 83 | beat_samples = onset_samples | ||
| 84 | except Exception: | ||
| 85 | beat_samples = np.array([], dtype=int) | ||
| 86 | if beat_samples.size == 0: | ||
| 87 | return [] | ||
| 88 | offsets = [] | ||
| 89 | max_start = max(len(y) - segment_len, 0) | ||
| 90 | for beat in beat_samples.tolist(): | ||
| 91 | start = max(0, min(int(beat), max_start)) | ||
| 92 | offsets.append(start) | ||
| 93 | if not offsets: | ||
| 94 | return [] | ||
| 95 | step = max(1, len(offsets) // 8) | ||
| 96 | return sorted(set(offsets[::step][:8])) | ||
| 97 | |||
| 64 | return [] | 98 | return [] |
| 65 | 99 | ||
| 66 | 100 | ||
| ... | @@ -152,7 +186,7 @@ class ACRDataset(Dataset): | ... | @@ -152,7 +186,7 @@ class ACRDataset(Dataset): |
| 152 | 186 | ||
| 153 | if self.segment_strategy == "hybrid": | 187 | if self.segment_strategy == "hybrid": |
| 154 | candidate_pool: List[int] = [] | 188 | candidate_pool: List[int] = [] |
| 155 | for strategy in ("high_energy", "onset_aware", "silence_aware"): | 189 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 156 | candidate_pool.extend( | 190 | candidate_pool.extend( |
| 157 | compute_candidate_offsets( | 191 | compute_candidate_offsets( |
| 158 | y=y, | 192 | y=y, |
| ... | @@ -331,7 +365,7 @@ class SongPairDataset(Dataset): | ... | @@ -331,7 +365,7 @@ class SongPairDataset(Dataset): |
| 331 | offset = min(random.choice(direct_candidates) / self.sr, max_offset) | 365 | offset = min(random.choice(direct_candidates) / self.sr, max_offset) |
| 332 | elif self.segment_strategy == "hybrid": | 366 | elif self.segment_strategy == "hybrid": |
| 333 | candidate_pool: List[int] = [] | 367 | candidate_pool: List[int] = [] |
| 334 | for strategy in ("high_energy", "onset_aware", "silence_aware"): | 368 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 335 | candidate_pool.extend( | 369 | candidate_pool.extend( |
| 336 | compute_candidate_offsets( | 370 | compute_candidate_offsets( |
| 337 | y=full_y, | 371 | y=full_y, | ... | ... |
| ... | @@ -516,7 +516,7 @@ def main(): | ... | @@ -516,7 +516,7 @@ def main(): |
| 516 | p.add_argument("--eval-ratio", type=float, default=0.2) | 516 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 517 | p.add_argument("--query-duration", type=float, default=8.0) | 517 | p.add_argument("--query-duration", type=float, default=8.0) |
| 518 | p.add_argument("--query-stride", type=float, default=None) | 518 | p.add_argument("--query-stride", type=float, default=None) |
| 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") | 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") |
| 520 | p.add_argument("--silence-top-db", type=int, default=30) | 520 | p.add_argument("--silence-top-db", type=int, default=30) |
| 521 | p.add_argument("--seed", type=int, default=42) | 521 | p.add_argument("--seed", type=int, default=42) |
| 522 | 522 | ||
| ... | @@ -548,8 +548,8 @@ def main(): | ... | @@ -548,8 +548,8 @@ def main(): |
| 548 | p.add_argument("--eval-ratio", type=float, default=0.2) | 548 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 549 | p.add_argument("--query-duration", type=float, default=8.0) | 549 | p.add_argument("--query-duration", type=float, default=8.0) |
| 550 | p.add_argument("--query-stride", type=float, default=None) | 550 | p.add_argument("--query-stride", type=float, default=None) |
| 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") | 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") |
| 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") | 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") |
| 553 | p.add_argument("--silence-top-db", type=int, default=30) | 553 | p.add_argument("--silence-top-db", type=int, default=30) |
| 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) | 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) |
| 555 | p.add_argument("--seed", type=int, default=42) | 555 | p.add_argument("--seed", type=int, default=42) | ... | ... |
| ... | @@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( | ... | @@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( |
| 117 | 117 | ||
| 118 | if duration >= query_duration: | 118 | if duration >= query_duration: |
| 119 | strategy_offsets = [] | 119 | strategy_offsets = [] |
| 120 | if query_strategy in {"silence_aware", "high_energy", "onset_aware"}: | 120 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"}: |
| 121 | strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) | 121 | strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) |
| 122 | elif query_strategy == "hybrid": | 122 | elif query_strategy == "hybrid": |
| 123 | for strategy in ("high_energy", "onset_aware", "silence_aware"): | 123 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 124 | strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) | 124 | strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) |
| 125 | strategy_offsets = sorted(set(strategy_offsets)) | 125 | strategy_offsets = sorted(set(strategy_offsets)) |
| 126 | 126 | ||
| 127 | if query_strategy in {"silence_aware", "high_energy", "onset_aware"} and strategy_offsets: | 127 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"} and strategy_offsets: |
| 128 | offsets = strategy_offsets | 128 | offsets = strategy_offsets |
| 129 | elif query_strategy == "hybrid" and strategy_offsets: | 129 | elif query_strategy == "hybrid" and strategy_offsets: |
| 130 | if query_stride and query_stride > 0: | 130 | if query_stride and query_stride > 0: |
| ... | @@ -277,7 +277,7 @@ def main(): | ... | @@ -277,7 +277,7 @@ def main(): |
| 277 | p.add_argument("--eval-ratio", type=float, default=0.2) | 277 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 278 | p.add_argument("--query-duration", type=float, default=8.0) | 278 | p.add_argument("--query-duration", type=float, default=8.0) |
| 279 | p.add_argument("--query-stride", type=float, default=None) | 279 | p.add_argument("--query-stride", type=float, default=None) |
| 280 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") | 280 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") |
| 281 | p.add_argument("--silence-top-db", type=int, default=30) | 281 | p.add_argument("--silence-top-db", type=int, default=30) |
| 282 | p.add_argument("--seed", type=int, default=42) | 282 | p.add_argument("--seed", type=int, default=42) |
| 283 | 283 | ... | ... |
| ... | @@ -125,7 +125,7 @@ def main(): | ... | @@ -125,7 +125,7 @@ def main(): |
| 125 | parser.add_argument("--epochs", type=int, default=None) | 125 | parser.add_argument("--epochs", type=int, default=None) |
| 126 | parser.add_argument("--batch-size", type=int, default=None) | 126 | parser.add_argument("--batch-size", type=int, default=None) |
| 127 | parser.add_argument("--lr", type=float, default=None) | 127 | parser.add_argument("--lr", type=float, default=None) |
| 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "hybrid"], default="random") | 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") |
| 129 | parser.add_argument("--silence-top-db", type=int, default=30) | 129 | parser.add_argument("--silence-top-db", type=int, default=30) |
| 130 | parser.add_argument("--dry-run", action="store_true") | 130 | parser.add_argument("--dry-run", action="store_true") |
| 131 | args = parser.parse_args() | 131 | args = parser.parse_args() | ... | ... |
| ... | @@ -5569,3 +5569,52 @@ | ... | @@ -5569,3 +5569,52 @@ |
| 5569 | - beat-aware | 5569 | - beat-aware |
| 5570 | - chorus-aware | 5570 | - chorus-aware |
| 5571 | - repeated-section-aware | 5571 | - repeated-section-aware |
| 5572 | |||
| 5573 | ### Stage: beat-aware music segmentation | ||
| 5574 | |||
| 5575 | 完成项: | ||
| 5576 | - 在 `acr-engine/src/data/dataset.py` 新增: | ||
| 5577 | - `beat_aware` 候选切片策略 | ||
| 5578 | - 在 `acr-engine/src/data/manifest_tools.py` 新增: | ||
| 5579 | - `--query-strategy beat_aware` | ||
| 5580 | - 在 `train.py` 与 `external_adapters.py` 暴露: | ||
| 5581 | - `beat_aware` 选项 | ||
| 5582 | - 为 `beat_aware` 增加容错: | ||
| 5583 | - 优先使用 `librosa.beat.beat_track` | ||
| 5584 | - 若 beat 检测失败,则回退到 onset 间隔估计生成近似节拍点 | ||
| 5585 | - 将 `hybrid` 扩展为优先复用: | ||
| 5586 | - `beat_aware` | ||
| 5587 | - `high_energy` | ||
| 5588 | - `onset_aware` | ||
| 5589 | - `silence_aware` | ||
| 5590 | |||
| 5591 | 验证结果: | ||
| 5592 | - 编译验证: | ||
| 5593 | - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py` | ||
| 5594 | - 人造节拍音频验证: | ||
| 5595 | - 构造 `20s` 音频 | ||
| 5596 | - `4s-18s` 区间每 `0.5s` 注入一次脉冲(约 120 BPM) | ||
| 5597 | - 再叠加轻微 tonal bed | ||
| 5598 | - 直接 beat 候选结果: | ||
| 5599 | - `DIRECT_BEAT_CANDIDATES_SEC`: | ||
| 5600 | - `4.032, 5.952, 7.872, 9.792, 11.712, 13.632, 15.0` | ||
| 5601 | - query 生成结果: | ||
| 5602 | - `BEAT_QUERY_OFFSETS`: | ||
| 5603 | - `4.032, 7.872, 9.792, 11.712, 13.632, 15.0` | ||
| 5604 | - `HYBRID_QUERY_OFFSETS`: | ||
| 5605 | - `3.968, 4.032, 4.064, 4.544, 5.0, 5.536, 6.016, 6.048, 7.872, 9.591, 9.792, 10.0` | ||
| 5606 | - 训练侧偏移验证: | ||
| 5607 | - `TRAIN_BEAT_AWARE_OFFSETS`: | ||
| 5608 | - `13.632, 4.032, 4.032, 13.632, 7.872, 5.952` | ||
| 5609 | - `TRAIN_HYBRID_OFFSETS`: | ||
| 5610 | - `2.5, 5.536, 4.064, 12.5, 7.872, 4.032` | ||
| 5611 | - 说明 beat-aware 已明显偏向规则拍点,hybrid 也已吸收 beat-aware 候选 | ||
| 5612 | - dry-run 验证: | ||
| 5613 | - `train.py --data data/synthetic_v2 --dry-run --segment-strategy beat_aware` | ||
| 5614 | - forward/backward 成功,`Embedding shape: torch.Size([64, 192])` | ||
| 5615 | |||
| 5616 | 结论: | ||
| 5617 | - 当前项目的音乐感知切片已经进一步从“高能/起音”扩展到“规则拍点” | ||
| 5618 | - 下一步可继续叠加: | ||
| 5619 | - repeated-section-aware | ||
| 5620 | - chorus-like candidate mining | ... | ... |
| ... | @@ -356,12 +356,13 @@ flowchart TD | ... | @@ -356,12 +356,13 @@ flowchart TD |
| 356 | | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | | 356 | | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | |
| 357 | | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | | 357 | | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | |
| 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | | 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | |
| 359 | | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | | ||
| 359 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | | 360 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | |
| 360 | 361 | ||
| 361 | 推荐理解: | 362 | 推荐理解: |
| 362 | 363 | ||
| 363 | 1. **训练不是全部随机切** | 364 | 1. **训练不是全部随机切** |
| 364 | 当前训练集可用 `random / silence_aware / high_energy / onset_aware / hybrid` | 365 | 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / hybrid` |
| 365 | 2. **reference 建库不是随机切** | 366 | 2. **reference 建库不是随机切** |
| 366 | 建库仍然是固定滑窗 | 367 | 建库仍然是固定滑窗 |
| 367 | 3. **外部数据 query 生成也不是只能随机切** | 368 | 3. **外部数据 query 生成也不是只能随机切** |
| ... | @@ -388,6 +389,7 @@ flowchart TD | ... | @@ -388,6 +389,7 @@ flowchart TD |
| 388 | - 已知原始音频静音很多:`silence_aware` | 389 | - 已知原始音频静音很多:`silence_aware` |
| 389 | - 更想贴近副歌/强节奏:`high_energy` | 390 | - 更想贴近副歌/强节奏:`high_energy` |
| 390 | - 更想贴近短音起点/打点:`onset_aware` | 391 | - 更想贴近短音起点/打点:`onset_aware` |
| 392 | - 更想贴近稳定节拍网格:`beat_aware` | ||
| 391 | 393 | ||
| 392 | ### 外部数据 query 生成推荐 | 394 | ### 外部数据 query 生成推荐 |
| 393 | 395 | ||
| ... | @@ -409,6 +411,7 @@ flowchart TD | ... | @@ -409,6 +411,7 @@ flowchart TD |
| 409 | | 录音静音头尾很多 | `silence_aware` | | 411 | | 录音静音头尾很多 | `silence_aware` | |
| 410 | | 更想贴近副歌/主段 | `high_energy` | | 412 | | 更想贴近副歌/主段 | `high_energy` | |
| 411 | | 更想贴近打点/起唱点 | `onset_aware` | | 413 | | 更想贴近打点/起唱点 | `onset_aware` | |
| 414 | | 更想贴近规则拍点/律动骨架 | `beat_aware` | | ||
| 412 | | 既要音乐感知,又要保留泛化 | `hybrid` | | 415 | | 既要音乐感知,又要保留泛化 | `hybrid` | |
| 413 | 416 | ||
| 414 | --- | 417 | --- | ... | ... |
-
Please register or sign in to post a comment