Prioritize repeated chorus-like regions in music crop selection
Constraint: Music retrieval should sample repeated hook-like regions without adding heavyweight structure models or breaking the existing lightweight candidate stack Rejected: Reserve repeated-section logic for a later dedicated chorus detector | delays a practical chorus-like signal that can already improve query realism today Confidence: medium Scope-risk: moderate Directive: Treat repeated_section_aware as a lightweight chorus proxy; future chorus ranking should refine rather than discard these candidates Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy repeated_section_aware; handcrafted 24s repeated-motif fixture with repeated_section_aware and hybrid offset checks Not-tested: Full end-to-end metric impact on FMA/internal datasets with repeated_section_aware enabled
Showing
6 changed files
with
97 additions
and
11 deletions
| ... | @@ -95,6 +95,32 @@ def compute_candidate_offsets( | ... | @@ -95,6 +95,32 @@ def compute_candidate_offsets( |
| 95 | step = max(1, len(offsets) // 8) | 95 | step = max(1, len(offsets) // 8) |
| 96 | return sorted(set(offsets[::step][:8])) | 96 | return sorted(set(offsets[::step][:8])) |
| 97 | 97 | ||
| 98 | if strategy == "repeated_section_aware": | ||
| 99 | hop = max(segment_len // 2, 1) | ||
| 100 | starts = list(range(0, max(len(y) - segment_len + 1, 1), hop)) | ||
| 101 | if len(starts) < 2: | ||
| 102 | return starts[:1] | ||
| 103 | feats = [] | ||
| 104 | for start in starts: | ||
| 105 | seg = y[start : start + segment_len] | ||
| 106 | if len(seg) < segment_len: | ||
| 107 | seg = np.pad(seg, (0, segment_len - len(seg))) | ||
| 108 | chroma = librosa.feature.chroma_cqt(y=seg, sr=sr) | ||
| 109 | feat = np.mean(chroma, axis=1) | ||
| 110 | norm = float(np.linalg.norm(feat) + 1e-12) | ||
| 111 | feats.append(feat / norm) | ||
| 112 | scores: List[tuple[float, int]] = [] | ||
| 113 | for i, feat in enumerate(feats): | ||
| 114 | sims = [] | ||
| 115 | for j, other in enumerate(feats): | ||
| 116 | if i == j: | ||
| 117 | continue | ||
| 118 | sims.append(float(np.dot(feat, other))) | ||
| 119 | repeat_score = max(sims) if sims else 0.0 | ||
| 120 | scores.append((repeat_score, starts[i])) | ||
| 121 | scores.sort(key=lambda x: x[0], reverse=True) | ||
| 122 | return sorted(set(start for _, start in scores[: min(6, len(scores))])) | ||
| 123 | |||
| 98 | return [] | 124 | return [] |
| 99 | 125 | ||
| 100 | 126 | ||
| ... | @@ -186,7 +212,7 @@ class ACRDataset(Dataset): | ... | @@ -186,7 +212,7 @@ class ACRDataset(Dataset): |
| 186 | 212 | ||
| 187 | if self.segment_strategy == "hybrid": | 213 | if self.segment_strategy == "hybrid": |
| 188 | candidate_pool: List[int] = [] | 214 | candidate_pool: List[int] = [] |
| 189 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): | 215 | for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 190 | candidate_pool.extend( | 216 | candidate_pool.extend( |
| 191 | compute_candidate_offsets( | 217 | compute_candidate_offsets( |
| 192 | y=y, | 218 | y=y, |
| ... | @@ -365,7 +391,7 @@ class SongPairDataset(Dataset): | ... | @@ -365,7 +391,7 @@ class SongPairDataset(Dataset): |
| 365 | offset = min(random.choice(direct_candidates) / self.sr, max_offset) | 391 | offset = min(random.choice(direct_candidates) / self.sr, max_offset) |
| 366 | elif self.segment_strategy == "hybrid": | 392 | elif self.segment_strategy == "hybrid": |
| 367 | candidate_pool: List[int] = [] | 393 | candidate_pool: List[int] = [] |
| 368 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): | 394 | for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 369 | candidate_pool.extend( | 395 | candidate_pool.extend( |
| 370 | compute_candidate_offsets( | 396 | compute_candidate_offsets( |
| 371 | y=full_y, | 397 | y=full_y, | ... | ... |
| ... | @@ -516,7 +516,7 @@ def main(): | ... | @@ -516,7 +516,7 @@ def main(): |
| 516 | p.add_argument("--eval-ratio", type=float, default=0.2) | 516 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 517 | p.add_argument("--query-duration", type=float, default=8.0) | 517 | p.add_argument("--query-duration", type=float, default=8.0) |
| 518 | p.add_argument("--query-stride", type=float, default=None) | 518 | p.add_argument("--query-stride", type=float, default=None) |
| 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") | 519 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 520 | p.add_argument("--silence-top-db", type=int, default=30) | 520 | p.add_argument("--silence-top-db", type=int, default=30) |
| 521 | p.add_argument("--seed", type=int, default=42) | 521 | p.add_argument("--seed", type=int, default=42) |
| 522 | 522 | ||
| ... | @@ -548,8 +548,8 @@ def main(): | ... | @@ -548,8 +548,8 @@ def main(): |
| 548 | p.add_argument("--eval-ratio", type=float, default=0.2) | 548 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 549 | p.add_argument("--query-duration", type=float, default=8.0) | 549 | p.add_argument("--query-duration", type=float, default=8.0) |
| 550 | p.add_argument("--query-stride", type=float, default=None) | 550 | p.add_argument("--query-stride", type=float, default=None) |
| 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") | 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") | 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 553 | p.add_argument("--silence-top-db", type=int, default=30) | 553 | p.add_argument("--silence-top-db", type=int, default=30) |
| 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) | 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) |
| 555 | p.add_argument("--seed", type=int, default=42) | 555 | p.add_argument("--seed", type=int, default=42) | ... | ... |
| ... | @@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( | ... | @@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( |
| 117 | 117 | ||
| 118 | if duration >= query_duration: | 118 | if duration >= query_duration: |
| 119 | strategy_offsets = [] | 119 | strategy_offsets = [] |
| 120 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"}: | 120 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware"}: |
| 121 | strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) | 121 | strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) |
| 122 | elif query_strategy == "hybrid": | 122 | elif query_strategy == "hybrid": |
| 123 | for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): | 123 | for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"): |
| 124 | strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) | 124 | strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) |
| 125 | strategy_offsets = sorted(set(strategy_offsets)) | 125 | strategy_offsets = sorted(set(strategy_offsets)) |
| 126 | 126 | ||
| 127 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"} and strategy_offsets: | 127 | if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware"} and strategy_offsets: |
| 128 | offsets = strategy_offsets | 128 | offsets = strategy_offsets |
| 129 | elif query_strategy == "hybrid" and strategy_offsets: | 129 | elif query_strategy == "hybrid" and strategy_offsets: |
| 130 | if query_stride and query_stride > 0: | 130 | if query_stride and query_stride > 0: |
| ... | @@ -277,7 +277,7 @@ def main(): | ... | @@ -277,7 +277,7 @@ def main(): |
| 277 | p.add_argument("--eval-ratio", type=float, default=0.2) | 277 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 278 | p.add_argument("--query-duration", type=float, default=8.0) | 278 | p.add_argument("--query-duration", type=float, default=8.0) |
| 279 | p.add_argument("--query-stride", type=float, default=None) | 279 | p.add_argument("--query-stride", type=float, default=None) |
| 280 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") | 280 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 281 | p.add_argument("--silence-top-db", type=int, default=30) | 281 | p.add_argument("--silence-top-db", type=int, default=30) |
| 282 | p.add_argument("--seed", type=int, default=42) | 282 | p.add_argument("--seed", type=int, default=42) |
| 283 | 283 | ... | ... |
| ... | @@ -125,7 +125,7 @@ def main(): | ... | @@ -125,7 +125,7 @@ def main(): |
| 125 | parser.add_argument("--epochs", type=int, default=None) | 125 | parser.add_argument("--epochs", type=int, default=None) |
| 126 | parser.add_argument("--batch-size", type=int, default=None) | 126 | parser.add_argument("--batch-size", type=int, default=None) |
| 127 | parser.add_argument("--lr", type=float, default=None) | 127 | parser.add_argument("--lr", type=float, default=None) |
| 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") | 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 129 | parser.add_argument("--silence-top-db", type=int, default=30) | 129 | parser.add_argument("--silence-top-db", type=int, default=30) |
| 130 | parser.add_argument("--dry-run", action="store_true") | 130 | parser.add_argument("--dry-run", action="store_true") |
| 131 | args = parser.parse_args() | 131 | args = parser.parse_args() | ... | ... |
| ... | @@ -5618,3 +5618,60 @@ | ... | @@ -5618,3 +5618,60 @@ |
| 5618 | - 下一步可继续叠加: | 5618 | - 下一步可继续叠加: |
| 5619 | - repeated-section-aware | 5619 | - repeated-section-aware |
| 5620 | - chorus-like candidate mining | 5620 | - chorus-like candidate mining |
| 5621 | |||
| 5622 | ### Stage: repeated-section-aware / chorus-like candidate sampling | ||
| 5623 | |||
| 5624 | 完成项: | ||
| 5625 | - 在 `acr-engine/src/data/dataset.py` 新增: | ||
| 5626 | - `repeated_section_aware` | ||
| 5627 | - 在 `acr-engine/src/data/manifest_tools.py` 新增: | ||
| 5628 | - `--query-strategy repeated_section_aware` | ||
| 5629 | - 在 `train.py` 与 `external_adapters.py` 暴露: | ||
| 5630 | - `repeated_section_aware` | ||
| 5631 | - 实现方式: | ||
| 5632 | - 对滑窗片段提取 `chroma_cqt` | ||
| 5633 | - 取窗口级平均 chroma 向量 | ||
| 5634 | - 计算窗口间相似度 | ||
| 5635 | - 优先选择“与其它窗口最相似”的片段,作为重复主段 / 副歌 hook 的轻量近似 | ||
| 5636 | - 将 `hybrid` 扩展为优先复用: | ||
| 5637 | - `repeated_section_aware` | ||
| 5638 | - `beat_aware` | ||
| 5639 | - `high_energy` | ||
| 5640 | - `onset_aware` | ||
| 5641 | - `silence_aware` | ||
| 5642 | |||
| 5643 | 验证结果: | ||
| 5644 | - 编译验证: | ||
| 5645 | - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py` | ||
| 5646 | - 人造“重复副歌”音频验证: | ||
| 5647 | - 构造 `24s` 音频 | ||
| 5648 | - `8-12s` 与 `16-20s` 放置两段重复 motif | ||
| 5649 | - 直接重复候选结果: | ||
| 5650 | - `DIRECT_REPEAT_CANDIDATES_SEC`: | ||
| 5651 | - `6.0, 8.0, 10.0, 14.0, 16.0, 18.0` | ||
| 5652 | - query 生成结果: | ||
| 5653 | - `REPEAT_QUERY_OFFSETS`: | ||
| 5654 | - `6.0, 10.0, 14.0, 16.0, 18.0` | ||
| 5655 | - `HYBRID_QUERY_OFFSETS`: | ||
| 5656 | - `2.016, 2.048, 2.08, 6.0, 6.048, 8.0, 8.064, 10.0, 12.789, 14.0, 15.968, 16.0` | ||
| 5657 | - 训练侧偏移验证: | ||
| 5658 | - `TRAIN_REPEAT_OFFSETS`: | ||
| 5659 | - `17.5, 0.0, 0.0, 17.5, 7.5, 2.5` | ||
| 5660 | - `TRAIN_HYBRID_OFFSETS`: | ||
| 5661 | - `0.0, 8.032, 2.5, 2.048, 2.016, 7.5` | ||
| 5662 | - 说明 repeated-section-aware 已能明显偏向重复主段周边,而 hybrid 也已吸收这类候选 | ||
| 5663 | - dry-run 验证: | ||
| 5664 | - `train.py --data data/synthetic_v2 --dry-run --segment-strategy repeated_section_aware` | ||
| 5665 | - forward/backward 成功,`Embedding shape: torch.Size([64, 192])` | ||
| 5666 | |||
| 5667 | 结论: | ||
| 5668 | - 当前项目的音乐感知切片已经从: | ||
| 5669 | - 避静音 | ||
| 5670 | - 高能区 | ||
| 5671 | - 起音点 | ||
| 5672 | - 拍点 | ||
| 5673 | 进一步扩展到: | ||
| 5674 | - **重复主段 / 近似副歌** | ||
| 5675 | - 下一步可继续做更强的: | ||
| 5676 | - chorus-like multi-feature ranking | ||
| 5677 | - 小规模真实数据策略 A/B 对比 | ... | ... |
| ... | @@ -357,12 +357,13 @@ flowchart TD | ... | @@ -357,12 +357,13 @@ flowchart TD |
| 357 | | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | | 357 | | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | |
| 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | | 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | |
| 359 | | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | | 359 | | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | |
| 360 | | `repeated_section_aware` | 训练 query / 外部 query 生成 | 优先抽取与其它窗口最相似的重复主段,近似副歌/重复 hook | 是 | | ||
| 360 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | | 361 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | |
| 361 | 362 | ||
| 362 | 推荐理解: | 363 | 推荐理解: |
| 363 | 364 | ||
| 364 | 1. **训练不是全部随机切** | 365 | 1. **训练不是全部随机切** |
| 365 | 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / hybrid` | 366 | 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / repeated_section_aware / hybrid` |
| 366 | 2. **reference 建库不是随机切** | 367 | 2. **reference 建库不是随机切** |
| 367 | 建库仍然是固定滑窗 | 368 | 建库仍然是固定滑窗 |
| 368 | 3. **外部数据 query 生成也不是只能随机切** | 369 | 3. **外部数据 query 生成也不是只能随机切** |
| ... | @@ -390,6 +391,7 @@ flowchart TD | ... | @@ -390,6 +391,7 @@ flowchart TD |
| 390 | - 更想贴近副歌/强节奏:`high_energy` | 391 | - 更想贴近副歌/强节奏:`high_energy` |
| 391 | - 更想贴近短音起点/打点:`onset_aware` | 392 | - 更想贴近短音起点/打点:`onset_aware` |
| 392 | - 更想贴近稳定节拍网格:`beat_aware` | 393 | - 更想贴近稳定节拍网格:`beat_aware` |
| 394 | - 更想贴近副歌/重复 hook:`repeated_section_aware` | ||
| 393 | 395 | ||
| 394 | ### 外部数据 query 生成推荐 | 396 | ### 外部数据 query 生成推荐 |
| 395 | 397 | ||
| ... | @@ -412,6 +414,7 @@ flowchart TD | ... | @@ -412,6 +414,7 @@ flowchart TD |
| 412 | | 更想贴近副歌/主段 | `high_energy` | | 414 | | 更想贴近副歌/主段 | `high_energy` | |
| 413 | | 更想贴近打点/起唱点 | `onset_aware` | | 415 | | 更想贴近打点/起唱点 | `onset_aware` | |
| 414 | | 更想贴近规则拍点/律动骨架 | `beat_aware` | | 416 | | 更想贴近规则拍点/律动骨架 | `beat_aware` | |
| 417 | | 更想贴近重复主段/副歌 hook | `repeated_section_aware` | | ||
| 415 | | 既要音乐感知,又要保留泛化 | `hybrid` | | 418 | | 既要音乐感知,又要保留泛化 | `hybrid` | |
| 416 | 419 | ||
| 417 | --- | 420 | --- | ... | ... |
-
Please register or sign in to post a comment