Commit 8ed3e34e 8ed3e34ef1f40d16d160d5581a423c51b8dd57ce by cnb.bofCdSsphPA

Prioritize repeated chorus-like regions in music crop selection

Constraint: Music retrieval should sample repeated hook-like regions without adding heavyweight structure models or breaking the existing lightweight candidate stack
Rejected: Reserve repeated-section logic for a later dedicated chorus detector | delays a practical chorus-like signal that can already improve query realism today
Confidence: medium
Scope-risk: moderate
Directive: Treat repeated_section_aware as a lightweight chorus proxy; future chorus ranking should refine rather than discard these candidates
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; synthetic_v2 dry-run with --segment-strategy repeated_section_aware; handcrafted 24s repeated-motif fixture with repeated_section_aware and hybrid offset checks
Not-tested: Full end-to-end metric impact on FMA/internal datasets with repeated_section_aware enabled
1 parent d7a08944
...@@ -95,6 +95,32 @@ def compute_candidate_offsets( ...@@ -95,6 +95,32 @@ def compute_candidate_offsets(
95 step = max(1, len(offsets) // 8) 95 step = max(1, len(offsets) // 8)
96 return sorted(set(offsets[::step][:8])) 96 return sorted(set(offsets[::step][:8]))
97 97
98 if strategy == "repeated_section_aware":
99 hop = max(segment_len // 2, 1)
100 starts = list(range(0, max(len(y) - segment_len + 1, 1), hop))
101 if len(starts) < 2:
102 return starts[:1]
103 feats = []
104 for start in starts:
105 seg = y[start : start + segment_len]
106 if len(seg) < segment_len:
107 seg = np.pad(seg, (0, segment_len - len(seg)))
108 chroma = librosa.feature.chroma_cqt(y=seg, sr=sr)
109 feat = np.mean(chroma, axis=1)
110 norm = float(np.linalg.norm(feat) + 1e-12)
111 feats.append(feat / norm)
112 scores: List[tuple[float, int]] = []
113 for i, feat in enumerate(feats):
114 sims = []
115 for j, other in enumerate(feats):
116 if i == j:
117 continue
118 sims.append(float(np.dot(feat, other)))
119 repeat_score = max(sims) if sims else 0.0
120 scores.append((repeat_score, starts[i]))
121 scores.sort(key=lambda x: x[0], reverse=True)
122 return sorted(set(start for _, start in scores[: min(6, len(scores))]))
123
98 return [] 124 return []
99 125
100 126
...@@ -186,7 +212,7 @@ class ACRDataset(Dataset): ...@@ -186,7 +212,7 @@ class ACRDataset(Dataset):
186 212
187 if self.segment_strategy == "hybrid": 213 if self.segment_strategy == "hybrid":
188 candidate_pool: List[int] = [] 214 candidate_pool: List[int] = []
189 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): 215 for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
190 candidate_pool.extend( 216 candidate_pool.extend(
191 compute_candidate_offsets( 217 compute_candidate_offsets(
192 y=y, 218 y=y,
...@@ -365,7 +391,7 @@ class SongPairDataset(Dataset): ...@@ -365,7 +391,7 @@ class SongPairDataset(Dataset):
365 offset = min(random.choice(direct_candidates) / self.sr, max_offset) 391 offset = min(random.choice(direct_candidates) / self.sr, max_offset)
366 elif self.segment_strategy == "hybrid": 392 elif self.segment_strategy == "hybrid":
367 candidate_pool: List[int] = [] 393 candidate_pool: List[int] = []
368 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): 394 for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
369 candidate_pool.extend( 395 candidate_pool.extend(
370 compute_candidate_offsets( 396 compute_candidate_offsets(
371 y=full_y, 397 y=full_y,
......
...@@ -516,7 +516,7 @@ def main(): ...@@ -516,7 +516,7 @@ def main():
516 p.add_argument("--eval-ratio", type=float, default=0.2) 516 p.add_argument("--eval-ratio", type=float, default=0.2)
517 p.add_argument("--query-duration", type=float, default=8.0) 517 p.add_argument("--query-duration", type=float, default=8.0)
518 p.add_argument("--query-stride", type=float, default=None) 518 p.add_argument("--query-stride", type=float, default=None)
519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") 519 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
520 p.add_argument("--silence-top-db", type=int, default=30) 520 p.add_argument("--silence-top-db", type=int, default=30)
521 p.add_argument("--seed", type=int, default=42) 521 p.add_argument("--seed", type=int, default=42)
522 522
...@@ -548,8 +548,8 @@ def main(): ...@@ -548,8 +548,8 @@ def main():
548 p.add_argument("--eval-ratio", type=float, default=0.2) 548 p.add_argument("--eval-ratio", type=float, default=0.2)
549 p.add_argument("--query-duration", type=float, default=8.0) 549 p.add_argument("--query-duration", type=float, default=8.0)
550 p.add_argument("--query-stride", type=float, default=None) 550 p.add_argument("--query-stride", type=float, default=None)
551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") 551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") 552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
553 p.add_argument("--silence-top-db", type=int, default=30) 553 p.add_argument("--silence-top-db", type=int, default=30)
554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100) 554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100)
555 p.add_argument("--seed", type=int, default=42) 555 p.add_argument("--seed", type=int, default=42)
......
...@@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir( ...@@ -117,14 +117,14 @@ def build_train_eval_from_audio_dir(
117 117
118 if duration >= query_duration: 118 if duration >= query_duration:
119 strategy_offsets = [] 119 strategy_offsets = []
120 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"}: 120 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware"}:
121 strategy_offsets = compute_strategy_offsets(path, duration, query_strategy) 121 strategy_offsets = compute_strategy_offsets(path, duration, query_strategy)
122 elif query_strategy == "hybrid": 122 elif query_strategy == "hybrid":
123 for strategy in ("beat_aware", "high_energy", "onset_aware", "silence_aware"): 123 for strategy in ("repeated_section_aware", "beat_aware", "high_energy", "onset_aware", "silence_aware"):
124 strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy)) 124 strategy_offsets.extend(compute_strategy_offsets(path, duration, strategy))
125 strategy_offsets = sorted(set(strategy_offsets)) 125 strategy_offsets = sorted(set(strategy_offsets))
126 126
127 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware"} and strategy_offsets: 127 if query_strategy in {"silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware"} and strategy_offsets:
128 offsets = strategy_offsets 128 offsets = strategy_offsets
129 elif query_strategy == "hybrid" and strategy_offsets: 129 elif query_strategy == "hybrid" and strategy_offsets:
130 if query_stride and query_stride > 0: 130 if query_stride and query_stride > 0:
...@@ -277,7 +277,7 @@ def main(): ...@@ -277,7 +277,7 @@ def main():
277 p.add_argument("--eval-ratio", type=float, default=0.2) 277 p.add_argument("--eval-ratio", type=float, default=0.2)
278 p.add_argument("--query-duration", type=float, default=8.0) 278 p.add_argument("--query-duration", type=float, default=8.0)
279 p.add_argument("--query-stride", type=float, default=None) 279 p.add_argument("--query-stride", type=float, default=None)
280 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") 280 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
281 p.add_argument("--silence-top-db", type=int, default=30) 281 p.add_argument("--silence-top-db", type=int, default=30)
282 p.add_argument("--seed", type=int, default=42) 282 p.add_argument("--seed", type=int, default=42)
283 283
......
...@@ -125,7 +125,7 @@ def main(): ...@@ -125,7 +125,7 @@ def main():
125 parser.add_argument("--epochs", type=int, default=None) 125 parser.add_argument("--epochs", type=int, default=None)
126 parser.add_argument("--batch-size", type=int, default=None) 126 parser.add_argument("--batch-size", type=int, default=None)
127 parser.add_argument("--lr", type=float, default=None) 127 parser.add_argument("--lr", type=float, default=None)
128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "hybrid"], default="random") 128 parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random")
129 parser.add_argument("--silence-top-db", type=int, default=30) 129 parser.add_argument("--silence-top-db", type=int, default=30)
130 parser.add_argument("--dry-run", action="store_true") 130 parser.add_argument("--dry-run", action="store_true")
131 args = parser.parse_args() 131 args = parser.parse_args()
......
...@@ -5618,3 +5618,60 @@ ...@@ -5618,3 +5618,60 @@
5618 - 下一步可继续叠加: 5618 - 下一步可继续叠加:
5619 - repeated-section-aware 5619 - repeated-section-aware
5620 - chorus-like candidate mining 5620 - chorus-like candidate mining
5621
5622 ### Stage: repeated-section-aware / chorus-like candidate sampling
5623
5624 完成项:
5625 -`acr-engine/src/data/dataset.py` 新增:
5626 - `repeated_section_aware`
5627 -`acr-engine/src/data/manifest_tools.py` 新增:
5628 - `--query-strategy repeated_section_aware`
5629 -`train.py``external_adapters.py` 暴露:
5630 - `repeated_section_aware`
5631 - 实现方式:
5632 - 对滑窗片段提取 `chroma_cqt`
5633 - 取窗口级平均 chroma 向量
5634 - 计算窗口间相似度
5635 - 优先选择“与其它窗口最相似”的片段,作为重复主段 / 副歌 hook 的轻量近似
5636 -`hybrid` 扩展为优先复用:
5637 - `repeated_section_aware`
5638 - `beat_aware`
5639 - `high_energy`
5640 - `onset_aware`
5641 - `silence_aware`
5642
5643 验证结果:
5644 - 编译验证:
5645 - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py`
5646 - 人造“重复副歌”音频验证:
5647 - 构造 `24s` 音频
5648 - `8-12s``16-20s` 放置两段重复 motif
5649 - 直接重复候选结果:
5650 - `DIRECT_REPEAT_CANDIDATES_SEC`
5651 - `6.0, 8.0, 10.0, 14.0, 16.0, 18.0`
5652 - query 生成结果:
5653 - `REPEAT_QUERY_OFFSETS`
5654 - `6.0, 10.0, 14.0, 16.0, 18.0`
5655 - `HYBRID_QUERY_OFFSETS`
5656 - `2.016, 2.048, 2.08, 6.0, 6.048, 8.0, 8.064, 10.0, 12.789, 14.0, 15.968, 16.0`
5657 - 训练侧偏移验证:
5658 - `TRAIN_REPEAT_OFFSETS`
5659 - `17.5, 0.0, 0.0, 17.5, 7.5, 2.5`
5660 - `TRAIN_HYBRID_OFFSETS`
5661 - `0.0, 8.032, 2.5, 2.048, 2.016, 7.5`
5662 - 说明 repeated-section-aware 已能明显偏向重复主段周边,而 hybrid 也已吸收这类候选
5663 - dry-run 验证:
5664 - `train.py --data data/synthetic_v2 --dry-run --segment-strategy repeated_section_aware`
5665 - forward/backward 成功,`Embedding shape: torch.Size([64, 192])`
5666
5667 结论:
5668 - 当前项目的音乐感知切片已经从:
5669 - 避静音
5670 - 高能区
5671 - 起音点
5672 - 拍点
5673 进一步扩展到:
5674 - **重复主段 / 近似副歌**
5675 - 下一步可继续做更强的:
5676 - chorus-like multi-feature ranking
5677 - 小规模真实数据策略 A/B 对比
......
...@@ -357,12 +357,13 @@ flowchart TD ...@@ -357,12 +357,13 @@ flowchart TD
357 | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 | 357 | `high_energy` | 训练 query / 外部 query 生成 | 优先抽取 RMS 高能区,更接近副歌/主唱/强节奏段 | 是 |
358 | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | 358 | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 |
359 | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | 359 | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 |
360 | `repeated_section_aware` | 训练 query / 外部 query 生成 | 优先抽取与其它窗口最相似的重复主段,近似副歌/重复 hook | 是 |
360 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | 361 | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 |
361 362
362 推荐理解: 363 推荐理解:
363 364
364 1. **训练不是全部随机切** 365 1. **训练不是全部随机切**
365 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / hybrid` 366 当前训练集可用 `random / silence_aware / high_energy / onset_aware / beat_aware / repeated_section_aware / hybrid`
366 2. **reference 建库不是随机切** 367 2. **reference 建库不是随机切**
367 建库仍然是固定滑窗 368 建库仍然是固定滑窗
368 3. **外部数据 query 生成也不是只能随机切** 369 3. **外部数据 query 生成也不是只能随机切**
...@@ -390,6 +391,7 @@ flowchart TD ...@@ -390,6 +391,7 @@ flowchart TD
390 - 更想贴近副歌/强节奏:`high_energy` 391 - 更想贴近副歌/强节奏:`high_energy`
391 - 更想贴近短音起点/打点:`onset_aware` 392 - 更想贴近短音起点/打点:`onset_aware`
392 - 更想贴近稳定节拍网格:`beat_aware` 393 - 更想贴近稳定节拍网格:`beat_aware`
394 - 更想贴近副歌/重复 hook:`repeated_section_aware`
393 395
394 ### 外部数据 query 生成推荐 396 ### 外部数据 query 生成推荐
395 397
...@@ -412,6 +414,7 @@ flowchart TD ...@@ -412,6 +414,7 @@ flowchart TD
412 | 更想贴近副歌/主段 | `high_energy` | 414 | 更想贴近副歌/主段 | `high_energy` |
413 | 更想贴近打点/起唱点 | `onset_aware` | 415 | 更想贴近打点/起唱点 | `onset_aware` |
414 | 更想贴近规则拍点/律动骨架 | `beat_aware` | 416 | 更想贴近规则拍点/律动骨架 | `beat_aware` |
417 | 更想贴近重复主段/副歌 hook | `repeated_section_aware` |
415 | 既要音乐感知,又要保留泛化 | `hybrid` | 418 | 既要音乐感知,又要保留泛化 | `hybrid` |
416 419
417 --- 420 ---
......