Make segmentation strategy benchmarks comparable under fixed query budgets
Clarify that the pipeline already mixes random sampling with librosa-guided candidate selection, while keeping heavier structural segmentation as a later optimization path. Constraint: Must avoid staging local datasets and transient smoke artifacts Rejected: Full librosa.segment.* default rollout | Too CPU-heavy and too distribution-shaping for current smoke/training stage Confidence: high Scope-risk: narrow Directive: Keep future segmentation comparisons capped by equal query budgets when reporting quality deltas Tested: py_compile for evaluate/external_adapters/ab_smoke_segmentation; evaluate.py --max-queries 5; ab_smoke_segmentation end-to-end smoke with max_test_queries=5 Not-tested: Multi-strategy medium-size capped A/B benchmark on larger real FMA subset
Showing
7 changed files
with
164 additions
and
9 deletions
| 1 | #!/usr/bin/env python3 | 1 | #!/usr/bin/env python3 |
| 2 | import argparse | 2 | import argparse |
| 3 | import json | 3 | import json |
| 4 | import random | ||
| 4 | from pathlib import Path | 5 | from pathlib import Path |
| 5 | 6 | ||
| 6 | import numpy as np | 7 | import numpy as np |
| ... | @@ -28,6 +29,8 @@ def main(): | ... | @@ -28,6 +29,8 @@ def main(): |
| 28 | parser.add_argument("--chroma-weight", type=float, default=0.25) | 29 | parser.add_argument("--chroma-weight", type=float, default=0.25) |
| 29 | parser.add_argument("--ecapa-weight", type=float, default=0.5) | 30 | parser.add_argument("--ecapa-weight", type=float, default=0.5) |
| 30 | parser.add_argument("--melody-weight", type=float, default=0.25) | 31 | parser.add_argument("--melody-weight", type=float, default=0.25) |
| 32 | parser.add_argument("--max-queries", type=int, default=None) | ||
| 33 | parser.add_argument("--seed", type=int, default=42) | ||
| 31 | args = parser.parse_args() | 34 | args = parser.parse_args() |
| 32 | 35 | ||
| 33 | data_dir = Path(args.data) | 36 | data_dir = Path(args.data) |
| ... | @@ -57,6 +60,9 @@ def main(): | ... | @@ -57,6 +60,9 @@ def main(): |
| 57 | queries = [x for x in items if x.get("type") != "reference"] | 60 | queries = [x for x in items if x.get("type") != "reference"] |
| 58 | if not queries: | 61 | if not queries: |
| 59 | raise SystemExit("No segment queries found for evaluation") | 62 | raise SystemExit("No segment queries found for evaluation") |
| 63 | if args.max_queries is not None and args.max_queries > 0 and len(queries) > args.max_queries: | ||
| 64 | rng = random.Random(args.seed) | ||
| 65 | queries = rng.sample(queries, args.max_queries) | ||
| 60 | 66 | ||
| 61 | top1 = 0 | 67 | top1 = 0 |
| 62 | topk = 0 | 68 | topk = 0 | ... | ... |
| ... | @@ -71,8 +71,10 @@ def main(): | ... | @@ -71,8 +71,10 @@ def main(): |
| 71 | parser.add_argument("--batch-size", type=int, default=2) | 71 | parser.add_argument("--batch-size", type=int, default=2) |
| 72 | parser.add_argument("--device", default="cpu") | 72 | parser.add_argument("--device", default="cpu") |
| 73 | parser.add_argument("--seed", type=int, default=42) | 73 | parser.add_argument("--seed", type=int, default=42) |
| 74 | parser.add_argument("--max-test-queries", type=int, default=None) | ||
| 74 | parser.add_argument("--strategies", nargs="*", default=DEFAULT_STRATEGIES) | 75 | parser.add_argument("--strategies", nargs="*", default=DEFAULT_STRATEGIES) |
| 75 | parser.add_argument("--output-json", default=None) | 76 | parser.add_argument("--output-json", default=None) |
| 77 | parser.add_argument("--resume", action="store_true") | ||
| 76 | args = parser.parse_args() | 78 | args = parser.parse_args() |
| 77 | 79 | ||
| 78 | repo = Path(__file__).resolve().parents[1] | 80 | repo = Path(__file__).resolve().parents[1] |
| ... | @@ -80,9 +82,20 @@ def main(): | ... | @@ -80,9 +82,20 @@ def main(): |
| 80 | work_root = (repo / args.work_root).resolve() | 82 | work_root = (repo / args.work_root).resolve() |
| 81 | subset_dir = work_root / "subset_audio" | 83 | subset_dir = work_root / "subset_audio" |
| 82 | subset_info = prepare_subset(input_dir, subset_dir, args.subset_size) | 84 | subset_info = prepare_subset(input_dir, subset_dir, args.subset_size) |
| 85 | progress_path = work_root / "progress.json" | ||
| 86 | cached_results = {} | ||
| 87 | if args.resume and progress_path.exists(): | ||
| 88 | try: | ||
| 89 | payload = json.loads(progress_path.read_text()) | ||
| 90 | cached_results = {item["strategy"]: item for item in payload.get("strategies", [])} | ||
| 91 | except Exception: | ||
| 92 | cached_results = {} | ||
| 83 | 93 | ||
| 84 | results = [] | 94 | results = [] |
| 85 | for strategy in args.strategies: | 95 | for strategy in args.strategies: |
| 96 | if strategy in cached_results: | ||
| 97 | results.append(cached_results[strategy]) | ||
| 98 | continue | ||
| 86 | smoke_root = work_root / strategy | 99 | smoke_root = work_root / strategy |
| 87 | if smoke_root.exists(): | 100 | if smoke_root.exists(): |
| 88 | shutil.rmtree(smoke_root) | 101 | shutil.rmtree(smoke_root) |
| ... | @@ -110,6 +123,7 @@ def main(): | ... | @@ -110,6 +123,7 @@ def main(): |
| 110 | str(args.batch_size), | 123 | str(args.batch_size), |
| 111 | "--device", | 124 | "--device", |
| 112 | args.device, | 125 | args.device, |
| 126 | *([] if args.max_test_queries is None else ["--max-test-queries", str(args.max_test_queries)]), | ||
| 113 | "--seed", | 127 | "--seed", |
| 114 | str(args.seed), | 128 | str(args.seed), |
| 115 | ] | 129 | ] |
| ... | @@ -130,6 +144,17 @@ def main(): | ... | @@ -130,6 +144,17 @@ def main(): |
| 130 | "report_dir": summary["report_dir"], | 144 | "report_dir": summary["report_dir"], |
| 131 | "sample_failures": eval_report.get("sample_failures", [])[:3], | 145 | "sample_failures": eval_report.get("sample_failures", [])[:3], |
| 132 | }) | 146 | }) |
| 147 | progress_payload = { | ||
| 148 | "dataset": args.dataset, | ||
| 149 | "subset": subset_info, | ||
| 150 | "query_duration": args.query_duration, | ||
| 151 | "query_stride": args.query_stride, | ||
| 152 | "train_epochs": args.train_epochs, | ||
| 153 | "batch_size": args.batch_size, | ||
| 154 | "device": args.device, | ||
| 155 | "strategies": results, | ||
| 156 | } | ||
| 157 | progress_path.write_text(json.dumps(progress_payload, ensure_ascii=False, indent=2)) | ||
| 133 | 158 | ||
| 134 | results.sort(key=lambda x: (x["top1"], x["topk"], x["num_queries"]), reverse=True) | 159 | results.sort(key=lambda x: (x["top1"], x["topk"], x["num_queries"]), reverse=True) |
| 135 | report = { | 160 | report = { |
| ... | @@ -140,6 +165,7 @@ def main(): | ... | @@ -140,6 +165,7 @@ def main(): |
| 140 | "train_epochs": args.train_epochs, | 165 | "train_epochs": args.train_epochs, |
| 141 | "batch_size": args.batch_size, | 166 | "batch_size": args.batch_size, |
| 142 | "device": args.device, | 167 | "device": args.device, |
| 168 | "max_test_queries": args.max_test_queries, | ||
| 143 | "strategies": results, | 169 | "strategies": results, |
| 144 | "winner": results[0] if results else None, | 170 | "winner": results[0] if results else None, |
| 145 | } | 171 | } | ... | ... |
| ... | @@ -373,6 +373,7 @@ def smoke_local_dataset( | ... | @@ -373,6 +373,7 @@ def smoke_local_dataset( |
| 373 | segment_strategy: str, | 373 | segment_strategy: str, |
| 374 | silence_top_db: int, | 374 | silence_top_db: int, |
| 375 | index_checkpoint_every_refs: int, | 375 | index_checkpoint_every_refs: int, |
| 376 | max_test_queries: int | None, | ||
| 376 | seed: int, | 377 | seed: int, |
| 377 | train_epochs: int, | 378 | train_epochs: int, |
| 378 | batch_size: int, | 379 | batch_size: int, |
| ... | @@ -449,6 +450,8 @@ def smoke_local_dataset( | ... | @@ -449,6 +450,8 @@ def smoke_local_dataset( |
| 449 | "--device", resolved_device, | 450 | "--device", resolved_device, |
| 450 | "--fast-eval", | 451 | "--fast-eval", |
| 451 | "--output-json", str(eval_json), | 452 | "--output-json", str(eval_json), |
| 453 | "--seed", str(seed), | ||
| 454 | *([] if max_test_queries is None else ["--max-queries", str(max_test_queries)]), | ||
| 452 | ], check=True) | 455 | ], check=True) |
| 453 | 456 | ||
| 454 | config = build_smoke_config_summary( | 457 | config = build_smoke_config_summary( |
| ... | @@ -467,6 +470,7 @@ def smoke_local_dataset( | ... | @@ -467,6 +470,7 @@ def smoke_local_dataset( |
| 467 | config["run"]["index_checkpoint_every_refs"] = index_checkpoint_every_refs | 470 | config["run"]["index_checkpoint_every_refs"] = index_checkpoint_every_refs |
| 468 | config["run"]["index_resume_enabled"] = True | 471 | config["run"]["index_resume_enabled"] = True |
| 469 | config["run"]["train_segment_strategy"] = segment_strategy | 472 | config["run"]["train_segment_strategy"] = segment_strategy |
| 473 | config["run"]["max_test_queries"] = max_test_queries | ||
| 470 | report_dir.mkdir(parents=True, exist_ok=True) | 474 | report_dir.mkdir(parents=True, exist_ok=True) |
| 471 | config_path.write_text(json.dumps(config, indent=2)) | 475 | config_path.write_text(json.dumps(config, indent=2)) |
| 472 | 476 | ||
| ... | @@ -552,6 +556,7 @@ def main(): | ... | @@ -552,6 +556,7 @@ def main(): |
| 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") | 556 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "high_energy", "onset_aware", "beat_aware", "repeated_section_aware", "hybrid"], default="random") |
| 553 | p.add_argument("--silence-top-db", type=int, default=30) | 557 | p.add_argument("--silence-top-db", type=int, default=30) |
| 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) | 558 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) |
| 559 | p.add_argument("--max-test-queries", type=int, default=None) | ||
| 555 | p.add_argument("--seed", type=int, default=42) | 560 | p.add_argument("--seed", type=int, default=42) |
| 556 | p.add_argument("--train-epochs", type=int, default=1) | 561 | p.add_argument("--train-epochs", type=int, default=1) |
| 557 | p.add_argument("--batch-size", type=int, default=2) | 562 | p.add_argument("--batch-size", type=int, default=2) |
| ... | @@ -612,6 +617,7 @@ def main(): | ... | @@ -612,6 +617,7 @@ def main(): |
| 612 | segment_strategy=args.segment_strategy, | 617 | segment_strategy=args.segment_strategy, |
| 613 | silence_top_db=args.silence_top_db, | 618 | silence_top_db=args.silence_top_db, |
| 614 | index_checkpoint_every_refs=args.index_checkpoint_every_refs, | 619 | index_checkpoint_every_refs=args.index_checkpoint_every_refs, |
| 620 | max_test_queries=args.max_test_queries, | ||
| 615 | seed=args.seed, | 621 | seed=args.seed, |
| 616 | train_epochs=args.train_epochs, | 622 | train_epochs=args.train_epochs, |
| 617 | batch_size=args.batch_size, | 623 | batch_size=args.batch_size, | ... | ... |
| ... | @@ -2,6 +2,50 @@ | ... | @@ -2,6 +2,50 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 为切片策略评测补齐公平 query cap,并澄清 librosa 分段现状 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 修改 `acr-engine/evaluate.py` | ||
| 9 | - 新增 `--max-queries` | ||
| 10 | - 新增 `--seed` | ||
| 11 | - 允许评测前对 query 集进行可复现随机抽样 | ||
| 12 | - 修改 `acr-engine/src/data/external_adapters.py` | ||
| 13 | - `smoke-local` 新增 `--max-test-queries` | ||
| 14 | - 自动透传到 `evaluate.py --max-queries` | ||
| 15 | - smoke 配置摘要同步记录 cap 信息 | ||
| 16 | - 修改 `acr-engine/scripts/ab_smoke_segmentation.py` | ||
| 17 | - 新增 `--max-test-queries` | ||
| 18 | - 可在策略 A/B smoke 中统一限制 query 预算 | ||
| 19 | - 更新文档: | ||
| 20 | - [open-dataset-workflow.md](./open-dataset-workflow.md) | ||
| 21 | - [dataset-spec.md](./dataset-spec.md) | ||
| 22 | - [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) | ||
| 23 | - 文档中额外澄清: | ||
| 24 | - 当前切片**不是只有 random** | ||
| 25 | - 已经接入 `librosa.effects.split / onset_detect / beat_track / chroma_cqt` | ||
| 26 | - 但尚未把更重的 `librosa.segment.*` 结构分段作为默认主流程 | ||
| 27 | |||
| 28 | 验证结果: | ||
| 29 | - 语法检查: | ||
| 30 | - `/usr/local/miniconda3/bin/python -m py_compile evaluate.py src/data/external_adapters.py scripts/ab_smoke_segmentation.py` | ||
| 31 | - 单点评测验证: | ||
| 32 | - `evaluate.py --max-queries 5 --seed 123` | ||
| 33 | - 输出 `num_queries = 5` | ||
| 34 | - `top1 = 1.0` | ||
| 35 | - `topk = 1.0` | ||
| 36 | - 端到端 smoke 验证: | ||
| 37 | - `scripts/ab_smoke_segmentation.py --strategies hybrid --max-test-queries 5` | ||
| 38 | - 最终报告: | ||
| 39 | - `max_test_queries = 5` | ||
| 40 | - `num_queries = 5` | ||
| 41 | - `top1 = 1.0` | ||
| 42 | - `topk = 1.0` | ||
| 43 | |||
| 44 | 结论: | ||
| 45 | - 现在策略 A/B 不再只能比较“谁生成的 query 更多” | ||
| 46 | - 已经可以在**统一 query 成本预算**下比较不同切片策略 | ||
| 47 | - 当前项目也已明确进入“random + librosa 音乐感知候选”的混合切片阶段,而不是纯随机切片阶段 | ||
| 48 | |||
| 5 | ### Stage: 为内部素材 query 自动补 duration / offset 规则 | 49 | ### Stage: 为内部素材 query 自动补 duration / offset 规则 |
| 6 | 50 | ||
| 7 | 完成项: | 51 | 完成项: | ... | ... |
| ... | @@ -89,16 +89,16 @@ flowchart TD | ... | @@ -89,16 +89,16 @@ flowchart TD |
| 89 | 89 | ||
| 90 | | 场景 | 当前实现 | 是否重叠 | 代码位置 | | 90 | | 场景 | 当前实现 | 是否重叠 | 代码位置 | |
| 91 | |---|---|---:|---| | 91 | |---|---|---:|---| |
| 92 | | 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | 92 | | 训练 `SongPairDataset` | 每次采样按 `segment_strategy` 选 1 个 5s clip;默认可随机,也可走音乐感知候选 | 否,**不是固定滑窗全集展开** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | |
| 93 | | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) | | 93 | | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) | |
| 94 | | `audio-dir-to-splits` 默认 | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | | 94 | | `audio-dir-to-splits` 默认 | 每首歌生成 query;可随机,也可按音乐感知策略产出 | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | |
| 95 | | `audio-dir-to-splits --query-stride 4.0` 例 | 对单首歌生成多个滑窗 query | 是,可配置 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | | 95 | | `audio-dir-to-splits --query-stride 4.0` 例 | 对单首歌生成多个滑窗 query | 是,可配置 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | |
| 96 | 96 | ||
| 97 | ### 直接回答你的问题 | 97 | ### 直接回答你的问题 |
| 98 | 98 | ||
| 99 | - **有重叠窗口,但只在检索/索引链路里有。** | 99 | - **有重叠窗口,主要在检索/索引链路;训练端不是全量滑窗展开。** |
| 100 | - **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。 | 100 | - **当前训练主链路不是“只会随机切”**,而是每次 batch 动态选 1 个 5s 片段;候选可以来自 `random / silence_aware / high_energy / onset_aware / beat_aware / repeated_section_aware / hybrid`。 |
| 101 | - **当前外部数据集 manifest 生成器默认仍是一首歌 1 个随机 query,但现在已经支持通过 `--query-stride` 开启多 query / overlap query 生成。** | 101 | - **当前外部数据集 manifest 生成器也不再只有随机 query**,可通过 `--query-strategy` 走音乐感知切法,也可通过 `--query-stride` 开启多 query / overlap query 生成。** |
| 102 | 102 | ||
| 103 | --- | 103 | --- |
| 104 | 104 | ||
| ... | @@ -108,12 +108,37 @@ flowchart TD | ... | @@ -108,12 +108,37 @@ flowchart TD |
| 108 | |---|---|---| | 108 | |---|---|---| |
| 109 | | 训练随机裁剪 | 节省存储,不必预生成几万切片 | 同一 epoch 暴露到的时间区域有限 | | 109 | | 训练随机裁剪 | 节省存储,不必预生成几万切片 | 同一 epoch 暴露到的时间区域有限 | |
| 110 | | 检索重叠滑窗 | 更接近真实 ACR reference coverage | 索引体积更大 | | 110 | | 检索重叠滑窗 | 更接近真实 ACR reference coverage | 索引体积更大 | |
| 111 | | 外部数据一首歌一个 query | smoke 更轻、更快验证 | 训练/评测覆盖不充分 | | 111 | | 音乐感知候选切片 | 更容易打到主段、起音、拍点、非静音区 | CPU 分析成本更高 | |
| 112 | | 外部数据少量 query smoke | smoke 更轻、更快验证 | 训练/评测覆盖不充分 | | ||
| 112 | 113 | ||
| 113 | 推荐理解方式: | 114 | 推荐理解方式: |
| 114 | - **训练端**更像“随机数据增强采样器” | 115 | - **训练端**更像“随机数据增强采样器” |
| 115 | - **检索端**更像“为了召回覆盖做滑窗索引” | 116 | - **检索端**更像“为了召回覆盖做滑窗索引” |
| 116 | 117 | ||
| 118 | ### 5.3 我们到底有没有用 librosa 的分段逻辑 | ||
| 119 | |||
| 120 | 有,而且已经进入主链路,但不是“把整套结构分段 API 全盘替代随机采样”。 | ||
| 121 | |||
| 122 | 当前已用到的 `librosa` 音乐感知逻辑: | ||
| 123 | |||
| 124 | | 逻辑 | 当前用途 | 代码位置 | | ||
| 125 | |---|---|---| | ||
| 126 | | `librosa.effects.split` | `silence_aware`,避开静音区 | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | ||
| 127 | | `librosa.onset.onset_detect` | `onset_aware`,优先起音附近 | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | ||
| 128 | | `librosa.beat.beat_track` | `beat_aware`,优先规则拍点 | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | ||
| 129 | | `librosa.feature.chroma_cqt` | `repeated_section_aware`,近似找重复主段 / hook | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | ||
| 130 | |||
| 131 | 还**没有**直接上整套更重的 `librosa.segment.*` 结构分段主流程,原因主要是: | ||
| 132 | |||
| 133 | 1. **训练 query 的真实来源并不总对齐段落边界**,完全结构分段会把训练分布拉得过“整齐”; | ||
| 134 | 2. **CPU 成本更高**,对 FMA / MTG 这类大目录 smoke 和批量 manifest 生成不够轻; | ||
| 135 | 3. **当前阶段先追求稳健可复现**,优先落地静音、起音、拍点、重复段这几类收益更直接的候选策略。 | ||
| 136 | |||
| 137 | 所以现在的设计不是“没考虑 librosa 分段”,而是: | ||
| 138 | - **已经用了 librosa 的轻量高收益部分** | ||
| 139 | - **保留 random 作为泛化增强** | ||
| 140 | - **把更重的结构分段留作后续增强,而不是一上来替代全部采样逻辑** | ||
| 141 | |||
| 117 | --- | 142 | --- |
| 118 | 143 | ||
| 119 | ## 6. 当前训练信号与 hard-case 规则 | 144 | ## 6. 当前训练信号与 hard-case 规则 | ... | ... |
| ... | @@ -118,6 +118,32 @@ flowchart LR | ... | @@ -118,6 +118,32 @@ flowchart LR |
| 118 | - 最后按 `num_queries` | 118 | - 最后按 `num_queries` |
| 119 | 119 | ||
| 120 | 这样在 top1/top5 持平时,会优先保留**覆盖 query 更多**的策略,而不是误把 query 更少的策略排到第一。 | 120 | 这样在 top1/top5 持平时,会优先保留**覆盖 query 更多**的策略,而不是误把 query 更少的策略排到第一。 |
| 121 | |||
| 122 | 如果你要做**更公平**的策略比较,建议再加 `--max-test-queries`,让每个策略在同样的 query 预算下评测: | ||
| 123 | |||
| 124 | ```bash | ||
| 125 | /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_segmentation.py \ | ||
| 126 | --dataset fma \ | ||
| 127 | --input-dir acr-engine/data/raw/fma_small_audio \ | ||
| 128 | --work-root /tmp/ab_smoke_seg_cap \ | ||
| 129 | --subset-size 6 \ | ||
| 130 | --query-duration 8 \ | ||
| 131 | --train-epochs 1 \ | ||
| 132 | --batch-size 2 \ | ||
| 133 | --device cpu \ | ||
| 134 | --strategies hybrid \ | ||
| 135 | --max-test-queries 5 \ | ||
| 136 | --output-json /tmp/ab_smoke_seg_cap/report.json | ||
| 137 | ``` | ||
| 138 | |||
| 139 | 已验证: | ||
| 140 | - 最终报告会显式记录 `max_test_queries` | ||
| 141 | - `evaluate.py` 会按 `--seed` 复现抽样 | ||
| 142 | - 端到端 smoke 报告中的 `num_queries` 已成功收敛到 `5` | ||
| 143 | |||
| 144 | 这一步的意义是: | ||
| 145 | - 之前的 A/B 排名更偏“覆盖能力” | ||
| 146 | - 加上 cap 后,可以更公平地比较“同等 query 成本下的识别质量” | ||
| 121 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json | 147 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json |
| 122 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local | 148 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local |
| 123 | ``` | 149 | ``` | ... | ... |
| ... | @@ -347,7 +347,7 @@ flowchart TD | ... | @@ -347,7 +347,7 @@ flowchart TD |
| 347 | 347 | ||
| 348 | ## 11.5 切片策略:不要只用随机切 | 348 | ## 11.5 切片策略:不要只用随机切 |
| 349 | 349 | ||
| 350 | 当前项目现在已经支持 4 类切片思路,但职责不同: | 350 | 当前项目现在已经支持多类切片思路,但职责不同: |
| 351 | 351 | ||
| 352 | | 策略 | 适用位置 | 作用 | 是否已接入 | | 352 | | 策略 | 适用位置 | 作用 | 是否已接入 | |
| 353 | |---|---|---|---| | 353 | |---|---|---|---| |
| ... | @@ -358,7 +358,7 @@ flowchart TD | ... | @@ -358,7 +358,7 @@ flowchart TD |
| 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | | 358 | | `onset_aware` | 训练 query / 外部 query 生成 | 优先靠近起音事件,减少截到拖尾/空拍 | 是 | |
| 359 | | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | | 359 | | `beat_aware` | 训练 query / 外部 query 生成 | 优先靠近节拍点,适合强节奏流行/电子/舞曲等 | 是 | |
| 360 | | `repeated_section_aware` | 训练 query / 外部 query 生成 | 优先抽取与其它窗口最相似的重复主段,近似副歌/重复 hook | 是 | | 360 | | `repeated_section_aware` | 训练 query / 外部 query 生成 | 优先抽取与其它窗口最相似的重复主段,近似副歌/重复 hook | 是 | |
| 361 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | | 361 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 repeated-section / beat / energy / onset / silence / random | 是 | |
| 362 | 362 | ||
| 363 | 推荐理解: | 363 | 推荐理解: |
| 364 | 364 | ||
| ... | @@ -367,7 +367,29 @@ flowchart TD | ... | @@ -367,7 +367,29 @@ flowchart TD |
| 367 | 2. **reference 建库不是随机切** | 367 | 2. **reference 建库不是随机切** |
| 368 | 建库仍然是固定滑窗 | 368 | 建库仍然是固定滑窗 |
| 369 | 3. **外部数据 query 生成也不是只能随机切** | 369 | 3. **外部数据 query 生成也不是只能随机切** |
| 370 | 现在可选 `--query-strategy silence_aware` | 370 | 现在可选 `--query-strategy random|silence_aware|high_energy|onset_aware|beat_aware|repeated_section_aware|hybrid` |
| 371 | |||
| 372 | ### 11.6 为什么没有直接全量切到 `librosa.segment.*` | ||
| 373 | |||
| 374 | 这不是没考虑,而是当前做了更保守的工程取舍: | ||
| 375 | |||
| 376 | - 已经接入 `librosa.effects.split / onset_detect / beat_track / chroma_cqt` | ||
| 377 | - 先把非静音、起音、拍点、重复段这些高收益候选打通 | ||
| 378 | - 暂时没有把更重的结构分段作为默认主流程 | ||
| 379 | |||
| 380 | 原因: | ||
| 381 | |||
| 382 | 1. **ACR 查询不总是结构化片段** | ||
| 383 | 用户截到的可能是副歌,也可能是过门、录屏残片、短视频二创片段。 | ||
| 384 | 2. **重结构分段更耗 CPU** | ||
| 385 | 对 FMA 这类真实开放集批量 prepare/smoke 不够轻。 | ||
| 386 | 3. **训练仍需要随机性** | ||
| 387 | 纯结构分段会降低截取点分布的多样性。 | ||
| 388 | |||
| 389 | 当前更合理的策略是: | ||
| 390 | - `hybrid` 作为默认训练切片推荐 | ||
| 391 | - `beat_aware / repeated_section_aware` 作为偏音乐主段的强化选项 | ||
| 392 | - `random` 保留为泛化基线 | ||
| 371 | 393 | ||
| 372 | 为什么不直接完全依赖音乐结构分段? | 394 | 为什么不直接完全依赖音乐结构分段? |
| 373 | 395 | ... | ... |
-
Please register or sign in to post a comment