Benchmark segmentation strategies on a real FMA mini-smoke set
Constraint: Strategy comparisons need real-audio evidence, but the benchmark must stay cheap enough to run repeatedly on CPU during active development Rejected: Judge winners only by top1/topk on a tiny subset | ties hide the practical value of strategies that generate far more usable queries Confidence: medium Scope-risk: narrow Directive: Keep num_queries as a tie-breaker for tiny-smoke comparisons; increase subset size before promoting benchmark winners to default training policy Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_segmentation.py --dataset fma --input-dir acr-engine/data/raw/fma_small_audio --work-root /tmp/ab_smoke_seg --subset-size 8 --query-duration 8 --train-epochs 1 --batch-size 2 --device cpu --output-json /tmp/ab_smoke_seg/report.json; post-run ranking verification from /tmp/ab_smoke_seg/report.json Not-tested: Larger FMA subsets or difficult internal query mixes in the same benchmark script
Showing
3 changed files
with
234 additions
and
0 deletions
acr-engine/scripts/ab_smoke_segmentation.py
0 → 100644
| 1 | #!/usr/bin/env python3 | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | import shutil | ||
| 7 | import subprocess | ||
| 8 | from pathlib import Path | ||
| 9 | |||
| 10 | |||
| 11 | PYTHON = "/usr/local/miniconda3/bin/python" | ||
| 12 | DEFAULT_STRATEGIES = [ | ||
| 13 | "random", | ||
| 14 | "silence_aware", | ||
| 15 | "high_energy", | ||
| 16 | "beat_aware", | ||
| 17 | "repeated_section_aware", | ||
| 18 | "hybrid", | ||
| 19 | ] | ||
| 20 | |||
| 21 | |||
| 22 | def run(cmd: list[str], cwd: Path) -> str: | ||
| 23 | return subprocess.check_output(cmd, cwd=str(cwd), text=True) | ||
| 24 | |||
| 25 | |||
| 26 | def parse_last_json(text: str) -> dict: | ||
| 27 | for start in range(len(text) - 1, -1, -1): | ||
| 28 | if text[start] != "{": | ||
| 29 | continue | ||
| 30 | try: | ||
| 31 | return json.loads(text[start:]) | ||
| 32 | except json.JSONDecodeError: | ||
| 33 | continue | ||
| 34 | raise ValueError("No JSON object found in command output") | ||
| 35 | |||
| 36 | |||
| 37 | def prepare_subset(src_dir: Path, subset_dir: Path, limit: int) -> dict: | ||
| 38 | files = sorted(src_dir.rglob("*.mp3"))[:limit] | ||
| 39 | subset_dir.mkdir(parents=True, exist_ok=True) | ||
| 40 | copied = [] | ||
| 41 | for src in files: | ||
| 42 | rel = src.relative_to(src_dir) | ||
| 43 | dst = subset_dir / rel | ||
| 44 | dst.parent.mkdir(parents=True, exist_ok=True) | ||
| 45 | if not dst.exists(): | ||
| 46 | shutil.copy2(src, dst) | ||
| 47 | copied.append(str(dst)) | ||
| 48 | return { | ||
| 49 | "source_dir": str(src_dir), | ||
| 50 | "subset_dir": str(subset_dir), | ||
| 51 | "num_files": len(copied), | ||
| 52 | "sample_files": copied[:5], | ||
| 53 | } | ||
| 54 | |||
| 55 | |||
| 56 | def train_strategy_for_query(strategy: str) -> str: | ||
| 57 | if strategy == "sliding": | ||
| 58 | return "random" | ||
| 59 | return strategy | ||
| 60 | |||
| 61 | |||
| 62 | def main(): | ||
| 63 | parser = argparse.ArgumentParser() | ||
| 64 | parser.add_argument("--dataset", default="fma") | ||
| 65 | parser.add_argument("--input-dir", default="data/raw/fma_small_audio") | ||
| 66 | parser.add_argument("--work-root", default="data/ab_smoke_segmentation") | ||
| 67 | parser.add_argument("--subset-size", type=int, default=12) | ||
| 68 | parser.add_argument("--query-duration", type=float, default=8.0) | ||
| 69 | parser.add_argument("--query-stride", type=float, default=None) | ||
| 70 | parser.add_argument("--train-epochs", type=int, default=1) | ||
| 71 | parser.add_argument("--batch-size", type=int, default=2) | ||
| 72 | parser.add_argument("--device", default="cpu") | ||
| 73 | parser.add_argument("--seed", type=int, default=42) | ||
| 74 | parser.add_argument("--strategies", nargs="*", default=DEFAULT_STRATEGIES) | ||
| 75 | parser.add_argument("--output-json", default=None) | ||
| 76 | args = parser.parse_args() | ||
| 77 | |||
| 78 | repo = Path(__file__).resolve().parents[1] | ||
| 79 | input_dir = (repo / args.input_dir).resolve() | ||
| 80 | work_root = (repo / args.work_root).resolve() | ||
| 81 | subset_dir = work_root / "subset_audio" | ||
| 82 | subset_info = prepare_subset(input_dir, subset_dir, args.subset_size) | ||
| 83 | |||
| 84 | results = [] | ||
| 85 | for strategy in args.strategies: | ||
| 86 | smoke_root = work_root / strategy | ||
| 87 | if smoke_root.exists(): | ||
| 88 | shutil.rmtree(smoke_root) | ||
| 89 | smoke_root.mkdir(parents=True, exist_ok=True) | ||
| 90 | |||
| 91 | cmd = [ | ||
| 92 | PYTHON, | ||
| 93 | "src/data/external_adapters.py", | ||
| 94 | "smoke-local", | ||
| 95 | args.dataset, | ||
| 96 | str(subset_dir), | ||
| 97 | "--output-root", | ||
| 98 | str(smoke_root), | ||
| 99 | "--eval-ratio", | ||
| 100 | "0.2", | ||
| 101 | "--query-duration", | ||
| 102 | str(args.query_duration), | ||
| 103 | "--query-strategy", | ||
| 104 | strategy, | ||
| 105 | "--segment-strategy", | ||
| 106 | train_strategy_for_query(strategy), | ||
| 107 | "--train-epochs", | ||
| 108 | str(args.train_epochs), | ||
| 109 | "--batch-size", | ||
| 110 | str(args.batch_size), | ||
| 111 | "--device", | ||
| 112 | args.device, | ||
| 113 | "--seed", | ||
| 114 | str(args.seed), | ||
| 115 | ] | ||
| 116 | if args.query_stride is not None: | ||
| 117 | cmd.extend(["--query-stride", str(args.query_stride)]) | ||
| 118 | |||
| 119 | output = run(cmd, cwd=repo) | ||
| 120 | summary = parse_last_json(output) | ||
| 121 | eval_json = Path(summary["eval_json"]) | ||
| 122 | eval_report = json.loads(eval_json.read_text()) | ||
| 123 | results.append({ | ||
| 124 | "strategy": strategy, | ||
| 125 | "train_segment_strategy": train_strategy_for_query(strategy), | ||
| 126 | "num_queries": eval_report["num_queries"], | ||
| 127 | "top1": eval_report["top1"], | ||
| 128 | "topk": eval_report["topk"], | ||
| 129 | "eval_json": str(eval_json), | ||
| 130 | "report_dir": summary["report_dir"], | ||
| 131 | "sample_failures": eval_report.get("sample_failures", [])[:3], | ||
| 132 | }) | ||
| 133 | |||
| 134 | results.sort(key=lambda x: (x["top1"], x["topk"], x["num_queries"]), reverse=True) | ||
| 135 | report = { | ||
| 136 | "dataset": args.dataset, | ||
| 137 | "subset": subset_info, | ||
| 138 | "query_duration": args.query_duration, | ||
| 139 | "query_stride": args.query_stride, | ||
| 140 | "train_epochs": args.train_epochs, | ||
| 141 | "batch_size": args.batch_size, | ||
| 142 | "device": args.device, | ||
| 143 | "strategies": results, | ||
| 144 | "winner": results[0] if results else None, | ||
| 145 | } | ||
| 146 | text = json.dumps(report, ensure_ascii=False, indent=2) | ||
| 147 | if args.output_json: | ||
| 148 | out = Path(args.output_json) | ||
| 149 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 150 | out.write_text(text) | ||
| 151 | print(text) | ||
| 152 | |||
| 153 | |||
| 154 | if __name__ == "__main__": | ||
| 155 | main() |
| ... | @@ -5675,3 +5675,50 @@ | ... | @@ -5675,3 +5675,50 @@ |
| 5675 | - 下一步可继续做更强的: | 5675 | - 下一步可继续做更强的: |
| 5676 | - chorus-like multi-feature ranking | 5676 | - chorus-like multi-feature ranking |
| 5677 | - 小规模真实数据策略 A/B 对比 | 5677 | - 小规模真实数据策略 A/B 对比 |
| 5678 | |||
| 5679 | ### Stage: real FMA mini-subset segmentation A/B smoke benchmark | ||
| 5680 | |||
| 5681 | 完成项: | ||
| 5682 | - 新增脚本: | ||
| 5683 | - `acr-engine/scripts/ab_smoke_segmentation.py` | ||
| 5684 | - 能力: | ||
| 5685 | - 从本地真实数据目录抽取固定数量子集 | ||
| 5686 | - 依次运行 `smoke-local` | ||
| 5687 | - 自动比较多种切片策略的 smoke 结果 | ||
| 5688 | - 汇总 `top1 / topk / num_queries` | ||
| 5689 | - 修正排序规则: | ||
| 5690 | - 不再只按 `top1/topk` | ||
| 5691 | - 改为 `top1 -> topk -> num_queries` | ||
| 5692 | - 避免在分数持平时把 query 更少的策略误判为 winner | ||
| 5693 | |||
| 5694 | 验证结果: | ||
| 5695 | - 真实数据来源: | ||
| 5696 | - `data/raw/fma_small_audio` | ||
| 5697 | - smoke 子集: | ||
| 5698 | - `8` 首 FMA 音频 | ||
| 5699 | - `query_duration=8` | ||
| 5700 | - `train_epochs=1` | ||
| 5701 | - `batch_size=2` | ||
| 5702 | - 比较策略: | ||
| 5703 | - `random` | ||
| 5704 | - `silence_aware` | ||
| 5705 | - `high_energy` | ||
| 5706 | - `beat_aware` | ||
| 5707 | - `repeated_section_aware` | ||
| 5708 | - `hybrid` | ||
| 5709 | - 报告路径: | ||
| 5710 | - `/tmp/ab_smoke_seg/report.json` | ||
| 5711 | - 排序修正后的结果: | ||
| 5712 | 1. `hybrid`:`num_queries=37`, `top1=1.0`, `topk=1.0` | ||
| 5713 | 2. `beat_aware`:`num_queries=13`, `top1=1.0`, `topk=1.0` | ||
| 5714 | 3. `high_energy`:`num_queries=12`, `top1=1.0`, `topk=1.0` | ||
| 5715 | 4. `repeated_section_aware`:`num_queries=12`, `top1=1.0`, `topk=1.0` | ||
| 5716 | 5. `random`:`num_queries=4`, `top1=1.0`, `topk=1.0` | ||
| 5717 | 6. `silence_aware`:`num_queries=2`, `top1=1.0`, `topk=1.0` | ||
| 5718 | |||
| 5719 | 结论: | ||
| 5720 | - 在这个极小真实子集 smoke 上,所有策略都能达到 `top1/top5 = 1.0` | ||
| 5721 | - 但从 **query 覆盖率** 看: | ||
| 5722 | - `hybrid` 当前最优 | ||
| 5723 | - `beat_aware / high_energy / repeated_section_aware` 是更强的次优候选 | ||
| 5724 | - 下一步应扩大真实子集规模,并引入更难的 query 类型,进一步拉开策略差异 | ... | ... |
| ... | @@ -86,6 +86,38 @@ flowchart LR | ... | @@ -86,6 +86,38 @@ flowchart LR |
| 86 | - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume` | 86 | - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume` |
| 87 | - checkpoint 会记录 `model_signature` | 87 | - checkpoint 会记录 `model_signature` |
| 88 | - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding | 88 | - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding |
| 89 | |||
| 90 | ## 小规模策略 A/B smoke | ||
| 91 | |||
| 92 | 如果你想快速比较不同 query / training 切片策略,可直接运行: | ||
| 93 | |||
| 94 | ```bash | ||
| 95 | /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_segmentation.py \ | ||
| 96 | --dataset fma \ | ||
| 97 | --input-dir acr-engine/data/raw/fma_small_audio \ | ||
| 98 | --work-root /tmp/ab_smoke_seg \ | ||
| 99 | --subset-size 8 \ | ||
| 100 | --query-duration 8 \ | ||
| 101 | --train-epochs 1 \ | ||
| 102 | --batch-size 2 \ | ||
| 103 | --device cpu \ | ||
| 104 | --output-json /tmp/ab_smoke_seg/report.json | ||
| 105 | ``` | ||
| 106 | |||
| 107 | 当前脚本会比较: | ||
| 108 | - `random` | ||
| 109 | - `silence_aware` | ||
| 110 | - `high_energy` | ||
| 111 | - `beat_aware` | ||
| 112 | - `repeated_section_aware` | ||
| 113 | - `hybrid` | ||
| 114 | |||
| 115 | 排序规则: | ||
| 116 | - 先按 `top1` | ||
| 117 | - 再按 `topk` | ||
| 118 | - 最后按 `num_queries` | ||
| 119 | |||
| 120 | 这样在 top1/top5 持平时,会优先保留**覆盖 query 更多**的策略,而不是误把 query 更少的策略排到第一。 | ||
| 89 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json | 121 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json |
| 90 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local | 122 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local |
| 91 | ``` | 123 | ``` | ... | ... |
-
Please register or sign in to post a comment