Commit f04a314e f04a314e7e9cb2f11def143390a13ea273036667 by cnb.bofCdSsphPA

Benchmark segmentation strategies on a real FMA mini-smoke set

Constraint: Strategy comparisons need real-audio evidence, but the benchmark must stay cheap enough to run repeatedly on CPU during active development
Rejected: Judge winners only by top1/topk on a tiny subset | ties hide the practical value of strategies that generate far more usable queries
Confidence: medium
Scope-risk: narrow
Directive: Keep num_queries as a tie-breaker for tiny-smoke comparisons; increase subset size before promoting benchmark winners to default training policy
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_segmentation.py --dataset fma --input-dir acr-engine/data/raw/fma_small_audio --work-root /tmp/ab_smoke_seg --subset-size 8 --query-duration 8 --train-epochs 1 --batch-size 2 --device cpu --output-json /tmp/ab_smoke_seg/report.json; post-run ranking verification from /tmp/ab_smoke_seg/report.json
Not-tested: Larger FMA subsets or difficult internal query mixes in the same benchmark script
1 parent 8ed3e34e
1 #!/usr/bin/env python3
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import shutil
7 import subprocess
8 from pathlib import Path
9
10
11 PYTHON = "/usr/local/miniconda3/bin/python"
12 DEFAULT_STRATEGIES = [
13 "random",
14 "silence_aware",
15 "high_energy",
16 "beat_aware",
17 "repeated_section_aware",
18 "hybrid",
19 ]
20
21
22 def run(cmd: list[str], cwd: Path) -> str:
23 return subprocess.check_output(cmd, cwd=str(cwd), text=True)
24
25
26 def parse_last_json(text: str) -> dict:
27 for start in range(len(text) - 1, -1, -1):
28 if text[start] != "{":
29 continue
30 try:
31 return json.loads(text[start:])
32 except json.JSONDecodeError:
33 continue
34 raise ValueError("No JSON object found in command output")
35
36
37 def prepare_subset(src_dir: Path, subset_dir: Path, limit: int) -> dict:
38 files = sorted(src_dir.rglob("*.mp3"))[:limit]
39 subset_dir.mkdir(parents=True, exist_ok=True)
40 copied = []
41 for src in files:
42 rel = src.relative_to(src_dir)
43 dst = subset_dir / rel
44 dst.parent.mkdir(parents=True, exist_ok=True)
45 if not dst.exists():
46 shutil.copy2(src, dst)
47 copied.append(str(dst))
48 return {
49 "source_dir": str(src_dir),
50 "subset_dir": str(subset_dir),
51 "num_files": len(copied),
52 "sample_files": copied[:5],
53 }
54
55
56 def train_strategy_for_query(strategy: str) -> str:
57 if strategy == "sliding":
58 return "random"
59 return strategy
60
61
62 def main():
63 parser = argparse.ArgumentParser()
64 parser.add_argument("--dataset", default="fma")
65 parser.add_argument("--input-dir", default="data/raw/fma_small_audio")
66 parser.add_argument("--work-root", default="data/ab_smoke_segmentation")
67 parser.add_argument("--subset-size", type=int, default=12)
68 parser.add_argument("--query-duration", type=float, default=8.0)
69 parser.add_argument("--query-stride", type=float, default=None)
70 parser.add_argument("--train-epochs", type=int, default=1)
71 parser.add_argument("--batch-size", type=int, default=2)
72 parser.add_argument("--device", default="cpu")
73 parser.add_argument("--seed", type=int, default=42)
74 parser.add_argument("--strategies", nargs="*", default=DEFAULT_STRATEGIES)
75 parser.add_argument("--output-json", default=None)
76 args = parser.parse_args()
77
78 repo = Path(__file__).resolve().parents[1]
79 input_dir = (repo / args.input_dir).resolve()
80 work_root = (repo / args.work_root).resolve()
81 subset_dir = work_root / "subset_audio"
82 subset_info = prepare_subset(input_dir, subset_dir, args.subset_size)
83
84 results = []
85 for strategy in args.strategies:
86 smoke_root = work_root / strategy
87 if smoke_root.exists():
88 shutil.rmtree(smoke_root)
89 smoke_root.mkdir(parents=True, exist_ok=True)
90
91 cmd = [
92 PYTHON,
93 "src/data/external_adapters.py",
94 "smoke-local",
95 args.dataset,
96 str(subset_dir),
97 "--output-root",
98 str(smoke_root),
99 "--eval-ratio",
100 "0.2",
101 "--query-duration",
102 str(args.query_duration),
103 "--query-strategy",
104 strategy,
105 "--segment-strategy",
106 train_strategy_for_query(strategy),
107 "--train-epochs",
108 str(args.train_epochs),
109 "--batch-size",
110 str(args.batch_size),
111 "--device",
112 args.device,
113 "--seed",
114 str(args.seed),
115 ]
116 if args.query_stride is not None:
117 cmd.extend(["--query-stride", str(args.query_stride)])
118
119 output = run(cmd, cwd=repo)
120 summary = parse_last_json(output)
121 eval_json = Path(summary["eval_json"])
122 eval_report = json.loads(eval_json.read_text())
123 results.append({
124 "strategy": strategy,
125 "train_segment_strategy": train_strategy_for_query(strategy),
126 "num_queries": eval_report["num_queries"],
127 "top1": eval_report["top1"],
128 "topk": eval_report["topk"],
129 "eval_json": str(eval_json),
130 "report_dir": summary["report_dir"],
131 "sample_failures": eval_report.get("sample_failures", [])[:3],
132 })
133
134 results.sort(key=lambda x: (x["top1"], x["topk"], x["num_queries"]), reverse=True)
135 report = {
136 "dataset": args.dataset,
137 "subset": subset_info,
138 "query_duration": args.query_duration,
139 "query_stride": args.query_stride,
140 "train_epochs": args.train_epochs,
141 "batch_size": args.batch_size,
142 "device": args.device,
143 "strategies": results,
144 "winner": results[0] if results else None,
145 }
146 text = json.dumps(report, ensure_ascii=False, indent=2)
147 if args.output_json:
148 out = Path(args.output_json)
149 out.parent.mkdir(parents=True, exist_ok=True)
150 out.write_text(text)
151 print(text)
152
153
154 if __name__ == "__main__":
155 main()
...@@ -5675,3 +5675,50 @@ ...@@ -5675,3 +5675,50 @@
5675 - 下一步可继续做更强的: 5675 - 下一步可继续做更强的:
5676 - chorus-like multi-feature ranking 5676 - chorus-like multi-feature ranking
5677 - 小规模真实数据策略 A/B 对比 5677 - 小规模真实数据策略 A/B 对比
5678
5679 ### Stage: real FMA mini-subset segmentation A/B smoke benchmark
5680
5681 完成项:
5682 - 新增脚本:
5683 - `acr-engine/scripts/ab_smoke_segmentation.py`
5684 - 能力:
5685 - 从本地真实数据目录抽取固定数量子集
5686 - 依次运行 `smoke-local`
5687 - 自动比较多种切片策略的 smoke 结果
5688 - 汇总 `top1 / topk / num_queries`
5689 - 修正排序规则:
5690 - 不再只按 `top1/topk`
5691 - 改为 `top1 -> topk -> num_queries`
5692 - 避免在分数持平时把 query 更少的策略误判为 winner
5693
5694 验证结果:
5695 - 真实数据来源:
5696 - `data/raw/fma_small_audio`
5697 - smoke 子集:
5698 - `8` 首 FMA 音频
5699 - `query_duration=8`
5700 - `train_epochs=1`
5701 - `batch_size=2`
5702 - 比较策略:
5703 - `random`
5704 - `silence_aware`
5705 - `high_energy`
5706 - `beat_aware`
5707 - `repeated_section_aware`
5708 - `hybrid`
5709 - 报告路径:
5710 - `/tmp/ab_smoke_seg/report.json`
5711 - 排序修正后的结果:
5712 1. `hybrid``num_queries=37`, `top1=1.0`, `topk=1.0`
5713 2. `beat_aware``num_queries=13`, `top1=1.0`, `topk=1.0`
5714 3. `high_energy``num_queries=12`, `top1=1.0`, `topk=1.0`
5715 4. `repeated_section_aware``num_queries=12`, `top1=1.0`, `topk=1.0`
5716 5. `random``num_queries=4`, `top1=1.0`, `topk=1.0`
5717 6. `silence_aware``num_queries=2`, `top1=1.0`, `topk=1.0`
5718
5719 结论:
5720 - 在这个极小真实子集 smoke 上,所有策略都能达到 `top1/top5 = 1.0`
5721 - 但从 **query 覆盖率** 看:
5722 - `hybrid` 当前最优
5723 - `beat_aware / high_energy / repeated_section_aware` 是更强的次优候选
5724 - 下一步应扩大真实子集规模,并引入更难的 query 类型,进一步拉开策略差异
......
...@@ -86,6 +86,38 @@ flowchart LR ...@@ -86,6 +86,38 @@ flowchart LR
86 - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume` 86 - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume`
87 - checkpoint 会记录 `model_signature` 87 - checkpoint 会记录 `model_signature`
88 - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding 88 - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding
89
90 ## 小规模策略 A/B smoke
91
92 如果你想快速比较不同 query / training 切片策略,可直接运行:
93
94 ```bash
95 /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_segmentation.py \
96 --dataset fma \
97 --input-dir acr-engine/data/raw/fma_small_audio \
98 --work-root /tmp/ab_smoke_seg \
99 --subset-size 8 \
100 --query-duration 8 \
101 --train-epochs 1 \
102 --batch-size 2 \
103 --device cpu \
104 --output-json /tmp/ab_smoke_seg/report.json
105 ```
106
107 当前脚本会比较:
108 - `random`
109 - `silence_aware`
110 - `high_energy`
111 - `beat_aware`
112 - `repeated_section_aware`
113 - `hybrid`
114
115 排序规则:
116 - 先按 `top1`
117 - 再按 `topk`
118 - 最后按 `num_queries`
119
120 这样在 top1/top5 持平时,会优先保留**覆盖 query 更多**的策略,而不是误把 query 更少的策略排到第一。
89 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json 121 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json
90 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local 122 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local
91 ``` 123 ```
......