Commit c1a22cbb c1a22cbbe877d9aa8973c6f8e2f1cc513ff00edf by cnb.bofCdSsphPA

Promote bucket benchmarking from a plan to a runnable baseline

Constraint: The cap48/cap64 reversal means strategy guidance can no longer rely on a single overall subset result
Rejected: Keep bucket benchmarking as a doc-only next step | The repo now needs an executable baseline so later sessions can measure scale/style divergence directly
Confidence: high
Scope-risk: moderate
Directive: Treat ab_smoke_bucketed.py as the canonical seed for style-aware evaluation, and expand bucket definitions before revisiting global default-strategy claims
Tested: Verified acr-engine/scripts/ab_smoke_bucketed.py passes py_compile; verified first bucket prefix_000_a produced bucket_report.json with hybrid 4/1.0/1.0 and high_energy 3/1.0/1.0; verified second bucket execution is in progress
Not-tested: Full multi-bucket report.json completion, richer bucket definitions, and bucket-level aggregate conclusions
1 parent e49dc0b9
1 #!/usr/bin/env python3
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import subprocess
7 from pathlib import Path
8 from statistics import mean
9
10 PYTHON = "/usr/local/miniconda3/bin/python"
11
12
13 def run(cmd: list[str], cwd: Path) -> str:
14 return subprocess.check_output(cmd, cwd=str(cwd), text=True)
15
16
17 def collect_files(input_dir: Path, patterns: list[str], limit: int | None = None) -> list[Path]:
18 seen: set[Path] = set()
19 files: list[Path] = []
20 for pattern in patterns:
21 for path in sorted(input_dir.glob(pattern)):
22 if not path.is_file() or path.suffix.lower() != ".mp3":
23 continue
24 resolved = path.resolve()
25 if resolved in seen:
26 continue
27 seen.add(resolved)
28 files.append(resolved)
29 if limit is not None and len(files) >= limit:
30 return files
31 return files
32
33
34 def ensure_bucket_subset(input_dir: Path, bucket_dir: Path, patterns: list[str], limit: int | None) -> dict:
35 bucket_dir.mkdir(parents=True, exist_ok=True)
36 files = collect_files(input_dir, patterns, limit=limit)
37 copied: list[str] = []
38 for src in files:
39 rel = src.relative_to(input_dir)
40 dst = bucket_dir / rel
41 dst.parent.mkdir(parents=True, exist_ok=True)
42 if not dst.exists():
43 dst.write_bytes(src.read_bytes())
44 copied.append(str(dst))
45 return {
46 "num_files": len(copied),
47 "sample_files": copied[:5],
48 }
49
50
51 def main() -> None:
52 parser = argparse.ArgumentParser(description="Run bucket/style-aware segmented smoke benchmarks")
53 parser.add_argument("--dataset", default="fma")
54 parser.add_argument("--input-dir", default="data/raw/fma_small_audio")
55 parser.add_argument("--bucket-config", required=True, help="JSON file with {buckets:[{name,patterns,subset_size?}]} or {bucket_name:[patterns]}")
56 parser.add_argument("--work-root", default="/tmp/ab_smoke_bucketed")
57 parser.add_argument("--query-duration", type=float, default=8.0)
58 parser.add_argument("--query-stride", type=float, default=None)
59 parser.add_argument("--train-epochs", type=int, default=1)
60 parser.add_argument("--batch-size", type=int, default=2)
61 parser.add_argument("--device", default="cpu")
62 parser.add_argument("--seed", type=int, default=42)
63 parser.add_argument("--max-test-queries", type=int, default=None)
64 parser.add_argument("--default-subset-size", type=int, default=16)
65 parser.add_argument("--min-files", type=int, default=2)
66 parser.add_argument("--strategies", nargs="*", default=["high_energy", "hybrid"])
67 parser.add_argument("--output-json", default=None)
68 args = parser.parse_args()
69
70 repo = Path(__file__).resolve().parents[1]
71 input_dir = (repo / args.input_dir).resolve()
72 work_root = Path(args.work_root).resolve()
73 config = json.loads(Path(args.bucket_config).read_text())
74
75 if isinstance(config, dict) and "buckets" in config:
76 bucket_specs = config["buckets"]
77 elif isinstance(config, dict):
78 bucket_specs = [{"name": k, "patterns": v} for k, v in config.items()]
79 else:
80 raise ValueError("bucket config must be an object")
81
82 bucket_reports = []
83 for spec in bucket_specs:
84 name = spec["name"]
85 patterns = spec["patterns"]
86 subset_size = spec.get("subset_size", args.default_subset_size)
87 bucket_root = work_root / name
88 subset_dir = bucket_root / "bucket_input"
89 subset_info = ensure_bucket_subset(input_dir, subset_dir, patterns, subset_size)
90 if subset_info["num_files"] < args.min_files:
91 bucket_reports.append({
92 "bucket": name,
93 "patterns": patterns,
94 "subset_size": subset_info["num_files"],
95 "skipped": True,
96 "reason": f"num_files<{args.min_files}",
97 "subset": subset_info,
98 })
99 continue
100
101 cmd = [
102 PYTHON,
103 "scripts/ab_smoke_segmentation.py",
104 "--dataset", args.dataset,
105 "--input-dir", str(subset_dir),
106 "--work-root", str(bucket_root / "run"),
107 "--subset-size", str(subset_info["num_files"]),
108 "--query-duration", str(args.query_duration),
109 "--train-epochs", str(args.train_epochs),
110 "--batch-size", str(args.batch_size),
111 "--device", args.device,
112 "--seed", str(args.seed),
113 "--strategies", *args.strategies,
114 ]
115 if args.max_test_queries is not None:
116 cmd += ["--max-test-queries", str(args.max_test_queries)]
117 if args.query_stride is not None:
118 cmd += ["--query-stride", str(args.query_stride)]
119 out_json = bucket_root / "bucket_report.json"
120 cmd += ["--output-json", str(out_json)]
121 run(cmd, cwd=repo)
122 result = json.loads(out_json.read_text())
123 bucket_reports.append({
124 "bucket": name,
125 "patterns": patterns,
126 "subset_size": subset_info["num_files"],
127 "skipped": False,
128 "subset": subset_info,
129 "winner": result.get("winner"),
130 "strategies": result.get("strategies", []),
131 })
132
133 strategy_aggregate: dict[str, dict[str, list[float]]] = {}
134 for bucket in bucket_reports:
135 if bucket.get("skipped"):
136 continue
137 for row in bucket["strategies"]:
138 agg = strategy_aggregate.setdefault(row["strategy"], {"top1": [], "topk": [], "num_queries": []})
139 agg["top1"].append(row["top1"])
140 agg["topk"].append(row["topk"])
141 agg["num_queries"].append(row["num_queries"])
142
143 aggregate = {
144 strategy: {
145 "bucket_runs": len(vals["top1"]),
146 "mean_top1": round(mean(vals["top1"]), 4),
147 "mean_topk": round(mean(vals["topk"]), 4),
148 "mean_num_queries": round(mean(vals["num_queries"]), 4),
149 }
150 for strategy, vals in strategy_aggregate.items() if vals["top1"]
151 }
152
153 report = {
154 "dataset": args.dataset,
155 "input_dir": str(input_dir),
156 "bucket_config": str(Path(args.bucket_config).resolve()),
157 "query_duration": args.query_duration,
158 "query_stride": args.query_stride,
159 "train_epochs": args.train_epochs,
160 "batch_size": args.batch_size,
161 "device": args.device,
162 "seed": args.seed,
163 "max_test_queries": args.max_test_queries,
164 "strategies": args.strategies,
165 "buckets": bucket_reports,
166 "aggregate": aggregate,
167 }
168 text = json.dumps(report, ensure_ascii=False, indent=2)
169 if args.output_json:
170 out = Path(args.output_json)
171 out.parent.mkdir(parents=True, exist_ok=True)
172 out.write_text(text)
173 print(text)
174
175
176 if __name__ == "__main__":
177 main()
1 ## 2026-06-02 bucket/style-aware benchmark 基线落地 checkpoint
2
3 完成项:
4 - 新增 `acr-engine/scripts/ab_smoke_bucketed.py`,用于按 bucket 配置批量驱动现有 `ab_smoke_segmentation.py`
5 - 新脚本已通过 `py_compile`
6 - 已完成最小 smoke 验证:首个 bucket `prefix_000_a` 已成功产出 `bucket_report.json`
7
8 验证证据:
9 - 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py`
10 - 首个 bucket 结果:
11 - `prefix_000_a`
12 - `hybrid`: `num_queries=4, top1=1.0, topk=1.0`
13 - `high_energy`: `num_queries=3, top1=1.0, topk=1.0`
14 - winner: `hybrid`
15 - 当前第二个 bucket `prefix_000_b` 仍在运行中。
16
17 说明:
18 - 这次提交的重点是把 bucket/style-aware benchmark 从“待办”推进为“已存在可运行基线”。
19 - 完整 bucket 汇总 `report.json` 尚未生成,因此当前只把它视作基线工具完成与首桶 smoke 通过。
20
1 ## 2026-06-02 cap64 完结 checkpoint 21 ## 2026-06-02 cap64 完结 checkpoint
2 22
3 完成项: 23 完成项:
......
...@@ -80,3 +80,5 @@ cd /workspace/acr-engine ...@@ -80,3 +80,5 @@ cd /workspace/acr-engine
80 - 已补充 cap64 新鲜证据:`hybrid` reference index 完成(`64 refs / 657 windows / 192-d`)并进入 `evaluate.py` 80 - 已补充 cap64 新鲜证据:`hybrid` reference index 完成(`64 refs / 657 windows / 192-d`)并进入 `evaluate.py`
81 81
82 - 已补齐 cap64 最终结果:`hybrid=0.875``high_energy=0.625`,winner=`hybrid` 82 - 已补齐 cap64 最终结果:`hybrid=0.875``high_energy=0.625`,winner=`hybrid`
83
84 - 已新增 `acr-engine/scripts/ab_smoke_bucketed.py`,并完成首个 bucket 的 smoke 验证。
......
...@@ -81,3 +81,31 @@ flowchart LR ...@@ -81,3 +81,31 @@ flowchart LR
81 81
82 ## Sources 82 ## Sources
83 - See [references-and-sources.md](./references-and-sources.md) for the current source map. 83 - See [references-and-sources.md](./references-and-sources.md) for the current source map.
84
85
86 ## 6. Bucket / Style-aware 基线
87
88 当前仓库已经新增可运行基线脚本:
89 - [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py)
90
91 用途:
92 - 按 bucket 配置文件拆分多个小子集
93 - 对每个 bucket 分别运行现有 `ab_smoke_segmentation.py`
94 - 输出 bucket 级 winner 与聚合均值
95
96 推荐最小配置文件格式:
97
98 ```json
99 {
100 "buckets": [
101 {"name": "prefix_000_a", "patterns": ["fma_small/000/00000?.mp3"], "subset_size": 4},
102 {"name": "prefix_000_b", "patterns": ["fma_small/000/00014?.mp3"], "subset_size": 4}
103 ]
104 }
105 ```
106
107 推荐命令:
108
109 ```bash
110 /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_bucketed.py --dataset fma --input-dir data/raw/fma_small_audio --bucket-config /tmp/cap64_bucket_test.json --work-root /tmp/ab_smoke_bucketed_smoke --default-subset-size 4 --query-duration 8 --train-epochs 1 --batch-size 2 --device cpu --strategies high_energy hybrid --max-test-queries 4 --seed 42 --output-json /tmp/ab_smoke_bucketed_smoke/report.json
111 ```
......
...@@ -339,3 +339,22 @@ cd acr-engine ...@@ -339,3 +339,22 @@ cd acr-engine
339 ## Sources 339 ## Sources
340 - See [dataset-spec.md](./dataset-spec.md) 340 - See [dataset-spec.md](./dataset-spec.md)
341 - See [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) 341 - See [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md)
342
343
344 ### Bucket / style-aware benchmark 基线
345
346 为了避免只看单一子集规模,现在仓库里已经有可运行的 bucket benchmark 基线:
347 - [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py)
348
349 它的作用是:
350 1. 从同一大目录中按 pattern 划出多个 bucket
351 2. 每个 bucket 各自运行 `ab_smoke_segmentation.py`
352 3. 生成 bucket 级 winner 与 aggregate summary
353
354 最小 smoke 已验证:
355 - bucket: `prefix_000_a`
356 - `hybrid`: `4 / 1.0 / 1.0`
357 - `high_energy`: `3 / 1.0 / 1.0`
358 - winner: `hybrid`
359
360 当前第二个 bucket 仍在运行中,因此完整 bucket 汇总仍待补齐。
......
...@@ -240,10 +240,10 @@ ...@@ -240,10 +240,10 @@
240 - `hybrid``mean_top1=0.8750, min=0.7917, max=0.9583, stdev=0.0680` 240 - `hybrid``mean_top1=0.8750, min=0.7917, max=0.9583, stdev=0.0680`
241 241
242 ### 最优先待办 242 ### 最优先待办
243 1. 设计并启动 bucket/style-aware benchmark 243 1. 跟进 bucket/style-aware benchmark 的完整 `report.json`
244 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 244 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。
245 3. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 245 3. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。
246 4.新 benchmark 基线下继续提交与推送。 246 4. bucket 基线下继续提交与推送。
247 247
248 ### 续跑时不要做的事 248 ### 续跑时不要做的事
249 - 不要 `git add .` 249 - 不要 `git add .`
...@@ -681,3 +681,14 @@ seed123 最终结论: ...@@ -681,3 +681,14 @@ seed123 最终结论:
681 - cap64 winner:`hybrid` 681 - cap64 winner:`hybrid`
682 - 当前结论已进入“分子集规模不一致”阶段,必须继续做 bucket/style-aware benchmark 682 - 当前结论已进入“分子集规模不一致”阶段,必须继续做 bucket/style-aware benchmark
683 683
684
685 ## 101. bucket/style-aware benchmark 基线已落地
686
687 - 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py`
688 - 已通过:`py_compile`
689 - 已验证首个 bucket:`prefix_000_a`
690 - `hybrid`: `4 / 1.0 / 1.0`
691 - `high_energy`: `3 / 1.0 / 1.0`
692 - winner: `hybrid`
693 - 当前第二个 bucket `prefix_000_b` 仍在继续执行
694
......