Promote bucket benchmarking from a plan to a runnable baseline
Constraint: The cap48/cap64 reversal means strategy guidance can no longer rely on a single overall subset result Rejected: Keep bucket benchmarking as a doc-only next step | The repo now needs an executable baseline so later sessions can measure scale/style divergence directly Confidence: high Scope-risk: moderate Directive: Treat ab_smoke_bucketed.py as the canonical seed for style-aware evaluation, and expand bucket definitions before revisiting global default-strategy claims Tested: Verified acr-engine/scripts/ab_smoke_bucketed.py passes py_compile; verified first bucket prefix_000_a produced bucket_report.json with hybrid 4/1.0/1.0 and high_energy 3/1.0/1.0; verified second bucket execution is in progress Not-tested: Full multi-bucket report.json completion, richer bucket definitions, and bucket-level aggregate conclusions
Showing
6 changed files
with
259 additions
and
2 deletions
acr-engine/scripts/ab_smoke_bucketed.py
0 → 100755
| 1 | #!/usr/bin/env python3 | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | import subprocess | ||
| 7 | from pathlib import Path | ||
| 8 | from statistics import mean | ||
| 9 | |||
| 10 | PYTHON = "/usr/local/miniconda3/bin/python" | ||
| 11 | |||
| 12 | |||
| 13 | def run(cmd: list[str], cwd: Path) -> str: | ||
| 14 | return subprocess.check_output(cmd, cwd=str(cwd), text=True) | ||
| 15 | |||
| 16 | |||
| 17 | def collect_files(input_dir: Path, patterns: list[str], limit: int | None = None) -> list[Path]: | ||
| 18 | seen: set[Path] = set() | ||
| 19 | files: list[Path] = [] | ||
| 20 | for pattern in patterns: | ||
| 21 | for path in sorted(input_dir.glob(pattern)): | ||
| 22 | if not path.is_file() or path.suffix.lower() != ".mp3": | ||
| 23 | continue | ||
| 24 | resolved = path.resolve() | ||
| 25 | if resolved in seen: | ||
| 26 | continue | ||
| 27 | seen.add(resolved) | ||
| 28 | files.append(resolved) | ||
| 29 | if limit is not None and len(files) >= limit: | ||
| 30 | return files | ||
| 31 | return files | ||
| 32 | |||
| 33 | |||
| 34 | def ensure_bucket_subset(input_dir: Path, bucket_dir: Path, patterns: list[str], limit: int | None) -> dict: | ||
| 35 | bucket_dir.mkdir(parents=True, exist_ok=True) | ||
| 36 | files = collect_files(input_dir, patterns, limit=limit) | ||
| 37 | copied: list[str] = [] | ||
| 38 | for src in files: | ||
| 39 | rel = src.relative_to(input_dir) | ||
| 40 | dst = bucket_dir / rel | ||
| 41 | dst.parent.mkdir(parents=True, exist_ok=True) | ||
| 42 | if not dst.exists(): | ||
| 43 | dst.write_bytes(src.read_bytes()) | ||
| 44 | copied.append(str(dst)) | ||
| 45 | return { | ||
| 46 | "num_files": len(copied), | ||
| 47 | "sample_files": copied[:5], | ||
| 48 | } | ||
| 49 | |||
| 50 | |||
| 51 | def main() -> None: | ||
| 52 | parser = argparse.ArgumentParser(description="Run bucket/style-aware segmented smoke benchmarks") | ||
| 53 | parser.add_argument("--dataset", default="fma") | ||
| 54 | parser.add_argument("--input-dir", default="data/raw/fma_small_audio") | ||
| 55 | parser.add_argument("--bucket-config", required=True, help="JSON file with {buckets:[{name,patterns,subset_size?}]} or {bucket_name:[patterns]}") | ||
| 56 | parser.add_argument("--work-root", default="/tmp/ab_smoke_bucketed") | ||
| 57 | parser.add_argument("--query-duration", type=float, default=8.0) | ||
| 58 | parser.add_argument("--query-stride", type=float, default=None) | ||
| 59 | parser.add_argument("--train-epochs", type=int, default=1) | ||
| 60 | parser.add_argument("--batch-size", type=int, default=2) | ||
| 61 | parser.add_argument("--device", default="cpu") | ||
| 62 | parser.add_argument("--seed", type=int, default=42) | ||
| 63 | parser.add_argument("--max-test-queries", type=int, default=None) | ||
| 64 | parser.add_argument("--default-subset-size", type=int, default=16) | ||
| 65 | parser.add_argument("--min-files", type=int, default=2) | ||
| 66 | parser.add_argument("--strategies", nargs="*", default=["high_energy", "hybrid"]) | ||
| 67 | parser.add_argument("--output-json", default=None) | ||
| 68 | args = parser.parse_args() | ||
| 69 | |||
| 70 | repo = Path(__file__).resolve().parents[1] | ||
| 71 | input_dir = (repo / args.input_dir).resolve() | ||
| 72 | work_root = Path(args.work_root).resolve() | ||
| 73 | config = json.loads(Path(args.bucket_config).read_text()) | ||
| 74 | |||
| 75 | if isinstance(config, dict) and "buckets" in config: | ||
| 76 | bucket_specs = config["buckets"] | ||
| 77 | elif isinstance(config, dict): | ||
| 78 | bucket_specs = [{"name": k, "patterns": v} for k, v in config.items()] | ||
| 79 | else: | ||
| 80 | raise ValueError("bucket config must be an object") | ||
| 81 | |||
| 82 | bucket_reports = [] | ||
| 83 | for spec in bucket_specs: | ||
| 84 | name = spec["name"] | ||
| 85 | patterns = spec["patterns"] | ||
| 86 | subset_size = spec.get("subset_size", args.default_subset_size) | ||
| 87 | bucket_root = work_root / name | ||
| 88 | subset_dir = bucket_root / "bucket_input" | ||
| 89 | subset_info = ensure_bucket_subset(input_dir, subset_dir, patterns, subset_size) | ||
| 90 | if subset_info["num_files"] < args.min_files: | ||
| 91 | bucket_reports.append({ | ||
| 92 | "bucket": name, | ||
| 93 | "patterns": patterns, | ||
| 94 | "subset_size": subset_info["num_files"], | ||
| 95 | "skipped": True, | ||
| 96 | "reason": f"num_files<{args.min_files}", | ||
| 97 | "subset": subset_info, | ||
| 98 | }) | ||
| 99 | continue | ||
| 100 | |||
| 101 | cmd = [ | ||
| 102 | PYTHON, | ||
| 103 | "scripts/ab_smoke_segmentation.py", | ||
| 104 | "--dataset", args.dataset, | ||
| 105 | "--input-dir", str(subset_dir), | ||
| 106 | "--work-root", str(bucket_root / "run"), | ||
| 107 | "--subset-size", str(subset_info["num_files"]), | ||
| 108 | "--query-duration", str(args.query_duration), | ||
| 109 | "--train-epochs", str(args.train_epochs), | ||
| 110 | "--batch-size", str(args.batch_size), | ||
| 111 | "--device", args.device, | ||
| 112 | "--seed", str(args.seed), | ||
| 113 | "--strategies", *args.strategies, | ||
| 114 | ] | ||
| 115 | if args.max_test_queries is not None: | ||
| 116 | cmd += ["--max-test-queries", str(args.max_test_queries)] | ||
| 117 | if args.query_stride is not None: | ||
| 118 | cmd += ["--query-stride", str(args.query_stride)] | ||
| 119 | out_json = bucket_root / "bucket_report.json" | ||
| 120 | cmd += ["--output-json", str(out_json)] | ||
| 121 | run(cmd, cwd=repo) | ||
| 122 | result = json.loads(out_json.read_text()) | ||
| 123 | bucket_reports.append({ | ||
| 124 | "bucket": name, | ||
| 125 | "patterns": patterns, | ||
| 126 | "subset_size": subset_info["num_files"], | ||
| 127 | "skipped": False, | ||
| 128 | "subset": subset_info, | ||
| 129 | "winner": result.get("winner"), | ||
| 130 | "strategies": result.get("strategies", []), | ||
| 131 | }) | ||
| 132 | |||
| 133 | strategy_aggregate: dict[str, dict[str, list[float]]] = {} | ||
| 134 | for bucket in bucket_reports: | ||
| 135 | if bucket.get("skipped"): | ||
| 136 | continue | ||
| 137 | for row in bucket["strategies"]: | ||
| 138 | agg = strategy_aggregate.setdefault(row["strategy"], {"top1": [], "topk": [], "num_queries": []}) | ||
| 139 | agg["top1"].append(row["top1"]) | ||
| 140 | agg["topk"].append(row["topk"]) | ||
| 141 | agg["num_queries"].append(row["num_queries"]) | ||
| 142 | |||
| 143 | aggregate = { | ||
| 144 | strategy: { | ||
| 145 | "bucket_runs": len(vals["top1"]), | ||
| 146 | "mean_top1": round(mean(vals["top1"]), 4), | ||
| 147 | "mean_topk": round(mean(vals["topk"]), 4), | ||
| 148 | "mean_num_queries": round(mean(vals["num_queries"]), 4), | ||
| 149 | } | ||
| 150 | for strategy, vals in strategy_aggregate.items() if vals["top1"] | ||
| 151 | } | ||
| 152 | |||
| 153 | report = { | ||
| 154 | "dataset": args.dataset, | ||
| 155 | "input_dir": str(input_dir), | ||
| 156 | "bucket_config": str(Path(args.bucket_config).resolve()), | ||
| 157 | "query_duration": args.query_duration, | ||
| 158 | "query_stride": args.query_stride, | ||
| 159 | "train_epochs": args.train_epochs, | ||
| 160 | "batch_size": args.batch_size, | ||
| 161 | "device": args.device, | ||
| 162 | "seed": args.seed, | ||
| 163 | "max_test_queries": args.max_test_queries, | ||
| 164 | "strategies": args.strategies, | ||
| 165 | "buckets": bucket_reports, | ||
| 166 | "aggregate": aggregate, | ||
| 167 | } | ||
| 168 | text = json.dumps(report, ensure_ascii=False, indent=2) | ||
| 169 | if args.output_json: | ||
| 170 | out = Path(args.output_json) | ||
| 171 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 172 | out.write_text(text) | ||
| 173 | print(text) | ||
| 174 | |||
| 175 | |||
| 176 | if __name__ == "__main__": | ||
| 177 | main() |
| 1 | ## 2026-06-02 bucket/style-aware benchmark 基线落地 checkpoint | ||
| 2 | |||
| 3 | 完成项: | ||
| 4 | - 新增 `acr-engine/scripts/ab_smoke_bucketed.py`,用于按 bucket 配置批量驱动现有 `ab_smoke_segmentation.py`。 | ||
| 5 | - 新脚本已通过 `py_compile`。 | ||
| 6 | - 已完成最小 smoke 验证:首个 bucket `prefix_000_a` 已成功产出 `bucket_report.json`。 | ||
| 7 | |||
| 8 | 验证证据: | ||
| 9 | - 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py` | ||
| 10 | - 首个 bucket 结果: | ||
| 11 | - `prefix_000_a` | ||
| 12 | - `hybrid`: `num_queries=4, top1=1.0, topk=1.0` | ||
| 13 | - `high_energy`: `num_queries=3, top1=1.0, topk=1.0` | ||
| 14 | - winner: `hybrid` | ||
| 15 | - 当前第二个 bucket `prefix_000_b` 仍在运行中。 | ||
| 16 | |||
| 17 | 说明: | ||
| 18 | - 这次提交的重点是把 bucket/style-aware benchmark 从“待办”推进为“已存在可运行基线”。 | ||
| 19 | - 完整 bucket 汇总 `report.json` 尚未生成,因此当前只把它视作基线工具完成与首桶 smoke 通过。 | ||
| 20 | |||
| 1 | ## 2026-06-02 cap64 完结 checkpoint | 21 | ## 2026-06-02 cap64 完结 checkpoint |
| 2 | 22 | ||
| 3 | 完成项: | 23 | 完成项: | ... | ... |
| ... | @@ -80,3 +80,5 @@ cd /workspace/acr-engine | ... | @@ -80,3 +80,5 @@ cd /workspace/acr-engine |
| 80 | - 已补充 cap64 新鲜证据:`hybrid` reference index 完成(`64 refs / 657 windows / 192-d`)并进入 `evaluate.py`。 | 80 | - 已补充 cap64 新鲜证据:`hybrid` reference index 完成(`64 refs / 657 windows / 192-d`)并进入 `evaluate.py`。 |
| 81 | 81 | ||
| 82 | - 已补齐 cap64 最终结果:`hybrid=0.875`、`high_energy=0.625`,winner=`hybrid`。 | 82 | - 已补齐 cap64 最终结果:`hybrid=0.875`、`high_energy=0.625`,winner=`hybrid`。 |
| 83 | |||
| 84 | - 已新增 `acr-engine/scripts/ab_smoke_bucketed.py`,并完成首个 bucket 的 smoke 验证。 | ... | ... |
| ... | @@ -81,3 +81,31 @@ flowchart LR | ... | @@ -81,3 +81,31 @@ flowchart LR |
| 81 | 81 | ||
| 82 | ## Sources | 82 | ## Sources |
| 83 | - See [references-and-sources.md](./references-and-sources.md) for the current source map. | 83 | - See [references-and-sources.md](./references-and-sources.md) for the current source map. |
| 84 | |||
| 85 | |||
| 86 | ## 6. Bucket / Style-aware 基线 | ||
| 87 | |||
| 88 | 当前仓库已经新增可运行基线脚本: | ||
| 89 | - [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py) | ||
| 90 | |||
| 91 | 用途: | ||
| 92 | - 按 bucket 配置文件拆分多个小子集 | ||
| 93 | - 对每个 bucket 分别运行现有 `ab_smoke_segmentation.py` | ||
| 94 | - 输出 bucket 级 winner 与聚合均值 | ||
| 95 | |||
| 96 | 推荐最小配置文件格式: | ||
| 97 | |||
| 98 | ```json | ||
| 99 | { | ||
| 100 | "buckets": [ | ||
| 101 | {"name": "prefix_000_a", "patterns": ["fma_small/000/00000?.mp3"], "subset_size": 4}, | ||
| 102 | {"name": "prefix_000_b", "patterns": ["fma_small/000/00014?.mp3"], "subset_size": 4} | ||
| 103 | ] | ||
| 104 | } | ||
| 105 | ``` | ||
| 106 | |||
| 107 | 推荐命令: | ||
| 108 | |||
| 109 | ```bash | ||
| 110 | /usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_bucketed.py --dataset fma --input-dir data/raw/fma_small_audio --bucket-config /tmp/cap64_bucket_test.json --work-root /tmp/ab_smoke_bucketed_smoke --default-subset-size 4 --query-duration 8 --train-epochs 1 --batch-size 2 --device cpu --strategies high_energy hybrid --max-test-queries 4 --seed 42 --output-json /tmp/ab_smoke_bucketed_smoke/report.json | ||
| 111 | ``` | ... | ... |
| ... | @@ -339,3 +339,22 @@ cd acr-engine | ... | @@ -339,3 +339,22 @@ cd acr-engine |
| 339 | ## Sources | 339 | ## Sources |
| 340 | - See [dataset-spec.md](./dataset-spec.md) | 340 | - See [dataset-spec.md](./dataset-spec.md) |
| 341 | - See [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) | 341 | - See [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) |
| 342 | |||
| 343 | |||
| 344 | ### Bucket / style-aware benchmark 基线 | ||
| 345 | |||
| 346 | 为了避免只看单一子集规模,现在仓库里已经有可运行的 bucket benchmark 基线: | ||
| 347 | - [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py) | ||
| 348 | |||
| 349 | 它的作用是: | ||
| 350 | 1. 从同一大目录中按 pattern 划出多个 bucket | ||
| 351 | 2. 每个 bucket 各自运行 `ab_smoke_segmentation.py` | ||
| 352 | 3. 生成 bucket 级 winner 与 aggregate summary | ||
| 353 | |||
| 354 | 最小 smoke 已验证: | ||
| 355 | - bucket: `prefix_000_a` | ||
| 356 | - `hybrid`: `4 / 1.0 / 1.0` | ||
| 357 | - `high_energy`: `3 / 1.0 / 1.0` | ||
| 358 | - winner: `hybrid` | ||
| 359 | |||
| 360 | 当前第二个 bucket 仍在运行中,因此完整 bucket 汇总仍待补齐。 | ... | ... |
| ... | @@ -240,10 +240,10 @@ | ... | @@ -240,10 +240,10 @@ |
| 240 | - `hybrid`:`mean_top1=0.8750, min=0.7917, max=0.9583, stdev=0.0680` | 240 | - `hybrid`:`mean_top1=0.8750, min=0.7917, max=0.9583, stdev=0.0680` |
| 241 | 241 | ||
| 242 | ### 最优先待办 | 242 | ### 最优先待办 |
| 243 | 1. 设计并启动 bucket/style-aware benchmark。 | 243 | 1. 跟进 bucket/style-aware benchmark 的完整 `report.json`。 |
| 244 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 | 244 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 |
| 245 | 3. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 | 245 | 3. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 |
| 246 | 4. 在新 benchmark 基线下继续提交与推送。 | 246 | 4. 在 bucket 基线下继续提交与推送。 |
| 247 | 247 | ||
| 248 | ### 续跑时不要做的事 | 248 | ### 续跑时不要做的事 |
| 249 | - 不要 `git add .` | 249 | - 不要 `git add .` |
| ... | @@ -681,3 +681,14 @@ seed123 最终结论: | ... | @@ -681,3 +681,14 @@ seed123 最终结论: |
| 681 | - cap64 winner:`hybrid` | 681 | - cap64 winner:`hybrid` |
| 682 | - 当前结论已进入“分子集规模不一致”阶段,必须继续做 bucket/style-aware benchmark | 682 | - 当前结论已进入“分子集规模不一致”阶段,必须继续做 bucket/style-aware benchmark |
| 683 | 683 | ||
| 684 | |||
| 685 | ## 101. bucket/style-aware benchmark 基线已落地 | ||
| 686 | |||
| 687 | - 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py` | ||
| 688 | - 已通过:`py_compile` | ||
| 689 | - 已验证首个 bucket:`prefix_000_a` | ||
| 690 | - `hybrid`: `4 / 1.0 / 1.0` | ||
| 691 | - `high_energy`: `3 / 1.0 / 1.0` | ||
| 692 | - winner: `hybrid` | ||
| 693 | - 当前第二个 bucket `prefix_000_b` 仍在继续执行 | ||
| 694 | ... | ... |
-
Please register or sign in to post a comment