Commit c1a22cbb c1a22cbbe877d9aa8973c6f8e2f1cc513ff00edf by cnb.bofCdSsphPA

Promote bucket benchmarking from a plan to a runnable baseline

Constraint: The cap48/cap64 reversal means strategy guidance can no longer rely on a single overall subset result
Rejected: Keep bucket benchmarking as a doc-only next step | The repo now needs an executable baseline so later sessions can measure scale/style divergence directly
Confidence: high
Scope-risk: moderate
Directive: Treat ab_smoke_bucketed.py as the canonical seed for style-aware evaluation, and expand bucket definitions before revisiting global default-strategy claims
Tested: Verified acr-engine/scripts/ab_smoke_bucketed.py passes py_compile; verified first bucket prefix_000_a produced bucket_report.json with hybrid 4/1.0/1.0 and high_energy 3/1.0/1.0; verified second bucket execution is in progress
Not-tested: Full multi-bucket report.json completion, richer bucket definitions, and bucket-level aggregate conclusions
1 parent e49dc0b9
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
from statistics import mean
PYTHON = "/usr/local/miniconda3/bin/python"
def run(cmd: list[str], cwd: Path) -> str:
return subprocess.check_output(cmd, cwd=str(cwd), text=True)
def collect_files(input_dir: Path, patterns: list[str], limit: int | None = None) -> list[Path]:
seen: set[Path] = set()
files: list[Path] = []
for pattern in patterns:
for path in sorted(input_dir.glob(pattern)):
if not path.is_file() or path.suffix.lower() != ".mp3":
continue
resolved = path.resolve()
if resolved in seen:
continue
seen.add(resolved)
files.append(resolved)
if limit is not None and len(files) >= limit:
return files
return files
def ensure_bucket_subset(input_dir: Path, bucket_dir: Path, patterns: list[str], limit: int | None) -> dict:
bucket_dir.mkdir(parents=True, exist_ok=True)
files = collect_files(input_dir, patterns, limit=limit)
copied: list[str] = []
for src in files:
rel = src.relative_to(input_dir)
dst = bucket_dir / rel
dst.parent.mkdir(parents=True, exist_ok=True)
if not dst.exists():
dst.write_bytes(src.read_bytes())
copied.append(str(dst))
return {
"num_files": len(copied),
"sample_files": copied[:5],
}
def main() -> None:
parser = argparse.ArgumentParser(description="Run bucket/style-aware segmented smoke benchmarks")
parser.add_argument("--dataset", default="fma")
parser.add_argument("--input-dir", default="data/raw/fma_small_audio")
parser.add_argument("--bucket-config", required=True, help="JSON file with {buckets:[{name,patterns,subset_size?}]} or {bucket_name:[patterns]}")
parser.add_argument("--work-root", default="/tmp/ab_smoke_bucketed")
parser.add_argument("--query-duration", type=float, default=8.0)
parser.add_argument("--query-stride", type=float, default=None)
parser.add_argument("--train-epochs", type=int, default=1)
parser.add_argument("--batch-size", type=int, default=2)
parser.add_argument("--device", default="cpu")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max-test-queries", type=int, default=None)
parser.add_argument("--default-subset-size", type=int, default=16)
parser.add_argument("--min-files", type=int, default=2)
parser.add_argument("--strategies", nargs="*", default=["high_energy", "hybrid"])
parser.add_argument("--output-json", default=None)
args = parser.parse_args()
repo = Path(__file__).resolve().parents[1]
input_dir = (repo / args.input_dir).resolve()
work_root = Path(args.work_root).resolve()
config = json.loads(Path(args.bucket_config).read_text())
if isinstance(config, dict) and "buckets" in config:
bucket_specs = config["buckets"]
elif isinstance(config, dict):
bucket_specs = [{"name": k, "patterns": v} for k, v in config.items()]
else:
raise ValueError("bucket config must be an object")
bucket_reports = []
for spec in bucket_specs:
name = spec["name"]
patterns = spec["patterns"]
subset_size = spec.get("subset_size", args.default_subset_size)
bucket_root = work_root / name
subset_dir = bucket_root / "bucket_input"
subset_info = ensure_bucket_subset(input_dir, subset_dir, patterns, subset_size)
if subset_info["num_files"] < args.min_files:
bucket_reports.append({
"bucket": name,
"patterns": patterns,
"subset_size": subset_info["num_files"],
"skipped": True,
"reason": f"num_files<{args.min_files}",
"subset": subset_info,
})
continue
cmd = [
PYTHON,
"scripts/ab_smoke_segmentation.py",
"--dataset", args.dataset,
"--input-dir", str(subset_dir),
"--work-root", str(bucket_root / "run"),
"--subset-size", str(subset_info["num_files"]),
"--query-duration", str(args.query_duration),
"--train-epochs", str(args.train_epochs),
"--batch-size", str(args.batch_size),
"--device", args.device,
"--seed", str(args.seed),
"--strategies", *args.strategies,
]
if args.max_test_queries is not None:
cmd += ["--max-test-queries", str(args.max_test_queries)]
if args.query_stride is not None:
cmd += ["--query-stride", str(args.query_stride)]
out_json = bucket_root / "bucket_report.json"
cmd += ["--output-json", str(out_json)]
run(cmd, cwd=repo)
result = json.loads(out_json.read_text())
bucket_reports.append({
"bucket": name,
"patterns": patterns,
"subset_size": subset_info["num_files"],
"skipped": False,
"subset": subset_info,
"winner": result.get("winner"),
"strategies": result.get("strategies", []),
})
strategy_aggregate: dict[str, dict[str, list[float]]] = {}
for bucket in bucket_reports:
if bucket.get("skipped"):
continue
for row in bucket["strategies"]:
agg = strategy_aggregate.setdefault(row["strategy"], {"top1": [], "topk": [], "num_queries": []})
agg["top1"].append(row["top1"])
agg["topk"].append(row["topk"])
agg["num_queries"].append(row["num_queries"])
aggregate = {
strategy: {
"bucket_runs": len(vals["top1"]),
"mean_top1": round(mean(vals["top1"]), 4),
"mean_topk": round(mean(vals["topk"]), 4),
"mean_num_queries": round(mean(vals["num_queries"]), 4),
}
for strategy, vals in strategy_aggregate.items() if vals["top1"]
}
report = {
"dataset": args.dataset,
"input_dir": str(input_dir),
"bucket_config": str(Path(args.bucket_config).resolve()),
"query_duration": args.query_duration,
"query_stride": args.query_stride,
"train_epochs": args.train_epochs,
"batch_size": args.batch_size,
"device": args.device,
"seed": args.seed,
"max_test_queries": args.max_test_queries,
"strategies": args.strategies,
"buckets": bucket_reports,
"aggregate": aggregate,
}
text = json.dumps(report, ensure_ascii=False, indent=2)
if args.output_json:
out = Path(args.output_json)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(text)
print(text)
if __name__ == "__main__":
main()
## 2026-06-02 bucket/style-aware benchmark 基线落地 checkpoint
完成项:
- 新增 `acr-engine/scripts/ab_smoke_bucketed.py`,用于按 bucket 配置批量驱动现有 `ab_smoke_segmentation.py`
- 新脚本已通过 `py_compile`
- 已完成最小 smoke 验证:首个 bucket `prefix_000_a` 已成功产出 `bucket_report.json`
验证证据:
- 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py`
- 首个 bucket 结果:
- `prefix_000_a`
- `hybrid`: `num_queries=4, top1=1.0, topk=1.0`
- `high_energy`: `num_queries=3, top1=1.0, topk=1.0`
- winner: `hybrid`
- 当前第二个 bucket `prefix_000_b` 仍在运行中。
说明:
- 这次提交的重点是把 bucket/style-aware benchmark 从“待办”推进为“已存在可运行基线”。
- 完整 bucket 汇总 `report.json` 尚未生成,因此当前只把它视作基线工具完成与首桶 smoke 通过。
## 2026-06-02 cap64 完结 checkpoint
完成项:
......
......@@ -80,3 +80,5 @@ cd /workspace/acr-engine
- 已补充 cap64 新鲜证据:`hybrid` reference index 完成(`64 refs / 657 windows / 192-d`)并进入 `evaluate.py`
- 已补齐 cap64 最终结果:`hybrid=0.875``high_energy=0.625`,winner=`hybrid`
- 已新增 `acr-engine/scripts/ab_smoke_bucketed.py`,并完成首个 bucket 的 smoke 验证。
......
......@@ -81,3 +81,31 @@ flowchart LR
## Sources
- See [references-and-sources.md](./references-and-sources.md) for the current source map.
## 6. Bucket / Style-aware 基线
当前仓库已经新增可运行基线脚本:
- [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py)
用途:
- 按 bucket 配置文件拆分多个小子集
- 对每个 bucket 分别运行现有 `ab_smoke_segmentation.py`
- 输出 bucket 级 winner 与聚合均值
推荐最小配置文件格式:
```json
{
"buckets": [
{"name": "prefix_000_a", "patterns": ["fma_small/000/00000?.mp3"], "subset_size": 4},
{"name": "prefix_000_b", "patterns": ["fma_small/000/00014?.mp3"], "subset_size": 4}
]
}
```
推荐命令:
```bash
/usr/local/miniconda3/bin/python acr-engine/scripts/ab_smoke_bucketed.py --dataset fma --input-dir data/raw/fma_small_audio --bucket-config /tmp/cap64_bucket_test.json --work-root /tmp/ab_smoke_bucketed_smoke --default-subset-size 4 --query-duration 8 --train-epochs 1 --batch-size 2 --device cpu --strategies high_energy hybrid --max-test-queries 4 --seed 42 --output-json /tmp/ab_smoke_bucketed_smoke/report.json
```
......
......@@ -339,3 +339,22 @@ cd acr-engine
## Sources
- See [dataset-spec.md](./dataset-spec.md)
- See [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md)
### Bucket / style-aware benchmark 基线
为了避免只看单一子集规模,现在仓库里已经有可运行的 bucket benchmark 基线:
- [../acr-engine/scripts/ab_smoke_bucketed.py](../acr-engine/scripts/ab_smoke_bucketed.py)
它的作用是:
1. 从同一大目录中按 pattern 划出多个 bucket
2. 每个 bucket 各自运行 `ab_smoke_segmentation.py`
3. 生成 bucket 级 winner 与 aggregate summary
最小 smoke 已验证:
- bucket: `prefix_000_a`
- `hybrid`: `4 / 1.0 / 1.0`
- `high_energy`: `3 / 1.0 / 1.0`
- winner: `hybrid`
当前第二个 bucket 仍在运行中,因此完整 bucket 汇总仍待补齐。
......
......@@ -240,10 +240,10 @@
- `hybrid``mean_top1=0.8750, min=0.7917, max=0.9583, stdev=0.0680`
### 最优先待办
1. 设计并启动 bucket/style-aware benchmark
1. 跟进 bucket/style-aware benchmark 的完整 `report.json`
2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。
3. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。
4.新 benchmark 基线下继续提交与推送。
4. bucket 基线下继续提交与推送。
### 续跑时不要做的事
- 不要 `git add .`
......@@ -681,3 +681,14 @@ seed123 最终结论:
- cap64 winner:`hybrid`
- 当前结论已进入“分子集规模不一致”阶段,必须继续做 bucket/style-aware benchmark
## 101. bucket/style-aware benchmark 基线已落地
- 新脚本:`acr-engine/scripts/ab_smoke_bucketed.py`
- 已通过:`py_compile`
- 已验证首个 bucket:`prefix_000_a`
- `hybrid`: `4 / 1.0 / 1.0`
- `high_energy`: `3 / 1.0 / 1.0`
- winner: `hybrid`
- 当前第二个 bucket `prefix_000_b` 仍在继续执行
......