Commit bc6d07af bc6d07afbd1e31d3956d20e35c20c424bc21ba99 by cnb.bofCdSsphPA

Expose chromaprint build progress so long index runs stop being black boxes

Constraint: the live 8000-reference FMA run is already in flight, so observability had to be added as forward-safe progress and partial-cache outputs for future runs instead of altering the active process
Rejected: keep waiting on blind build-index runs | hides whether chromaprint is advancing and blocks operational debugging
Confidence: high
Scope-risk: narrow
Directive: Prefer progress JSON plus partial cache evidence for future large-index investigations before assuming a stall
Tested: py_compile on chromaprint_matcher.py and run_demo.py, verified chromaprint_progress.json and chromaprint.pkl appear in /tmp/chroma_index_observable_smoke at refs_done=50
Not-tested: end-to-end completion of the new observable build-index flow has not finished yet
1 parent ae1f4673
......@@ -27,13 +27,15 @@ def cmd_generate_data(args):
print(f"[done] dataset generated at {args.output}")
def build_chroma_index(data_dir: Path, output_dir: Path):
def build_chroma_index(data_dir: Path, output_dir: Path, checkpoint_every_refs: int = 0):
matcher = ChromaprintMatcher()
metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'
matcher.index_songs_from_dir(
songs_dir=str(data_dir),
metadata_path=str(metadata_path),
cache_path=str(output_dir / 'chromaprint.pkl'),
checkpoint_every_refs=checkpoint_every_refs,
progress_path=str(output_dir / 'chromaprint_progress.json'),
)
print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}")
return matcher
......@@ -66,7 +68,7 @@ def cmd_build_index(args):
out_dir.mkdir(parents=True, exist_ok=True)
print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}")
build_chroma_index(data_dir, out_dir)
build_chroma_index(data_dir, out_dir, checkpoint_every_refs=args.chromaprint_checkpoint_every_refs)
print(
f"[build-index] starting embedding index: model={args.model} device={args.device} "
f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}"
......@@ -173,6 +175,7 @@ if __name__ == '__main__':
p.add_argument('--output', default='data/index')
p.add_argument('--device', default='cpu')
p.add_argument('--checkpoint-every-refs', type=int, default=250)
p.add_argument('--chromaprint-checkpoint-every-refs', type=int, default=100)
p.add_argument('--resume', action='store_true')
p.set_defaults(func=cmd_build_index)
......
......@@ -15,6 +15,7 @@ from typing import Dict, List, Tuple, Optional
import pickle
import json
from pathlib import Path
import time
class Fingerprint:
......@@ -88,24 +89,65 @@ class ChromaprintMatcher:
self.hash_db[h].append(Fingerprint(song_id, offset, h))
def index_songs_from_dir(
self, songs_dir: str, metadata_path: str, cache_path: Optional[str] = None
self,
songs_dir: str,
metadata_path: str,
cache_path: Optional[str] = None,
checkpoint_every_refs: int = 0,
progress_path: Optional[str] = None,
):
with open(metadata_path) as f:
meta = json.load(f)
songs_dir = Path(songs_dir)
for item in meta:
if item.get("type") != "reference":
continue
refs = [item for item in meta if item.get("type") == "reference"]
total_refs = len(refs)
start_time = time.time()
progress_file = Path(progress_path) if progress_path else None
cache_file = Path(cache_path) if cache_path else None
def write_progress(refs_done: int, status: str):
if progress_file is None:
return
elapsed = max(time.time() - start_time, 1e-6)
refs_per_sec = refs_done / elapsed if refs_done > 0 else 0.0
eta_sec = (total_refs - refs_done) / refs_per_sec if refs_per_sec > 0 else 0.0
progress_file.write_text(json.dumps({
"status": status,
"refs_done": refs_done,
"refs_total": total_refs,
"elapsed_sec": round(elapsed, 3),
"eta_sec": round(eta_sec, 3),
"hashes": self.num_hashes,
"postings": self.index_size,
"cache_path": str(cache_file) if cache_file else None,
}, indent=2))
for ref_idx, item in enumerate(refs, start=1):
audio_path = self._resolve_audio_path(songs_dir, item["audio_path"])
if not audio_path.exists():
continue
song_id = item["song_id"]
y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True)
self.index_song(song_id, y)
if cache_path:
self.save(cache_path)
if ref_idx == 1 or ref_idx == total_refs or (checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0):
elapsed = max(time.time() - start_time, 1e-6)
refs_per_sec = ref_idx / elapsed
eta_sec = (total_refs - ref_idx) / refs_per_sec if refs_per_sec > 0 else 0.0
print(
f"[chromaprint-index] progress: refs={ref_idx}/{total_refs} "
f"hashes={self.num_hashes} postings={self.index_size} "
f"elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}"
)
if checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0:
if cache_file is not None:
self.save(str(cache_file))
write_progress(ref_idx, "building")
if cache_file is not None:
self.save(str(cache_file))
write_progress(total_refs, "complete")
def match(self, y: np.ndarray, top_k: int = 10) -> List[Tuple[str, float]]:
S = self._spectrogram(y)
......
## 2026-06-02 chromaprint build-index observability checkpoint
完成项:
-`acr-engine/src/engines/chromaprint_matcher.py``index_songs_from_dir()` 增加 chromaprint 阶段 progress JSON 与周期性 partial cache 落盘。
-`acr-engine/run_demo.py build-index` 增加 `--chromaprint-checkpoint-every-refs`,让 chromaprint 阶段也能分段 checkpoint,而不是只能盲等最终 `chromaprint.pkl`
验证结果:
- 小样本可观测 smoke:
- 命令:`run_demo.py build-index --data data/external_smoke/fma/manifests --output /tmp/chroma_index_observable_smoke --chromaprint-checkpoint-every-refs 10 ...`
- 已实际落盘:
- `/tmp/chroma_index_observable_smoke/chromaprint_progress.json`
- `/tmp/chroma_index_observable_smoke/chromaprint.pkl`
- `chromaprint_progress.json` fresh evidence:
- `status=building`
- `refs_done=50`
- `refs_total=8000`
- `elapsed_sec=36.93`
- `eta_sec=5871.909`
- `hashes=19509`
- `postings=45626`
- `chromaprint.pkl` 当时文件大小:`574K`
- `python -m py_compile src/engines/chromaprint_matcher.py run_demo.py` 通过
结论:
- 现在 chromaprint 阶段不再是黑盒长跑,可以直接看到 refs 进度、ETA、hash/posting 规模,并在运行中拿到 partial cache。
- 后续新进程可直接用这套证据判断 build-index 是否真正推进,而不必盲等最终产物。
## 2026-06-02 chromaprint peak scan exact-safe optimization checkpoint
完成项:
......