Expose chromaprint build progress so long index runs stop being black boxes
Constraint: the live 8000-reference FMA run is already in flight, so observability had to be added as forward-safe progress and partial-cache outputs for future runs instead of altering the active process Rejected: keep waiting on blind build-index runs | hides whether chromaprint is advancing and blocks operational debugging Confidence: high Scope-risk: narrow Directive: Prefer progress JSON plus partial cache evidence for future large-index investigations before assuming a stall Tested: py_compile on chromaprint_matcher.py and run_demo.py, verified chromaprint_progress.json and chromaprint.pkl appear in /tmp/chroma_index_observable_smoke at refs_done=50 Not-tested: end-to-end completion of the new observable build-index flow has not finished yet
Showing
3 changed files
with
81 additions
and
9 deletions
| ... | @@ -27,13 +27,15 @@ def cmd_generate_data(args): | ... | @@ -27,13 +27,15 @@ def cmd_generate_data(args): |
| 27 | print(f"[done] dataset generated at {args.output}") | 27 | print(f"[done] dataset generated at {args.output}") |
| 28 | 28 | ||
| 29 | 29 | ||
| 30 | def build_chroma_index(data_dir: Path, output_dir: Path): | 30 | def build_chroma_index(data_dir: Path, output_dir: Path, checkpoint_every_refs: int = 0): |
| 31 | matcher = ChromaprintMatcher() | 31 | matcher = ChromaprintMatcher() |
| 32 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' | 32 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' |
| 33 | matcher.index_songs_from_dir( | 33 | matcher.index_songs_from_dir( |
| 34 | songs_dir=str(data_dir), | 34 | songs_dir=str(data_dir), |
| 35 | metadata_path=str(metadata_path), | 35 | metadata_path=str(metadata_path), |
| 36 | cache_path=str(output_dir / 'chromaprint.pkl'), | 36 | cache_path=str(output_dir / 'chromaprint.pkl'), |
| 37 | checkpoint_every_refs=checkpoint_every_refs, | ||
| 38 | progress_path=str(output_dir / 'chromaprint_progress.json'), | ||
| 37 | ) | 39 | ) |
| 38 | print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") | 40 | print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") |
| 39 | return matcher | 41 | return matcher |
| ... | @@ -66,7 +68,7 @@ def cmd_build_index(args): | ... | @@ -66,7 +68,7 @@ def cmd_build_index(args): |
| 66 | out_dir.mkdir(parents=True, exist_ok=True) | 68 | out_dir.mkdir(parents=True, exist_ok=True) |
| 67 | 69 | ||
| 68 | print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") | 70 | print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") |
| 69 | build_chroma_index(data_dir, out_dir) | 71 | build_chroma_index(data_dir, out_dir, checkpoint_every_refs=args.chromaprint_checkpoint_every_refs) |
| 70 | print( | 72 | print( |
| 71 | f"[build-index] starting embedding index: model={args.model} device={args.device} " | 73 | f"[build-index] starting embedding index: model={args.model} device={args.device} " |
| 72 | f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}" | 74 | f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}" |
| ... | @@ -173,6 +175,7 @@ if __name__ == '__main__': | ... | @@ -173,6 +175,7 @@ if __name__ == '__main__': |
| 173 | p.add_argument('--output', default='data/index') | 175 | p.add_argument('--output', default='data/index') |
| 174 | p.add_argument('--device', default='cpu') | 176 | p.add_argument('--device', default='cpu') |
| 175 | p.add_argument('--checkpoint-every-refs', type=int, default=250) | 177 | p.add_argument('--checkpoint-every-refs', type=int, default=250) |
| 178 | p.add_argument('--chromaprint-checkpoint-every-refs', type=int, default=100) | ||
| 176 | p.add_argument('--resume', action='store_true') | 179 | p.add_argument('--resume', action='store_true') |
| 177 | p.set_defaults(func=cmd_build_index) | 180 | p.set_defaults(func=cmd_build_index) |
| 178 | 181 | ... | ... |
| ... | @@ -15,6 +15,7 @@ from typing import Dict, List, Tuple, Optional | ... | @@ -15,6 +15,7 @@ from typing import Dict, List, Tuple, Optional |
| 15 | import pickle | 15 | import pickle |
| 16 | import json | 16 | import json |
| 17 | from pathlib import Path | 17 | from pathlib import Path |
| 18 | import time | ||
| 18 | 19 | ||
| 19 | 20 | ||
| 20 | class Fingerprint: | 21 | class Fingerprint: |
| ... | @@ -88,24 +89,65 @@ class ChromaprintMatcher: | ... | @@ -88,24 +89,65 @@ class ChromaprintMatcher: |
| 88 | self.hash_db[h].append(Fingerprint(song_id, offset, h)) | 89 | self.hash_db[h].append(Fingerprint(song_id, offset, h)) |
| 89 | 90 | ||
| 90 | def index_songs_from_dir( | 91 | def index_songs_from_dir( |
| 91 | self, songs_dir: str, metadata_path: str, cache_path: Optional[str] = None | 92 | self, |
| 93 | songs_dir: str, | ||
| 94 | metadata_path: str, | ||
| 95 | cache_path: Optional[str] = None, | ||
| 96 | checkpoint_every_refs: int = 0, | ||
| 97 | progress_path: Optional[str] = None, | ||
| 92 | ): | 98 | ): |
| 93 | with open(metadata_path) as f: | 99 | with open(metadata_path) as f: |
| 94 | meta = json.load(f) | 100 | meta = json.load(f) |
| 95 | 101 | ||
| 96 | songs_dir = Path(songs_dir) | 102 | songs_dir = Path(songs_dir) |
| 97 | for item in meta: | 103 | refs = [item for item in meta if item.get("type") == "reference"] |
| 98 | if item.get("type") != "reference": | 104 | total_refs = len(refs) |
| 99 | continue | 105 | start_time = time.time() |
| 106 | |||
| 107 | progress_file = Path(progress_path) if progress_path else None | ||
| 108 | cache_file = Path(cache_path) if cache_path else None | ||
| 109 | |||
| 110 | def write_progress(refs_done: int, status: str): | ||
| 111 | if progress_file is None: | ||
| 112 | return | ||
| 113 | elapsed = max(time.time() - start_time, 1e-6) | ||
| 114 | refs_per_sec = refs_done / elapsed if refs_done > 0 else 0.0 | ||
| 115 | eta_sec = (total_refs - refs_done) / refs_per_sec if refs_per_sec > 0 else 0.0 | ||
| 116 | progress_file.write_text(json.dumps({ | ||
| 117 | "status": status, | ||
| 118 | "refs_done": refs_done, | ||
| 119 | "refs_total": total_refs, | ||
| 120 | "elapsed_sec": round(elapsed, 3), | ||
| 121 | "eta_sec": round(eta_sec, 3), | ||
| 122 | "hashes": self.num_hashes, | ||
| 123 | "postings": self.index_size, | ||
| 124 | "cache_path": str(cache_file) if cache_file else None, | ||
| 125 | }, indent=2)) | ||
| 126 | |||
| 127 | for ref_idx, item in enumerate(refs, start=1): | ||
| 100 | audio_path = self._resolve_audio_path(songs_dir, item["audio_path"]) | 128 | audio_path = self._resolve_audio_path(songs_dir, item["audio_path"]) |
| 101 | if not audio_path.exists(): | 129 | if not audio_path.exists(): |
| 102 | continue | 130 | continue |
| 103 | song_id = item["song_id"] | 131 | song_id = item["song_id"] |
| 104 | y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) | 132 | y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) |
| 105 | self.index_song(song_id, y) | 133 | self.index_song(song_id, y) |
| 106 | 134 | if ref_idx == 1 or ref_idx == total_refs or (checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0): | |
| 107 | if cache_path: | 135 | elapsed = max(time.time() - start_time, 1e-6) |
| 108 | self.save(cache_path) | 136 | refs_per_sec = ref_idx / elapsed |
| 137 | eta_sec = (total_refs - ref_idx) / refs_per_sec if refs_per_sec > 0 else 0.0 | ||
| 138 | print( | ||
| 139 | f"[chromaprint-index] progress: refs={ref_idx}/{total_refs} " | ||
| 140 | f"hashes={self.num_hashes} postings={self.index_size} " | ||
| 141 | f"elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}" | ||
| 142 | ) | ||
| 143 | if checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0: | ||
| 144 | if cache_file is not None: | ||
| 145 | self.save(str(cache_file)) | ||
| 146 | write_progress(ref_idx, "building") | ||
| 147 | |||
| 148 | if cache_file is not None: | ||
| 149 | self.save(str(cache_file)) | ||
| 150 | write_progress(total_refs, "complete") | ||
| 109 | 151 | ||
| 110 | def match(self, y: np.ndarray, top_k: int = 10) -> List[Tuple[str, float]]: | 152 | def match(self, y: np.ndarray, top_k: int = 10) -> List[Tuple[str, float]]: |
| 111 | S = self._spectrogram(y) | 153 | S = self._spectrogram(y) | ... | ... |
| 1 | ## 2026-06-02 chromaprint build-index observability checkpoint | ||
| 2 | |||
| 3 | 完成项: | ||
| 4 | - 为 `acr-engine/src/engines/chromaprint_matcher.py` 的 `index_songs_from_dir()` 增加 chromaprint 阶段 progress JSON 与周期性 partial cache 落盘。 | ||
| 5 | - 为 `acr-engine/run_demo.py build-index` 增加 `--chromaprint-checkpoint-every-refs`,让 chromaprint 阶段也能分段 checkpoint,而不是只能盲等最终 `chromaprint.pkl`。 | ||
| 6 | |||
| 7 | 验证结果: | ||
| 8 | - 小样本可观测 smoke: | ||
| 9 | - 命令:`run_demo.py build-index --data data/external_smoke/fma/manifests --output /tmp/chroma_index_observable_smoke --chromaprint-checkpoint-every-refs 10 ...` | ||
| 10 | - 已实际落盘: | ||
| 11 | - `/tmp/chroma_index_observable_smoke/chromaprint_progress.json` | ||
| 12 | - `/tmp/chroma_index_observable_smoke/chromaprint.pkl` | ||
| 13 | - `chromaprint_progress.json` fresh evidence: | ||
| 14 | - `status=building` | ||
| 15 | - `refs_done=50` | ||
| 16 | - `refs_total=8000` | ||
| 17 | - `elapsed_sec=36.93` | ||
| 18 | - `eta_sec=5871.909` | ||
| 19 | - `hashes=19509` | ||
| 20 | - `postings=45626` | ||
| 21 | - `chromaprint.pkl` 当时文件大小:`574K` | ||
| 22 | - `python -m py_compile src/engines/chromaprint_matcher.py run_demo.py` 通过 | ||
| 23 | |||
| 24 | 结论: | ||
| 25 | - 现在 chromaprint 阶段不再是黑盒长跑,可以直接看到 refs 进度、ETA、hash/posting 规模,并在运行中拿到 partial cache。 | ||
| 26 | - 后续新进程可直接用这套证据判断 build-index 是否真正推进,而不必盲等最终产物。 | ||
| 27 | |||
| 1 | ## 2026-06-02 chromaprint peak scan exact-safe optimization checkpoint | 28 | ## 2026-06-02 chromaprint peak scan exact-safe optimization checkpoint |
| 2 | 29 | ||
| 3 | 完成项: | 30 | 完成项: | ... | ... |
-
Please register or sign in to post a comment