Commit bc6d07af bc6d07afbd1e31d3956d20e35c20c424bc21ba99 by cnb.bofCdSsphPA

Expose chromaprint build progress so long index runs stop being black boxes

Constraint: the live 8000-reference FMA run is already in flight, so observability had to be added as forward-safe progress and partial-cache outputs for future runs instead of altering the active process
Rejected: keep waiting on blind build-index runs | hides whether chromaprint is advancing and blocks operational debugging
Confidence: high
Scope-risk: narrow
Directive: Prefer progress JSON plus partial cache evidence for future large-index investigations before assuming a stall
Tested: py_compile on chromaprint_matcher.py and run_demo.py, verified chromaprint_progress.json and chromaprint.pkl appear in /tmp/chroma_index_observable_smoke at refs_done=50
Not-tested: end-to-end completion of the new observable build-index flow has not finished yet
1 parent ae1f4673
...@@ -27,13 +27,15 @@ def cmd_generate_data(args): ...@@ -27,13 +27,15 @@ def cmd_generate_data(args):
27 print(f"[done] dataset generated at {args.output}") 27 print(f"[done] dataset generated at {args.output}")
28 28
29 29
30 def build_chroma_index(data_dir: Path, output_dir: Path): 30 def build_chroma_index(data_dir: Path, output_dir: Path, checkpoint_every_refs: int = 0):
31 matcher = ChromaprintMatcher() 31 matcher = ChromaprintMatcher()
32 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' 32 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'
33 matcher.index_songs_from_dir( 33 matcher.index_songs_from_dir(
34 songs_dir=str(data_dir), 34 songs_dir=str(data_dir),
35 metadata_path=str(metadata_path), 35 metadata_path=str(metadata_path),
36 cache_path=str(output_dir / 'chromaprint.pkl'), 36 cache_path=str(output_dir / 'chromaprint.pkl'),
37 checkpoint_every_refs=checkpoint_every_refs,
38 progress_path=str(output_dir / 'chromaprint_progress.json'),
37 ) 39 )
38 print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") 40 print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}")
39 return matcher 41 return matcher
...@@ -66,7 +68,7 @@ def cmd_build_index(args): ...@@ -66,7 +68,7 @@ def cmd_build_index(args):
66 out_dir.mkdir(parents=True, exist_ok=True) 68 out_dir.mkdir(parents=True, exist_ok=True)
67 69
68 print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") 70 print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}")
69 build_chroma_index(data_dir, out_dir) 71 build_chroma_index(data_dir, out_dir, checkpoint_every_refs=args.chromaprint_checkpoint_every_refs)
70 print( 72 print(
71 f"[build-index] starting embedding index: model={args.model} device={args.device} " 73 f"[build-index] starting embedding index: model={args.model} device={args.device} "
72 f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}" 74 f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}"
...@@ -173,6 +175,7 @@ if __name__ == '__main__': ...@@ -173,6 +175,7 @@ if __name__ == '__main__':
173 p.add_argument('--output', default='data/index') 175 p.add_argument('--output', default='data/index')
174 p.add_argument('--device', default='cpu') 176 p.add_argument('--device', default='cpu')
175 p.add_argument('--checkpoint-every-refs', type=int, default=250) 177 p.add_argument('--checkpoint-every-refs', type=int, default=250)
178 p.add_argument('--chromaprint-checkpoint-every-refs', type=int, default=100)
176 p.add_argument('--resume', action='store_true') 179 p.add_argument('--resume', action='store_true')
177 p.set_defaults(func=cmd_build_index) 180 p.set_defaults(func=cmd_build_index)
178 181
......
...@@ -15,6 +15,7 @@ from typing import Dict, List, Tuple, Optional ...@@ -15,6 +15,7 @@ from typing import Dict, List, Tuple, Optional
15 import pickle 15 import pickle
16 import json 16 import json
17 from pathlib import Path 17 from pathlib import Path
18 import time
18 19
19 20
20 class Fingerprint: 21 class Fingerprint:
...@@ -88,24 +89,65 @@ class ChromaprintMatcher: ...@@ -88,24 +89,65 @@ class ChromaprintMatcher:
88 self.hash_db[h].append(Fingerprint(song_id, offset, h)) 89 self.hash_db[h].append(Fingerprint(song_id, offset, h))
89 90
90 def index_songs_from_dir( 91 def index_songs_from_dir(
91 self, songs_dir: str, metadata_path: str, cache_path: Optional[str] = None 92 self,
93 songs_dir: str,
94 metadata_path: str,
95 cache_path: Optional[str] = None,
96 checkpoint_every_refs: int = 0,
97 progress_path: Optional[str] = None,
92 ): 98 ):
93 with open(metadata_path) as f: 99 with open(metadata_path) as f:
94 meta = json.load(f) 100 meta = json.load(f)
95 101
96 songs_dir = Path(songs_dir) 102 songs_dir = Path(songs_dir)
97 for item in meta: 103 refs = [item for item in meta if item.get("type") == "reference"]
98 if item.get("type") != "reference": 104 total_refs = len(refs)
99 continue 105 start_time = time.time()
106
107 progress_file = Path(progress_path) if progress_path else None
108 cache_file = Path(cache_path) if cache_path else None
109
110 def write_progress(refs_done: int, status: str):
111 if progress_file is None:
112 return
113 elapsed = max(time.time() - start_time, 1e-6)
114 refs_per_sec = refs_done / elapsed if refs_done > 0 else 0.0
115 eta_sec = (total_refs - refs_done) / refs_per_sec if refs_per_sec > 0 else 0.0
116 progress_file.write_text(json.dumps({
117 "status": status,
118 "refs_done": refs_done,
119 "refs_total": total_refs,
120 "elapsed_sec": round(elapsed, 3),
121 "eta_sec": round(eta_sec, 3),
122 "hashes": self.num_hashes,
123 "postings": self.index_size,
124 "cache_path": str(cache_file) if cache_file else None,
125 }, indent=2))
126
127 for ref_idx, item in enumerate(refs, start=1):
100 audio_path = self._resolve_audio_path(songs_dir, item["audio_path"]) 128 audio_path = self._resolve_audio_path(songs_dir, item["audio_path"])
101 if not audio_path.exists(): 129 if not audio_path.exists():
102 continue 130 continue
103 song_id = item["song_id"] 131 song_id = item["song_id"]
104 y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) 132 y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True)
105 self.index_song(song_id, y) 133 self.index_song(song_id, y)
106 134 if ref_idx == 1 or ref_idx == total_refs or (checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0):
107 if cache_path: 135 elapsed = max(time.time() - start_time, 1e-6)
108 self.save(cache_path) 136 refs_per_sec = ref_idx / elapsed
137 eta_sec = (total_refs - ref_idx) / refs_per_sec if refs_per_sec > 0 else 0.0
138 print(
139 f"[chromaprint-index] progress: refs={ref_idx}/{total_refs} "
140 f"hashes={self.num_hashes} postings={self.index_size} "
141 f"elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}"
142 )
143 if checkpoint_every_refs > 0 and ref_idx % checkpoint_every_refs == 0:
144 if cache_file is not None:
145 self.save(str(cache_file))
146 write_progress(ref_idx, "building")
147
148 if cache_file is not None:
149 self.save(str(cache_file))
150 write_progress(total_refs, "complete")
109 151
110 def match(self, y: np.ndarray, top_k: int = 10) -> List[Tuple[str, float]]: 152 def match(self, y: np.ndarray, top_k: int = 10) -> List[Tuple[str, float]]:
111 S = self._spectrogram(y) 153 S = self._spectrogram(y)
......
1 ## 2026-06-02 chromaprint build-index observability checkpoint
2
3 完成项:
4 -`acr-engine/src/engines/chromaprint_matcher.py``index_songs_from_dir()` 增加 chromaprint 阶段 progress JSON 与周期性 partial cache 落盘。
5 -`acr-engine/run_demo.py build-index` 增加 `--chromaprint-checkpoint-every-refs`,让 chromaprint 阶段也能分段 checkpoint,而不是只能盲等最终 `chromaprint.pkl`
6
7 验证结果:
8 - 小样本可观测 smoke:
9 - 命令:`run_demo.py build-index --data data/external_smoke/fma/manifests --output /tmp/chroma_index_observable_smoke --chromaprint-checkpoint-every-refs 10 ...`
10 - 已实际落盘:
11 - `/tmp/chroma_index_observable_smoke/chromaprint_progress.json`
12 - `/tmp/chroma_index_observable_smoke/chromaprint.pkl`
13 - `chromaprint_progress.json` fresh evidence:
14 - `status=building`
15 - `refs_done=50`
16 - `refs_total=8000`
17 - `elapsed_sec=36.93`
18 - `eta_sec=5871.909`
19 - `hashes=19509`
20 - `postings=45626`
21 - `chromaprint.pkl` 当时文件大小:`574K`
22 - `python -m py_compile src/engines/chromaprint_matcher.py run_demo.py` 通过
23
24 结论:
25 - 现在 chromaprint 阶段不再是黑盒长跑,可以直接看到 refs 进度、ETA、hash/posting 规模,并在运行中拿到 partial cache。
26 - 后续新进程可直接用这套证据判断 build-index 是否真正推进,而不必盲等最终产物。
27
1 ## 2026-06-02 chromaprint peak scan exact-safe optimization checkpoint 28 ## 2026-06-02 chromaprint peak scan exact-safe optimization checkpoint
2 29
3 完成项: 30 完成项:
......