Make long CPU index builds resumable and root-path tolerant
Constraint: Real FMA smoke indexing can run for a long time on CPU and synthetic/root-layout datasets must still use the same build-index entrypoint Rejected: Treat build-index as all-or-nothing and require full reruns after interruption | wastes hours on CPU and obscures whether work was already completed Confidence: high Scope-risk: moderate Directive: Preserve checkpoint file compatibility; future smoke-local automation should prefer resume before rebuilding from scratch Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/engines/ecapa_embedder.py acr-engine/src/engines/chromaprint_matcher.py acr-engine/run_demo.py; synthetic_v2 partial-checkpoint resume vs fresh rebuild equality check (shape/ids/embeddings/progress) Not-tested: In-place resumption of the currently running real FMA process after an actual external kill/restart
Showing
5 changed files
with
181 additions
and
9 deletions
| ... | @@ -39,13 +39,22 @@ def build_chroma_index(data_dir: Path, output_dir: Path): | ... | @@ -39,13 +39,22 @@ def build_chroma_index(data_dir: Path, output_dir: Path): |
| 39 | return matcher | 39 | return matcher |
| 40 | 40 | ||
| 41 | 41 | ||
| 42 | def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str): | 42 | def build_embedding_index( |
| 43 | data_dir: Path, | ||
| 44 | model_path: Path, | ||
| 45 | output_prefix: Path, | ||
| 46 | device: str, | ||
| 47 | checkpoint_every_refs: int = 250, | ||
| 48 | resume: bool = False, | ||
| 49 | ): | ||
| 43 | embedder = ECAPAEmbedder(model_path=str(model_path), device=device) | 50 | embedder = ECAPAEmbedder(model_path=str(model_path), device=device) |
| 44 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' | 51 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' |
| 45 | ref_embs, ref_ids = embedder.build_reference_index( | 52 | ref_embs, ref_ids = embedder.build_reference_index( |
| 46 | songs_dir=str(data_dir), | 53 | songs_dir=str(data_dir), |
| 47 | metadata_path=str(metadata_path), | 54 | metadata_path=str(metadata_path), |
| 48 | output_path=str(output_prefix), | 55 | output_path=str(output_prefix), |
| 56 | checkpoint_every_refs=checkpoint_every_refs, | ||
| 57 | resume=resume, | ||
| 49 | ) | 58 | ) |
| 50 | print(f"[done] embedding index built: {len(ref_ids)} refs") | 59 | print(f"[done] embedding index built: {len(ref_ids)} refs") |
| 51 | return embedder, ref_embs, ref_ids | 60 | return embedder, ref_embs, ref_ids |
| ... | @@ -58,8 +67,18 @@ def cmd_build_index(args): | ... | @@ -58,8 +67,18 @@ def cmd_build_index(args): |
| 58 | 67 | ||
| 59 | print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") | 68 | print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") |
| 60 | build_chroma_index(data_dir, out_dir) | 69 | build_chroma_index(data_dir, out_dir) |
| 61 | print(f"[build-index] starting embedding index: model={args.model} device={args.device}") | 70 | print( |
| 62 | build_embedding_index(data_dir, Path(args.model), out_dir / 'reference', args.device) | 71 | f"[build-index] starting embedding index: model={args.model} device={args.device} " |
| 72 | f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}" | ||
| 73 | ) | ||
| 74 | build_embedding_index( | ||
| 75 | data_dir, | ||
| 76 | Path(args.model), | ||
| 77 | out_dir / 'reference', | ||
| 78 | args.device, | ||
| 79 | checkpoint_every_refs=args.checkpoint_every_refs, | ||
| 80 | resume=args.resume, | ||
| 81 | ) | ||
| 63 | 82 | ||
| 64 | 83 | ||
| 65 | def load_index(prefix: Path): | 84 | def load_index(prefix: Path): |
| ... | @@ -153,6 +172,8 @@ if __name__ == '__main__': | ... | @@ -153,6 +172,8 @@ if __name__ == '__main__': |
| 153 | p.add_argument('--model', required=True) | 172 | p.add_argument('--model', required=True) |
| 154 | p.add_argument('--output', default='data/index') | 173 | p.add_argument('--output', default='data/index') |
| 155 | p.add_argument('--device', default='cpu') | 174 | p.add_argument('--device', default='cpu') |
| 175 | p.add_argument('--checkpoint-every-refs', type=int, default=250) | ||
| 176 | p.add_argument('--resume', action='store_true') | ||
| 156 | p.set_defaults(func=cmd_build_index) | 177 | p.set_defaults(func=cmd_build_index) |
| 157 | 178 | ||
| 158 | p = sub.add_parser('recognize') | 179 | p = sub.add_parser('recognize') | ... | ... |
| ... | @@ -41,6 +41,13 @@ class ChromaprintMatcher: | ... | @@ -41,6 +41,13 @@ class ChromaprintMatcher: |
| 41 | self.min_peak_energy = min_peak_energy | 41 | self.min_peak_energy = min_peak_energy |
| 42 | self.hash_db: Dict[int, List[Fingerprint]] = defaultdict(list) | 42 | self.hash_db: Dict[int, List[Fingerprint]] = defaultdict(list) |
| 43 | 43 | ||
| 44 | def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path: | ||
| 45 | candidate = songs_dir / rel_path | ||
| 46 | if candidate.exists(): | ||
| 47 | return candidate | ||
| 48 | candidate = songs_dir.parent / rel_path | ||
| 49 | return candidate | ||
| 50 | |||
| 44 | def _spectrogram(self, y: np.ndarray) -> np.ndarray: | 51 | def _spectrogram(self, y: np.ndarray) -> np.ndarray: |
| 45 | S = np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length)) | 52 | S = np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length)) |
| 46 | return S | 53 | return S |
| ... | @@ -84,7 +91,7 @@ class ChromaprintMatcher: | ... | @@ -84,7 +91,7 @@ class ChromaprintMatcher: |
| 84 | for item in meta: | 91 | for item in meta: |
| 85 | if item.get("type") != "reference": | 92 | if item.get("type") != "reference": |
| 86 | continue | 93 | continue |
| 87 | audio_path = songs_dir.parent / item["audio_path"] | 94 | audio_path = self._resolve_audio_path(songs_dir, item["audio_path"]) |
| 88 | if not audio_path.exists(): | 95 | if not audio_path.exists(): |
| 89 | continue | 96 | continue |
| 90 | song_id = item["song_id"] | 97 | song_id = item["song_id"] | ... | ... |
| ... | @@ -54,6 +54,13 @@ class ECAPAEmbedder: | ... | @@ -54,6 +54,13 @@ class ECAPAEmbedder: |
| 54 | y, _ = librosa.load(path, sr=self.sr, mono=True) | 54 | y, _ = librosa.load(path, sr=self.sr, mono=True) |
| 55 | return y | 55 | return y |
| 56 | 56 | ||
| 57 | def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path: | ||
| 58 | candidate = songs_dir / rel_path | ||
| 59 | if candidate.exists(): | ||
| 60 | return candidate | ||
| 61 | candidate = songs_dir.parent / rel_path | ||
| 62 | return candidate | ||
| 63 | |||
| 57 | def _to_mel(self, y: np.ndarray) -> torch.Tensor: | 64 | def _to_mel(self, y: np.ndarray) -> torch.Tensor: |
| 58 | mel = librosa.feature.melspectrogram( | 65 | mel = librosa.feature.melspectrogram( |
| 59 | y=y, | 66 | y=y, |
| ... | @@ -95,6 +102,8 @@ class ECAPAEmbedder: | ... | @@ -95,6 +102,8 @@ class ECAPAEmbedder: |
| 95 | output_path: str, | 102 | output_path: str, |
| 96 | window_sec: float = 5.0, | 103 | window_sec: float = 5.0, |
| 97 | stride_sec: float = 2.5, | 104 | stride_sec: float = 2.5, |
| 105 | checkpoint_every_refs: int = 250, | ||
| 106 | resume: bool = False, | ||
| 98 | ) -> Tuple[np.ndarray, List[str]]: | 107 | ) -> Tuple[np.ndarray, List[str]]: |
| 99 | with open(metadata_path) as f: | 108 | with open(metadata_path) as f: |
| 100 | meta = json.load(f) | 109 | meta = json.load(f) |
| ... | @@ -105,13 +114,90 @@ class ECAPAEmbedder: | ... | @@ -105,13 +114,90 @@ class ECAPAEmbedder: |
| 105 | refs = [item for item in meta if item.get("type") == "reference"] | 114 | refs = [item for item in meta if item.get("type") == "reference"] |
| 106 | total_refs = len(refs) | 115 | total_refs = len(refs) |
| 107 | start_time = time.time() | 116 | start_time = time.time() |
| 117 | output_prefix = Path(output_path) | ||
| 118 | progress_path = output_prefix.parent / f"{output_prefix.name}_progress.json" | ||
| 119 | partial_embs_path = Path(f"{output_path}_embs.partial.npy") | ||
| 120 | partial_ids_path = Path(f"{output_path}_ids.partial.npy") | ||
| 121 | final_embs_path = Path(f"{output_path}_embs.npy") | ||
| 122 | final_ids_path = Path(f"{output_path}_ids.npy") | ||
| 123 | refs_done = 0 | ||
| 124 | |||
| 125 | if resume and final_embs_path.exists() and final_ids_path.exists(): | ||
| 126 | print(f"[build-reference-index] resume hit complete index: {final_embs_path} / {final_ids_path}") | ||
| 127 | final_embs = np.load(final_embs_path) | ||
| 128 | final_ids = np.load(final_ids_path, allow_pickle=True).tolist() | ||
| 129 | return final_embs, final_ids | ||
| 130 | |||
| 131 | if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists(): | ||
| 132 | try: | ||
| 133 | progress = json.loads(progress_path.read_text()) | ||
| 134 | refs_done = int(progress.get("refs_done", 0) or 0) | ||
| 135 | partial_embs = np.load(partial_embs_path) | ||
| 136 | partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist() | ||
| 137 | all_embs = [row for row in partial_embs] | ||
| 138 | all_ids = partial_ids | ||
| 139 | print( | ||
| 140 | f"[build-reference-index] resuming from checkpoint: refs_done={refs_done}/{total_refs} " | ||
| 141 | f"windows_done={len(all_ids)}" | ||
| 142 | ) | ||
| 143 | except Exception as exc: | ||
| 144 | print(f"[build-reference-index] resume checkpoint ignored due to load failure: {exc}") | ||
| 145 | refs_done = 0 | ||
| 146 | all_embs = [] | ||
| 147 | all_ids = [] | ||
| 148 | |||
| 108 | print( | 149 | print( |
| 109 | f"[build-reference-index] start: refs={total_refs} device={self.device.type} " | 150 | f"[build-reference-index] start: refs={total_refs} device={self.device.type} " |
| 110 | f"window_sec={window_sec} stride_sec={stride_sec}" | 151 | f"window_sec={window_sec} stride_sec={stride_sec} resume={resume} refs_done={refs_done}" |
| 111 | ) | 152 | ) |
| 112 | 153 | ||
| 113 | for ref_idx, item in enumerate(refs, start=1): | 154 | def write_checkpoint(ref_idx: int): |
| 114 | audio_path = songs_dir.parent / item["audio_path"] | 155 | if not all_embs: |
| 156 | return | ||
| 157 | elapsed = max(time.time() - start_time, 1e-6) | ||
| 158 | refs_per_sec = ref_idx / elapsed | ||
| 159 | eta_sec = (total_refs - ref_idx) / refs_per_sec if refs_per_sec > 0 else 0.0 | ||
| 160 | emb_array = np.vstack(all_embs) | ||
| 161 | np.save(partial_embs_path, emb_array) | ||
| 162 | np.save(partial_ids_path, np.array(all_ids)) | ||
| 163 | progress_path.write_text(json.dumps({ | ||
| 164 | "status": "building", | ||
| 165 | "refs_done": ref_idx, | ||
| 166 | "refs_total": total_refs, | ||
| 167 | "windows_done": len(all_ids), | ||
| 168 | "elapsed_sec": round(elapsed, 3), | ||
| 169 | "eta_sec": round(eta_sec, 3), | ||
| 170 | "device": self.device.type, | ||
| 171 | "window_sec": window_sec, | ||
| 172 | "stride_sec": stride_sec, | ||
| 173 | "partial_embs_path": str(partial_embs_path), | ||
| 174 | "partial_ids_path": str(partial_ids_path), | ||
| 175 | }, indent=2)) | ||
| 176 | |||
| 177 | def write_complete(total_windows: int, emb_shape: tuple[int, ...]): | ||
| 178 | elapsed = max(time.time() - start_time, 1e-6) | ||
| 179 | progress_path.write_text(json.dumps({ | ||
| 180 | "status": "complete", | ||
| 181 | "refs_done": total_refs, | ||
| 182 | "refs_total": total_refs, | ||
| 183 | "windows_done": total_windows, | ||
| 184 | "elapsed_sec": round(elapsed, 3), | ||
| 185 | "device": self.device.type, | ||
| 186 | "window_sec": window_sec, | ||
| 187 | "stride_sec": stride_sec, | ||
| 188 | "final_embs_path": str(final_embs_path), | ||
| 189 | "final_ids_path": str(final_ids_path), | ||
| 190 | "embedding_shape": list(emb_shape), | ||
| 191 | }, indent=2)) | ||
| 192 | |||
| 193 | if refs_done > total_refs: | ||
| 194 | print(f"[build-reference-index] resume refs_done={refs_done} exceeds refs_total={total_refs}; restarting") | ||
| 195 | refs_done = 0 | ||
| 196 | all_embs = [] | ||
| 197 | all_ids = [] | ||
| 198 | |||
| 199 | for ref_idx, item in enumerate(refs[refs_done:], start=refs_done + 1): | ||
| 200 | audio_path = self._resolve_audio_path(songs_dir, item["audio_path"]) | ||
| 115 | if not audio_path.exists(): | 201 | if not audio_path.exists(): |
| 116 | continue | 202 | continue |
| 117 | song_id = item["song_id"] | 203 | song_id = item["song_id"] |
| ... | @@ -131,10 +217,18 @@ class ECAPAEmbedder: | ... | @@ -131,10 +217,18 @@ class ECAPAEmbedder: |
| 131 | f"[build-reference-index] progress: refs={ref_idx}/{total_refs} " | 217 | f"[build-reference-index] progress: refs={ref_idx}/{total_refs} " |
| 132 | f"windows={len(all_ids)} elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}" | 218 | f"windows={len(all_ids)} elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}" |
| 133 | ) | 219 | ) |
| 220 | if checkpoint_every_refs > 0 and (ref_idx % checkpoint_every_refs == 0 or ref_idx == total_refs): | ||
| 221 | write_checkpoint(ref_idx) | ||
| 222 | |||
| 223 | if not all_embs: | ||
| 224 | raise ValueError( | ||
| 225 | f"No reference embeddings were produced from metadata={metadata_path} songs_dir={songs_dir}" | ||
| 226 | ) | ||
| 134 | 227 | ||
| 135 | all_embs = np.vstack(all_embs) | 228 | all_embs = np.vstack(all_embs) |
| 136 | np.save(f"{output_path}_embs.npy", all_embs) | 229 | np.save(final_embs_path, all_embs) |
| 137 | np.save(f"{output_path}_ids.npy", np.array(all_ids)) | 230 | np.save(final_ids_path, np.array(all_ids)) |
| 231 | write_complete(len(all_ids), all_embs.shape) | ||
| 138 | print(f"Built reference index: {len(all_ids)} windows, embeddings shape {all_embs.shape}") | 232 | print(f"Built reference index: {len(all_ids)} windows, embeddings shape {all_embs.shape}") |
| 139 | return all_embs, all_ids | 233 | return all_embs, all_ids |
| 140 | 234 | ... | ... |
| ... | @@ -5441,3 +5441,44 @@ | ... | @@ -5441,3 +5441,44 @@ |
| 5441 | - **建库侧**:固定滑窗 | 5441 | - **建库侧**:固定滑窗 |
| 5442 | - **开源集 query 生成侧**:`random / sliding / silence_aware / hybrid` | 5442 | - **开源集 query 生成侧**:`random / sliding / silence_aware / hybrid` |
| 5443 | - 下一阶段可继续叠加 beat/onset/chorus-aware 切片,而无需推翻现有流程 | 5443 | - 下一阶段可继续叠加 beat/onset/chorus-aware 切片,而无需推翻现有流程 |
| 5444 | |||
| 5445 | ### Stage: build-index checkpoint resume + path compatibility hardening | ||
| 5446 | |||
| 5447 | 完成项: | ||
| 5448 | - 在 `acr-engine/src/engines/ecapa_embedder.py` 完成 embedding index 的 checkpoint / resume 逻辑 | ||
| 5449 | - 支持读取 `reference_progress.json` | ||
| 5450 | - 支持复用 `reference_embs.partial.npy` / `reference_ids.partial.npy` | ||
| 5451 | - 若 final index 已存在,`--resume` 直接命中完成态 | ||
| 5452 | - 在 `acr-engine/run_demo.py build-index` 暴露: | ||
| 5453 | - `--resume` | ||
| 5454 | - `--checkpoint-every-refs` | ||
| 5455 | - 修复 `synthetic` / 根目录型数据集的音频路径解析兼容问题: | ||
| 5456 | - `acr-engine/src/engines/ecapa_embedder.py` | ||
| 5457 | - `acr-engine/src/engines/chromaprint_matcher.py` | ||
| 5458 | - 为 “没有任何 reference 被成功解析” 的场景补充显式报错,避免 `np.vstack([])` 这类低可读错误 | ||
| 5459 | - 在 [docs/open-dataset-workflow.md](./open-dataset-workflow.md) 补充 `build-index --resume` 用法 | ||
| 5460 | |||
| 5461 | 验证结果: | ||
| 5462 | - 代码编译验证: | ||
| 5463 | - `/usr/local/miniconda3/bin/python -m py_compile src/engines/ecapa_embedder.py src/engines/chromaprint_matcher.py run_demo.py` | ||
| 5464 | - 兼容性验证: | ||
| 5465 | - `run_demo.py build-index --data data/synthetic_v2 --model data/models_v6/best_model.pt --output /tmp/index_resume_fresh --device cpu` | ||
| 5466 | - synthetic 根目录型 `audio_path=songs/...` 已可正常建索引 | ||
| 5467 | - resume 一致性验证: | ||
| 5468 | 1. 用 `data/synthetic_v2/catalog.json` 的前 2 首 reference 生成 partial checkpoint | ||
| 5469 | 2. 人工保留 `reference_embs.partial.npy / reference_ids.partial.npy + reference_progress.json` | ||
| 5470 | 3. 执行: | ||
| 5471 | - `run_demo.py build-index ... --resume --checkpoint-every-refs 1` | ||
| 5472 | 4. 与 fresh full rebuild 对比结果: | ||
| 5473 | - `resume_shape == fresh_shape == (120, 192)` | ||
| 5474 | - `ids_equal == True` | ||
| 5475 | - `embs_allclose == True` | ||
| 5476 | - `progress_status == complete` | ||
| 5477 | - `progress_refs_done == progress_refs_total == 24` | ||
| 5478 | - resume 日志证据: | ||
| 5479 | - `[build-reference-index] resuming from checkpoint: refs_done=2/24 windows_done=10` | ||
| 5480 | |||
| 5481 | 结论: | ||
| 5482 | - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑 | ||
| 5483 | - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据 | ||
| 5484 | - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里 | ... | ... |
| ... | @@ -72,6 +72,15 @@ flowchart LR | ... | @@ -72,6 +72,15 @@ flowchart LR |
| 72 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests | 72 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests |
| 73 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run | 73 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run |
| 74 | /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu | 74 | /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu |
| 75 | |||
| 76 | # 如果长时间 CPU 建索引被中断,可从 partial checkpoint 续跑 | ||
| 77 | /usr/local/miniconda3/bin/python run_demo.py build-index \ | ||
| 78 | --data data/external_ingested/fma/manifests \ | ||
| 79 | --model data/models_fma_smoke/best_model.pt \ | ||
| 80 | --output data/index_fma_smoke \ | ||
| 81 | --device cpu \ | ||
| 82 | --resume \ | ||
| 83 | --checkpoint-every-refs 100 | ||
| 75 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json | 84 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json |
| 76 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local | 85 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local |
| 77 | ``` | 86 | ``` | ... | ... |
-
Please register or sign in to post a comment