Commit e45896b7 e45896b757062f9b8d7c1b55efa06e99a2d21236 by cnb.bofCdSsphPA

Make long CPU index builds resumable and root-path tolerant

Constraint: Real FMA smoke indexing can run for a long time on CPU and synthetic/root-layout datasets must still use the same build-index entrypoint
Rejected: Treat build-index as all-or-nothing and require full reruns after interruption | wastes hours on CPU and obscures whether work was already completed
Confidence: high
Scope-risk: moderate
Directive: Preserve checkpoint file compatibility; future smoke-local automation should prefer resume before rebuilding from scratch
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/engines/ecapa_embedder.py acr-engine/src/engines/chromaprint_matcher.py acr-engine/run_demo.py; synthetic_v2 partial-checkpoint resume vs fresh rebuild equality check (shape/ids/embeddings/progress)
Not-tested: In-place resumption of the currently running real FMA process after an actual external kill/restart
1 parent 90e252b8
...@@ -39,13 +39,22 @@ def build_chroma_index(data_dir: Path, output_dir: Path): ...@@ -39,13 +39,22 @@ def build_chroma_index(data_dir: Path, output_dir: Path):
39 return matcher 39 return matcher
40 40
41 41
42 def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str): 42 def build_embedding_index(
43 data_dir: Path,
44 model_path: Path,
45 output_prefix: Path,
46 device: str,
47 checkpoint_every_refs: int = 250,
48 resume: bool = False,
49 ):
43 embedder = ECAPAEmbedder(model_path=str(model_path), device=device) 50 embedder = ECAPAEmbedder(model_path=str(model_path), device=device)
44 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' 51 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'
45 ref_embs, ref_ids = embedder.build_reference_index( 52 ref_embs, ref_ids = embedder.build_reference_index(
46 songs_dir=str(data_dir), 53 songs_dir=str(data_dir),
47 metadata_path=str(metadata_path), 54 metadata_path=str(metadata_path),
48 output_path=str(output_prefix), 55 output_path=str(output_prefix),
56 checkpoint_every_refs=checkpoint_every_refs,
57 resume=resume,
49 ) 58 )
50 print(f"[done] embedding index built: {len(ref_ids)} refs") 59 print(f"[done] embedding index built: {len(ref_ids)} refs")
51 return embedder, ref_embs, ref_ids 60 return embedder, ref_embs, ref_ids
...@@ -58,8 +67,18 @@ def cmd_build_index(args): ...@@ -58,8 +67,18 @@ def cmd_build_index(args):
58 67
59 print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}") 68 print(f"[build-index] starting chromaprint index: data={data_dir} output={out_dir}")
60 build_chroma_index(data_dir, out_dir) 69 build_chroma_index(data_dir, out_dir)
61 print(f"[build-index] starting embedding index: model={args.model} device={args.device}") 70 print(
62 build_embedding_index(data_dir, Path(args.model), out_dir / 'reference', args.device) 71 f"[build-index] starting embedding index: model={args.model} device={args.device} "
72 f"resume={args.resume} checkpoint_every_refs={args.checkpoint_every_refs}"
73 )
74 build_embedding_index(
75 data_dir,
76 Path(args.model),
77 out_dir / 'reference',
78 args.device,
79 checkpoint_every_refs=args.checkpoint_every_refs,
80 resume=args.resume,
81 )
63 82
64 83
65 def load_index(prefix: Path): 84 def load_index(prefix: Path):
...@@ -153,6 +172,8 @@ if __name__ == '__main__': ...@@ -153,6 +172,8 @@ if __name__ == '__main__':
153 p.add_argument('--model', required=True) 172 p.add_argument('--model', required=True)
154 p.add_argument('--output', default='data/index') 173 p.add_argument('--output', default='data/index')
155 p.add_argument('--device', default='cpu') 174 p.add_argument('--device', default='cpu')
175 p.add_argument('--checkpoint-every-refs', type=int, default=250)
176 p.add_argument('--resume', action='store_true')
156 p.set_defaults(func=cmd_build_index) 177 p.set_defaults(func=cmd_build_index)
157 178
158 p = sub.add_parser('recognize') 179 p = sub.add_parser('recognize')
......
...@@ -41,6 +41,13 @@ class ChromaprintMatcher: ...@@ -41,6 +41,13 @@ class ChromaprintMatcher:
41 self.min_peak_energy = min_peak_energy 41 self.min_peak_energy = min_peak_energy
42 self.hash_db: Dict[int, List[Fingerprint]] = defaultdict(list) 42 self.hash_db: Dict[int, List[Fingerprint]] = defaultdict(list)
43 43
44 def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path:
45 candidate = songs_dir / rel_path
46 if candidate.exists():
47 return candidate
48 candidate = songs_dir.parent / rel_path
49 return candidate
50
44 def _spectrogram(self, y: np.ndarray) -> np.ndarray: 51 def _spectrogram(self, y: np.ndarray) -> np.ndarray:
45 S = np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length)) 52 S = np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length))
46 return S 53 return S
...@@ -84,7 +91,7 @@ class ChromaprintMatcher: ...@@ -84,7 +91,7 @@ class ChromaprintMatcher:
84 for item in meta: 91 for item in meta:
85 if item.get("type") != "reference": 92 if item.get("type") != "reference":
86 continue 93 continue
87 audio_path = songs_dir.parent / item["audio_path"] 94 audio_path = self._resolve_audio_path(songs_dir, item["audio_path"])
88 if not audio_path.exists(): 95 if not audio_path.exists():
89 continue 96 continue
90 song_id = item["song_id"] 97 song_id = item["song_id"]
......
...@@ -54,6 +54,13 @@ class ECAPAEmbedder: ...@@ -54,6 +54,13 @@ class ECAPAEmbedder:
54 y, _ = librosa.load(path, sr=self.sr, mono=True) 54 y, _ = librosa.load(path, sr=self.sr, mono=True)
55 return y 55 return y
56 56
57 def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path:
58 candidate = songs_dir / rel_path
59 if candidate.exists():
60 return candidate
61 candidate = songs_dir.parent / rel_path
62 return candidate
63
57 def _to_mel(self, y: np.ndarray) -> torch.Tensor: 64 def _to_mel(self, y: np.ndarray) -> torch.Tensor:
58 mel = librosa.feature.melspectrogram( 65 mel = librosa.feature.melspectrogram(
59 y=y, 66 y=y,
...@@ -95,6 +102,8 @@ class ECAPAEmbedder: ...@@ -95,6 +102,8 @@ class ECAPAEmbedder:
95 output_path: str, 102 output_path: str,
96 window_sec: float = 5.0, 103 window_sec: float = 5.0,
97 stride_sec: float = 2.5, 104 stride_sec: float = 2.5,
105 checkpoint_every_refs: int = 250,
106 resume: bool = False,
98 ) -> Tuple[np.ndarray, List[str]]: 107 ) -> Tuple[np.ndarray, List[str]]:
99 with open(metadata_path) as f: 108 with open(metadata_path) as f:
100 meta = json.load(f) 109 meta = json.load(f)
...@@ -105,13 +114,90 @@ class ECAPAEmbedder: ...@@ -105,13 +114,90 @@ class ECAPAEmbedder:
105 refs = [item for item in meta if item.get("type") == "reference"] 114 refs = [item for item in meta if item.get("type") == "reference"]
106 total_refs = len(refs) 115 total_refs = len(refs)
107 start_time = time.time() 116 start_time = time.time()
117 output_prefix = Path(output_path)
118 progress_path = output_prefix.parent / f"{output_prefix.name}_progress.json"
119 partial_embs_path = Path(f"{output_path}_embs.partial.npy")
120 partial_ids_path = Path(f"{output_path}_ids.partial.npy")
121 final_embs_path = Path(f"{output_path}_embs.npy")
122 final_ids_path = Path(f"{output_path}_ids.npy")
123 refs_done = 0
124
125 if resume and final_embs_path.exists() and final_ids_path.exists():
126 print(f"[build-reference-index] resume hit complete index: {final_embs_path} / {final_ids_path}")
127 final_embs = np.load(final_embs_path)
128 final_ids = np.load(final_ids_path, allow_pickle=True).tolist()
129 return final_embs, final_ids
130
131 if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists():
132 try:
133 progress = json.loads(progress_path.read_text())
134 refs_done = int(progress.get("refs_done", 0) or 0)
135 partial_embs = np.load(partial_embs_path)
136 partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist()
137 all_embs = [row for row in partial_embs]
138 all_ids = partial_ids
139 print(
140 f"[build-reference-index] resuming from checkpoint: refs_done={refs_done}/{total_refs} "
141 f"windows_done={len(all_ids)}"
142 )
143 except Exception as exc:
144 print(f"[build-reference-index] resume checkpoint ignored due to load failure: {exc}")
145 refs_done = 0
146 all_embs = []
147 all_ids = []
148
108 print( 149 print(
109 f"[build-reference-index] start: refs={total_refs} device={self.device.type} " 150 f"[build-reference-index] start: refs={total_refs} device={self.device.type} "
110 f"window_sec={window_sec} stride_sec={stride_sec}" 151 f"window_sec={window_sec} stride_sec={stride_sec} resume={resume} refs_done={refs_done}"
111 ) 152 )
112 153
113 for ref_idx, item in enumerate(refs, start=1): 154 def write_checkpoint(ref_idx: int):
114 audio_path = songs_dir.parent / item["audio_path"] 155 if not all_embs:
156 return
157 elapsed = max(time.time() - start_time, 1e-6)
158 refs_per_sec = ref_idx / elapsed
159 eta_sec = (total_refs - ref_idx) / refs_per_sec if refs_per_sec > 0 else 0.0
160 emb_array = np.vstack(all_embs)
161 np.save(partial_embs_path, emb_array)
162 np.save(partial_ids_path, np.array(all_ids))
163 progress_path.write_text(json.dumps({
164 "status": "building",
165 "refs_done": ref_idx,
166 "refs_total": total_refs,
167 "windows_done": len(all_ids),
168 "elapsed_sec": round(elapsed, 3),
169 "eta_sec": round(eta_sec, 3),
170 "device": self.device.type,
171 "window_sec": window_sec,
172 "stride_sec": stride_sec,
173 "partial_embs_path": str(partial_embs_path),
174 "partial_ids_path": str(partial_ids_path),
175 }, indent=2))
176
177 def write_complete(total_windows: int, emb_shape: tuple[int, ...]):
178 elapsed = max(time.time() - start_time, 1e-6)
179 progress_path.write_text(json.dumps({
180 "status": "complete",
181 "refs_done": total_refs,
182 "refs_total": total_refs,
183 "windows_done": total_windows,
184 "elapsed_sec": round(elapsed, 3),
185 "device": self.device.type,
186 "window_sec": window_sec,
187 "stride_sec": stride_sec,
188 "final_embs_path": str(final_embs_path),
189 "final_ids_path": str(final_ids_path),
190 "embedding_shape": list(emb_shape),
191 }, indent=2))
192
193 if refs_done > total_refs:
194 print(f"[build-reference-index] resume refs_done={refs_done} exceeds refs_total={total_refs}; restarting")
195 refs_done = 0
196 all_embs = []
197 all_ids = []
198
199 for ref_idx, item in enumerate(refs[refs_done:], start=refs_done + 1):
200 audio_path = self._resolve_audio_path(songs_dir, item["audio_path"])
115 if not audio_path.exists(): 201 if not audio_path.exists():
116 continue 202 continue
117 song_id = item["song_id"] 203 song_id = item["song_id"]
...@@ -131,10 +217,18 @@ class ECAPAEmbedder: ...@@ -131,10 +217,18 @@ class ECAPAEmbedder:
131 f"[build-reference-index] progress: refs={ref_idx}/{total_refs} " 217 f"[build-reference-index] progress: refs={ref_idx}/{total_refs} "
132 f"windows={len(all_ids)} elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}" 218 f"windows={len(all_ids)} elapsed_sec={elapsed:.1f} eta_sec={eta_sec:.1f}"
133 ) 219 )
220 if checkpoint_every_refs > 0 and (ref_idx % checkpoint_every_refs == 0 or ref_idx == total_refs):
221 write_checkpoint(ref_idx)
222
223 if not all_embs:
224 raise ValueError(
225 f"No reference embeddings were produced from metadata={metadata_path} songs_dir={songs_dir}"
226 )
134 227
135 all_embs = np.vstack(all_embs) 228 all_embs = np.vstack(all_embs)
136 np.save(f"{output_path}_embs.npy", all_embs) 229 np.save(final_embs_path, all_embs)
137 np.save(f"{output_path}_ids.npy", np.array(all_ids)) 230 np.save(final_ids_path, np.array(all_ids))
231 write_complete(len(all_ids), all_embs.shape)
138 print(f"Built reference index: {len(all_ids)} windows, embeddings shape {all_embs.shape}") 232 print(f"Built reference index: {len(all_ids)} windows, embeddings shape {all_embs.shape}")
139 return all_embs, all_ids 233 return all_embs, all_ids
140 234
......
...@@ -5441,3 +5441,44 @@ ...@@ -5441,3 +5441,44 @@
5441 - **建库侧**:固定滑窗 5441 - **建库侧**:固定滑窗
5442 - **开源集 query 生成侧**`random / sliding / silence_aware / hybrid` 5442 - **开源集 query 生成侧**`random / sliding / silence_aware / hybrid`
5443 - 下一阶段可继续叠加 beat/onset/chorus-aware 切片,而无需推翻现有流程 5443 - 下一阶段可继续叠加 beat/onset/chorus-aware 切片,而无需推翻现有流程
5444
5445 ### Stage: build-index checkpoint resume + path compatibility hardening
5446
5447 完成项:
5448 -`acr-engine/src/engines/ecapa_embedder.py` 完成 embedding index 的 checkpoint / resume 逻辑
5449 - 支持读取 `reference_progress.json`
5450 - 支持复用 `reference_embs.partial.npy` / `reference_ids.partial.npy`
5451 - 若 final index 已存在,`--resume` 直接命中完成态
5452 -`acr-engine/run_demo.py build-index` 暴露:
5453 - `--resume`
5454 - `--checkpoint-every-refs`
5455 - 修复 `synthetic` / 根目录型数据集的音频路径解析兼容问题:
5456 - `acr-engine/src/engines/ecapa_embedder.py`
5457 - `acr-engine/src/engines/chromaprint_matcher.py`
5458 - 为 “没有任何 reference 被成功解析” 的场景补充显式报错,避免 `np.vstack([])` 这类低可读错误
5459 -[docs/open-dataset-workflow.md](./open-dataset-workflow.md) 补充 `build-index --resume` 用法
5460
5461 验证结果:
5462 - 代码编译验证:
5463 - `/usr/local/miniconda3/bin/python -m py_compile src/engines/ecapa_embedder.py src/engines/chromaprint_matcher.py run_demo.py`
5464 - 兼容性验证:
5465 - `run_demo.py build-index --data data/synthetic_v2 --model data/models_v6/best_model.pt --output /tmp/index_resume_fresh --device cpu`
5466 - synthetic 根目录型 `audio_path=songs/...` 已可正常建索引
5467 - resume 一致性验证:
5468 1.`data/synthetic_v2/catalog.json` 的前 2 首 reference 生成 partial checkpoint
5469 2. 人工保留 `reference_embs.partial.npy / reference_ids.partial.npy + reference_progress.json`
5470 3. 执行:
5471 - `run_demo.py build-index ... --resume --checkpoint-every-refs 1`
5472 4. 与 fresh full rebuild 对比结果:
5473 - `resume_shape == fresh_shape == (120, 192)`
5474 - `ids_equal == True`
5475 - `embs_allclose == True`
5476 - `progress_status == complete`
5477 - `progress_refs_done == progress_refs_total == 24`
5478 - resume 日志证据:
5479 - `[build-reference-index] resuming from checkpoint: refs_done=2/24 windows_done=10`
5480
5481 结论:
5482 - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑
5483 - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据
5484 - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里
......
...@@ -72,6 +72,15 @@ flowchart LR ...@@ -72,6 +72,15 @@ flowchart LR
72 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests 72 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests
73 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run 73 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run
74 /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu 74 /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu
75
76 # 如果长时间 CPU 建索引被中断,可从 partial checkpoint 续跑
77 /usr/local/miniconda3/bin/python run_demo.py build-index \
78 --data data/external_ingested/fma/manifests \
79 --model data/models_fma_smoke/best_model.pt \
80 --output data/index_fma_smoke \
81 --device cpu \
82 --resume \
83 --checkpoint-every-refs 100
75 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json 84 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json
76 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local 85 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local
77 ``` 86 ```
......