Resume smoke indexing safely without mixing model generations
Constraint: smoke-local must recover long CPU index builds automatically, but partial embeddings from an older model must never contaminate a newly trained index Rejected: Always reuse any existing partial checkpoint | can silently blend embeddings from different model generations into one index Confidence: high Scope-risk: moderate Directive: Keep model-signature checks on all future index resume paths; auto-resume should fall back to clean rebuild on any signature mismatch Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/engines/ecapa_embedder.py acr-engine/src/data/external_adapters.py acr-engine/run_demo.py; same-model partial checkpoint resume vs fresh rebuild equality; mismatched-model checkpoint rejection and clean rebuild equality Not-tested: Reattaching the currently running real FMA smoke process after an external interruption
Showing
4 changed files
with
75 additions
and
0 deletions
| ... | @@ -372,6 +372,7 @@ def smoke_local_dataset( | ... | @@ -372,6 +372,7 @@ def smoke_local_dataset( |
| 372 | query_strategy: str, | 372 | query_strategy: str, |
| 373 | segment_strategy: str, | 373 | segment_strategy: str, |
| 374 | silence_top_db: int, | 374 | silence_top_db: int, |
| 375 | index_checkpoint_every_refs: int, | ||
| 375 | seed: int, | 376 | seed: int, |
| 376 | train_epochs: int, | 377 | train_epochs: int, |
| 377 | batch_size: int, | 378 | batch_size: int, |
| ... | @@ -432,6 +433,8 @@ def smoke_local_dataset( | ... | @@ -432,6 +433,8 @@ def smoke_local_dataset( |
| 432 | "--model", str(model_dir / "best_model.pt"), | 433 | "--model", str(model_dir / "best_model.pt"), |
| 433 | "--output", str(index_dir), | 434 | "--output", str(index_dir), |
| 434 | "--device", resolved_device, | 435 | "--device", resolved_device, |
| 436 | "--resume", | ||
| 437 | "--checkpoint-every-refs", str(index_checkpoint_every_refs), | ||
| 435 | ], check=True) | 438 | ], check=True) |
| 436 | 439 | ||
| 437 | report_dir.mkdir(parents=True, exist_ok=True) | 440 | report_dir.mkdir(parents=True, exist_ok=True) |
| ... | @@ -461,6 +464,8 @@ def smoke_local_dataset( | ... | @@ -461,6 +464,8 @@ def smoke_local_dataset( |
| 461 | config["data"]["manifest_query_stride"] = query_stride | 464 | config["data"]["manifest_query_stride"] = query_stride |
| 462 | config["data"]["manifest_query_strategy"] = query_strategy | 465 | config["data"]["manifest_query_strategy"] = query_strategy |
| 463 | config["data"]["silence_top_db"] = silence_top_db | 466 | config["data"]["silence_top_db"] = silence_top_db |
| 467 | config["run"]["index_checkpoint_every_refs"] = index_checkpoint_every_refs | ||
| 468 | config["run"]["index_resume_enabled"] = True | ||
| 464 | config["run"]["train_segment_strategy"] = segment_strategy | 469 | config["run"]["train_segment_strategy"] = segment_strategy |
| 465 | report_dir.mkdir(parents=True, exist_ok=True) | 470 | report_dir.mkdir(parents=True, exist_ok=True) |
| 466 | config_path.write_text(json.dumps(config, indent=2)) | 471 | config_path.write_text(json.dumps(config, indent=2)) |
| ... | @@ -546,6 +551,7 @@ def main(): | ... | @@ -546,6 +551,7 @@ def main(): |
| 546 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | 551 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") |
| 547 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") | 552 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") |
| 548 | p.add_argument("--silence-top-db", type=int, default=30) | 553 | p.add_argument("--silence-top-db", type=int, default=30) |
| 554 | p.add_argument("--index-checkpoint-every-refs", type=int, default=100) | ||
| 549 | p.add_argument("--seed", type=int, default=42) | 555 | p.add_argument("--seed", type=int, default=42) |
| 550 | p.add_argument("--train-epochs", type=int, default=1) | 556 | p.add_argument("--train-epochs", type=int, default=1) |
| 551 | p.add_argument("--batch-size", type=int, default=2) | 557 | p.add_argument("--batch-size", type=int, default=2) |
| ... | @@ -605,6 +611,7 @@ def main(): | ... | @@ -605,6 +611,7 @@ def main(): |
| 605 | query_strategy=args.query_strategy, | 611 | query_strategy=args.query_strategy, |
| 606 | segment_strategy=args.segment_strategy, | 612 | segment_strategy=args.segment_strategy, |
| 607 | silence_top_db=args.silence_top_db, | 613 | silence_top_db=args.silence_top_db, |
| 614 | index_checkpoint_every_refs=args.index_checkpoint_every_refs, | ||
| 608 | seed=args.seed, | 615 | seed=args.seed, |
| 609 | train_epochs=args.train_epochs, | 616 | train_epochs=args.train_epochs, |
| 610 | batch_size=args.batch_size, | 617 | batch_size=args.batch_size, | ... | ... |
| ... | @@ -19,10 +19,12 @@ class ECAPAEmbedder: | ... | @@ -19,10 +19,12 @@ class ECAPAEmbedder: |
| 19 | hop_length: int = 160, | 19 | hop_length: int = 160, |
| 20 | ): | 20 | ): |
| 21 | self.device = torch.device(device) | 21 | self.device = torch.device(device) |
| 22 | self.model_path = Path(model_path) | ||
| 22 | self.sr = sr | 23 | self.sr = sr |
| 23 | self.n_mels = n_mels | 24 | self.n_mels = n_mels |
| 24 | self.n_fft = n_fft | 25 | self.n_fft = n_fft |
| 25 | self.hop_length = hop_length | 26 | self.hop_length = hop_length |
| 27 | self.model_signature = self._build_model_signature(self.model_path) | ||
| 26 | 28 | ||
| 27 | from src.models.ecapa_tdnn import ECAPA_ACR | 29 | from src.models.ecapa_tdnn import ECAPA_ACR |
| 28 | 30 | ||
| ... | @@ -54,6 +56,14 @@ class ECAPAEmbedder: | ... | @@ -54,6 +56,14 @@ class ECAPAEmbedder: |
| 54 | y, _ = librosa.load(path, sr=self.sr, mono=True) | 56 | y, _ = librosa.load(path, sr=self.sr, mono=True) |
| 55 | return y | 57 | return y |
| 56 | 58 | ||
| 59 | def _build_model_signature(self, model_path: Path) -> dict: | ||
| 60 | stat = model_path.stat() | ||
| 61 | return { | ||
| 62 | "path": str(model_path), | ||
| 63 | "size_bytes": int(stat.st_size), | ||
| 64 | "mtime_ns": int(stat.st_mtime_ns), | ||
| 65 | } | ||
| 66 | |||
| 57 | def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path: | 67 | def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path: |
| 58 | candidate = songs_dir / rel_path | 68 | candidate = songs_dir / rel_path |
| 59 | if candidate.exists(): | 69 | if candidate.exists(): |
| ... | @@ -131,6 +141,11 @@ class ECAPAEmbedder: | ... | @@ -131,6 +141,11 @@ class ECAPAEmbedder: |
| 131 | if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists(): | 141 | if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists(): |
| 132 | try: | 142 | try: |
| 133 | progress = json.loads(progress_path.read_text()) | 143 | progress = json.loads(progress_path.read_text()) |
| 144 | progress_sig = progress.get("model_signature") | ||
| 145 | if progress_sig and progress_sig != self.model_signature: | ||
| 146 | raise ValueError( | ||
| 147 | f"model signature mismatch: checkpoint={progress_sig} current={self.model_signature}" | ||
| 148 | ) | ||
| 134 | refs_done = int(progress.get("refs_done", 0) or 0) | 149 | refs_done = int(progress.get("refs_done", 0) or 0) |
| 135 | partial_embs = np.load(partial_embs_path) | 150 | partial_embs = np.load(partial_embs_path) |
| 136 | partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist() | 151 | partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist() |
| ... | @@ -145,6 +160,12 @@ class ECAPAEmbedder: | ... | @@ -145,6 +160,12 @@ class ECAPAEmbedder: |
| 145 | refs_done = 0 | 160 | refs_done = 0 |
| 146 | all_embs = [] | 161 | all_embs = [] |
| 147 | all_ids = [] | 162 | all_ids = [] |
| 163 | for stale_path in (partial_embs_path, partial_ids_path): | ||
| 164 | try: | ||
| 165 | if stale_path.exists(): | ||
| 166 | stale_path.unlink() | ||
| 167 | except OSError: | ||
| 168 | pass | ||
| 148 | 169 | ||
| 149 | print( | 170 | print( |
| 150 | f"[build-reference-index] start: refs={total_refs} device={self.device.type} " | 171 | f"[build-reference-index] start: refs={total_refs} device={self.device.type} " |
| ... | @@ -170,6 +191,7 @@ class ECAPAEmbedder: | ... | @@ -170,6 +191,7 @@ class ECAPAEmbedder: |
| 170 | "device": self.device.type, | 191 | "device": self.device.type, |
| 171 | "window_sec": window_sec, | 192 | "window_sec": window_sec, |
| 172 | "stride_sec": stride_sec, | 193 | "stride_sec": stride_sec, |
| 194 | "model_signature": self.model_signature, | ||
| 173 | "partial_embs_path": str(partial_embs_path), | 195 | "partial_embs_path": str(partial_embs_path), |
| 174 | "partial_ids_path": str(partial_ids_path), | 196 | "partial_ids_path": str(partial_ids_path), |
| 175 | }, indent=2)) | 197 | }, indent=2)) |
| ... | @@ -185,6 +207,7 @@ class ECAPAEmbedder: | ... | @@ -185,6 +207,7 @@ class ECAPAEmbedder: |
| 185 | "device": self.device.type, | 207 | "device": self.device.type, |
| 186 | "window_sec": window_sec, | 208 | "window_sec": window_sec, |
| 187 | "stride_sec": stride_sec, | 209 | "stride_sec": stride_sec, |
| 210 | "model_signature": self.model_signature, | ||
| 188 | "final_embs_path": str(final_embs_path), | 211 | "final_embs_path": str(final_embs_path), |
| 189 | "final_ids_path": str(final_ids_path), | 212 | "final_ids_path": str(final_ids_path), |
| 190 | "embedding_shape": list(emb_shape), | 213 | "embedding_shape": list(emb_shape), | ... | ... |
| ... | @@ -5482,3 +5482,43 @@ | ... | @@ -5482,3 +5482,43 @@ |
| 5482 | - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑 | 5482 | - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑 |
| 5483 | - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据 | 5483 | - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据 |
| 5484 | - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里 | 5484 | - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里 |
| 5485 | |||
| 5486 | ### Stage: smoke-local auto resume + model-signature safety gate | ||
| 5487 | |||
| 5488 | 完成项: | ||
| 5489 | - 在 `acr-engine/src/engines/ecapa_embedder.py` 为 index checkpoint 增加 `model_signature` | ||
| 5490 | - `path` | ||
| 5491 | - `size_bytes` | ||
| 5492 | - `mtime_ns` | ||
| 5493 | - 恢复时如果 checkpoint 的 `model_signature` 与当前 `best_model.pt` 不一致: | ||
| 5494 | - 自动拒绝旧 checkpoint | ||
| 5495 | - 清理旧 partial 文件 | ||
| 5496 | - 从 0 重建 embedding index | ||
| 5497 | - 在 `acr-engine/src/data/external_adapters.py` 的 `smoke-local` 中默认启用: | ||
| 5498 | - `run_demo.py build-index --resume` | ||
| 5499 | - `--checkpoint-every-refs` | ||
| 5500 | - 在 [docs/open-dataset-workflow.md](./open-dataset-workflow.md) 补充模型签名护栏说明 | ||
| 5501 | |||
| 5502 | 验证结果: | ||
| 5503 | - 编译验证: | ||
| 5504 | - `/usr/local/miniconda3/bin/python -m py_compile src/engines/ecapa_embedder.py src/data/external_adapters.py run_demo.py` | ||
| 5505 | - 同模型恢复验证(`models_v6 -> models_v6`): | ||
| 5506 | - 人工构造前 `2` 首 reference 的 partial checkpoint | ||
| 5507 | - 日志出现: | ||
| 5508 | - `[build-reference-index] resuming from checkpoint: refs_done=2/24 windows_done=10` | ||
| 5509 | - 与 fresh rebuild 对比: | ||
| 5510 | - `same_final_ids_equal == True` | ||
| 5511 | - `same_final_embs_equal == True` | ||
| 5512 | - `same_progress_status == complete` | ||
| 5513 | - 异模型拒绝恢复验证(`models_v6 partial -> models_v5 current`): | ||
| 5514 | - 日志出现: | ||
| 5515 | - `resume checkpoint ignored due to load failure: model signature mismatch` | ||
| 5516 | - 随后从 0 重建: | ||
| 5517 | - `start: refs=24 ... resume=True refs_done=0` | ||
| 5518 | - 与 `models_v5` fresh rebuild 对比: | ||
| 5519 | - `mismatch_final_ids_equal == True` | ||
| 5520 | - `mismatch_final_embs_equal == True` | ||
| 5521 | |||
| 5522 | 结论: | ||
| 5523 | - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力 | ||
| 5524 | - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index | ... | ... |
| ... | @@ -81,6 +81,11 @@ flowchart LR | ... | @@ -81,6 +81,11 @@ flowchart LR |
| 81 | --device cpu \ | 81 | --device cpu \ |
| 82 | --resume \ | 82 | --resume \ |
| 83 | --checkpoint-every-refs 100 | 83 | --checkpoint-every-refs 100 |
| 84 | |||
| 85 | 说明: | ||
| 86 | - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume` | ||
| 87 | - checkpoint 会记录 `model_signature` | ||
| 88 | - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding | ||
| 84 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json | 89 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json |
| 85 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local | 90 | /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local |
| 86 | ``` | 91 | ``` | ... | ... |
-
Please register or sign in to post a comment