Commit 4ceaa995 4ceaa995820cda86de67bbdb1881c26b0d142f46 by cnb.bofCdSsphPA

Resume smoke indexing safely without mixing model generations

Constraint: smoke-local must recover long CPU index builds automatically, but partial embeddings from an older model must never contaminate a newly trained index
Rejected: Always reuse any existing partial checkpoint | can silently blend embeddings from different model generations into one index
Confidence: high
Scope-risk: moderate
Directive: Keep model-signature checks on all future index resume paths; auto-resume should fall back to clean rebuild on any signature mismatch
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/engines/ecapa_embedder.py acr-engine/src/data/external_adapters.py acr-engine/run_demo.py; same-model partial checkpoint resume vs fresh rebuild equality; mismatched-model checkpoint rejection and clean rebuild equality
Not-tested: Reattaching the currently running real FMA smoke process after an external interruption
1 parent e45896b7
...@@ -372,6 +372,7 @@ def smoke_local_dataset( ...@@ -372,6 +372,7 @@ def smoke_local_dataset(
372 query_strategy: str, 372 query_strategy: str,
373 segment_strategy: str, 373 segment_strategy: str,
374 silence_top_db: int, 374 silence_top_db: int,
375 index_checkpoint_every_refs: int,
375 seed: int, 376 seed: int,
376 train_epochs: int, 377 train_epochs: int,
377 batch_size: int, 378 batch_size: int,
...@@ -432,6 +433,8 @@ def smoke_local_dataset( ...@@ -432,6 +433,8 @@ def smoke_local_dataset(
432 "--model", str(model_dir / "best_model.pt"), 433 "--model", str(model_dir / "best_model.pt"),
433 "--output", str(index_dir), 434 "--output", str(index_dir),
434 "--device", resolved_device, 435 "--device", resolved_device,
436 "--resume",
437 "--checkpoint-every-refs", str(index_checkpoint_every_refs),
435 ], check=True) 438 ], check=True)
436 439
437 report_dir.mkdir(parents=True, exist_ok=True) 440 report_dir.mkdir(parents=True, exist_ok=True)
...@@ -461,6 +464,8 @@ def smoke_local_dataset( ...@@ -461,6 +464,8 @@ def smoke_local_dataset(
461 config["data"]["manifest_query_stride"] = query_stride 464 config["data"]["manifest_query_stride"] = query_stride
462 config["data"]["manifest_query_strategy"] = query_strategy 465 config["data"]["manifest_query_strategy"] = query_strategy
463 config["data"]["silence_top_db"] = silence_top_db 466 config["data"]["silence_top_db"] = silence_top_db
467 config["run"]["index_checkpoint_every_refs"] = index_checkpoint_every_refs
468 config["run"]["index_resume_enabled"] = True
464 config["run"]["train_segment_strategy"] = segment_strategy 469 config["run"]["train_segment_strategy"] = segment_strategy
465 report_dir.mkdir(parents=True, exist_ok=True) 470 report_dir.mkdir(parents=True, exist_ok=True)
466 config_path.write_text(json.dumps(config, indent=2)) 471 config_path.write_text(json.dumps(config, indent=2))
...@@ -546,6 +551,7 @@ def main(): ...@@ -546,6 +551,7 @@ def main():
546 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") 551 p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random")
547 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") 552 p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random")
548 p.add_argument("--silence-top-db", type=int, default=30) 553 p.add_argument("--silence-top-db", type=int, default=30)
554 p.add_argument("--index-checkpoint-every-refs", type=int, default=100)
549 p.add_argument("--seed", type=int, default=42) 555 p.add_argument("--seed", type=int, default=42)
550 p.add_argument("--train-epochs", type=int, default=1) 556 p.add_argument("--train-epochs", type=int, default=1)
551 p.add_argument("--batch-size", type=int, default=2) 557 p.add_argument("--batch-size", type=int, default=2)
...@@ -605,6 +611,7 @@ def main(): ...@@ -605,6 +611,7 @@ def main():
605 query_strategy=args.query_strategy, 611 query_strategy=args.query_strategy,
606 segment_strategy=args.segment_strategy, 612 segment_strategy=args.segment_strategy,
607 silence_top_db=args.silence_top_db, 613 silence_top_db=args.silence_top_db,
614 index_checkpoint_every_refs=args.index_checkpoint_every_refs,
608 seed=args.seed, 615 seed=args.seed,
609 train_epochs=args.train_epochs, 616 train_epochs=args.train_epochs,
610 batch_size=args.batch_size, 617 batch_size=args.batch_size,
......
...@@ -19,10 +19,12 @@ class ECAPAEmbedder: ...@@ -19,10 +19,12 @@ class ECAPAEmbedder:
19 hop_length: int = 160, 19 hop_length: int = 160,
20 ): 20 ):
21 self.device = torch.device(device) 21 self.device = torch.device(device)
22 self.model_path = Path(model_path)
22 self.sr = sr 23 self.sr = sr
23 self.n_mels = n_mels 24 self.n_mels = n_mels
24 self.n_fft = n_fft 25 self.n_fft = n_fft
25 self.hop_length = hop_length 26 self.hop_length = hop_length
27 self.model_signature = self._build_model_signature(self.model_path)
26 28
27 from src.models.ecapa_tdnn import ECAPA_ACR 29 from src.models.ecapa_tdnn import ECAPA_ACR
28 30
...@@ -54,6 +56,14 @@ class ECAPAEmbedder: ...@@ -54,6 +56,14 @@ class ECAPAEmbedder:
54 y, _ = librosa.load(path, sr=self.sr, mono=True) 56 y, _ = librosa.load(path, sr=self.sr, mono=True)
55 return y 57 return y
56 58
59 def _build_model_signature(self, model_path: Path) -> dict:
60 stat = model_path.stat()
61 return {
62 "path": str(model_path),
63 "size_bytes": int(stat.st_size),
64 "mtime_ns": int(stat.st_mtime_ns),
65 }
66
57 def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path: 67 def _resolve_audio_path(self, songs_dir: Path, rel_path: str) -> Path:
58 candidate = songs_dir / rel_path 68 candidate = songs_dir / rel_path
59 if candidate.exists(): 69 if candidate.exists():
...@@ -131,6 +141,11 @@ class ECAPAEmbedder: ...@@ -131,6 +141,11 @@ class ECAPAEmbedder:
131 if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists(): 141 if resume and progress_path.exists() and partial_embs_path.exists() and partial_ids_path.exists():
132 try: 142 try:
133 progress = json.loads(progress_path.read_text()) 143 progress = json.loads(progress_path.read_text())
144 progress_sig = progress.get("model_signature")
145 if progress_sig and progress_sig != self.model_signature:
146 raise ValueError(
147 f"model signature mismatch: checkpoint={progress_sig} current={self.model_signature}"
148 )
134 refs_done = int(progress.get("refs_done", 0) or 0) 149 refs_done = int(progress.get("refs_done", 0) or 0)
135 partial_embs = np.load(partial_embs_path) 150 partial_embs = np.load(partial_embs_path)
136 partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist() 151 partial_ids = np.load(partial_ids_path, allow_pickle=True).tolist()
...@@ -145,6 +160,12 @@ class ECAPAEmbedder: ...@@ -145,6 +160,12 @@ class ECAPAEmbedder:
145 refs_done = 0 160 refs_done = 0
146 all_embs = [] 161 all_embs = []
147 all_ids = [] 162 all_ids = []
163 for stale_path in (partial_embs_path, partial_ids_path):
164 try:
165 if stale_path.exists():
166 stale_path.unlink()
167 except OSError:
168 pass
148 169
149 print( 170 print(
150 f"[build-reference-index] start: refs={total_refs} device={self.device.type} " 171 f"[build-reference-index] start: refs={total_refs} device={self.device.type} "
...@@ -170,6 +191,7 @@ class ECAPAEmbedder: ...@@ -170,6 +191,7 @@ class ECAPAEmbedder:
170 "device": self.device.type, 191 "device": self.device.type,
171 "window_sec": window_sec, 192 "window_sec": window_sec,
172 "stride_sec": stride_sec, 193 "stride_sec": stride_sec,
194 "model_signature": self.model_signature,
173 "partial_embs_path": str(partial_embs_path), 195 "partial_embs_path": str(partial_embs_path),
174 "partial_ids_path": str(partial_ids_path), 196 "partial_ids_path": str(partial_ids_path),
175 }, indent=2)) 197 }, indent=2))
...@@ -185,6 +207,7 @@ class ECAPAEmbedder: ...@@ -185,6 +207,7 @@ class ECAPAEmbedder:
185 "device": self.device.type, 207 "device": self.device.type,
186 "window_sec": window_sec, 208 "window_sec": window_sec,
187 "stride_sec": stride_sec, 209 "stride_sec": stride_sec,
210 "model_signature": self.model_signature,
188 "final_embs_path": str(final_embs_path), 211 "final_embs_path": str(final_embs_path),
189 "final_ids_path": str(final_ids_path), 212 "final_ids_path": str(final_ids_path),
190 "embedding_shape": list(emb_shape), 213 "embedding_shape": list(emb_shape),
......
...@@ -5482,3 +5482,43 @@ ...@@ -5482,3 +5482,43 @@
5482 - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑 5482 - 现在 CPU 长时间 `build-index` 任务即使中断,也可以从 partial checkpoint 续跑
5483 - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据 5483 - 该恢复逻辑已经拿到“恢复结果与 fresh rebuild 完全一致”的新鲜证据
5484 - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里 5484 - 下一步可以把这套 resume 能力进一步接到 `smoke-local` 的自动恢复策略里
5485
5486 ### Stage: smoke-local auto resume + model-signature safety gate
5487
5488 完成项:
5489 -`acr-engine/src/engines/ecapa_embedder.py` 为 index checkpoint 增加 `model_signature`
5490 - `path`
5491 - `size_bytes`
5492 - `mtime_ns`
5493 - 恢复时如果 checkpoint 的 `model_signature` 与当前 `best_model.pt` 不一致:
5494 - 自动拒绝旧 checkpoint
5495 - 清理旧 partial 文件
5496 - 从 0 重建 embedding index
5497 -`acr-engine/src/data/external_adapters.py``smoke-local` 中默认启用:
5498 - `run_demo.py build-index --resume`
5499 - `--checkpoint-every-refs`
5500 -[docs/open-dataset-workflow.md](./open-dataset-workflow.md) 补充模型签名护栏说明
5501
5502 验证结果:
5503 - 编译验证:
5504 - `/usr/local/miniconda3/bin/python -m py_compile src/engines/ecapa_embedder.py src/data/external_adapters.py run_demo.py`
5505 - 同模型恢复验证(`models_v6 -> models_v6`):
5506 - 人工构造前 `2` 首 reference 的 partial checkpoint
5507 - 日志出现:
5508 - `[build-reference-index] resuming from checkpoint: refs_done=2/24 windows_done=10`
5509 - 与 fresh rebuild 对比:
5510 - `same_final_ids_equal == True`
5511 - `same_final_embs_equal == True`
5512 - `same_progress_status == complete`
5513 - 异模型拒绝恢复验证(`models_v6 partial -> models_v5 current`):
5514 - 日志出现:
5515 - `resume checkpoint ignored due to load failure: model signature mismatch`
5516 - 随后从 0 重建:
5517 - `start: refs=24 ... resume=True refs_done=0`
5518 -`models_v5` fresh rebuild 对比:
5519 - `mismatch_final_ids_equal == True`
5520 - `mismatch_final_embs_equal == True`
5521
5522 结论:
5523 - `smoke-local` 现在已经具备“可恢复,但不会错误复用旧模型 embedding”的安全自动恢复能力
5524 - 这对真实 FMA 这类 CPU 长时任务尤其重要:重启可续跑,换模型不会串污染 index
......
...@@ -81,6 +81,11 @@ flowchart LR ...@@ -81,6 +81,11 @@ flowchart LR
81 --device cpu \ 81 --device cpu \
82 --resume \ 82 --resume \
83 --checkpoint-every-refs 100 83 --checkpoint-every-refs 100
84
85 说明:
86 - `smoke-local` 现在内部默认也会为 `build-index` 打开 `--resume`
87 - checkpoint 会记录 `model_signature`
88 - 如果这次训练出的 `best_model.pt` 与旧 partial checkpoint 不是同一个模型,恢复会被自动拒绝并从 0 重建,避免混入不同模型的 embedding
84 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json 89 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json
85 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local 90 /usr/local/miniconda3/bin/python scripts/generate_artifacts.py --eval-json reports/fma-smoke/eval.json --config-json reports/fma-smoke/config.json --output-dir reports/fma-smoke --model-version fma-smoke --data-version fma_local
86 ``` 91 ```
......