Commit f0c82687 f0c826879d8435a60e80241dff3bfe19778648ce by cnb.bofCdSsphPA

Prove the Phase-1 registry bootstrap is idempotent

Constraint: Ralph follow-up work must keep producing audit-ready evidence and a pushed trail for the next session
Rejected: Assume the new bootstrap script is safe to rerun without proof | Duplicate feature-set inserts would erode trust in the PostgreSQL bootstrap path
Confidence: high
Scope-risk: narrow
Directive: Re-run registry bootstrap in-place before future extraction jobs and treat count drift as a regression signal
Tested: /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_model_registry_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_registry_bootstrap_report.json (run twice); /usr/local/miniconda3/bin/python -m py_compile scripts/bootstrap_phase1_model_registry_live.py; git diff --check -- acr-engine/scripts/bootstrap_phase1_model_registry_live.py acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md
Not-tested: Actual downstream MERT/MuQ extraction after bootstrap, missing business sample mount recovery
1 parent fef8f438
1 {
2 "run1_counts": {
3 "model_registry": 5,
4 "feature_set_registry": 6,
5 "reference_set_registry": 2
6 },
7 "run2_counts": {
8 "model_registry": 5,
9 "feature_set_registry": 6,
10 "reference_set_registry": 2
11 },
12 "run2_model_operations": [
13 "updated",
14 "updated",
15 "updated",
16 "updated"
17 ],
18 "run2_feature_operations": [
19 "reused",
20 "reused",
21 "reused",
22 "reused",
23 "reused"
24 ],
25 "run2_reference_set_operation": "updated",
26 "idempotent": true
27 }
...\ No newline at end of file ...\ No newline at end of file
...@@ -6,25 +6,29 @@ ...@@ -6,25 +6,29 @@
6 "model_id": 2, 6 "model_id": 2,
7 "model_name": "chromaprint", 7 "model_name": "chromaprint",
8 "model_version": "v1", 8 "model_version": "v1",
9 "output_embedding_dim": null 9 "output_embedding_dim": null,
10 "operation": "updated"
10 }, 11 },
11 { 12 {
12 "model_id": 3, 13 "model_id": 3,
13 "model_name": "mert", 14 "model_name": "mert",
14 "model_version": "v1-95m", 15 "model_version": "v1-95m",
15 "output_embedding_dim": 768 16 "output_embedding_dim": 768,
17 "operation": "updated"
16 }, 18 },
17 { 19 {
18 "model_id": 4, 20 "model_id": 4,
19 "model_name": "muq", 21 "model_name": "muq",
20 "model_version": "large-msd-iter", 22 "model_version": "large-msd-iter",
21 "output_embedding_dim": 768 23 "output_embedding_dim": 768,
24 "operation": "updated"
22 }, 25 },
23 { 26 {
24 "model_id": 5, 27 "model_id": 5,
25 "model_name": "ecapa", 28 "model_name": "ecapa",
26 "model_version": "acr-baseline-v1", 29 "model_version": "acr-baseline-v1",
27 "output_embedding_dim": 192 30 "output_embedding_dim": 192,
31 "operation": "updated"
28 } 32 }
29 ], 33 ],
30 "feature_sets": [ 34 "feature_sets": [
...@@ -36,7 +40,8 @@ ...@@ -36,7 +40,8 @@
36 "window_sec": 5.0, 40 "window_sec": 5.0,
37 "hop_sec": 2.5, 41 "hop_sec": 2.5,
38 "embedding_dim": null, 42 "embedding_dim": null,
39 "distance_metric": "hamming" 43 "distance_metric": "hamming",
44 "operation": "reused"
40 }, 45 },
41 { 46 {
42 "feature_set_id": 3, 47 "feature_set_id": 3,
...@@ -46,7 +51,8 @@ ...@@ -46,7 +51,8 @@
46 "window_sec": 5.0, 51 "window_sec": 5.0,
47 "hop_sec": 2.5, 52 "hop_sec": 2.5,
48 "embedding_dim": 768, 53 "embedding_dim": 768,
49 "distance_metric": "cosine" 54 "distance_metric": "cosine",
55 "operation": "reused"
50 }, 56 },
51 { 57 {
52 "feature_set_id": 4, 58 "feature_set_id": 4,
...@@ -56,7 +62,8 @@ ...@@ -56,7 +62,8 @@
56 "window_sec": 10.0, 62 "window_sec": 10.0,
57 "hop_sec": 5.0, 63 "hop_sec": 5.0,
58 "embedding_dim": 768, 64 "embedding_dim": 768,
59 "distance_metric": "cosine" 65 "distance_metric": "cosine",
66 "operation": "reused"
60 }, 67 },
61 { 68 {
62 "feature_set_id": 5, 69 "feature_set_id": 5,
...@@ -66,7 +73,8 @@ ...@@ -66,7 +73,8 @@
66 "window_sec": 5.0, 73 "window_sec": 5.0,
67 "hop_sec": 2.5, 74 "hop_sec": 2.5,
68 "embedding_dim": 768, 75 "embedding_dim": 768,
69 "distance_metric": "cosine" 76 "distance_metric": "cosine",
77 "operation": "reused"
70 }, 78 },
71 { 79 {
72 "feature_set_id": 6, 80 "feature_set_id": 6,
...@@ -76,13 +84,15 @@ ...@@ -76,13 +84,15 @@
76 "window_sec": 5.0, 84 "window_sec": 5.0,
77 "hop_sec": 2.5, 85 "hop_sec": 2.5,
78 "embedding_dim": 192, 86 "embedding_dim": 192,
79 "distance_metric": "cosine" 87 "distance_metric": "cosine",
88 "operation": "reused"
80 } 89 }
81 ], 90 ],
82 "reference_set": { 91 "reference_set": {
83 "reference_set_id": 2, 92 "reference_set_id": 2,
84 "set_name": "phase1_hot_reference_v1", 93 "set_name": "phase1_hot_reference_v1",
85 "encoder_scope": "chromaprint-v1 / mert-v1-95m / muq-large-msd-iter" 94 "encoder_scope": "chromaprint-v1 / mert-v1-95m / muq-large-msd-iter",
95 "operation": "updated"
86 }, 96 },
87 "counts": { 97 "counts": {
88 "model_registry": 5, 98 "model_registry": 5,
......
...@@ -207,7 +207,11 @@ REFERENCE_SET = { ...@@ -207,7 +207,11 @@ REFERENCE_SET = {
207 } 207 }
208 208
209 209
210 def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> int: 210 def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> tuple[int, str]:
211 existing = conn.execute(
212 'SELECT model_id FROM model_registry WHERE model_name = %s AND model_version = %s',
213 (model['model_name'], model['model_version']),
214 ).fetchone()
211 row = conn.execute( 215 row = conn.execute(
212 """ 216 """
213 INSERT INTO model_registry ( 217 INSERT INTO model_registry (
...@@ -242,10 +246,10 @@ def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> int: ...@@ -242,10 +246,10 @@ def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> int:
242 """, 246 """,
243 {**model, 'metadata_json': json.dumps(model['metadata_json'])}, 247 {**model, 'metadata_json': json.dumps(model['metadata_json'])},
244 ).fetchone() 248 ).fetchone()
245 return int(row[0]) 249 return int(row[0]), ('updated' if existing else 'inserted')
246 250
247 251
248 def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[str, Any]) -> int: 252 def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[str, Any]) -> tuple[int, str]:
249 existing = conn.execute( 253 existing = conn.execute(
250 """ 254 """
251 SELECT feature_set_id 255 SELECT feature_set_id
...@@ -283,7 +287,7 @@ def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[st ...@@ -283,7 +287,7 @@ def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[st
283 "UPDATE feature_set_registry SET config_json = %s::jsonb, status = %s, updated_at = NOW() WHERE feature_set_id = %s", 287 "UPDATE feature_set_registry SET config_json = %s::jsonb, status = %s, updated_at = NOW() WHERE feature_set_id = %s",
284 (json.dumps(feature['config_json']), feature['status'], existing[0]), 288 (json.dumps(feature['config_json']), feature['status'], existing[0]),
285 ) 289 )
286 return int(existing[0]) 290 return int(existing[0]), 'reused'
287 291
288 row = conn.execute( 292 row = conn.execute(
289 """ 293 """
...@@ -318,10 +322,14 @@ def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[st ...@@ -318,10 +322,14 @@ def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[st
318 feature['status'], 322 feature['status'],
319 ), 323 ),
320 ).fetchone() 324 ).fetchone()
321 return int(row[0]) 325 return int(row[0]), 'inserted'
322 326
323 327
324 def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> int: 328 def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> tuple[int, str]:
329 existing = conn.execute(
330 'SELECT reference_set_id FROM reference_set_registry WHERE set_name = %s',
331 (payload['set_name'],),
332 ).fetchone()
325 row = conn.execute( 333 row = conn.execute(
326 """ 334 """
327 INSERT INTO reference_set_registry (set_name, description, encoder_scope, status, metadata_json) 335 INSERT INTO reference_set_registry (set_name, description, encoder_scope, status, metadata_json)
...@@ -343,7 +351,7 @@ def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> i ...@@ -343,7 +351,7 @@ def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> i
343 json.dumps(payload['metadata_json']), 351 json.dumps(payload['metadata_json']),
344 ), 352 ),
345 ).fetchone() 353 ).fetchone()
346 return int(row[0]) 354 return int(row[0]), ('updated' if existing else 'inserted')
347 355
348 356
349 def main() -> None: 357 def main() -> None:
...@@ -365,18 +373,19 @@ def main() -> None: ...@@ -365,18 +373,19 @@ def main() -> None:
365 conn.execute(f'SET search_path TO {args.schema}, public;') 373 conn.execute(f'SET search_path TO {args.schema}, public;')
366 model_ids: dict[tuple[str, str], int] = {} 374 model_ids: dict[tuple[str, str], int] = {}
367 for model in MODELS: 375 for model in MODELS:
368 model_id = upsert_model(conn, model) 376 model_id, operation = upsert_model(conn, model)
369 model_ids[(model['model_name'], model['model_version'])] = model_id 377 model_ids[(model['model_name'], model['model_version'])] = model_id
370 summary['models'].append({ 378 summary['models'].append({
371 'model_id': model_id, 379 'model_id': model_id,
372 'model_name': model['model_name'], 380 'model_name': model['model_name'],
373 'model_version': model['model_version'], 381 'model_version': model['model_version'],
374 'output_embedding_dim': model['output_embedding_dim'], 382 'output_embedding_dim': model['output_embedding_dim'],
383 'operation': operation,
375 }) 384 })
376 385
377 for feature in FEATURE_SETS: 386 for feature in FEATURE_SETS:
378 model_id = model_ids[(feature['model_name'], feature['model_version'])] 387 model_id = model_ids[(feature['model_name'], feature['model_version'])]
379 feature_set_id = ensure_feature_set(conn, model_id, feature) 388 feature_set_id, operation = ensure_feature_set(conn, model_id, feature)
380 summary['feature_sets'].append({ 389 summary['feature_sets'].append({
381 'feature_set_id': feature_set_id, 390 'feature_set_id': feature_set_id,
382 'model_name': feature['model_name'], 391 'model_name': feature['model_name'],
...@@ -386,13 +395,15 @@ def main() -> None: ...@@ -386,13 +395,15 @@ def main() -> None:
386 'hop_sec': feature['hop_sec'], 395 'hop_sec': feature['hop_sec'],
387 'embedding_dim': feature['embedding_dim'], 396 'embedding_dim': feature['embedding_dim'],
388 'distance_metric': feature['distance_metric'], 397 'distance_metric': feature['distance_metric'],
398 'operation': operation,
389 }) 399 })
390 400
391 reference_set_id = upsert_reference_set(conn, REFERENCE_SET) 401 reference_set_id, operation = upsert_reference_set(conn, REFERENCE_SET)
392 summary['reference_set'] = { 402 summary['reference_set'] = {
393 'reference_set_id': reference_set_id, 403 'reference_set_id': reference_set_id,
394 'set_name': REFERENCE_SET['set_name'], 404 'set_name': REFERENCE_SET['set_name'],
395 'encoder_scope': REFERENCE_SET['encoder_scope'], 405 'encoder_scope': REFERENCE_SET['encoder_scope'],
406 'operation': operation,
396 } 407 }
397 summary['counts'] = { 408 summary['counts'] = {
398 'model_registry': int(conn.execute('SELECT count(*) FROM model_registry;').fetchone()[0]), 409 'model_registry': int(conn.execute('SELECT count(*) FROM model_registry;').fetchone()[0]),
......
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py``acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。
3 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 4 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。
4 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。 5 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。
5 - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md)`acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile``diff --check` 通过的机械验证事实。 6 - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md)`acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile``diff --check` 通过的机械验证事实。
......
...@@ -272,6 +272,7 @@ cd /workspace/acr-engine ...@@ -272,6 +272,7 @@ cd /workspace/acr-engine
272 ### 8.3 当前产物 272 ### 8.3 当前产物
273 273
274 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` 274 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`
275 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json`
275 276
276 这个文件已经记录了: 277 这个文件已经记录了:
277 - model_id 278 - model_id
...@@ -280,3 +281,22 @@ cd /workspace/acr-engine ...@@ -280,3 +281,22 @@ cd /workspace/acr-engine
280 - 最终表计数 281 - 最终表计数
281 282
282 因此,下次 session 不需要再从 SQL 片段手工执行开始,而可以直接从 live bootstrap 脚本接上。 283 因此,下次 session 不需要再从 SQL 片段手工执行开始,而可以直接从 live bootstrap 脚本接上。
284
285 ### 8.4 幂等性验证(已做)
286
287 同一套命令在 `acr_test` schema 上连续执行两次后,已经拿到真实幂等性证据:
288
289 | 项目 | 第 1 次 | 第 2 次 |
290 |---|---:|---:|
291 | `model_registry` | `5` | `5` |
292 | `feature_set_registry` | `6` | `6` |
293 | `reference_set_registry` | `2` | `2` |
294
295 第二次执行时:
296 - `models` 全部表现为 `updated`
297 - `feature_sets` 全部表现为 `reused`
298 - `reference_set` 表现为 `updated`
299
300 结论:
301
302 > 当前 bootstrap 脚本可重复执行,不会把 Phase-1 registry 数据重复灌爆。
......
...@@ -66,6 +66,7 @@ ...@@ -66,6 +66,7 @@
66 | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` | 66 | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` |
67 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` | 67 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` |
68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | 68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` |
69 | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` |
69 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | 70 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` |
70 71
71 --- 72 ---
......
...@@ -182,6 +182,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -182,6 +182,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
182 - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据 182 - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据
183 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过 183 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过
184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` 184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1`
185 - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变
185 186
186 ### 未验证 / 仍是缺口 187 ### 未验证 / 仍是缺口
187 - **未实际跑 MERT / MuQ encoder-only 特征抽取** 188 - **未实际跑 MERT / MuQ encoder-only 特征抽取**
......