Commit fef8f438 fef8f4387d95be5d4a017ba55150b6fa7463f1f6 by cnb.bofCdSsphPA

Bootstrap the Phase-1 model registry on live PostgreSQL

Constraint: Continue the Ralph loop without waiting on missing business sample mounts, while still leaving a push-ready implementation and documentation trail
Rejected: Keep Phase-1 registry setup as static SQL snippets only | It slows live validation and leaves no machine-checkable bootstrap path
Confidence: high
Scope-risk: narrow
Directive: Treat model_registry/feature_set_registry/reference_set_registry as the mandatory entrypoint before any future MERT/MuQ extraction jobs
Tested: /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_model_registry_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_registry_bootstrap_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/bootstrap_phase1_model_registry_live.py; git diff --check -- acr-engine/scripts/bootstrap_phase1_model_registry_live.py acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md
Not-tested: Actual MERT/MuQ embedding extraction, hard-case type_8/type_16 live queries, multi-recording/cover-lane retrieval
1 parent ea51b9c1
1 {
2 "schema": "acr_test",
3 "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2",
4 "models": [
5 {
6 "model_id": 2,
7 "model_name": "chromaprint",
8 "model_version": "v1",
9 "output_embedding_dim": null
10 },
11 {
12 "model_id": 3,
13 "model_name": "mert",
14 "model_version": "v1-95m",
15 "output_embedding_dim": 768
16 },
17 {
18 "model_id": 4,
19 "model_name": "muq",
20 "model_version": "large-msd-iter",
21 "output_embedding_dim": 768
22 },
23 {
24 "model_id": 5,
25 "model_name": "ecapa",
26 "model_version": "acr-baseline-v1",
27 "output_embedding_dim": 192
28 }
29 ],
30 "feature_sets": [
31 {
32 "feature_set_id": 2,
33 "model_name": "chromaprint",
34 "model_version": "v1",
35 "feature_name": "fingerprint_asset",
36 "window_sec": 5.0,
37 "hop_sec": 2.5,
38 "embedding_dim": null,
39 "distance_metric": "hamming"
40 },
41 {
42 "feature_set_id": 3,
43 "model_name": "mert",
44 "model_version": "v1-95m",
45 "feature_name": "semantic_embedding",
46 "window_sec": 5.0,
47 "hop_sec": 2.5,
48 "embedding_dim": 768,
49 "distance_metric": "cosine"
50 },
51 {
52 "feature_set_id": 4,
53 "model_name": "mert",
54 "model_version": "v1-95m",
55 "feature_name": "semantic_embedding",
56 "window_sec": 10.0,
57 "hop_sec": 5.0,
58 "embedding_dim": 768,
59 "distance_metric": "cosine"
60 },
61 {
62 "feature_set_id": 5,
63 "model_name": "muq",
64 "model_version": "large-msd-iter",
65 "feature_name": "semantic_embedding",
66 "window_sec": 5.0,
67 "hop_sec": 2.5,
68 "embedding_dim": 768,
69 "distance_metric": "cosine"
70 },
71 {
72 "feature_set_id": 6,
73 "model_name": "ecapa",
74 "model_version": "acr-baseline-v1",
75 "feature_name": "semantic_embedding",
76 "window_sec": 5.0,
77 "hop_sec": 2.5,
78 "embedding_dim": 192,
79 "distance_metric": "cosine"
80 }
81 ],
82 "reference_set": {
83 "reference_set_id": 2,
84 "set_name": "phase1_hot_reference_v1",
85 "encoder_scope": "chromaprint-v1 / mert-v1-95m / muq-large-msd-iter"
86 },
87 "counts": {
88 "model_registry": 5,
89 "feature_set_registry": 6,
90 "reference_set_registry": 2
91 }
92 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7 from typing import Any
8
9 import psycopg
10
11 ROOT = Path(__file__).resolve().parents[1]
12 DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_registry_bootstrap_report.json'
13
14 MODELS = [
15 {
16 'model_name': 'chromaprint',
17 'model_family': 'fingerprint',
18 'model_version': 'v1',
19 'model_source': 'acoustid',
20 'model_uri': 'https://acoustid.org/chromaprint',
21 'license_name': 'lgpl-2.1',
22 'input_modality': 'audio',
23 'input_sample_rate': 16000,
24 'input_channel_mode': 'mono',
25 'default_window_sec': 5.0,
26 'default_hop_sec': 2.5,
27 'output_embedding_dim': None,
28 'pooling_supported': ['none'],
29 'layer_selection_supported': False,
30 'is_trainable': False,
31 'metadata_json': {
32 'lane': 'exact',
33 'phase': 'phase1',
34 'note': 'exact fingerprint lane baseline',
35 },
36 },
37 {
38 'model_name': 'mert',
39 'model_family': 'music_ssl',
40 'model_version': 'v1-95m',
41 'model_source': 'github',
42 'model_uri': 'https://github.com/yizhilll/MERT',
43 'license_name': 'apache-2.0',
44 'input_modality': 'audio',
45 'input_sample_rate': 24000,
46 'input_channel_mode': 'mono',
47 'default_window_sec': 5.0,
48 'default_hop_sec': 2.5,
49 'output_embedding_dim': 768,
50 'pooling_supported': ['mean', 'cls'],
51 'layer_selection_supported': True,
52 'is_trainable': False,
53 'metadata_json': {
54 'lane': 'semantic',
55 'role': 'primary_baseline',
56 'phase': 'phase1',
57 },
58 },
59 {
60 'model_name': 'muq',
61 'model_family': 'music_ssl',
62 'model_version': 'large-msd-iter',
63 'model_source': 'github',
64 'model_uri': 'https://github.com/tencent-ailab/MuQ',
65 'license_name': 'apache-2.0',
66 'input_modality': 'audio',
67 'input_sample_rate': 24000,
68 'input_channel_mode': 'mono',
69 'default_window_sec': 5.0,
70 'default_hop_sec': 2.5,
71 'output_embedding_dim': 768,
72 'pooling_supported': ['mean', 'cls'],
73 'layer_selection_supported': True,
74 'is_trainable': False,
75 'metadata_json': {
76 'lane': 'semantic',
77 'role': 'challenger',
78 'phase': 'phase1',
79 },
80 },
81 {
82 'model_name': 'ecapa',
83 'model_family': 'speech_derived',
84 'model_version': 'acr-baseline-v1',
85 'model_source': 'local',
86 'model_uri': None,
87 'license_name': 'internal-eval',
88 'input_modality': 'audio',
89 'input_sample_rate': 16000,
90 'input_channel_mode': 'mono',
91 'default_window_sec': 5.0,
92 'default_hop_sec': 2.5,
93 'output_embedding_dim': 192,
94 'pooling_supported': ['mean'],
95 'layer_selection_supported': False,
96 'is_trainable': True,
97 'metadata_json': {
98 'lane': 'semantic',
99 'role': 'historical_baseline',
100 'phase': 'phase1',
101 },
102 },
103 ]
104
105 FEATURE_SETS = [
106 {
107 'model_name': 'chromaprint',
108 'model_version': 'v1',
109 'feature_name': 'fingerprint_asset',
110 'feature_level': 'asset',
111 'extraction_granularity': 'full_asset',
112 'window_sec': 5.0,
113 'hop_sec': 2.5,
114 'embedding_dim': None,
115 'pooling_strategy': 'none',
116 'layer_selection': 'na',
117 'normalize_l2': False,
118 'distance_metric': 'hamming',
119 'quantization_type': 'fingerprint_hash',
120 'feature_schema_version': 'v1',
121 'config_json': {'lane': 'exact', 'index_target': 'audio_fingerprint'},
122 'status': 'active',
123 },
124 {
125 'model_name': 'mert',
126 'model_version': 'v1-95m',
127 'feature_name': 'semantic_embedding',
128 'feature_level': 'window',
129 'extraction_granularity': 'sliding_window',
130 'window_sec': 5.0,
131 'hop_sec': 2.5,
132 'embedding_dim': 768,
133 'pooling_strategy': 'mean',
134 'layer_selection': 'final',
135 'normalize_l2': True,
136 'distance_metric': 'cosine',
137 'quantization_type': None,
138 'feature_schema_version': 'v1',
139 'config_json': {'role': 'primary_semantic_baseline'},
140 'status': 'active',
141 },
142 {
143 'model_name': 'mert',
144 'model_version': 'v1-95m',
145 'feature_name': 'semantic_embedding',
146 'feature_level': 'window',
147 'extraction_granularity': 'sliding_window',
148 'window_sec': 10.0,
149 'hop_sec': 5.0,
150 'embedding_dim': 768,
151 'pooling_strategy': 'mean',
152 'layer_selection': 'final',
153 'normalize_l2': True,
154 'distance_metric': 'cosine',
155 'quantization_type': None,
156 'feature_schema_version': 'v1',
157 'config_json': {'role': 'long_context_validation'},
158 'status': 'active',
159 },
160 {
161 'model_name': 'muq',
162 'model_version': 'large-msd-iter',
163 'feature_name': 'semantic_embedding',
164 'feature_level': 'window',
165 'extraction_granularity': 'sliding_window',
166 'window_sec': 5.0,
167 'hop_sec': 2.5,
168 'embedding_dim': 768,
169 'pooling_strategy': 'mean',
170 'layer_selection': 'final',
171 'normalize_l2': True,
172 'distance_metric': 'cosine',
173 'quantization_type': None,
174 'feature_schema_version': 'v1',
175 'config_json': {'role': 'semantic_challenger'},
176 'status': 'active',
177 },
178 {
179 'model_name': 'ecapa',
180 'model_version': 'acr-baseline-v1',
181 'feature_name': 'semantic_embedding',
182 'feature_level': 'window',
183 'extraction_granularity': 'sliding_window',
184 'window_sec': 5.0,
185 'hop_sec': 2.5,
186 'embedding_dim': 192,
187 'pooling_strategy': 'mean',
188 'layer_selection': 'na',
189 'normalize_l2': True,
190 'distance_metric': 'cosine',
191 'quantization_type': None,
192 'feature_schema_version': 'v1',
193 'config_json': {'role': 'historical_baseline'},
194 'status': 'active',
195 },
196 ]
197
198 REFERENCE_SET = {
199 'set_name': 'phase1_hot_reference_v1',
200 'description': 'Phase-1 hot reference set bootstrap for MERT/MuQ/Chromaprint lanes',
201 'encoder_scope': 'chromaprint-v1 / mert-v1-95m / muq-large-msd-iter',
202 'status': 'active',
203 'metadata_json': {
204 'phase': 'phase1',
205 'purpose': 'registry_bootstrap',
206 },
207 }
208
209
210 def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> int:
211 row = conn.execute(
212 """
213 INSERT INTO model_registry (
214 model_name, model_family, model_version, model_source, model_uri,
215 license_name, input_modality, input_sample_rate, input_channel_mode,
216 default_window_sec, default_hop_sec, output_embedding_dim,
217 pooling_supported, layer_selection_supported, is_trainable, metadata_json
218 ) VALUES (
219 %(model_name)s, %(model_family)s, %(model_version)s, %(model_source)s, %(model_uri)s,
220 %(license_name)s, %(input_modality)s, %(input_sample_rate)s, %(input_channel_mode)s,
221 %(default_window_sec)s, %(default_hop_sec)s, %(output_embedding_dim)s,
222 %(pooling_supported)s, %(layer_selection_supported)s, %(is_trainable)s, %(metadata_json)s::jsonb
223 )
224 ON CONFLICT (model_name, model_version)
225 DO UPDATE SET
226 model_family = EXCLUDED.model_family,
227 model_source = EXCLUDED.model_source,
228 model_uri = EXCLUDED.model_uri,
229 license_name = EXCLUDED.license_name,
230 input_modality = EXCLUDED.input_modality,
231 input_sample_rate = EXCLUDED.input_sample_rate,
232 input_channel_mode = EXCLUDED.input_channel_mode,
233 default_window_sec = EXCLUDED.default_window_sec,
234 default_hop_sec = EXCLUDED.default_hop_sec,
235 output_embedding_dim = EXCLUDED.output_embedding_dim,
236 pooling_supported = EXCLUDED.pooling_supported,
237 layer_selection_supported = EXCLUDED.layer_selection_supported,
238 is_trainable = EXCLUDED.is_trainable,
239 metadata_json = EXCLUDED.metadata_json,
240 updated_at = NOW()
241 RETURNING model_id;
242 """,
243 {**model, 'metadata_json': json.dumps(model['metadata_json'])},
244 ).fetchone()
245 return int(row[0])
246
247
248 def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[str, Any]) -> int:
249 existing = conn.execute(
250 """
251 SELECT feature_set_id
252 FROM feature_set_registry
253 WHERE model_id = %s
254 AND feature_name = %s
255 AND feature_level = %s
256 AND extraction_granularity = %s
257 AND coalesce(window_sec, -1) = coalesce(%s, -1)
258 AND coalesce(hop_sec, -1) = coalesce(%s, -1)
259 AND coalesce(embedding_dim, -1) = coalesce(%s, -1)
260 AND coalesce(pooling_strategy, '') = coalesce(%s, '')
261 AND coalesce(layer_selection, '') = coalesce(%s, '')
262 AND normalize_l2 = %s
263 AND distance_metric = %s
264 AND coalesce(feature_schema_version, '') = coalesce(%s, '');
265 """,
266 (
267 model_id,
268 feature['feature_name'],
269 feature['feature_level'],
270 feature['extraction_granularity'],
271 feature['window_sec'],
272 feature['hop_sec'],
273 feature['embedding_dim'],
274 feature['pooling_strategy'],
275 feature['layer_selection'],
276 feature['normalize_l2'],
277 feature['distance_metric'],
278 feature['feature_schema_version'],
279 ),
280 ).fetchone()
281 if existing:
282 conn.execute(
283 "UPDATE feature_set_registry SET config_json = %s::jsonb, status = %s, updated_at = NOW() WHERE feature_set_id = %s",
284 (json.dumps(feature['config_json']), feature['status'], existing[0]),
285 )
286 return int(existing[0])
287
288 row = conn.execute(
289 """
290 INSERT INTO feature_set_registry (
291 model_id, feature_name, feature_level, extraction_granularity,
292 window_sec, hop_sec, embedding_dim, pooling_strategy, layer_selection,
293 normalize_l2, distance_metric, quantization_type, feature_schema_version,
294 config_json, status
295 ) VALUES (
296 %s, %s, %s, %s,
297 %s, %s, %s, %s, %s,
298 %s, %s, %s, %s,
299 %s::jsonb, %s
300 )
301 RETURNING feature_set_id;
302 """,
303 (
304 model_id,
305 feature['feature_name'],
306 feature['feature_level'],
307 feature['extraction_granularity'],
308 feature['window_sec'],
309 feature['hop_sec'],
310 feature['embedding_dim'],
311 feature['pooling_strategy'],
312 feature['layer_selection'],
313 feature['normalize_l2'],
314 feature['distance_metric'],
315 feature['quantization_type'],
316 feature['feature_schema_version'],
317 json.dumps(feature['config_json']),
318 feature['status'],
319 ),
320 ).fetchone()
321 return int(row[0])
322
323
324 def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> int:
325 row = conn.execute(
326 """
327 INSERT INTO reference_set_registry (set_name, description, encoder_scope, status, metadata_json)
328 VALUES (%s, %s, %s, %s, %s::jsonb)
329 ON CONFLICT (set_name)
330 DO UPDATE SET
331 description = EXCLUDED.description,
332 encoder_scope = EXCLUDED.encoder_scope,
333 status = EXCLUDED.status,
334 metadata_json = EXCLUDED.metadata_json,
335 updated_at = NOW()
336 RETURNING reference_set_id;
337 """,
338 (
339 payload['set_name'],
340 payload['description'],
341 payload['encoder_scope'],
342 payload['status'],
343 json.dumps(payload['metadata_json']),
344 ),
345 ).fetchone()
346 return int(row[0])
347
348
349 def main() -> None:
350 ap = argparse.ArgumentParser()
351 ap.add_argument('--dsn', required=True)
352 ap.add_argument('--schema', default='acr_test')
353 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
354 args = ap.parse_args()
355
356 summary: dict[str, Any] = {
357 'schema': args.schema,
358 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2',
359 'models': [],
360 'feature_sets': [],
361 'reference_set': None,
362 }
363
364 with psycopg.connect(args.dsn, autocommit=True) as conn:
365 conn.execute(f'SET search_path TO {args.schema}, public;')
366 model_ids: dict[tuple[str, str], int] = {}
367 for model in MODELS:
368 model_id = upsert_model(conn, model)
369 model_ids[(model['model_name'], model['model_version'])] = model_id
370 summary['models'].append({
371 'model_id': model_id,
372 'model_name': model['model_name'],
373 'model_version': model['model_version'],
374 'output_embedding_dim': model['output_embedding_dim'],
375 })
376
377 for feature in FEATURE_SETS:
378 model_id = model_ids[(feature['model_name'], feature['model_version'])]
379 feature_set_id = ensure_feature_set(conn, model_id, feature)
380 summary['feature_sets'].append({
381 'feature_set_id': feature_set_id,
382 'model_name': feature['model_name'],
383 'model_version': feature['model_version'],
384 'feature_name': feature['feature_name'],
385 'window_sec': feature['window_sec'],
386 'hop_sec': feature['hop_sec'],
387 'embedding_dim': feature['embedding_dim'],
388 'distance_metric': feature['distance_metric'],
389 })
390
391 reference_set_id = upsert_reference_set(conn, REFERENCE_SET)
392 summary['reference_set'] = {
393 'reference_set_id': reference_set_id,
394 'set_name': REFERENCE_SET['set_name'],
395 'encoder_scope': REFERENCE_SET['encoder_scope'],
396 }
397 summary['counts'] = {
398 'model_registry': int(conn.execute('SELECT count(*) FROM model_registry;').fetchone()[0]),
399 'feature_set_registry': int(conn.execute('SELECT count(*) FROM feature_set_registry;').fetchone()[0]),
400 'reference_set_registry': int(conn.execute('SELECT count(*) FROM reference_set_registry;').fetchone()[0]),
401 }
402
403 out = Path(args.output)
404 out.parent.mkdir(parents=True, exist_ok=True)
405 out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
406 print(json.dumps(summary, ensure_ascii=False, indent=2))
407
408
409 if __name__ == '__main__':
410 main()
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py``acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。
3 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。 4 - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。
4 - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md)`acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile``diff --check` 通过的机械验证事实。 5 - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md)`acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile``diff --check` 通过的机械验证事实。
5 - 新增 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md),补齐 `acr_pg_schema_v2.sql` 的真实落库样例、`pgvector` live 检索验证、lineage trigger 负例测试,以及当前召回/混淆结果解读。 6 - 新增 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md),补齐 `acr_pg_schema_v2.sql` 的真实落库样例、`pgvector` live 检索验证、lineage trigger 负例测试,以及当前召回/混淆结果解读。
......
...@@ -216,3 +216,67 @@ flowchart TD ...@@ -216,3 +216,67 @@ flowchart TD
216 6. `phase1_hot_reference_v1` 216 6. `phase1_hot_reference_v1`
217 217
218 这样数据、模型、索引三条线就都有了稳定入口。 218 这样数据、模型、索引三条线就都有了稳定入口。
219
220 ---
221
222 ## 8. live PostgreSQL bootstrap 脚本
223
224 为了避免每次手工执行 SQL,本仓库现在提供了一个可直接连 PostgreSQL 的 live bootstrap 脚本:
225
226 - `acr-engine/scripts/bootstrap_phase1_model_registry_live.py`
227
228 用途:
229 - 向目标 schema 写入 `model_registry`
230 - 写入 `feature_set_registry`
231 - 写入 `reference_set_registry`
232 - 采用 **幂等式 upsert / ensure** 方式,适合重复执行
233
234 ### 8.1 执行命令
235
236 ```bash
237 cd /workspace/acr-engine
238 /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_model_registry_live.py \
239 --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \
240 --schema acr_test \
241 --output data/pgvector_eval/music20/phase1_registry_bootstrap_report.json
242 ```
243
244 ### 8.2 当前已验证结果(acr_test)
245
246 本轮已在 `acr_test` schema 上真实执行,写入结果如下:
247
248 | 对象 | 数量 |
249 |---|---:|
250 | `model_registry` | `5` |
251 | `feature_set_registry` | `6` |
252 | `reference_set_registry` | `2` |
253
254 其中新增的 Phase-1 对象包含:
255
256 #### models
257 - `chromaprint v1`
258 - `mert v1-95m`
259 - `muq large-msd-iter`
260 - `ecapa acr-baseline-v1`
261
262 #### feature sets
263 - `chromaprint fingerprint_asset`
264 - `mert semantic_embedding 5s/2.5s`
265 - `mert semantic_embedding 10s/5s`
266 - `muq semantic_embedding 5s/2.5s`
267 - `ecapa semantic_embedding 5s/2.5s`
268
269 #### reference set
270 - `phase1_hot_reference_v1`
271
272 ### 8.3 当前产物
273
274 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`
275
276 这个文件已经记录了:
277 - model_id
278 - feature_set_id
279 - reference_set_id
280 - 最终表计数
281
282 因此,下次 session 不需要再从 SQL 片段手工执行开始,而可以直接从 live bootstrap 脚本接上。
......
...@@ -62,8 +62,10 @@ ...@@ -62,8 +62,10 @@
62 |---|---| 62 |---|---|
63 | 推荐 DDL | `acr-engine/sql/acr_pg_schema_v2.sql` | 63 | 推荐 DDL | `acr-engine/sql/acr_pg_schema_v2.sql` |
64 | live 测试脚本 | `acr-engine/scripts/live_pgvector_music20_eval.py` | 64 | live 测试脚本 | `acr-engine/scripts/live_pgvector_music20_eval.py` |
65 | registry bootstrap 脚本 | `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` |
65 | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` | 66 | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` |
66 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` | 67 | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` |
68 | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` |
67 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | 69 | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` |
68 70
69 --- 71 ---
...@@ -379,6 +381,23 @@ flowchart LR ...@@ -379,6 +381,23 @@ flowchart LR
379 381
380 ## 推荐的下一步 382 ## 推荐的下一步
381 383
384 ### 本轮新增:Phase-1 registry 已可 live bootstrap
385
386 除了 live 检索脚本外,本轮还新增了:
387
388 - `acr-engine/scripts/bootstrap_phase1_model_registry_live.py`
389
390 它已经在 `acr_test` schema 上真实写入了:
391 - `chromaprint`
392 - `mert`
393 - `muq`
394 - `ecapa`
395 - 对应 feature sets
396 - `phase1_hot_reference_v1`
397
398 对应 live 报告:
399 - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`
400
382 ### 路线 1:继续做 PostgreSQL 工程化 401 ### 路线 1:继续做 PostgreSQL 工程化
383 402
384 1.`live_pgvector_music20_eval.py` 泛化成: 403 1.`live_pgvector_music20_eval.py` 泛化成:
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
24 - SOTA 演进路径已明确:**Phase-1 先走 encoder-only** 24 - SOTA 演进路径已明确:**Phase-1 先走 encoder-only**
25 - PostgreSQL 主数据与特征注册 DDL 已落地为推荐版 schema 25 - PostgreSQL 主数据与特征注册 DDL 已落地为推荐版 schema
26 - Phase-1 实施 checklist 和 model/feature/reference set 初始化手册已补齐 26 - Phase-1 实施 checklist 和 model/feature/reference set 初始化手册已补齐
27 - `acr_test` schema 上已经真实完成 Phase-1 `model_registry / feature_set_registry / reference_set_registry` bootstrap 验证
27 28
28 当前最重要的下一步不是继续写方案,而是: 29 当前最重要的下一步不是继续写方案,而是:
29 30
...@@ -180,6 +181,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql ...@@ -180,6 +181,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
180 - 代码已推送远端 181 - 代码已推送远端
181 - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据 182 - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据
182 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过 183 - 机械校验已补齐:`live_pgvector_music20_eval.py``py_compile` 通过,相关变更 `diff --check` 通过
184 - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1`
183 185
184 ### 未验证 / 仍是缺口 186 ### 未验证 / 仍是缺口
185 - **未实际跑 MERT / MuQ encoder-only 特征抽取** 187 - **未实际跑 MERT / MuQ encoder-only 特征抽取**
......