Bootstrap the Phase-1 model registry on live PostgreSQL
Constraint: Continue the Ralph loop without waiting on missing business sample mounts, while still leaving a push-ready implementation and documentation trail Rejected: Keep Phase-1 registry setup as static SQL snippets only | It slows live validation and leaves no machine-checkable bootstrap path Confidence: high Scope-risk: narrow Directive: Treat model_registry/feature_set_registry/reference_set_registry as the mandatory entrypoint before any future MERT/MuQ extraction jobs Tested: /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_model_registry_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --output data/pgvector_eval/music20/phase1_registry_bootstrap_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/bootstrap_phase1_model_registry_live.py; git diff --check -- acr-engine/scripts/bootstrap_phase1_model_registry_live.py acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md Not-tested: Actual MERT/MuQ embedding extraction, hard-case type_8/type_16 live queries, multi-recording/cover-lane retrieval
Showing
6 changed files
with
588 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "models": [ | ||
| 5 | { | ||
| 6 | "model_id": 2, | ||
| 7 | "model_name": "chromaprint", | ||
| 8 | "model_version": "v1", | ||
| 9 | "output_embedding_dim": null | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "model_id": 3, | ||
| 13 | "model_name": "mert", | ||
| 14 | "model_version": "v1-95m", | ||
| 15 | "output_embedding_dim": 768 | ||
| 16 | }, | ||
| 17 | { | ||
| 18 | "model_id": 4, | ||
| 19 | "model_name": "muq", | ||
| 20 | "model_version": "large-msd-iter", | ||
| 21 | "output_embedding_dim": 768 | ||
| 22 | }, | ||
| 23 | { | ||
| 24 | "model_id": 5, | ||
| 25 | "model_name": "ecapa", | ||
| 26 | "model_version": "acr-baseline-v1", | ||
| 27 | "output_embedding_dim": 192 | ||
| 28 | } | ||
| 29 | ], | ||
| 30 | "feature_sets": [ | ||
| 31 | { | ||
| 32 | "feature_set_id": 2, | ||
| 33 | "model_name": "chromaprint", | ||
| 34 | "model_version": "v1", | ||
| 35 | "feature_name": "fingerprint_asset", | ||
| 36 | "window_sec": 5.0, | ||
| 37 | "hop_sec": 2.5, | ||
| 38 | "embedding_dim": null, | ||
| 39 | "distance_metric": "hamming" | ||
| 40 | }, | ||
| 41 | { | ||
| 42 | "feature_set_id": 3, | ||
| 43 | "model_name": "mert", | ||
| 44 | "model_version": "v1-95m", | ||
| 45 | "feature_name": "semantic_embedding", | ||
| 46 | "window_sec": 5.0, | ||
| 47 | "hop_sec": 2.5, | ||
| 48 | "embedding_dim": 768, | ||
| 49 | "distance_metric": "cosine" | ||
| 50 | }, | ||
| 51 | { | ||
| 52 | "feature_set_id": 4, | ||
| 53 | "model_name": "mert", | ||
| 54 | "model_version": "v1-95m", | ||
| 55 | "feature_name": "semantic_embedding", | ||
| 56 | "window_sec": 10.0, | ||
| 57 | "hop_sec": 5.0, | ||
| 58 | "embedding_dim": 768, | ||
| 59 | "distance_metric": "cosine" | ||
| 60 | }, | ||
| 61 | { | ||
| 62 | "feature_set_id": 5, | ||
| 63 | "model_name": "muq", | ||
| 64 | "model_version": "large-msd-iter", | ||
| 65 | "feature_name": "semantic_embedding", | ||
| 66 | "window_sec": 5.0, | ||
| 67 | "hop_sec": 2.5, | ||
| 68 | "embedding_dim": 768, | ||
| 69 | "distance_metric": "cosine" | ||
| 70 | }, | ||
| 71 | { | ||
| 72 | "feature_set_id": 6, | ||
| 73 | "model_name": "ecapa", | ||
| 74 | "model_version": "acr-baseline-v1", | ||
| 75 | "feature_name": "semantic_embedding", | ||
| 76 | "window_sec": 5.0, | ||
| 77 | "hop_sec": 2.5, | ||
| 78 | "embedding_dim": 192, | ||
| 79 | "distance_metric": "cosine" | ||
| 80 | } | ||
| 81 | ], | ||
| 82 | "reference_set": { | ||
| 83 | "reference_set_id": 2, | ||
| 84 | "set_name": "phase1_hot_reference_v1", | ||
| 85 | "encoder_scope": "chromaprint-v1 / mert-v1-95m / muq-large-msd-iter" | ||
| 86 | }, | ||
| 87 | "counts": { | ||
| 88 | "model_registry": 5, | ||
| 89 | "feature_set_registry": 6, | ||
| 90 | "reference_set_registry": 2 | ||
| 91 | } | ||
| 92 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | from typing import Any | ||
| 8 | |||
| 9 | import psycopg | ||
| 10 | |||
| 11 | ROOT = Path(__file__).resolve().parents[1] | ||
| 12 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_registry_bootstrap_report.json' | ||
| 13 | |||
| 14 | MODELS = [ | ||
| 15 | { | ||
| 16 | 'model_name': 'chromaprint', | ||
| 17 | 'model_family': 'fingerprint', | ||
| 18 | 'model_version': 'v1', | ||
| 19 | 'model_source': 'acoustid', | ||
| 20 | 'model_uri': 'https://acoustid.org/chromaprint', | ||
| 21 | 'license_name': 'lgpl-2.1', | ||
| 22 | 'input_modality': 'audio', | ||
| 23 | 'input_sample_rate': 16000, | ||
| 24 | 'input_channel_mode': 'mono', | ||
| 25 | 'default_window_sec': 5.0, | ||
| 26 | 'default_hop_sec': 2.5, | ||
| 27 | 'output_embedding_dim': None, | ||
| 28 | 'pooling_supported': ['none'], | ||
| 29 | 'layer_selection_supported': False, | ||
| 30 | 'is_trainable': False, | ||
| 31 | 'metadata_json': { | ||
| 32 | 'lane': 'exact', | ||
| 33 | 'phase': 'phase1', | ||
| 34 | 'note': 'exact fingerprint lane baseline', | ||
| 35 | }, | ||
| 36 | }, | ||
| 37 | { | ||
| 38 | 'model_name': 'mert', | ||
| 39 | 'model_family': 'music_ssl', | ||
| 40 | 'model_version': 'v1-95m', | ||
| 41 | 'model_source': 'github', | ||
| 42 | 'model_uri': 'https://github.com/yizhilll/MERT', | ||
| 43 | 'license_name': 'apache-2.0', | ||
| 44 | 'input_modality': 'audio', | ||
| 45 | 'input_sample_rate': 24000, | ||
| 46 | 'input_channel_mode': 'mono', | ||
| 47 | 'default_window_sec': 5.0, | ||
| 48 | 'default_hop_sec': 2.5, | ||
| 49 | 'output_embedding_dim': 768, | ||
| 50 | 'pooling_supported': ['mean', 'cls'], | ||
| 51 | 'layer_selection_supported': True, | ||
| 52 | 'is_trainable': False, | ||
| 53 | 'metadata_json': { | ||
| 54 | 'lane': 'semantic', | ||
| 55 | 'role': 'primary_baseline', | ||
| 56 | 'phase': 'phase1', | ||
| 57 | }, | ||
| 58 | }, | ||
| 59 | { | ||
| 60 | 'model_name': 'muq', | ||
| 61 | 'model_family': 'music_ssl', | ||
| 62 | 'model_version': 'large-msd-iter', | ||
| 63 | 'model_source': 'github', | ||
| 64 | 'model_uri': 'https://github.com/tencent-ailab/MuQ', | ||
| 65 | 'license_name': 'apache-2.0', | ||
| 66 | 'input_modality': 'audio', | ||
| 67 | 'input_sample_rate': 24000, | ||
| 68 | 'input_channel_mode': 'mono', | ||
| 69 | 'default_window_sec': 5.0, | ||
| 70 | 'default_hop_sec': 2.5, | ||
| 71 | 'output_embedding_dim': 768, | ||
| 72 | 'pooling_supported': ['mean', 'cls'], | ||
| 73 | 'layer_selection_supported': True, | ||
| 74 | 'is_trainable': False, | ||
| 75 | 'metadata_json': { | ||
| 76 | 'lane': 'semantic', | ||
| 77 | 'role': 'challenger', | ||
| 78 | 'phase': 'phase1', | ||
| 79 | }, | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | 'model_name': 'ecapa', | ||
| 83 | 'model_family': 'speech_derived', | ||
| 84 | 'model_version': 'acr-baseline-v1', | ||
| 85 | 'model_source': 'local', | ||
| 86 | 'model_uri': None, | ||
| 87 | 'license_name': 'internal-eval', | ||
| 88 | 'input_modality': 'audio', | ||
| 89 | 'input_sample_rate': 16000, | ||
| 90 | 'input_channel_mode': 'mono', | ||
| 91 | 'default_window_sec': 5.0, | ||
| 92 | 'default_hop_sec': 2.5, | ||
| 93 | 'output_embedding_dim': 192, | ||
| 94 | 'pooling_supported': ['mean'], | ||
| 95 | 'layer_selection_supported': False, | ||
| 96 | 'is_trainable': True, | ||
| 97 | 'metadata_json': { | ||
| 98 | 'lane': 'semantic', | ||
| 99 | 'role': 'historical_baseline', | ||
| 100 | 'phase': 'phase1', | ||
| 101 | }, | ||
| 102 | }, | ||
| 103 | ] | ||
| 104 | |||
| 105 | FEATURE_SETS = [ | ||
| 106 | { | ||
| 107 | 'model_name': 'chromaprint', | ||
| 108 | 'model_version': 'v1', | ||
| 109 | 'feature_name': 'fingerprint_asset', | ||
| 110 | 'feature_level': 'asset', | ||
| 111 | 'extraction_granularity': 'full_asset', | ||
| 112 | 'window_sec': 5.0, | ||
| 113 | 'hop_sec': 2.5, | ||
| 114 | 'embedding_dim': None, | ||
| 115 | 'pooling_strategy': 'none', | ||
| 116 | 'layer_selection': 'na', | ||
| 117 | 'normalize_l2': False, | ||
| 118 | 'distance_metric': 'hamming', | ||
| 119 | 'quantization_type': 'fingerprint_hash', | ||
| 120 | 'feature_schema_version': 'v1', | ||
| 121 | 'config_json': {'lane': 'exact', 'index_target': 'audio_fingerprint'}, | ||
| 122 | 'status': 'active', | ||
| 123 | }, | ||
| 124 | { | ||
| 125 | 'model_name': 'mert', | ||
| 126 | 'model_version': 'v1-95m', | ||
| 127 | 'feature_name': 'semantic_embedding', | ||
| 128 | 'feature_level': 'window', | ||
| 129 | 'extraction_granularity': 'sliding_window', | ||
| 130 | 'window_sec': 5.0, | ||
| 131 | 'hop_sec': 2.5, | ||
| 132 | 'embedding_dim': 768, | ||
| 133 | 'pooling_strategy': 'mean', | ||
| 134 | 'layer_selection': 'final', | ||
| 135 | 'normalize_l2': True, | ||
| 136 | 'distance_metric': 'cosine', | ||
| 137 | 'quantization_type': None, | ||
| 138 | 'feature_schema_version': 'v1', | ||
| 139 | 'config_json': {'role': 'primary_semantic_baseline'}, | ||
| 140 | 'status': 'active', | ||
| 141 | }, | ||
| 142 | { | ||
| 143 | 'model_name': 'mert', | ||
| 144 | 'model_version': 'v1-95m', | ||
| 145 | 'feature_name': 'semantic_embedding', | ||
| 146 | 'feature_level': 'window', | ||
| 147 | 'extraction_granularity': 'sliding_window', | ||
| 148 | 'window_sec': 10.0, | ||
| 149 | 'hop_sec': 5.0, | ||
| 150 | 'embedding_dim': 768, | ||
| 151 | 'pooling_strategy': 'mean', | ||
| 152 | 'layer_selection': 'final', | ||
| 153 | 'normalize_l2': True, | ||
| 154 | 'distance_metric': 'cosine', | ||
| 155 | 'quantization_type': None, | ||
| 156 | 'feature_schema_version': 'v1', | ||
| 157 | 'config_json': {'role': 'long_context_validation'}, | ||
| 158 | 'status': 'active', | ||
| 159 | }, | ||
| 160 | { | ||
| 161 | 'model_name': 'muq', | ||
| 162 | 'model_version': 'large-msd-iter', | ||
| 163 | 'feature_name': 'semantic_embedding', | ||
| 164 | 'feature_level': 'window', | ||
| 165 | 'extraction_granularity': 'sliding_window', | ||
| 166 | 'window_sec': 5.0, | ||
| 167 | 'hop_sec': 2.5, | ||
| 168 | 'embedding_dim': 768, | ||
| 169 | 'pooling_strategy': 'mean', | ||
| 170 | 'layer_selection': 'final', | ||
| 171 | 'normalize_l2': True, | ||
| 172 | 'distance_metric': 'cosine', | ||
| 173 | 'quantization_type': None, | ||
| 174 | 'feature_schema_version': 'v1', | ||
| 175 | 'config_json': {'role': 'semantic_challenger'}, | ||
| 176 | 'status': 'active', | ||
| 177 | }, | ||
| 178 | { | ||
| 179 | 'model_name': 'ecapa', | ||
| 180 | 'model_version': 'acr-baseline-v1', | ||
| 181 | 'feature_name': 'semantic_embedding', | ||
| 182 | 'feature_level': 'window', | ||
| 183 | 'extraction_granularity': 'sliding_window', | ||
| 184 | 'window_sec': 5.0, | ||
| 185 | 'hop_sec': 2.5, | ||
| 186 | 'embedding_dim': 192, | ||
| 187 | 'pooling_strategy': 'mean', | ||
| 188 | 'layer_selection': 'na', | ||
| 189 | 'normalize_l2': True, | ||
| 190 | 'distance_metric': 'cosine', | ||
| 191 | 'quantization_type': None, | ||
| 192 | 'feature_schema_version': 'v1', | ||
| 193 | 'config_json': {'role': 'historical_baseline'}, | ||
| 194 | 'status': 'active', | ||
| 195 | }, | ||
| 196 | ] | ||
| 197 | |||
| 198 | REFERENCE_SET = { | ||
| 199 | 'set_name': 'phase1_hot_reference_v1', | ||
| 200 | 'description': 'Phase-1 hot reference set bootstrap for MERT/MuQ/Chromaprint lanes', | ||
| 201 | 'encoder_scope': 'chromaprint-v1 / mert-v1-95m / muq-large-msd-iter', | ||
| 202 | 'status': 'active', | ||
| 203 | 'metadata_json': { | ||
| 204 | 'phase': 'phase1', | ||
| 205 | 'purpose': 'registry_bootstrap', | ||
| 206 | }, | ||
| 207 | } | ||
| 208 | |||
| 209 | |||
| 210 | def upsert_model(conn: psycopg.Connection, model: dict[str, Any]) -> int: | ||
| 211 | row = conn.execute( | ||
| 212 | """ | ||
| 213 | INSERT INTO model_registry ( | ||
| 214 | model_name, model_family, model_version, model_source, model_uri, | ||
| 215 | license_name, input_modality, input_sample_rate, input_channel_mode, | ||
| 216 | default_window_sec, default_hop_sec, output_embedding_dim, | ||
| 217 | pooling_supported, layer_selection_supported, is_trainable, metadata_json | ||
| 218 | ) VALUES ( | ||
| 219 | %(model_name)s, %(model_family)s, %(model_version)s, %(model_source)s, %(model_uri)s, | ||
| 220 | %(license_name)s, %(input_modality)s, %(input_sample_rate)s, %(input_channel_mode)s, | ||
| 221 | %(default_window_sec)s, %(default_hop_sec)s, %(output_embedding_dim)s, | ||
| 222 | %(pooling_supported)s, %(layer_selection_supported)s, %(is_trainable)s, %(metadata_json)s::jsonb | ||
| 223 | ) | ||
| 224 | ON CONFLICT (model_name, model_version) | ||
| 225 | DO UPDATE SET | ||
| 226 | model_family = EXCLUDED.model_family, | ||
| 227 | model_source = EXCLUDED.model_source, | ||
| 228 | model_uri = EXCLUDED.model_uri, | ||
| 229 | license_name = EXCLUDED.license_name, | ||
| 230 | input_modality = EXCLUDED.input_modality, | ||
| 231 | input_sample_rate = EXCLUDED.input_sample_rate, | ||
| 232 | input_channel_mode = EXCLUDED.input_channel_mode, | ||
| 233 | default_window_sec = EXCLUDED.default_window_sec, | ||
| 234 | default_hop_sec = EXCLUDED.default_hop_sec, | ||
| 235 | output_embedding_dim = EXCLUDED.output_embedding_dim, | ||
| 236 | pooling_supported = EXCLUDED.pooling_supported, | ||
| 237 | layer_selection_supported = EXCLUDED.layer_selection_supported, | ||
| 238 | is_trainable = EXCLUDED.is_trainable, | ||
| 239 | metadata_json = EXCLUDED.metadata_json, | ||
| 240 | updated_at = NOW() | ||
| 241 | RETURNING model_id; | ||
| 242 | """, | ||
| 243 | {**model, 'metadata_json': json.dumps(model['metadata_json'])}, | ||
| 244 | ).fetchone() | ||
| 245 | return int(row[0]) | ||
| 246 | |||
| 247 | |||
| 248 | def ensure_feature_set(conn: psycopg.Connection, model_id: int, feature: dict[str, Any]) -> int: | ||
| 249 | existing = conn.execute( | ||
| 250 | """ | ||
| 251 | SELECT feature_set_id | ||
| 252 | FROM feature_set_registry | ||
| 253 | WHERE model_id = %s | ||
| 254 | AND feature_name = %s | ||
| 255 | AND feature_level = %s | ||
| 256 | AND extraction_granularity = %s | ||
| 257 | AND coalesce(window_sec, -1) = coalesce(%s, -1) | ||
| 258 | AND coalesce(hop_sec, -1) = coalesce(%s, -1) | ||
| 259 | AND coalesce(embedding_dim, -1) = coalesce(%s, -1) | ||
| 260 | AND coalesce(pooling_strategy, '') = coalesce(%s, '') | ||
| 261 | AND coalesce(layer_selection, '') = coalesce(%s, '') | ||
| 262 | AND normalize_l2 = %s | ||
| 263 | AND distance_metric = %s | ||
| 264 | AND coalesce(feature_schema_version, '') = coalesce(%s, ''); | ||
| 265 | """, | ||
| 266 | ( | ||
| 267 | model_id, | ||
| 268 | feature['feature_name'], | ||
| 269 | feature['feature_level'], | ||
| 270 | feature['extraction_granularity'], | ||
| 271 | feature['window_sec'], | ||
| 272 | feature['hop_sec'], | ||
| 273 | feature['embedding_dim'], | ||
| 274 | feature['pooling_strategy'], | ||
| 275 | feature['layer_selection'], | ||
| 276 | feature['normalize_l2'], | ||
| 277 | feature['distance_metric'], | ||
| 278 | feature['feature_schema_version'], | ||
| 279 | ), | ||
| 280 | ).fetchone() | ||
| 281 | if existing: | ||
| 282 | conn.execute( | ||
| 283 | "UPDATE feature_set_registry SET config_json = %s::jsonb, status = %s, updated_at = NOW() WHERE feature_set_id = %s", | ||
| 284 | (json.dumps(feature['config_json']), feature['status'], existing[0]), | ||
| 285 | ) | ||
| 286 | return int(existing[0]) | ||
| 287 | |||
| 288 | row = conn.execute( | ||
| 289 | """ | ||
| 290 | INSERT INTO feature_set_registry ( | ||
| 291 | model_id, feature_name, feature_level, extraction_granularity, | ||
| 292 | window_sec, hop_sec, embedding_dim, pooling_strategy, layer_selection, | ||
| 293 | normalize_l2, distance_metric, quantization_type, feature_schema_version, | ||
| 294 | config_json, status | ||
| 295 | ) VALUES ( | ||
| 296 | %s, %s, %s, %s, | ||
| 297 | %s, %s, %s, %s, %s, | ||
| 298 | %s, %s, %s, %s, | ||
| 299 | %s::jsonb, %s | ||
| 300 | ) | ||
| 301 | RETURNING feature_set_id; | ||
| 302 | """, | ||
| 303 | ( | ||
| 304 | model_id, | ||
| 305 | feature['feature_name'], | ||
| 306 | feature['feature_level'], | ||
| 307 | feature['extraction_granularity'], | ||
| 308 | feature['window_sec'], | ||
| 309 | feature['hop_sec'], | ||
| 310 | feature['embedding_dim'], | ||
| 311 | feature['pooling_strategy'], | ||
| 312 | feature['layer_selection'], | ||
| 313 | feature['normalize_l2'], | ||
| 314 | feature['distance_metric'], | ||
| 315 | feature['quantization_type'], | ||
| 316 | feature['feature_schema_version'], | ||
| 317 | json.dumps(feature['config_json']), | ||
| 318 | feature['status'], | ||
| 319 | ), | ||
| 320 | ).fetchone() | ||
| 321 | return int(row[0]) | ||
| 322 | |||
| 323 | |||
| 324 | def upsert_reference_set(conn: psycopg.Connection, payload: dict[str, Any]) -> int: | ||
| 325 | row = conn.execute( | ||
| 326 | """ | ||
| 327 | INSERT INTO reference_set_registry (set_name, description, encoder_scope, status, metadata_json) | ||
| 328 | VALUES (%s, %s, %s, %s, %s::jsonb) | ||
| 329 | ON CONFLICT (set_name) | ||
| 330 | DO UPDATE SET | ||
| 331 | description = EXCLUDED.description, | ||
| 332 | encoder_scope = EXCLUDED.encoder_scope, | ||
| 333 | status = EXCLUDED.status, | ||
| 334 | metadata_json = EXCLUDED.metadata_json, | ||
| 335 | updated_at = NOW() | ||
| 336 | RETURNING reference_set_id; | ||
| 337 | """, | ||
| 338 | ( | ||
| 339 | payload['set_name'], | ||
| 340 | payload['description'], | ||
| 341 | payload['encoder_scope'], | ||
| 342 | payload['status'], | ||
| 343 | json.dumps(payload['metadata_json']), | ||
| 344 | ), | ||
| 345 | ).fetchone() | ||
| 346 | return int(row[0]) | ||
| 347 | |||
| 348 | |||
| 349 | def main() -> None: | ||
| 350 | ap = argparse.ArgumentParser() | ||
| 351 | ap.add_argument('--dsn', required=True) | ||
| 352 | ap.add_argument('--schema', default='acr_test') | ||
| 353 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 354 | args = ap.parse_args() | ||
| 355 | |||
| 356 | summary: dict[str, Any] = { | ||
| 357 | 'schema': args.schema, | ||
| 358 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 359 | 'models': [], | ||
| 360 | 'feature_sets': [], | ||
| 361 | 'reference_set': None, | ||
| 362 | } | ||
| 363 | |||
| 364 | with psycopg.connect(args.dsn, autocommit=True) as conn: | ||
| 365 | conn.execute(f'SET search_path TO {args.schema}, public;') | ||
| 366 | model_ids: dict[tuple[str, str], int] = {} | ||
| 367 | for model in MODELS: | ||
| 368 | model_id = upsert_model(conn, model) | ||
| 369 | model_ids[(model['model_name'], model['model_version'])] = model_id | ||
| 370 | summary['models'].append({ | ||
| 371 | 'model_id': model_id, | ||
| 372 | 'model_name': model['model_name'], | ||
| 373 | 'model_version': model['model_version'], | ||
| 374 | 'output_embedding_dim': model['output_embedding_dim'], | ||
| 375 | }) | ||
| 376 | |||
| 377 | for feature in FEATURE_SETS: | ||
| 378 | model_id = model_ids[(feature['model_name'], feature['model_version'])] | ||
| 379 | feature_set_id = ensure_feature_set(conn, model_id, feature) | ||
| 380 | summary['feature_sets'].append({ | ||
| 381 | 'feature_set_id': feature_set_id, | ||
| 382 | 'model_name': feature['model_name'], | ||
| 383 | 'model_version': feature['model_version'], | ||
| 384 | 'feature_name': feature['feature_name'], | ||
| 385 | 'window_sec': feature['window_sec'], | ||
| 386 | 'hop_sec': feature['hop_sec'], | ||
| 387 | 'embedding_dim': feature['embedding_dim'], | ||
| 388 | 'distance_metric': feature['distance_metric'], | ||
| 389 | }) | ||
| 390 | |||
| 391 | reference_set_id = upsert_reference_set(conn, REFERENCE_SET) | ||
| 392 | summary['reference_set'] = { | ||
| 393 | 'reference_set_id': reference_set_id, | ||
| 394 | 'set_name': REFERENCE_SET['set_name'], | ||
| 395 | 'encoder_scope': REFERENCE_SET['encoder_scope'], | ||
| 396 | } | ||
| 397 | summary['counts'] = { | ||
| 398 | 'model_registry': int(conn.execute('SELECT count(*) FROM model_registry;').fetchone()[0]), | ||
| 399 | 'feature_set_registry': int(conn.execute('SELECT count(*) FROM feature_set_registry;').fetchone()[0]), | ||
| 400 | 'reference_set_registry': int(conn.execute('SELECT count(*) FROM reference_set_registry;').fetchone()[0]), | ||
| 401 | } | ||
| 402 | |||
| 403 | out = Path(args.output) | ||
| 404 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 405 | out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 406 | print(json.dumps(summary, ensure_ascii=False, indent=2)) | ||
| 407 | |||
| 408 | |||
| 409 | if __name__ == '__main__': | ||
| 410 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 | ||
| 3 | - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。 | 4 | - 补充文档阻塞事实:当前容器里缺少 `/workspace/downloads`,因此本轮无法直接从业务样本目录继续生成 `type_8 / type_16` 的 live PostgreSQL query JSONL;已把该环境前提写入 handoff 与 PostgreSQL 样例文档。 |
| 4 | - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md) 与 `acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile` 与 `diff --check` 通过的机械验证事实。 | 5 | - 更新 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md) 与 `acr-engine/scripts/live_pgvector_music20_eval.py`,把 lineage 负例验证从单条 `audio_window` 扩展到 `recording` / `audio_window` / `audio_embedding` 三类核心 trigger,并已重跑 live pgvector 报告确认检索指标不变;同时补充 `py_compile` 与 `diff --check` 通过的机械验证事实。 |
| 5 | - 新增 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md),补齐 `acr_pg_schema_v2.sql` 的真实落库样例、`pgvector` live 检索验证、lineage trigger 负例测试,以及当前召回/混淆结果解读。 | 6 | - 新增 [PostgreSQL 落库样例与 live 测试链路](./postgres_db_schema_samples.md),补齐 `acr_pg_schema_v2.sql` 的真实落库样例、`pgvector` live 检索验证、lineage trigger 负例测试,以及当前召回/混淆结果解读。 | ... | ... |
| ... | @@ -216,3 +216,67 @@ flowchart TD | ... | @@ -216,3 +216,67 @@ flowchart TD |
| 216 | 6. `phase1_hot_reference_v1` | 216 | 6. `phase1_hot_reference_v1` |
| 217 | 217 | ||
| 218 | 这样数据、模型、索引三条线就都有了稳定入口。 | 218 | 这样数据、模型、索引三条线就都有了稳定入口。 |
| 219 | |||
| 220 | --- | ||
| 221 | |||
| 222 | ## 8. live PostgreSQL bootstrap 脚本 | ||
| 223 | |||
| 224 | 为了避免每次手工执行 SQL,本仓库现在提供了一个可直接连 PostgreSQL 的 live bootstrap 脚本: | ||
| 225 | |||
| 226 | - `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` | ||
| 227 | |||
| 228 | 用途: | ||
| 229 | - 向目标 schema 写入 `model_registry` | ||
| 230 | - 写入 `feature_set_registry` | ||
| 231 | - 写入 `reference_set_registry` | ||
| 232 | - 采用 **幂等式 upsert / ensure** 方式,适合重复执行 | ||
| 233 | |||
| 234 | ### 8.1 执行命令 | ||
| 235 | |||
| 236 | ```bash | ||
| 237 | cd /workspace/acr-engine | ||
| 238 | /usr/local/miniconda3/bin/python scripts/bootstrap_phase1_model_registry_live.py \ | ||
| 239 | --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \ | ||
| 240 | --schema acr_test \ | ||
| 241 | --output data/pgvector_eval/music20/phase1_registry_bootstrap_report.json | ||
| 242 | ``` | ||
| 243 | |||
| 244 | ### 8.2 当前已验证结果(acr_test) | ||
| 245 | |||
| 246 | 本轮已在 `acr_test` schema 上真实执行,写入结果如下: | ||
| 247 | |||
| 248 | | 对象 | 数量 | | ||
| 249 | |---|---:| | ||
| 250 | | `model_registry` | `5` | | ||
| 251 | | `feature_set_registry` | `6` | | ||
| 252 | | `reference_set_registry` | `2` | | ||
| 253 | |||
| 254 | 其中新增的 Phase-1 对象包含: | ||
| 255 | |||
| 256 | #### models | ||
| 257 | - `chromaprint v1` | ||
| 258 | - `mert v1-95m` | ||
| 259 | - `muq large-msd-iter` | ||
| 260 | - `ecapa acr-baseline-v1` | ||
| 261 | |||
| 262 | #### feature sets | ||
| 263 | - `chromaprint fingerprint_asset` | ||
| 264 | - `mert semantic_embedding 5s/2.5s` | ||
| 265 | - `mert semantic_embedding 10s/5s` | ||
| 266 | - `muq semantic_embedding 5s/2.5s` | ||
| 267 | - `ecapa semantic_embedding 5s/2.5s` | ||
| 268 | |||
| 269 | #### reference set | ||
| 270 | - `phase1_hot_reference_v1` | ||
| 271 | |||
| 272 | ### 8.3 当前产物 | ||
| 273 | |||
| 274 | - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | ||
| 275 | |||
| 276 | 这个文件已经记录了: | ||
| 277 | - model_id | ||
| 278 | - feature_set_id | ||
| 279 | - reference_set_id | ||
| 280 | - 最终表计数 | ||
| 281 | |||
| 282 | 因此,下次 session 不需要再从 SQL 片段手工执行开始,而可以直接从 live bootstrap 脚本接上。 | ... | ... |
| ... | @@ -62,8 +62,10 @@ | ... | @@ -62,8 +62,10 @@ |
| 62 | |---|---| | 62 | |---|---| |
| 63 | | 推荐 DDL | `acr-engine/sql/acr_pg_schema_v2.sql` | | 63 | | 推荐 DDL | `acr-engine/sql/acr_pg_schema_v2.sql` | |
| 64 | | live 测试脚本 | `acr-engine/scripts/live_pgvector_music20_eval.py` | | 64 | | live 测试脚本 | `acr-engine/scripts/live_pgvector_music20_eval.py` | |
| 65 | | registry bootstrap 脚本 | `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` | | ||
| 65 | | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` | | 66 | | live 报告 | `acr-engine/data/pgvector_eval/music20/live_pgvector_report.json` | |
| 66 | | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` | | 67 | | FAISS 对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report_fresh.json` | |
| 68 | | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | | ||
| 67 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | | 69 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | |
| 68 | 70 | ||
| 69 | --- | 71 | --- |
| ... | @@ -379,6 +381,23 @@ flowchart LR | ... | @@ -379,6 +381,23 @@ flowchart LR |
| 379 | 381 | ||
| 380 | ## 推荐的下一步 | 382 | ## 推荐的下一步 |
| 381 | 383 | ||
| 384 | ### 本轮新增:Phase-1 registry 已可 live bootstrap | ||
| 385 | |||
| 386 | 除了 live 检索脚本外,本轮还新增了: | ||
| 387 | |||
| 388 | - `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` | ||
| 389 | |||
| 390 | 它已经在 `acr_test` schema 上真实写入了: | ||
| 391 | - `chromaprint` | ||
| 392 | - `mert` | ||
| 393 | - `muq` | ||
| 394 | - `ecapa` | ||
| 395 | - 对应 feature sets | ||
| 396 | - `phase1_hot_reference_v1` | ||
| 397 | |||
| 398 | 对应 live 报告: | ||
| 399 | - `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | ||
| 400 | |||
| 382 | ### 路线 1:继续做 PostgreSQL 工程化 | 401 | ### 路线 1:继续做 PostgreSQL 工程化 |
| 383 | 402 | ||
| 384 | 1. 把 `live_pgvector_music20_eval.py` 泛化成: | 403 | 1. 把 `live_pgvector_music20_eval.py` 泛化成: | ... | ... |
| ... | @@ -24,6 +24,7 @@ | ... | @@ -24,6 +24,7 @@ |
| 24 | - SOTA 演进路径已明确:**Phase-1 先走 encoder-only** | 24 | - SOTA 演进路径已明确:**Phase-1 先走 encoder-only** |
| 25 | - PostgreSQL 主数据与特征注册 DDL 已落地为推荐版 schema | 25 | - PostgreSQL 主数据与特征注册 DDL 已落地为推荐版 schema |
| 26 | - Phase-1 实施 checklist 和 model/feature/reference set 初始化手册已补齐 | 26 | - Phase-1 实施 checklist 和 model/feature/reference set 初始化手册已补齐 |
| 27 | - `acr_test` schema 上已经真实完成 Phase-1 `model_registry / feature_set_registry / reference_set_registry` bootstrap 验证 | ||
| 27 | 28 | ||
| 28 | 当前最重要的下一步不是继续写方案,而是: | 29 | 当前最重要的下一步不是继续写方案,而是: |
| 29 | 30 | ||
| ... | @@ -180,6 +181,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -180,6 +181,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 180 | - 代码已推送远端 | 181 | - 代码已推送远端 |
| 181 | - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据 | 182 | - PostgreSQL `acr_test` live 路径已再次验证:`recording` / `audio_window` / `audio_embedding` 三类 lineage trigger 均有真实负例证据 |
| 182 | - 机械校验已补齐:`live_pgvector_music20_eval.py` 的 `py_compile` 通过,相关变更 `diff --check` 通过 | 183 | - 机械校验已补齐:`live_pgvector_music20_eval.py` 的 `py_compile` 通过,相关变更 `diff --check` 通过 |
| 184 | - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` | ||
| 183 | 185 | ||
| 184 | ### 未验证 / 仍是缺口 | 186 | ### 未验证 / 仍是缺口 |
| 185 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | 187 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | ... | ... |
-
Please register or sign in to post a comment