Prove asset-level embedding upserts against live PostgreSQL
Constraint: The schema already declared asset-level idempotency, but without live evidence future work could mistake it for an unverified design note. Rejected: Rely on DDL inspection alone | It would not prove duplicate inserts are blocked and upserts reuse the same embedding row. Confidence: high Scope-risk: narrow Directive: Keep asset-level writer implementations aligned with the verified ON CONFLICT (feature_set_id, asset_id) WHERE window_id IS NULL contract. Tested: /usr/local/miniconda3/bin/python -m py_compile scripts/validate_audio_embedding_asset_upsert_live.py; git diff --check; /usr/local/miniconda3/bin/python scripts/validate_audio_embedding_asset_upsert_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_asset_upsert_test --output data/pgvector_eval/music20/audio_embedding_asset_upsert_live_report.json Not-tested: No production semantic writer uses the asset-level contract yet; this commit validates the DB contract, not an end-to-end extractor.
Showing
6 changed files
with
394 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_asset_upsert_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "seed_ids": { | ||
| 5 | "model_id": 1, | ||
| 6 | "feature_set_id": 1, | ||
| 7 | "canonical_song_id": 1, | ||
| 8 | "work_id": 1, | ||
| 9 | "recording_id": 1, | ||
| 10 | "asset_id": 1 | ||
| 11 | }, | ||
| 12 | "first_insert_embedding_id": 1, | ||
| 13 | "duplicate_insert_guard": { | ||
| 14 | "passed": true, | ||
| 15 | "error_type": "UniqueViolation", | ||
| 16 | "message": "duplicate key value violates unique constraint \"uq_audio_embedding_feature_asset\"" | ||
| 17 | }, | ||
| 18 | "upsert_embedding_id": 1, | ||
| 19 | "same_embedding_id_reused": true, | ||
| 20 | "counts": { | ||
| 21 | "audio_embedding": 1, | ||
| 22 | "audio_embedding_vector_192": 1 | ||
| 23 | }, | ||
| 24 | "final_state": { | ||
| 25 | "embedding_id": 1, | ||
| 26 | "asset_id": 1, | ||
| 27 | "window_id": null, | ||
| 28 | "checksum": "checksum-v2", | ||
| 29 | "embedding_uri": "inline://asset-probe-upsert", | ||
| 30 | "metadata_json": { | ||
| 31 | "probe": "asset_level_upsert_v2" | ||
| 32 | }, | ||
| 33 | "vector_literal": "[0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2]" | ||
| 34 | }, | ||
| 35 | "passed": true | ||
| 36 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | import sys | ||
| 8 | from typing import Any | ||
| 9 | |||
| 10 | import psycopg | ||
| 11 | |||
| 12 | ROOT = Path(__file__).resolve().parents[1] | ||
| 13 | if str(ROOT) not in sys.path: | ||
| 14 | sys.path.insert(0, str(ROOT)) | ||
| 15 | |||
| 16 | from workers._job_common import validate_schema | ||
| 17 | |||
| 18 | DEFAULT_SCHEMA_SQL = ROOT / 'sql' / 'acr_pg_schema_v2.sql' | ||
| 19 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'audio_embedding_asset_upsert_live_report.json' | ||
| 20 | |||
| 21 | |||
| 22 | def vec_literal(vec: list[float]) -> str: | ||
| 23 | return '[' + ','.join(f'{x:.10f}' for x in vec) + ']' | ||
| 24 | |||
| 25 | |||
| 26 | def reset_schema(conn: psycopg.Connection, schema: str) -> None: | ||
| 27 | schema = validate_schema(schema) | ||
| 28 | conn.execute(f'DROP SCHEMA IF EXISTS {schema} CASCADE;') | ||
| 29 | conn.execute(f'CREATE SCHEMA {schema};') | ||
| 30 | conn.execute(f'SET search_path TO {schema}, public;') | ||
| 31 | |||
| 32 | |||
| 33 | def apply_schema(conn: psycopg.Connection, schema_sql: Path) -> None: | ||
| 34 | conn.execute(schema_sql.read_text(encoding='utf-8')) | ||
| 35 | |||
| 36 | |||
| 37 | def seed_minimal_graph(conn: psycopg.Connection) -> dict[str, int]: | ||
| 38 | model_id = conn.execute( | ||
| 39 | """ | ||
| 40 | INSERT INTO model_registry ( | ||
| 41 | model_name, model_family, model_version, model_source, model_uri, | ||
| 42 | license_name, input_sample_rate, default_window_sec, default_hop_sec, | ||
| 43 | output_embedding_dim, pooling_supported, metadata_json | ||
| 44 | ) VALUES ( | ||
| 45 | 'asset_level_probe', 'probe', 'v1', 'live-test', | ||
| 46 | 'scripts/validate_audio_embedding_asset_upsert_live.py', 'internal-eval', | ||
| 47 | 16000, 5.0, 2.5, 192, ARRAY['none'], '{}'::jsonb | ||
| 48 | ) | ||
| 49 | RETURNING model_id; | ||
| 50 | """ | ||
| 51 | ).fetchone()[0] | ||
| 52 | feature_set_id = conn.execute( | ||
| 53 | """ | ||
| 54 | INSERT INTO feature_set_registry ( | ||
| 55 | model_id, feature_name, feature_level, extraction_granularity, | ||
| 56 | window_sec, hop_sec, embedding_dim, pooling_strategy, layer_selection, | ||
| 57 | normalize_l2, distance_metric, quantization_type, feature_schema_version, | ||
| 58 | config_json, status | ||
| 59 | ) VALUES ( | ||
| 60 | %s, 'semantic_embedding', 'asset', 'whole_asset', | ||
| 61 | 5.0, 2.5, 192, 'none', 'na', TRUE, 'cosine', NULL, 'v1', | ||
| 62 | '{"probe":"asset_level_upsert"}'::jsonb, 'active' | ||
| 63 | ) | ||
| 64 | RETURNING feature_set_id; | ||
| 65 | """, | ||
| 66 | (model_id,), | ||
| 67 | ).fetchone()[0] | ||
| 68 | canonical_song_id = conn.execute( | ||
| 69 | """ | ||
| 70 | INSERT INTO canonical_song (biz_song_code, title, rights_status, metadata_json) | ||
| 71 | VALUES ('asset-probe-song', 'Asset Probe Song', 'protected', '{}'::jsonb) | ||
| 72 | RETURNING canonical_song_id; | ||
| 73 | """ | ||
| 74 | ).fetchone()[0] | ||
| 75 | work_id = conn.execute( | ||
| 76 | """ | ||
| 77 | INSERT INTO work (canonical_song_id, work_code, work_title, metadata_json) | ||
| 78 | VALUES (%s, 'asset-probe-work', 'Asset Probe Work', '{}'::jsonb) | ||
| 79 | RETURNING work_id; | ||
| 80 | """, | ||
| 81 | (canonical_song_id,), | ||
| 82 | ).fetchone()[0] | ||
| 83 | recording_id = conn.execute( | ||
| 84 | """ | ||
| 85 | INSERT INTO recording ( | ||
| 86 | work_id, canonical_song_id, recording_code, recording_title, | ||
| 87 | version_type, is_reference, duration_sec, metadata_json | ||
| 88 | ) VALUES (%s, %s, 'asset-probe-rec', 'Asset Probe Recording', 'master_reference', TRUE, 5.0, '{}'::jsonb) | ||
| 89 | RETURNING recording_id; | ||
| 90 | """, | ||
| 91 | (work_id, canonical_song_id), | ||
| 92 | ).fetchone()[0] | ||
| 93 | asset_id = conn.execute( | ||
| 94 | """ | ||
| 95 | INSERT INTO recording_asset ( | ||
| 96 | recording_id, asset_role, storage_uri, storage_scheme, file_ext, | ||
| 97 | mime_type, sample_rate, channels, codec_name, duration_sec, | ||
| 98 | normalized_storage_uri, ingest_status, metadata_json | ||
| 99 | ) VALUES ( | ||
| 100 | %s, 'reference_audio', '/tmp/asset-probe.wav', 'file', 'wav', | ||
| 101 | 'audio/wav', 16000, 1, 'pcm_s16le', 5.0, | ||
| 102 | '/tmp/asset-probe.wav', 'ready', '{}'::jsonb | ||
| 103 | ) | ||
| 104 | RETURNING asset_id; | ||
| 105 | """, | ||
| 106 | (recording_id,), | ||
| 107 | ).fetchone()[0] | ||
| 108 | return { | ||
| 109 | 'model_id': int(model_id), | ||
| 110 | 'feature_set_id': int(feature_set_id), | ||
| 111 | 'canonical_song_id': int(canonical_song_id), | ||
| 112 | 'work_id': int(work_id), | ||
| 113 | 'recording_id': int(recording_id), | ||
| 114 | 'asset_id': int(asset_id), | ||
| 115 | } | ||
| 116 | |||
| 117 | |||
| 118 | def insert_asset_embedding(conn: psycopg.Connection, ids: dict[str, int], *, checksum: str, metadata: dict[str, Any], vec: list[float]) -> int: | ||
| 119 | embedding_id = conn.execute( | ||
| 120 | """ | ||
| 121 | INSERT INTO audio_embedding ( | ||
| 122 | feature_set_id, extraction_job_id, asset_id, window_id, recording_id, work_id, | ||
| 123 | canonical_song_id, embedding_storage_mode, embedding_uri, vector_norm, checksum, | ||
| 124 | is_indexed, metadata_json | ||
| 125 | ) VALUES ( | ||
| 126 | %s, NULL, %s, NULL, %s, %s, | ||
| 127 | %s, 'pgvector_inline_192', 'inline://asset-probe', 1.0, %s, | ||
| 128 | TRUE, %s::jsonb | ||
| 129 | ) | ||
| 130 | RETURNING embedding_id; | ||
| 131 | """, | ||
| 132 | ( | ||
| 133 | ids['feature_set_id'], | ||
| 134 | ids['asset_id'], | ||
| 135 | ids['recording_id'], | ||
| 136 | ids['work_id'], | ||
| 137 | ids['canonical_song_id'], | ||
| 138 | checksum, | ||
| 139 | json.dumps(metadata, ensure_ascii=False), | ||
| 140 | ), | ||
| 141 | ).fetchone()[0] | ||
| 142 | conn.execute( | ||
| 143 | 'INSERT INTO audio_embedding_vector_192 (embedding_id, embedding) VALUES (%s, %s::vector);', | ||
| 144 | (embedding_id, vec_literal(vec)), | ||
| 145 | ) | ||
| 146 | return int(embedding_id) | ||
| 147 | |||
| 148 | |||
| 149 | def expect_duplicate_insert_failure(conn: psycopg.Connection, ids: dict[str, int]) -> dict[str, Any]: | ||
| 150 | try: | ||
| 151 | with conn.transaction(): | ||
| 152 | conn.execute( | ||
| 153 | """ | ||
| 154 | INSERT INTO audio_embedding ( | ||
| 155 | feature_set_id, extraction_job_id, asset_id, window_id, recording_id, work_id, | ||
| 156 | canonical_song_id, embedding_storage_mode, embedding_uri, vector_norm, checksum, | ||
| 157 | is_indexed, metadata_json | ||
| 158 | ) VALUES ( | ||
| 159 | %s, NULL, %s, NULL, %s, %s, | ||
| 160 | %s, 'pgvector_inline_192', 'inline://asset-probe-duplicate', 1.0, 'dup-checksum', | ||
| 161 | TRUE, '{"probe":"duplicate_insert"}'::jsonb | ||
| 162 | ); | ||
| 163 | """, | ||
| 164 | ( | ||
| 165 | ids['feature_set_id'], | ||
| 166 | ids['asset_id'], | ||
| 167 | ids['recording_id'], | ||
| 168 | ids['work_id'], | ||
| 169 | ids['canonical_song_id'], | ||
| 170 | ), | ||
| 171 | ) | ||
| 172 | return {'passed': False, 'note': 'duplicate asset-level insert unexpectedly succeeded'} | ||
| 173 | except Exception as exc: # noqa: BLE001 | ||
| 174 | return { | ||
| 175 | 'passed': 'uq_audio_embedding_feature_asset' in str(exc), | ||
| 176 | 'error_type': type(exc).__name__, | ||
| 177 | 'message': str(exc).splitlines()[0], | ||
| 178 | } | ||
| 179 | |||
| 180 | |||
| 181 | def upsert_asset_embedding(conn: psycopg.Connection, ids: dict[str, int], *, checksum: str, metadata: dict[str, Any], vec: list[float]) -> int: | ||
| 182 | embedding_id = conn.execute( | ||
| 183 | """ | ||
| 184 | INSERT INTO audio_embedding ( | ||
| 185 | feature_set_id, extraction_job_id, asset_id, window_id, recording_id, work_id, | ||
| 186 | canonical_song_id, embedding_storage_mode, embedding_uri, vector_norm, checksum, | ||
| 187 | is_indexed, metadata_json | ||
| 188 | ) VALUES ( | ||
| 189 | %s, NULL, %s, NULL, %s, %s, | ||
| 190 | %s, 'pgvector_inline_192', 'inline://asset-probe-upsert', 1.0, %s, | ||
| 191 | TRUE, %s::jsonb | ||
| 192 | ) | ||
| 193 | ON CONFLICT (feature_set_id, asset_id) | ||
| 194 | WHERE window_id IS NULL AND asset_id IS NOT NULL | ||
| 195 | DO UPDATE SET | ||
| 196 | checksum = EXCLUDED.checksum, | ||
| 197 | embedding_uri = EXCLUDED.embedding_uri, | ||
| 198 | metadata_json = EXCLUDED.metadata_json, | ||
| 199 | is_indexed = EXCLUDED.is_indexed, | ||
| 200 | vector_norm = EXCLUDED.vector_norm | ||
| 201 | RETURNING embedding_id; | ||
| 202 | """, | ||
| 203 | ( | ||
| 204 | ids['feature_set_id'], | ||
| 205 | ids['asset_id'], | ||
| 206 | ids['recording_id'], | ||
| 207 | ids['work_id'], | ||
| 208 | ids['canonical_song_id'], | ||
| 209 | checksum, | ||
| 210 | json.dumps(metadata, ensure_ascii=False), | ||
| 211 | ), | ||
| 212 | ).fetchone()[0] | ||
| 213 | conn.execute( | ||
| 214 | """ | ||
| 215 | INSERT INTO audio_embedding_vector_192 (embedding_id, embedding) | ||
| 216 | VALUES (%s, %s::vector) | ||
| 217 | ON CONFLICT (embedding_id) | ||
| 218 | DO UPDATE SET embedding = EXCLUDED.embedding; | ||
| 219 | """, | ||
| 220 | (embedding_id, vec_literal(vec)), | ||
| 221 | ) | ||
| 222 | return int(embedding_id) | ||
| 223 | |||
| 224 | |||
| 225 | def fetch_final_state(conn: psycopg.Connection, embedding_id: int) -> dict[str, Any]: | ||
| 226 | row = conn.execute( | ||
| 227 | """ | ||
| 228 | SELECT ae.embedding_id, ae.asset_id, ae.window_id, ae.checksum, ae.embedding_uri, ae.metadata_json, | ||
| 229 | aev.embedding::text | ||
| 230 | FROM audio_embedding ae | ||
| 231 | JOIN audio_embedding_vector_192 aev ON aev.embedding_id = ae.embedding_id | ||
| 232 | WHERE ae.embedding_id = %s; | ||
| 233 | """, | ||
| 234 | (embedding_id,), | ||
| 235 | ).fetchone() | ||
| 236 | return { | ||
| 237 | 'embedding_id': int(row[0]), | ||
| 238 | 'asset_id': int(row[1]), | ||
| 239 | 'window_id': row[2], | ||
| 240 | 'checksum': row[3], | ||
| 241 | 'embedding_uri': row[4], | ||
| 242 | 'metadata_json': row[5] or {}, | ||
| 243 | 'vector_literal': row[6], | ||
| 244 | } | ||
| 245 | |||
| 246 | |||
| 247 | def main() -> None: | ||
| 248 | ap = argparse.ArgumentParser() | ||
| 249 | ap.add_argument('--dsn', required=True) | ||
| 250 | ap.add_argument('--schema', default='acr_asset_upsert_test') | ||
| 251 | ap.add_argument('--schema-sql', default=str(DEFAULT_SCHEMA_SQL)) | ||
| 252 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 253 | args = ap.parse_args() | ||
| 254 | |||
| 255 | initial_vec = [0.1] * 192 | ||
| 256 | updated_vec = [0.2] * 192 | ||
| 257 | |||
| 258 | payload: dict[str, Any] = { | ||
| 259 | 'schema': args.schema, | ||
| 260 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 261 | } | ||
| 262 | with psycopg.connect(args.dsn, autocommit=True) as conn: | ||
| 263 | reset_schema(conn, args.schema) | ||
| 264 | apply_schema(conn, Path(args.schema_sql)) | ||
| 265 | ids = seed_minimal_graph(conn) | ||
| 266 | payload['seed_ids'] = ids | ||
| 267 | |||
| 268 | first_embedding_id = insert_asset_embedding( | ||
| 269 | conn, | ||
| 270 | ids, | ||
| 271 | checksum='checksum-v1', | ||
| 272 | metadata={'probe': 'asset_level_insert_v1'}, | ||
| 273 | vec=initial_vec, | ||
| 274 | ) | ||
| 275 | payload['first_insert_embedding_id'] = first_embedding_id | ||
| 276 | payload['duplicate_insert_guard'] = expect_duplicate_insert_failure(conn, ids) | ||
| 277 | |||
| 278 | upsert_embedding_id = upsert_asset_embedding( | ||
| 279 | conn, | ||
| 280 | ids, | ||
| 281 | checksum='checksum-v2', | ||
| 282 | metadata={'probe': 'asset_level_upsert_v2'}, | ||
| 283 | vec=updated_vec, | ||
| 284 | ) | ||
| 285 | payload['upsert_embedding_id'] = upsert_embedding_id | ||
| 286 | payload['same_embedding_id_reused'] = first_embedding_id == upsert_embedding_id | ||
| 287 | payload['counts'] = { | ||
| 288 | 'audio_embedding': int(conn.execute('SELECT count(*) FROM audio_embedding;').fetchone()[0]), | ||
| 289 | 'audio_embedding_vector_192': int(conn.execute('SELECT count(*) FROM audio_embedding_vector_192;').fetchone()[0]), | ||
| 290 | } | ||
| 291 | payload['final_state'] = fetch_final_state(conn, upsert_embedding_id) | ||
| 292 | payload['passed'] = ( | ||
| 293 | payload['duplicate_insert_guard'].get('passed') | ||
| 294 | and payload['same_embedding_id_reused'] | ||
| 295 | and payload['counts']['audio_embedding'] == 1 | ||
| 296 | and payload['counts']['audio_embedding_vector_192'] == 1 | ||
| 297 | and payload['final_state']['checksum'] == 'checksum-v2' | ||
| 298 | and payload['final_state']['metadata_json'].get('probe') == 'asset_level_upsert_v2' | ||
| 299 | ) | ||
| 300 | |||
| 301 | out = Path(args.output) | ||
| 302 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 303 | out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 304 | print(json.dumps(payload, ensure_ascii=False, indent=2)) | ||
| 305 | |||
| 306 | |||
| 307 | if __name__ == '__main__': | ||
| 308 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `scripts/validate_audio_embedding_asset_upsert_live.py` 与 `audio_embedding_asset_upsert_live_report.json`,在隔离 schema `acr_asset_upsert_test` 上真实验证 `uq_audio_embedding_feature_asset`:重复普通 insert 会触发 `UniqueViolation`,而 `ON CONFLICT ... DO UPDATE` 会复用同一 `embedding_id`,最终 `audio_embedding/audio_embedding_vector_192` 行数都保持为 `1`。 | ||
| 3 | - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py` 与 `phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 | 4 | - 新增 `scripts/run_phase1_embedding_preflight_matrix_live.py` 与 `phase1_embedding_preflight_matrix_report.json`,对 `mert / muq / ecapa` 四条 semantic jobs 做了统一 live preflight 矩阵验证;结果表明 4 条 job 全都稳定落到 `preflight_failed`,且 blocker 已收敛为 `/workspace/downloads` 未挂载与语义模型 runtime 缺失,而不是单条 job 的偶发异常。 |
| 4 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 | 5 | - 更新 `run_embedding_job.py`,把 semantic lane 从“只有 dry-run”推进到“真实 scope 读取 + vector table 校验 + runtime 依赖校验 + 缺音频校验 + PostgreSQL failed 落账”的 preflight write contract;当前 live `mert` job 会把 `unreadable_audio_assets` 与 `model_runtime_unavailable` 同时写入 `feature_extraction_job.metadata_json`,不再只停留在纸面设计。 |
| 5 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 | 6 | - 给 `audio_embedding` 补上 `UNIQUE(feature_set_id, window_id) WHERE window_id IS NOT NULL` 与 `UNIQUE(feature_set_id, asset_id) WHERE window_id IS NULL AND asset_id IS NOT NULL` 两条幂等唯一键,为后续真实 `MERT / MuQ / ECAPA` upsert 落库固定主键策略。 | ... | ... |
| ... | @@ -343,6 +343,17 @@ MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: | ... | @@ -343,6 +343,17 @@ MERT 5s/2.5s job (`extraction_job_id=2`) 在 `acr_test` 上已经真实验证: |
| 343 | 343 | ||
| 344 | 而不需要先查再写。 | 344 | 而不需要先查再写。 |
| 345 | 345 | ||
| 346 | 当前这两条唯一键里,asset-level 路径也已经有 live 证据: | ||
| 347 | |||
| 348 | - `scripts/validate_audio_embedding_asset_upsert_live.py` | ||
| 349 | - `audio_embedding_asset_upsert_live_report.json` | ||
| 350 | |||
| 351 | 已验证: | ||
| 352 | |||
| 353 | - 重复 `INSERT` 会被 `uq_audio_embedding_feature_asset` 拒绝 | ||
| 354 | - `ON CONFLICT ... DO UPDATE` 会复用同一个 `embedding_id` | ||
| 355 | - `audio_embedding` / `audio_embedding_vector_192` 行数都保持为 `1` | ||
| 356 | |||
| 346 | ### 下一步替换点 | 357 | ### 下一步替换点 |
| 347 | 358 | ||
| 348 | 当 runtime 与音频挂载到位后,只需要把 guarded failure path 替换成真实 inference: | 359 | 当 runtime 与音频挂载到位后,只需要把 guarded failure path 替换成真实 inference: | ... | ... |
| ... | @@ -774,3 +774,40 @@ cd /workspace/acr-engine | ... | @@ -774,3 +774,40 @@ cd /workspace/acr-engine |
| 774 | - 当前真正阻塞 Phase-1 encoder-only 落地的是: | 774 | - 当前真正阻塞 Phase-1 encoder-only 落地的是: |
| 775 | 1. `/workspace/downloads` 音频挂载 | 775 | 1. `/workspace/downloads` 音频挂载 |
| 776 | 2. 模型 runtime 依赖安装 | 776 | 2. 模型 runtime 依赖安装 |
| 777 | |||
| 778 | |||
| 779 | ## 新增:asset-level embedding upsert live 验证 | ||
| 780 | |||
| 781 | 为了把 `uq_audio_embedding_feature_asset` 从“DDL 声明”推进到“真实证据”,本轮新增: | ||
| 782 | |||
| 783 | - `acr-engine/scripts/validate_audio_embedding_asset_upsert_live.py` | ||
| 784 | - `acr-engine/data/pgvector_eval/music20/audio_embedding_asset_upsert_live_report.json` | ||
| 785 | |||
| 786 | ### 验证动作 | ||
| 787 | |||
| 788 | 脚本会在隔离 schema `acr_asset_upsert_test` 中: | ||
| 789 | |||
| 790 | 1. 落最小主数据图:`song -> work -> recording -> asset` | ||
| 791 | 2. 插入第一条 `window_id IS NULL` 的 asset-level embedding | ||
| 792 | 3. 再做一次普通重复 `INSERT` | ||
| 793 | 4. 预期被 `uq_audio_embedding_feature_asset` 拒绝 | ||
| 794 | 5. 再做一次 `ON CONFLICT ... DO UPDATE` | ||
| 795 | 6. 验证最终仍只有 `1` 条 `audio_embedding` 与 `1` 条 `audio_embedding_vector_192` | ||
| 796 | |||
| 797 | ### 当前结果 | ||
| 798 | |||
| 799 | | 项 | 结果 | | ||
| 800 | |---|---| | ||
| 801 | | 首次 `embedding_id` | `1` | | ||
| 802 | | 重复普通 `INSERT` | `UniqueViolation` | | ||
| 803 | | 唯一键名 | `uq_audio_embedding_feature_asset` | | ||
| 804 | | upsert 后 `embedding_id` | `1` | | ||
| 805 | | `same_embedding_id_reused` | `true` | | ||
| 806 | | `audio_embedding` 行数 | `1` | | ||
| 807 | | `audio_embedding_vector_192` 行数 | `1` | | ||
| 808 | | 最终 `checksum` | `checksum-v2` | | ||
| 809 | |||
| 810 | 结论: | ||
| 811 | |||
| 812 | - asset-level 唯一键不是“纸面存在”,而是已经在 live PostgreSQL 上真实生效 | ||
| 813 | - 后续如果补 asset-level semantic writer,可以直接沿用同一个 `ON CONFLICT (feature_set_id, asset_id) ...` 合同 | ... | ... |
| ... | @@ -192,6 +192,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -192,6 +192,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed | 192 | - semantic lane 也已完成 live failure contract:`run_embedding_job.py` 现在会同时暴露 `unreadable_audio_assets` 与 `model_runtime_unavailable`,而不是把失败伪装成 completed |
| 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 | 193 | - `audio_embedding` 已补上 window / asset 双路唯一键,后续真实 encoder 只需替换 inference adapter 即可复用同一 upsert 合同 |
| 194 | - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 | 194 | - `scripts/run_phase1_embedding_preflight_matrix_live.py` 已跑通,4 条 semantic jobs(mert/muq/ecapa)在 `acr_test` 上都被稳定标记为 `preflight_failed`;当前共性 blocker 已收敛为 `/workspace/downloads` 缺失 + 语义模型 runtime 缺失 |
| 195 | - `scripts/validate_audio_embedding_asset_upsert_live.py` 已在隔离 schema `acr_asset_upsert_test` 上验证 `uq_audio_embedding_feature_asset`:重复 insert 会被唯一键拒绝,upsert 会复用同一 `embedding_id`,说明 asset-level 幂等键也已有真实证据 | ||
| 195 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` | 196 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 196 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | 197 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` |
| 197 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | 198 | - exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed` | ... | ... |
-
Please register or sign in to post a comment