Bootstrap the fused song-centric schema with repeatable live seed data
Constraint: Keep all new initialization logic on top of the current 4-table song-centric schema and validate it against the user PostgreSQL instead of synthetic-only assumptions. Rejected: Stop at one-row smoke evidence | It does not prove the schema is practical for repeated Phase-1 bootstrap workflows. Confidence: high Scope-risk: narrow Directive: Use bootstrap_songcentric_phase1_live.py as the default seed/bootstrap path when demonstrating or validating the fused schema on live PostgreSQL. Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/bootstrap_songcentric_phase1_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files Not-tested: large-batch bootstrap and conflict handling under concurrent writers
Showing
4 changed files
with
277 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_songcentric_test", | ||
| 3 | "songs": [ | ||
| 4 | { | ||
| 5 | "song_id": 2, | ||
| 6 | "asset_id": 3, | ||
| 7 | "window_id": 4, | ||
| 8 | "fingerprint_feature_id": 3, | ||
| 9 | "embedding_feature_id": 4, | ||
| 10 | "membership_id": 2 | ||
| 11 | }, | ||
| 12 | { | ||
| 13 | "song_id": 3, | ||
| 14 | "asset_id": 5, | ||
| 15 | "window_id": 6, | ||
| 16 | "fingerprint_feature_id": 5, | ||
| 17 | "embedding_feature_id": 6, | ||
| 18 | "membership_id": 3 | ||
| 19 | } | ||
| 20 | ], | ||
| 21 | "counts": { | ||
| 22 | "media_entity": 3, | ||
| 23 | "audio_object": 6, | ||
| 24 | "feature_fact": 6, | ||
| 25 | "set_membership": 3 | ||
| 26 | }, | ||
| 27 | "lineage_sample": { | ||
| 28 | "feature_type": "embedding", | ||
| 29 | "model_name": "mert", | ||
| 30 | "window_id": 2, | ||
| 31 | "asset_id": 1, | ||
| 32 | "song_id": 1, | ||
| 33 | "title": "Smoke Song" | ||
| 34 | } | ||
| 35 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | |||
| 8 | import psycopg | ||
| 9 | from psycopg.rows import dict_row | ||
| 10 | |||
| 11 | |||
| 12 | def quote_ident(name: str) -> str: | ||
| 13 | return '"' + name.replace('"', '""') + '"' | ||
| 14 | |||
| 15 | |||
| 16 | def ensure_song(cur, biz_key: str, title: str, artist_name: str) -> int: | ||
| 17 | row = cur.execute( | ||
| 18 | """ | ||
| 19 | select entity_id from media_entity | ||
| 20 | where entity_type = 'song' and biz_key = %s | ||
| 21 | """, | ||
| 22 | (biz_key,), | ||
| 23 | ).fetchone() | ||
| 24 | if row: | ||
| 25 | return row['entity_id'] | ||
| 26 | return cur.execute( | ||
| 27 | """ | ||
| 28 | insert into media_entity (entity_type, biz_key, title, artist_name) | ||
| 29 | values ('song', %s, %s, %s) | ||
| 30 | returning entity_id | ||
| 31 | """, | ||
| 32 | (biz_key, title, artist_name), | ||
| 33 | ).fetchone()['entity_id'] | ||
| 34 | |||
| 35 | |||
| 36 | def ensure_asset(cur, song_id: int, source_type: str, storage_uri: str, checksum: str, duration_ms: int) -> int: | ||
| 37 | row = cur.execute( | ||
| 38 | """ | ||
| 39 | select object_id from audio_object | ||
| 40 | where object_type = 'asset' and song_id = %s and checksum = %s | ||
| 41 | """, | ||
| 42 | (song_id, checksum), | ||
| 43 | ).fetchone() | ||
| 44 | if row: | ||
| 45 | return row['object_id'] | ||
| 46 | return cur.execute( | ||
| 47 | """ | ||
| 48 | insert into audio_object ( | ||
| 49 | object_type, song_id, source_type, storage_uri, storage_scheme, | ||
| 50 | checksum, codec, sample_rate, channels, duration_ms | ||
| 51 | ) values ( | ||
| 52 | 'asset', %s, %s, %s, 'file', %s, 'wav', 16000, 1, %s | ||
| 53 | ) returning object_id | ||
| 54 | """, | ||
| 55 | (song_id, source_type, storage_uri, checksum, duration_ms), | ||
| 56 | ).fetchone()['object_id'] | ||
| 57 | |||
| 58 | |||
| 59 | def ensure_window(cur, song_id: int, asset_id: int, start_ms: int, end_ms: int) -> int: | ||
| 60 | row = cur.execute( | ||
| 61 | """ | ||
| 62 | select object_id from audio_object | ||
| 63 | where object_type = 'window' and parent_object_id = %s and start_ms = %s and end_ms = %s | ||
| 64 | """, | ||
| 65 | (asset_id, start_ms, end_ms), | ||
| 66 | ).fetchone() | ||
| 67 | if row: | ||
| 68 | return row['object_id'] | ||
| 69 | return cur.execute( | ||
| 70 | """ | ||
| 71 | insert into audio_object ( | ||
| 72 | object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms | ||
| 73 | ) values ('window', %s, %s, %s, %s, %s) | ||
| 74 | returning object_id | ||
| 75 | """, | ||
| 76 | (song_id, asset_id, start_ms, end_ms, end_ms - start_ms), | ||
| 77 | ).fetchone()['object_id'] | ||
| 78 | |||
| 79 | |||
| 80 | def ensure_feature(cur, feature_type: str, object_id: int, song_id: int, model_name: str, model_version: str, | ||
| 81 | feature_set_name: str, payload: dict) -> int: | ||
| 82 | row = cur.execute( | ||
| 83 | """ | ||
| 84 | select feature_id from feature_fact | ||
| 85 | where object_id = %s and model_name = %s and model_version = %s | ||
| 86 | and feature_set_name = %s and feature_type = %s | ||
| 87 | """, | ||
| 88 | (object_id, model_name, model_version, feature_set_name, feature_type), | ||
| 89 | ).fetchone() | ||
| 90 | if row: | ||
| 91 | return row['feature_id'] | ||
| 92 | if feature_type == 'fingerprint': | ||
| 93 | return cur.execute( | ||
| 94 | """ | ||
| 95 | insert into feature_fact ( | ||
| 96 | feature_type, object_id, song_id, model_name, model_version, | ||
| 97 | feature_set_name, fingerprint_value, checksum, metadata_json | ||
| 98 | ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb) | ||
| 99 | returning feature_id | ||
| 100 | """, | ||
| 101 | ( | ||
| 102 | feature_type, object_id, song_id, model_name, model_version, | ||
| 103 | feature_set_name, payload['fingerprint_value'], payload['checksum'], json.dumps(payload.get('metadata_json', {})), | ||
| 104 | ), | ||
| 105 | ).fetchone()['feature_id'] | ||
| 106 | return cur.execute( | ||
| 107 | """ | ||
| 108 | insert into feature_fact ( | ||
| 109 | feature_type, object_id, song_id, model_name, model_version, | ||
| 110 | feature_set_name, embedding_dim, embedding_uri, vector_table_name, checksum, metadata_json | ||
| 111 | ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb) | ||
| 112 | returning feature_id | ||
| 113 | """, | ||
| 114 | ( | ||
| 115 | feature_type, object_id, song_id, model_name, model_version, | ||
| 116 | feature_set_name, payload['embedding_dim'], payload['embedding_uri'], payload['vector_table_name'], | ||
| 117 | payload['checksum'], json.dumps(payload.get('metadata_json', {})), | ||
| 118 | ), | ||
| 119 | ).fetchone()['feature_id'] | ||
| 120 | |||
| 121 | |||
| 122 | def ensure_membership(cur, set_type: str, set_name: str, member_type: str, member_id: int, song_id: int, priority: int) -> int: | ||
| 123 | row = cur.execute( | ||
| 124 | """ | ||
| 125 | select membership_id from set_membership | ||
| 126 | where set_type = %s and set_name = %s and member_type = %s and member_id = %s | ||
| 127 | """, | ||
| 128 | (set_type, set_name, member_type, member_id), | ||
| 129 | ).fetchone() | ||
| 130 | if row: | ||
| 131 | return row['membership_id'] | ||
| 132 | return cur.execute( | ||
| 133 | """ | ||
| 134 | insert into set_membership (set_type, set_name, member_type, member_id, song_id, priority) | ||
| 135 | values (%s, %s, %s, %s, %s, %s) | ||
| 136 | returning membership_id | ||
| 137 | """, | ||
| 138 | (set_type, set_name, member_type, member_id, song_id, priority), | ||
| 139 | ).fetchone()['membership_id'] | ||
| 140 | |||
| 141 | |||
| 142 | def main() -> int: | ||
| 143 | parser = argparse.ArgumentParser() | ||
| 144 | parser.add_argument('--dsn', required=True) | ||
| 145 | parser.add_argument('--schema', default='acr_songcentric_test') | ||
| 146 | parser.add_argument('--output', default='acr-engine/data/pgvector_eval/music20/songcentric_phase1_bootstrap_report.json') | ||
| 147 | args = parser.parse_args() | ||
| 148 | |||
| 149 | output_path = Path('/workspace') / args.output | ||
| 150 | output_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 151 | qschema = quote_ident(args.schema) | ||
| 152 | |||
| 153 | report = {'schema': args.schema, 'songs': []} | ||
| 154 | songs = [ | ||
| 155 | {'biz_key': 'song-10001', 'title': 'Song 10001', 'artist_name': 'Artist A'}, | ||
| 156 | {'biz_key': 'song-10002', 'title': 'Song 10002', 'artist_name': 'Artist B'}, | ||
| 157 | ] | ||
| 158 | |||
| 159 | with psycopg.connect(args.dsn, row_factory=dict_row) as conn: | ||
| 160 | with conn.cursor() as cur: | ||
| 161 | cur.execute(f'set search_path to {qschema}, public') | ||
| 162 | for idx, song in enumerate(songs, start=1): | ||
| 163 | song_id = ensure_song(cur, **song) | ||
| 164 | asset_id = ensure_asset( | ||
| 165 | cur, song_id, 'official', f'/workspace/downloads/{song["biz_key"]}/master.wav', | ||
| 166 | f'sha256:{song["biz_key"]}', 180000 + idx * 1000, | ||
| 167 | ) | ||
| 168 | window_id = ensure_window(cur, song_id, asset_id, 30000, 35000) | ||
| 169 | fingerprint_id = ensure_feature( | ||
| 170 | cur, 'fingerprint', window_id, song_id, | ||
| 171 | 'chromaprint', 'phase1', 'chromaprint_5s', | ||
| 172 | {'fingerprint_value': f'fp-{song["biz_key"]}', 'checksum': f'fpchk-{song["biz_key"]}', 'metadata_json': {'lane': 'exact'}}, | ||
| 173 | ) | ||
| 174 | embedding_id = ensure_feature( | ||
| 175 | cur, 'embedding', window_id, song_id, | ||
| 176 | 'mert', 'v1-95m', 'mert_5s_hop2.5_meanpool', | ||
| 177 | { | ||
| 178 | 'embedding_dim': 768, | ||
| 179 | 'embedding_uri': f's3://bucket/{song["biz_key"]}/win0001.npy', | ||
| 180 | 'vector_table_name': 'audio_embedding_vector_768', | ||
| 181 | 'checksum': f'embchk-{song["biz_key"]}', | ||
| 182 | 'metadata_json': {'lane': 'semantic'}, | ||
| 183 | }, | ||
| 184 | ) | ||
| 185 | membership_id = ensure_membership(cur, 'reference_set', 'phase1_hot_reference_v1', 'asset', asset_id, song_id, 100) | ||
| 186 | report['songs'].append({ | ||
| 187 | 'song_id': song_id, | ||
| 188 | 'asset_id': asset_id, | ||
| 189 | 'window_id': window_id, | ||
| 190 | 'fingerprint_feature_id': fingerprint_id, | ||
| 191 | 'embedding_feature_id': embedding_id, | ||
| 192 | 'membership_id': membership_id, | ||
| 193 | }) | ||
| 194 | |||
| 195 | counts = {} | ||
| 196 | for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']: | ||
| 197 | counts[table] = cur.execute(f'select count(*) as c from {table}').fetchone()['c'] | ||
| 198 | report['counts'] = counts | ||
| 199 | |||
| 200 | report['lineage_sample'] = cur.execute( | ||
| 201 | """ | ||
| 202 | select ff.feature_type, | ||
| 203 | ff.model_name, | ||
| 204 | win.object_id as window_id, | ||
| 205 | ast.object_id as asset_id, | ||
| 206 | song.entity_id as song_id, | ||
| 207 | song.title | ||
| 208 | from feature_fact ff | ||
| 209 | join audio_object win on win.object_id = ff.object_id and win.object_type = 'window' | ||
| 210 | join audio_object ast on ast.object_id = win.parent_object_id and ast.object_type = 'asset' | ||
| 211 | join media_entity song on song.entity_id = ff.song_id and song.entity_type = 'song' | ||
| 212 | where ff.feature_type = 'embedding' | ||
| 213 | order by ff.feature_id asc | ||
| 214 | limit 1 | ||
| 215 | """ | ||
| 216 | ).fetchone() | ||
| 217 | conn.commit() | ||
| 218 | |||
| 219 | output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2)) | ||
| 220 | print(json.dumps(report, ensure_ascii=False, indent=2)) | ||
| 221 | return 0 | ||
| 222 | |||
| 223 | |||
| 224 | if __name__ == '__main__': | ||
| 225 | raise SystemExit(main()) |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `acr-engine/scripts/bootstrap_songcentric_phase1_live.py`,把当前 4 表 schema 从“单条 smoke 写入”推进到“可重复 Phase-1 bootstrap”;并准备对 `acr_songcentric_test` 做 fresh live 初始化验证。 | ||
| 4 | |||
| 3 | - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。 | 5 | - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。 |
| 4 | 6 | ||
| 5 | - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 | 7 | - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 | ... | ... |
| ... | @@ -237,6 +237,21 @@ sequenceDiagram | ... | @@ -237,6 +237,21 @@ sequenceDiagram |
| 237 | ING->>DB: insert set_membership(reference_set) | 237 | ING->>DB: insert set_membership(reference_set) |
| 238 | ``` | 238 | ``` |
| 239 | 239 | ||
| 240 | |||
| 241 | ### 4.4 Phase-1 bootstrap 流程 | ||
| 242 | |||
| 243 | ```mermaid | ||
| 244 | flowchart TD | ||
| 245 | A[bootstrap_songcentric_phase1_live.py] --> B[media_entity song x N] | ||
| 246 | B --> C[audio_object asset x N] | ||
| 247 | C --> D[audio_object window x N] | ||
| 248 | D --> E1[feature_fact fingerprint x N] | ||
| 249 | D --> E2[feature_fact embedding x N] | ||
| 250 | C --> F[set_membership reference_set x N] | ||
| 251 | ``` | ||
| 252 | |||
| 253 | 当前 live bootstrap 脚本:[`acr-engine/scripts/bootstrap_songcentric_phase1_live.py`](../acr-engine/scripts/bootstrap_songcentric_phase1_live.py) | ||
| 254 | |||
| 240 | --- | 255 | --- |
| 241 | 256 | ||
| 242 | ## 5. 最常用 SQL 样例 | 257 | ## 5. 最常用 SQL 样例 | ... | ... |
-
Please register or sign in to post a comment