Commit 5e43f28b 5e43f28b8371048aad36a7919b16fb7afe8ead55 by cnb.bofCdSsphPA

Bootstrap the fused song-centric schema with repeatable live seed data

Constraint: Keep all new initialization logic on top of the current 4-table song-centric schema and validate it against the user PostgreSQL instead of synthetic-only assumptions.
Rejected: Stop at one-row smoke evidence | It does not prove the schema is practical for repeated Phase-1 bootstrap workflows.
Confidence: high
Scope-risk: narrow
Directive: Use bootstrap_songcentric_phase1_live.py as the default seed/bootstrap path when demonstrating or validating the fused schema on live PostgreSQL.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/bootstrap_songcentric_phase1_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: large-batch bootstrap and conflict handling under concurrent writers
1 parent 3ce36679
1 {
2 "schema": "acr_songcentric_test",
3 "songs": [
4 {
5 "song_id": 2,
6 "asset_id": 3,
7 "window_id": 4,
8 "fingerprint_feature_id": 3,
9 "embedding_feature_id": 4,
10 "membership_id": 2
11 },
12 {
13 "song_id": 3,
14 "asset_id": 5,
15 "window_id": 6,
16 "fingerprint_feature_id": 5,
17 "embedding_feature_id": 6,
18 "membership_id": 3
19 }
20 ],
21 "counts": {
22 "media_entity": 3,
23 "audio_object": 6,
24 "feature_fact": 6,
25 "set_membership": 3
26 },
27 "lineage_sample": {
28 "feature_type": "embedding",
29 "model_name": "mert",
30 "window_id": 2,
31 "asset_id": 1,
32 "song_id": 1,
33 "title": "Smoke Song"
34 }
35 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7
8 import psycopg
9 from psycopg.rows import dict_row
10
11
12 def quote_ident(name: str) -> str:
13 return '"' + name.replace('"', '""') + '"'
14
15
16 def ensure_song(cur, biz_key: str, title: str, artist_name: str) -> int:
17 row = cur.execute(
18 """
19 select entity_id from media_entity
20 where entity_type = 'song' and biz_key = %s
21 """,
22 (biz_key,),
23 ).fetchone()
24 if row:
25 return row['entity_id']
26 return cur.execute(
27 """
28 insert into media_entity (entity_type, biz_key, title, artist_name)
29 values ('song', %s, %s, %s)
30 returning entity_id
31 """,
32 (biz_key, title, artist_name),
33 ).fetchone()['entity_id']
34
35
36 def ensure_asset(cur, song_id: int, source_type: str, storage_uri: str, checksum: str, duration_ms: int) -> int:
37 row = cur.execute(
38 """
39 select object_id from audio_object
40 where object_type = 'asset' and song_id = %s and checksum = %s
41 """,
42 (song_id, checksum),
43 ).fetchone()
44 if row:
45 return row['object_id']
46 return cur.execute(
47 """
48 insert into audio_object (
49 object_type, song_id, source_type, storage_uri, storage_scheme,
50 checksum, codec, sample_rate, channels, duration_ms
51 ) values (
52 'asset', %s, %s, %s, 'file', %s, 'wav', 16000, 1, %s
53 ) returning object_id
54 """,
55 (song_id, source_type, storage_uri, checksum, duration_ms),
56 ).fetchone()['object_id']
57
58
59 def ensure_window(cur, song_id: int, asset_id: int, start_ms: int, end_ms: int) -> int:
60 row = cur.execute(
61 """
62 select object_id from audio_object
63 where object_type = 'window' and parent_object_id = %s and start_ms = %s and end_ms = %s
64 """,
65 (asset_id, start_ms, end_ms),
66 ).fetchone()
67 if row:
68 return row['object_id']
69 return cur.execute(
70 """
71 insert into audio_object (
72 object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms
73 ) values ('window', %s, %s, %s, %s, %s)
74 returning object_id
75 """,
76 (song_id, asset_id, start_ms, end_ms, end_ms - start_ms),
77 ).fetchone()['object_id']
78
79
80 def ensure_feature(cur, feature_type: str, object_id: int, song_id: int, model_name: str, model_version: str,
81 feature_set_name: str, payload: dict) -> int:
82 row = cur.execute(
83 """
84 select feature_id from feature_fact
85 where object_id = %s and model_name = %s and model_version = %s
86 and feature_set_name = %s and feature_type = %s
87 """,
88 (object_id, model_name, model_version, feature_set_name, feature_type),
89 ).fetchone()
90 if row:
91 return row['feature_id']
92 if feature_type == 'fingerprint':
93 return cur.execute(
94 """
95 insert into feature_fact (
96 feature_type, object_id, song_id, model_name, model_version,
97 feature_set_name, fingerprint_value, checksum, metadata_json
98 ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
99 returning feature_id
100 """,
101 (
102 feature_type, object_id, song_id, model_name, model_version,
103 feature_set_name, payload['fingerprint_value'], payload['checksum'], json.dumps(payload.get('metadata_json', {})),
104 ),
105 ).fetchone()['feature_id']
106 return cur.execute(
107 """
108 insert into feature_fact (
109 feature_type, object_id, song_id, model_name, model_version,
110 feature_set_name, embedding_dim, embedding_uri, vector_table_name, checksum, metadata_json
111 ) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
112 returning feature_id
113 """,
114 (
115 feature_type, object_id, song_id, model_name, model_version,
116 feature_set_name, payload['embedding_dim'], payload['embedding_uri'], payload['vector_table_name'],
117 payload['checksum'], json.dumps(payload.get('metadata_json', {})),
118 ),
119 ).fetchone()['feature_id']
120
121
122 def ensure_membership(cur, set_type: str, set_name: str, member_type: str, member_id: int, song_id: int, priority: int) -> int:
123 row = cur.execute(
124 """
125 select membership_id from set_membership
126 where set_type = %s and set_name = %s and member_type = %s and member_id = %s
127 """,
128 (set_type, set_name, member_type, member_id),
129 ).fetchone()
130 if row:
131 return row['membership_id']
132 return cur.execute(
133 """
134 insert into set_membership (set_type, set_name, member_type, member_id, song_id, priority)
135 values (%s, %s, %s, %s, %s, %s)
136 returning membership_id
137 """,
138 (set_type, set_name, member_type, member_id, song_id, priority),
139 ).fetchone()['membership_id']
140
141
142 def main() -> int:
143 parser = argparse.ArgumentParser()
144 parser.add_argument('--dsn', required=True)
145 parser.add_argument('--schema', default='acr_songcentric_test')
146 parser.add_argument('--output', default='acr-engine/data/pgvector_eval/music20/songcentric_phase1_bootstrap_report.json')
147 args = parser.parse_args()
148
149 output_path = Path('/workspace') / args.output
150 output_path.parent.mkdir(parents=True, exist_ok=True)
151 qschema = quote_ident(args.schema)
152
153 report = {'schema': args.schema, 'songs': []}
154 songs = [
155 {'biz_key': 'song-10001', 'title': 'Song 10001', 'artist_name': 'Artist A'},
156 {'biz_key': 'song-10002', 'title': 'Song 10002', 'artist_name': 'Artist B'},
157 ]
158
159 with psycopg.connect(args.dsn, row_factory=dict_row) as conn:
160 with conn.cursor() as cur:
161 cur.execute(f'set search_path to {qschema}, public')
162 for idx, song in enumerate(songs, start=1):
163 song_id = ensure_song(cur, **song)
164 asset_id = ensure_asset(
165 cur, song_id, 'official', f'/workspace/downloads/{song["biz_key"]}/master.wav',
166 f'sha256:{song["biz_key"]}', 180000 + idx * 1000,
167 )
168 window_id = ensure_window(cur, song_id, asset_id, 30000, 35000)
169 fingerprint_id = ensure_feature(
170 cur, 'fingerprint', window_id, song_id,
171 'chromaprint', 'phase1', 'chromaprint_5s',
172 {'fingerprint_value': f'fp-{song["biz_key"]}', 'checksum': f'fpchk-{song["biz_key"]}', 'metadata_json': {'lane': 'exact'}},
173 )
174 embedding_id = ensure_feature(
175 cur, 'embedding', window_id, song_id,
176 'mert', 'v1-95m', 'mert_5s_hop2.5_meanpool',
177 {
178 'embedding_dim': 768,
179 'embedding_uri': f's3://bucket/{song["biz_key"]}/win0001.npy',
180 'vector_table_name': 'audio_embedding_vector_768',
181 'checksum': f'embchk-{song["biz_key"]}',
182 'metadata_json': {'lane': 'semantic'},
183 },
184 )
185 membership_id = ensure_membership(cur, 'reference_set', 'phase1_hot_reference_v1', 'asset', asset_id, song_id, 100)
186 report['songs'].append({
187 'song_id': song_id,
188 'asset_id': asset_id,
189 'window_id': window_id,
190 'fingerprint_feature_id': fingerprint_id,
191 'embedding_feature_id': embedding_id,
192 'membership_id': membership_id,
193 })
194
195 counts = {}
196 for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']:
197 counts[table] = cur.execute(f'select count(*) as c from {table}').fetchone()['c']
198 report['counts'] = counts
199
200 report['lineage_sample'] = cur.execute(
201 """
202 select ff.feature_type,
203 ff.model_name,
204 win.object_id as window_id,
205 ast.object_id as asset_id,
206 song.entity_id as song_id,
207 song.title
208 from feature_fact ff
209 join audio_object win on win.object_id = ff.object_id and win.object_type = 'window'
210 join audio_object ast on ast.object_id = win.parent_object_id and ast.object_type = 'asset'
211 join media_entity song on song.entity_id = ff.song_id and song.entity_type = 'song'
212 where ff.feature_type = 'embedding'
213 order by ff.feature_id asc
214 limit 1
215 """
216 ).fetchone()
217 conn.commit()
218
219 output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
220 print(json.dumps(report, ensure_ascii=False, indent=2))
221 return 0
222
223
224 if __name__ == '__main__':
225 raise SystemExit(main())
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/bootstrap_songcentric_phase1_live.py`,把当前 4 表 schema 从“单条 smoke 写入”推进到“可重复 Phase-1 bootstrap”;并准备对 `acr_songcentric_test` 做 fresh live 初始化验证。
4
3 - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。 5 - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。
4 6
5 - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 7 - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。
......
...@@ -237,6 +237,21 @@ sequenceDiagram ...@@ -237,6 +237,21 @@ sequenceDiagram
237 ING->>DB: insert set_membership(reference_set) 237 ING->>DB: insert set_membership(reference_set)
238 ``` 238 ```
239 239
240
241 ### 4.4 Phase-1 bootstrap 流程
242
243 ```mermaid
244 flowchart TD
245 A[bootstrap_songcentric_phase1_live.py] --> B[media_entity song x N]
246 B --> C[audio_object asset x N]
247 C --> D[audio_object window x N]
248 D --> E1[feature_fact fingerprint x N]
249 D --> E2[feature_fact embedding x N]
250 C --> F[set_membership reference_set x N]
251 ```
252
253 当前 live bootstrap 脚本:[`acr-engine/scripts/bootstrap_songcentric_phase1_live.py`](../acr-engine/scripts/bootstrap_songcentric_phase1_live.py)
254
240 --- 255 ---
241 256
242 ## 5. 最常用 SQL 样例 257 ## 5. 最常用 SQL 样例
......