Commit d04a6e65 d04a6e6538f3619e8a778fc6f518135d6d245ae8 by cnb.bofCdSsphPA

Close the song-centric import loop with feature_fact ingestion

Constraint: Complete the current manifest-to-PostgreSQL onboarding path on the 4-table fused schema without reintroducing any split-table storage path.
Rejected: Keep feature generation outside the manifest import workflow for Phase-1 | It leaves the current onboarding path incomplete and harder to validate end-to-end.
Confidence: high
Scope-risk: narrow
Directive: Treat windows[].features[] in song-centric manifests as the default batch path for writing fingerprint and embedding rows into feature_fact.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/import_songcentric_manifest_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test --manifest acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl; repeated the import and verified counts remained media_entity=7, audio_object=15, feature_fact=9, set_membership=7; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: automatic feature extraction from raw audio during import; large-scale concurrent manifest ingest
1 parent ba387bf0
1 {
2 "schema": "acr_songcentric_test",
3 "manifest": "acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl",
4 "imported": [
5 {
6 "song_id": 6,
7 "asset_id": 12,
8 "window_ids": [
9 13
10 ],
11 "feature_ids": [
12 7,
13 8
14 ],
15 "membership_ids": [
16 6
17 ]
18 },
19 {
20 "song_id": 7,
21 "asset_id": 14,
22 "window_ids": [
23 15
24 ],
25 "feature_ids": [
26 9
27 ],
28 "membership_ids": [
29 7
30 ]
31 }
32 ],
33 "counts": {
34 "media_entity": 7,
35 "audio_object": 15,
36 "feature_fact": 9,
37 "set_membership": 7
38 },
39 "window_lineage_sample": {
40 "window_id": 15,
41 "asset_id": 14,
42 "song_id": 7,
43 "title": "Feature Manifest Song 2",
44 "start_ms": 5000,
45 "end_ms": 10000
46 },
47 "feature_lineage_sample": {
48 "feature_type": "embedding",
49 "model_name": "muq",
50 "model_version": "large-msd-iter",
51 "feature_set_name": "muq_5s_hop2.5_meanpool",
52 "window_id": 15,
53 "song_id": 7,
54 "title": "Feature Manifest Song 2"
55 }
56 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "schema": "acr_songcentric_test",
3 "manifest": "acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl",
4 "imported": [
5 {
6 "song_id": 6,
7 "asset_id": 12,
8 "window_ids": [
9 13
10 ],
11 "feature_ids": [
12 7,
13 8
14 ],
15 "membership_ids": [
16 6
17 ]
18 },
19 {
20 "song_id": 7,
21 "asset_id": 14,
22 "window_ids": [
23 15
24 ],
25 "feature_ids": [
26 9
27 ],
28 "membership_ids": [
29 7
30 ]
31 }
32 ],
33 "counts": {
34 "media_entity": 7,
35 "audio_object": 15,
36 "feature_fact": 9,
37 "set_membership": 7
38 },
39 "window_lineage_sample": {
40 "window_id": 15,
41 "asset_id": 14,
42 "song_id": 7,
43 "title": "Feature Manifest Song 2",
44 "start_ms": 5000,
45 "end_ms": 10000
46 },
47 "feature_lineage_sample": {
48 "feature_type": "embedding",
49 "model_name": "muq",
50 "model_version": "large-msd-iter",
51 "feature_set_name": "muq_5s_hop2.5_meanpool",
52 "window_id": 15,
53 "song_id": 7,
54 "title": "Feature Manifest Song 2"
55 }
56 }
...\ No newline at end of file ...\ No newline at end of file
1 {"song":{"biz_key":"song-30001","title":"Feature Manifest Song 1","artist_name":"Feature Artist 1"},"asset":{"source_type":"official","storage_uri":"/workspace/downloads/song-30001/master.wav","storage_scheme":"file","checksum":"sha256:feature-song-30001","codec":"wav","sample_rate":16000,"channels":1,"duration_ms":205000},"windows":[{"start_ms":10000,"end_ms":15000,"features":[{"feature_type":"fingerprint","model_name":"chromaprint","model_version":"phase1","feature_set_name":"chromaprint_5s","fingerprint_value":"fp-song-30001-w1","checksum":"fpchk-song-30001-w1"},{"feature_type":"embedding","model_name":"mert","model_version":"v1-95m","feature_set_name":"mert_5s_hop2.5_meanpool","embedding_dim":768,"embedding_uri":"s3://bucket/song-30001/win1.npy","vector_table_name":"audio_embedding_vector_768","checksum":"embchk-song-30001-w1"}]}],"memberships":[{"set_type":"reference_set","set_name":"phase1_hot_reference_v1","member_type":"asset","priority":100}]}
2 {"song":{"biz_key":"song-30002","title":"Feature Manifest Song 2","artist_name":"Feature Artist 2"},"asset":{"source_type":"ugc","storage_uri":"/workspace/downloads/song-30002/clip.wav","storage_scheme":"file","checksum":"sha256:feature-song-30002","codec":"wav","sample_rate":16000,"channels":1,"duration_ms":98000},"windows":[{"start_ms":5000,"end_ms":10000,"features":[{"feature_type":"embedding","model_name":"muq","model_version":"large-msd-iter","feature_set_name":"muq_5s_hop2.5_meanpool","embedding_dim":768,"embedding_uri":"s3://bucket/song-30002/win1.npy","vector_table_name":"audio_embedding_vector_768","checksum":"embchk-song-30002-w1"}]}],"memberships":[{"set_type":"eval_set","set_name":"phase1_eval_v1","member_type":"asset","priority":50}]}
...@@ -47,8 +47,15 @@ def ensure_asset(cur, song_id: int, asset: dict) -> int: ...@@ -47,8 +47,15 @@ def ensure_asset(cur, song_id: int, asset: dict) -> int:
47 ) values ('asset',%s,%s,%s,%s,%s,%s,%s,%s,%s) returning object_id 47 ) values ('asset',%s,%s,%s,%s,%s,%s,%s,%s,%s) returning object_id
48 """, 48 """,
49 ( 49 (
50 song_id, asset.get('source_type'), asset.get('storage_uri'), asset.get('storage_scheme'), 50 song_id,
51 asset.get('checksum'), asset.get('codec'), asset.get('sample_rate'), asset.get('channels'), asset.get('duration_ms'), 51 asset.get('source_type'),
52 asset.get('storage_uri'),
53 asset.get('storage_scheme'),
54 asset.get('checksum'),
55 asset.get('codec'),
56 asset.get('sample_rate'),
57 asset.get('channels'),
58 asset.get('duration_ms'),
52 ), 59 ),
53 ).fetchone()['object_id'] 60 ).fetchone()['object_id']
54 61
...@@ -66,16 +73,71 @@ def ensure_window(cur, song_id: int, asset_id: int, win: dict) -> int: ...@@ -66,16 +73,71 @@ def ensure_window(cur, song_id: int, asset_id: int, win: dict) -> int:
66 ).fetchone()['object_id'] 73 ).fetchone()['object_id']
67 74
68 75
69 def ensure_membership(cur, m: dict, member_id: int, song_id: int) -> int: 76 def ensure_feature(cur, feature: dict, object_id: int, song_id: int) -> int:
77 row = cur.execute(
78 "select feature_id from feature_fact where object_id=%s and model_name=%s and model_version=%s and feature_set_name=%s and feature_type=%s",
79 (object_id, feature['model_name'], feature['model_version'], feature['feature_set_name'], feature['feature_type']),
80 ).fetchone()
81 if row:
82 return row['feature_id']
83
84 if feature['feature_type'] == 'fingerprint':
85 return cur.execute(
86 """
87 insert into feature_fact (
88 feature_type, object_id, song_id, model_name, model_version,
89 feature_set_name, fingerprint_value, checksum, metadata_json
90 ) values (%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)
91 returning feature_id
92 """,
93 (
94 feature['feature_type'],
95 object_id,
96 song_id,
97 feature['model_name'],
98 feature['model_version'],
99 feature['feature_set_name'],
100 feature['fingerprint_value'],
101 feature.get('checksum'),
102 json.dumps(feature.get('metadata_json', {})),
103 ),
104 ).fetchone()['feature_id']
105
106 return cur.execute(
107 """
108 insert into feature_fact (
109 feature_type, object_id, song_id, model_name, model_version,
110 feature_set_name, feature_schema_ver, embedding_dim, embedding_uri, vector_table_name, checksum, metadata_json
111 ) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)
112 returning feature_id
113 """,
114 (
115 feature['feature_type'],
116 object_id,
117 song_id,
118 feature['model_name'],
119 feature['model_version'],
120 feature['feature_set_name'],
121 feature.get('feature_schema_ver', 'v1'),
122 feature.get('embedding_dim'),
123 feature.get('embedding_uri'),
124 feature.get('vector_table_name'),
125 feature.get('checksum'),
126 json.dumps(feature.get('metadata_json', {})),
127 ),
128 ).fetchone()['feature_id']
129
130
131 def ensure_membership(cur, membership: dict, member_id: int, song_id: int) -> int:
70 row = cur.execute( 132 row = cur.execute(
71 "select membership_id from set_membership where set_type=%s and set_name=%s and member_type=%s and member_id=%s", 133 "select membership_id from set_membership where set_type=%s and set_name=%s and member_type=%s and member_id=%s",
72 (m['set_type'], m['set_name'], m['member_type'], member_id), 134 (membership['set_type'], membership['set_name'], membership['member_type'], member_id),
73 ).fetchone() 135 ).fetchone()
74 if row: 136 if row:
75 return row['membership_id'] 137 return row['membership_id']
76 return cur.execute( 138 return cur.execute(
77 "insert into set_membership (set_type,set_name,member_type,member_id,song_id,priority) values (%s,%s,%s,%s,%s,%s) returning membership_id", 139 "insert into set_membership (set_type,set_name,member_type,member_id,song_id,priority) values (%s,%s,%s,%s,%s,%s) returning membership_id",
78 (m['set_type'], m['set_name'], m['member_type'], member_id, song_id, m.get('priority', 100)), 140 (membership['set_type'], membership['set_name'], membership['member_type'], member_id, song_id, membership.get('priority', 100)),
79 ).fetchone()['membership_id'] 141 ).fetchone()['membership_id']
80 142
81 143
...@@ -92,11 +154,7 @@ def main() -> int: ...@@ -92,11 +154,7 @@ def main() -> int:
92 output_path.parent.mkdir(parents=True, exist_ok=True) 154 output_path.parent.mkdir(parents=True, exist_ok=True)
93 qschema = quote_ident(args.schema) 155 qschema = quote_ident(args.schema)
94 156
95 report = { 157 report = {'schema': args.schema, 'manifest': str(manifest_path), 'imported': []}
96 'schema': args.schema,
97 'manifest': str(manifest_path),
98 'imported': [],
99 }
100 158
101 with psycopg.connect(args.dsn, row_factory=dict_row) as conn: 159 with psycopg.connect(args.dsn, row_factory=dict_row) as conn:
102 with conn.cursor() as cur: 160 with conn.cursor() as cur:
...@@ -104,17 +162,26 @@ def main() -> int: ...@@ -104,17 +162,26 @@ def main() -> int:
104 for row in load_jsonl(manifest_path): 162 for row in load_jsonl(manifest_path):
105 song_id = ensure_song(cur, row['song']) 163 song_id = ensure_song(cur, row['song'])
106 asset_id = ensure_asset(cur, song_id, row['asset']) 164 asset_id = ensure_asset(cur, song_id, row['asset'])
107 window_ids = [ensure_window(cur, song_id, asset_id, w) for w in row.get('windows', [])] 165 window_ids = []
166 feature_ids = []
167 for w in row.get('windows', []):
168 window_id = ensure_window(cur, song_id, asset_id, w)
169 window_ids.append(window_id)
170 for feature in w.get('features', []):
171 feature_ids.append(ensure_feature(cur, feature, window_id, song_id))
108 membership_ids = [] 172 membership_ids = []
109 for m in row.get('memberships', []): 173 for m in row.get('memberships', []):
110 member_id = asset_id if m['member_type'] == 'asset' else song_id 174 member_id = asset_id if m['member_type'] == 'asset' else song_id
111 membership_ids.append(ensure_membership(cur, m, member_id, song_id)) 175 membership_ids.append(ensure_membership(cur, m, member_id, song_id))
112 report['imported'].append({ 176 report['imported'].append(
113 'song_id': song_id, 177 {
114 'asset_id': asset_id, 178 'song_id': song_id,
115 'window_ids': window_ids, 179 'asset_id': asset_id,
116 'membership_ids': membership_ids, 180 'window_ids': window_ids,
117 }) 181 'feature_ids': feature_ids,
182 'membership_ids': membership_ids,
183 }
184 )
118 185
119 counts = {} 186 counts = {}
120 for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']: 187 for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']:
...@@ -136,6 +203,22 @@ def main() -> int: ...@@ -136,6 +203,22 @@ def main() -> int:
136 limit 1 203 limit 1
137 """ 204 """
138 ).fetchone() 205 ).fetchone()
206 report['feature_lineage_sample'] = cur.execute(
207 """
208 select ff.feature_type,
209 ff.model_name,
210 ff.model_version,
211 ff.feature_set_name,
212 win.object_id as window_id,
213 song.entity_id as song_id,
214 song.title
215 from feature_fact ff
216 join audio_object win on win.object_id = ff.object_id and win.object_type='window'
217 join media_entity song on song.entity_id = ff.song_id and song.entity_type='song'
218 order by ff.feature_id desc
219 limit 1
220 """
221 ).fetchone()
139 conn.commit() 222 conn.commit()
140 223
141 output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2)) 224 output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
......
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并用 `songcentric_feature_manifest_sample.jsonl` 在 live PostgreSQL 上验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环与幂等性。
4
5 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并新增 `songcentric_feature_manifest_sample.jsonl` 用于验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环。
6
3 - 新增 `acr-engine/scripts/import_songcentric_manifest_live.py` 与样例 manifest `acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl`,把当前 4 表 schema 推进到“可从 JSONL manifest 批量导入 song/asset/window/set_membership”的阶段。 7 - 新增 `acr-engine/scripts/import_songcentric_manifest_live.py` 与样例 manifest `acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl`,把当前 4 表 schema 推进到“可从 JSONL manifest 批量导入 song/asset/window/set_membership”的阶段。
4 8
5 - 新增 `acr-engine/scripts/bootstrap_songcentric_phase1_live.py`,把当前 4 表 schema 从“单条 smoke 写入”推进到“可重复 Phase-1 bootstrap”;并准备对 `acr_songcentric_test` 做 fresh live 初始化验证。 9 - 新增 `acr-engine/scripts/bootstrap_songcentric_phase1_live.py`,把当前 4 表 schema 从“单条 smoke 写入”推进到“可重复 Phase-1 bootstrap”;并准备对 `acr_songcentric_test` 做 fresh live 初始化验证。
......
...@@ -261,12 +261,15 @@ flowchart TD ...@@ -261,12 +261,15 @@ flowchart TD
261 B --> C[media_entity song] 261 B --> C[media_entity song]
262 B --> D[audio_object asset] 262 B --> D[audio_object asset]
263 B --> E[audio_object window x N] 263 B --> E[audio_object window x N]
264 B --> F[set_membership] 264 B --> F[feature_fact]
265 B --> G[set_membership]
265 ``` 266 ```
266 267
267 当前样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl) 268 当前样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_manifest_sample.jsonl)
268 当前导入脚本:[`acr-engine/scripts/import_songcentric_manifest_live.py`](../acr-engine/scripts/import_songcentric_manifest_live.py) 269 当前导入脚本:[`acr-engine/scripts/import_songcentric_manifest_live.py`](../acr-engine/scripts/import_songcentric_manifest_live.py)
269 270
271 当前带 feature 样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl)
272
270 --- 273 ---
271 274
272 ## 5. 最常用 SQL 样例 275 ## 5. 最常用 SQL 样例
......