Commit 3ce36679 3ce36679764cc26e67d1b4f375e96487b6a84c8d by cnb.bofCdSsphPA

Prove the fused song-centric ACR schema on live PostgreSQL

Constraint: Stay within the current 4-table song-centric model and validate it against the user-provided PostgreSQL before treating it as the active schema candidate.
Rejected: Leave the fused model as docs-only guidance | Without a runnable SQL file and smoke evidence, downstream implementation would still be ambiguous.
Confidence: high
Scope-risk: narrow
Directive: Prefer acr_pg_schema_songcentric_v1.sql for new schema experiments tied to the current song-centric design; do not revive the older split-table model for Phase-1 by default.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/smoke_songcentric_schema_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: high-volume bulk ingest on the fused schema
1 parent fe416ec9
1 {
2 "schema": "acr_songcentric_test",
3 "sql_path": "acr-engine/sql/acr_pg_schema_songcentric_v1.sql",
4 "inserted": {
5 "song_id": 1,
6 "asset_id": 1,
7 "window_id": 2,
8 "fingerprint_feature_id": 1,
9 "embedding_feature_id": 2,
10 "membership_id": 1
11 },
12 "counts": {
13 "media_entity": 1,
14 "audio_object": 2,
15 "feature_fact": 2,
16 "set_membership": 1
17 },
18 "embedding_lineage": {
19 "feature_id": 2,
20 "feature_type": "embedding",
21 "model_name": "mert",
22 "model_version": "v1-95m",
23 "feature_set_name": "mert_5s_hop2.5_meanpool",
24 "window_id": 2,
25 "asset_id": 1,
26 "song_id": 1,
27 "title": "Smoke Song",
28 "artist_name": "Smoke Artist"
29 }
30 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 from pathlib import Path
7
8 import psycopg
9 from psycopg.rows import dict_row
10
11
12 def quote_ident(name: str) -> str:
13 return '"' + name.replace('"', '""') + '"'
14
15
16 def main() -> int:
17 parser = argparse.ArgumentParser()
18 parser.add_argument('--dsn', required=True)
19 parser.add_argument('--schema', default='acr_songcentric_test')
20 parser.add_argument('--sql', default='acr-engine/sql/acr_pg_schema_songcentric_v1.sql')
21 parser.add_argument('--output', default='acr-engine/data/pgvector_eval/music20/songcentric_schema_smoke_report.json')
22 args = parser.parse_args()
23
24 sql_path = Path('/workspace') / args.sql
25 output_path = Path('/workspace') / args.output
26 output_path.parent.mkdir(parents=True, exist_ok=True)
27 schema = args.schema
28 qschema = quote_ident(schema)
29
30 report: dict = {'schema': schema, 'sql_path': str(sql_path.relative_to('/workspace'))}
31
32 with psycopg.connect(args.dsn, row_factory=dict_row) as conn:
33 conn.execute(f'drop schema if exists {qschema} cascade')
34 conn.execute(f'create schema {qschema}')
35 conn.execute(f'set search_path to {qschema}, public')
36 conn.execute(sql_path.read_text())
37
38 song_id = conn.execute(
39 """
40 insert into media_entity (entity_type, biz_key, title, artist_name)
41 values ('song', 'song-9001', 'Smoke Song', 'Smoke Artist')
42 returning entity_id
43 """
44 ).fetchone()['entity_id']
45
46 asset_id = conn.execute(
47 """
48 insert into audio_object (
49 object_type, song_id, source_type, storage_uri, storage_scheme,
50 checksum, codec, sample_rate, channels, duration_ms
51 ) values (
52 'asset', %s, 'official', 's3://bucket/smoke-song.wav', 's3',
53 'sha256:smoke-asset', 'wav', 44100, 2, 180000
54 ) returning object_id
55 """,
56 (song_id,),
57 ).fetchone()['object_id']
58
59 window_id = conn.execute(
60 """
61 insert into audio_object (
62 object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms
63 ) values ('window', %s, %s, 30000, 35000, 5000)
64 returning object_id
65 """,
66 (song_id, asset_id),
67 ).fetchone()['object_id']
68
69 fingerprint_id = conn.execute(
70 """
71 insert into feature_fact (
72 feature_type, object_id, song_id, model_name, model_version,
73 feature_set_name, fingerprint_value
74 ) values (
75 'fingerprint', %s, %s, 'chromaprint', 'phase1', 'chromaprint_5s', 'fp-smoke'
76 ) returning feature_id
77 """,
78 (window_id, song_id),
79 ).fetchone()['feature_id']
80
81 embedding_id = conn.execute(
82 """
83 insert into feature_fact (
84 feature_type, object_id, song_id, model_name, model_version,
85 feature_set_name, embedding_dim, embedding_uri, vector_table_name
86 ) values (
87 'embedding', %s, %s, 'mert', 'v1-95m',
88 'mert_5s_hop2.5_meanpool', 768, 's3://bucket/smoke-song-win.npy', 'audio_embedding_vector_768'
89 ) returning feature_id
90 """,
91 (window_id, song_id),
92 ).fetchone()['feature_id']
93
94 membership_id = conn.execute(
95 """
96 insert into set_membership (
97 set_type, set_name, member_type, member_id, song_id, priority
98 ) values (
99 'reference_set', 'phase1_hot_reference_v1', 'asset', %s, %s, 100
100 ) returning membership_id
101 """,
102 (asset_id, song_id),
103 ).fetchone()['membership_id']
104
105 lineage = conn.execute(
106 """
107 select ff.feature_id,
108 ff.feature_type,
109 ff.model_name,
110 ff.model_version,
111 ff.feature_set_name,
112 win.object_id as window_id,
113 ast.object_id as asset_id,
114 song.entity_id as song_id,
115 song.title,
116 song.artist_name
117 from feature_fact ff
118 join audio_object win
119 on win.object_id = ff.object_id
120 and win.object_type = 'window'
121 join audio_object ast
122 on ast.object_id = win.parent_object_id
123 and ast.object_type = 'asset'
124 join media_entity song
125 on song.entity_id = ff.song_id
126 and song.entity_type = 'song'
127 where ff.feature_id = %s
128 """,
129 (embedding_id,),
130 ).fetchone()
131
132 counts = {}
133 for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']:
134 counts[table] = conn.execute(f'select count(*) as c from {table}').fetchone()['c']
135
136 report.update(
137 inserted={
138 'song_id': song_id,
139 'asset_id': asset_id,
140 'window_id': window_id,
141 'fingerprint_feature_id': fingerprint_id,
142 'embedding_feature_id': embedding_id,
143 'membership_id': membership_id,
144 },
145 counts=counts,
146 embedding_lineage=lineage,
147 )
148 conn.commit()
149
150 output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
151 print(json.dumps(report, ensure_ascii=False, indent=2))
152 return 0
153
154
155 if __name__ == '__main__':
156 raise SystemExit(main())
1 create extension if not exists pgcrypto;
2
3 create table if not exists media_entity (
4 entity_id bigserial primary key,
5 entity_type text not null check (entity_type in ('song', 'work', 'recording')),
6 root_song_id bigint,
7 parent_entity_id bigint,
8 biz_key text,
9 title text not null,
10 artist_name text,
11 entity_status text not null default 'active',
12 metadata_json jsonb not null default '{}'::jsonb,
13 created_at timestamptz not null default now(),
14 updated_at timestamptz not null default now(),
15 constraint fk_media_entity_root_song
16 foreign key (root_song_id) references media_entity(entity_id),
17 constraint fk_media_entity_parent
18 foreign key (parent_entity_id) references media_entity(entity_id)
19 );
20
21 create unique index if not exists uq_media_entity_song_biz_key
22 on media_entity(entity_type, biz_key)
23 where biz_key is not null;
24
25 create index if not exists idx_media_entity_root_song
26 on media_entity(root_song_id);
27
28 create table if not exists audio_object (
29 object_id bigserial primary key,
30 object_type text not null check (object_type in ('asset', 'window')),
31 song_id bigint not null references media_entity(entity_id),
32 parent_object_id bigint references audio_object(object_id),
33 source_type text,
34 storage_uri text,
35 storage_scheme text,
36 checksum text,
37 codec text,
38 sample_rate integer,
39 channels integer,
40 duration_ms integer,
41 start_ms integer,
42 end_ms integer,
43 object_status text not null default 'ready',
44 metadata_json jsonb not null default '{}'::jsonb,
45 created_at timestamptz not null default now(),
46 updated_at timestamptz not null default now(),
47 constraint ck_audio_object_window_parent
48 check (
49 (object_type = 'asset' and parent_object_id is null)
50 or (object_type = 'window' and parent_object_id is not null)
51 )
52 );
53
54 create index if not exists idx_audio_object_song_type
55 on audio_object(song_id, object_type);
56
57 create index if not exists idx_audio_object_parent
58 on audio_object(parent_object_id);
59
60 create unique index if not exists uq_audio_object_asset_checksum
61 on audio_object(song_id, checksum)
62 where object_type = 'asset' and checksum is not null;
63
64 create unique index if not exists uq_audio_object_window_range
65 on audio_object(parent_object_id, start_ms, end_ms)
66 where object_type = 'window';
67
68 create table if not exists feature_fact (
69 feature_id bigserial primary key,
70 feature_type text not null check (feature_type in ('fingerprint', 'embedding')),
71 object_id bigint not null references audio_object(object_id),
72 song_id bigint not null references media_entity(entity_id),
73 model_name text not null,
74 model_version text not null,
75 feature_set_name text not null,
76 feature_schema_ver text not null default 'v1',
77 embedding_dim integer,
78 fingerprint_value text,
79 embedding_uri text,
80 vector_table_name text,
81 checksum text,
82 feature_status text not null default 'ready',
83 metadata_json jsonb not null default '{}'::jsonb,
84 created_at timestamptz not null default now(),
85 updated_at timestamptz not null default now(),
86 constraint ck_feature_payload
87 check (
88 (feature_type = 'fingerprint' and fingerprint_value is not null)
89 or (feature_type = 'embedding' and (embedding_uri is not null or vector_table_name is not null))
90 )
91 );
92
93 create index if not exists idx_feature_fact_object_type
94 on feature_fact(object_id, feature_type);
95
96 create index if not exists idx_feature_fact_song_type
97 on feature_fact(song_id, feature_type);
98
99 create unique index if not exists uq_feature_fact_embedding
100 on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type)
101 where feature_type = 'embedding';
102
103 create unique index if not exists uq_feature_fact_fingerprint
104 on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type)
105 where feature_type = 'fingerprint';
106
107 create table if not exists set_membership (
108 membership_id bigserial primary key,
109 set_type text not null check (set_type in ('reference_set', 'eval_set', 'hot_set')),
110 set_name text not null,
111 member_type text not null check (member_type in ('song', 'asset', 'window', 'feature')),
112 member_id bigint not null,
113 song_id bigint references media_entity(entity_id),
114 is_active boolean not null default true,
115 priority integer not null default 100,
116 metadata_json jsonb not null default '{}'::jsonb,
117 created_at timestamptz not null default now(),
118 updated_at timestamptz not null default now()
119 );
120
121 create unique index if not exists uq_set_membership_unique
122 on set_membership(set_type, set_name, member_type, member_id);
123
124 create index if not exists idx_set_membership_set_lookup
125 on set_membership(set_type, set_name, is_active, priority);
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。
4
3 - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 5 - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。
4 6
5 -`docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)` 7 -`docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
2 2
3 > 更新:2026-06-04 3 > 更新:2026-06-04
4 > 目标:把当前 **song-centric + 融合优先** 设计落成一版可以直接评审和继续实现的 PostgreSQL DDL 草案。 4 > 目标:把当前 **song-centric + 融合优先** 设计落成一版可以直接评审和继续实现的 PostgreSQL DDL 草案。
5 > SQL 文件:[`acr-engine/sql/acr_pg_schema_songcentric_v1.sql`](../acr-engine/sql/acr_pg_schema_songcentric_v1.sql)
6 > live smoke:[`acr-engine/scripts/smoke_songcentric_schema_live.py`](../acr-engine/scripts/smoke_songcentric_schema_live.py)
5 7
6 --- 8 ---
7 9
......