Commit 3ce36679 3ce36679764cc26e67d1b4f375e96487b6a84c8d by cnb.bofCdSsphPA

Prove the fused song-centric ACR schema on live PostgreSQL

Constraint: Stay within the current 4-table song-centric model and validate it against the user-provided PostgreSQL before treating it as the active schema candidate.
Rejected: Leave the fused model as docs-only guidance | Without a runnable SQL file and smoke evidence, downstream implementation would still be ambiguous.
Confidence: high
Scope-risk: narrow
Directive: Prefer acr_pg_schema_songcentric_v1.sql for new schema experiments tied to the current song-centric design; do not revive the older split-table model for Phase-1 by default.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/smoke_songcentric_schema_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: high-volume bulk ingest on the fused schema
1 parent fe416ec9
{
"schema": "acr_songcentric_test",
"sql_path": "acr-engine/sql/acr_pg_schema_songcentric_v1.sql",
"inserted": {
"song_id": 1,
"asset_id": 1,
"window_id": 2,
"fingerprint_feature_id": 1,
"embedding_feature_id": 2,
"membership_id": 1
},
"counts": {
"media_entity": 1,
"audio_object": 2,
"feature_fact": 2,
"set_membership": 1
},
"embedding_lineage": {
"feature_id": 2,
"feature_type": "embedding",
"model_name": "mert",
"model_version": "v1-95m",
"feature_set_name": "mert_5s_hop2.5_meanpool",
"window_id": 2,
"asset_id": 1,
"song_id": 1,
"title": "Smoke Song",
"artist_name": "Smoke Artist"
}
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
from pathlib import Path
import psycopg
from psycopg.rows import dict_row
def quote_ident(name: str) -> str:
return '"' + name.replace('"', '""') + '"'
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--dsn', required=True)
parser.add_argument('--schema', default='acr_songcentric_test')
parser.add_argument('--sql', default='acr-engine/sql/acr_pg_schema_songcentric_v1.sql')
parser.add_argument('--output', default='acr-engine/data/pgvector_eval/music20/songcentric_schema_smoke_report.json')
args = parser.parse_args()
sql_path = Path('/workspace') / args.sql
output_path = Path('/workspace') / args.output
output_path.parent.mkdir(parents=True, exist_ok=True)
schema = args.schema
qschema = quote_ident(schema)
report: dict = {'schema': schema, 'sql_path': str(sql_path.relative_to('/workspace'))}
with psycopg.connect(args.dsn, row_factory=dict_row) as conn:
conn.execute(f'drop schema if exists {qschema} cascade')
conn.execute(f'create schema {qschema}')
conn.execute(f'set search_path to {qschema}, public')
conn.execute(sql_path.read_text())
song_id = conn.execute(
"""
insert into media_entity (entity_type, biz_key, title, artist_name)
values ('song', 'song-9001', 'Smoke Song', 'Smoke Artist')
returning entity_id
"""
).fetchone()['entity_id']
asset_id = conn.execute(
"""
insert into audio_object (
object_type, song_id, source_type, storage_uri, storage_scheme,
checksum, codec, sample_rate, channels, duration_ms
) values (
'asset', %s, 'official', 's3://bucket/smoke-song.wav', 's3',
'sha256:smoke-asset', 'wav', 44100, 2, 180000
) returning object_id
""",
(song_id,),
).fetchone()['object_id']
window_id = conn.execute(
"""
insert into audio_object (
object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms
) values ('window', %s, %s, 30000, 35000, 5000)
returning object_id
""",
(song_id, asset_id),
).fetchone()['object_id']
fingerprint_id = conn.execute(
"""
insert into feature_fact (
feature_type, object_id, song_id, model_name, model_version,
feature_set_name, fingerprint_value
) values (
'fingerprint', %s, %s, 'chromaprint', 'phase1', 'chromaprint_5s', 'fp-smoke'
) returning feature_id
""",
(window_id, song_id),
).fetchone()['feature_id']
embedding_id = conn.execute(
"""
insert into feature_fact (
feature_type, object_id, song_id, model_name, model_version,
feature_set_name, embedding_dim, embedding_uri, vector_table_name
) values (
'embedding', %s, %s, 'mert', 'v1-95m',
'mert_5s_hop2.5_meanpool', 768, 's3://bucket/smoke-song-win.npy', 'audio_embedding_vector_768'
) returning feature_id
""",
(window_id, song_id),
).fetchone()['feature_id']
membership_id = conn.execute(
"""
insert into set_membership (
set_type, set_name, member_type, member_id, song_id, priority
) values (
'reference_set', 'phase1_hot_reference_v1', 'asset', %s, %s, 100
) returning membership_id
""",
(asset_id, song_id),
).fetchone()['membership_id']
lineage = conn.execute(
"""
select ff.feature_id,
ff.feature_type,
ff.model_name,
ff.model_version,
ff.feature_set_name,
win.object_id as window_id,
ast.object_id as asset_id,
song.entity_id as song_id,
song.title,
song.artist_name
from feature_fact ff
join audio_object win
on win.object_id = ff.object_id
and win.object_type = 'window'
join audio_object ast
on ast.object_id = win.parent_object_id
and ast.object_type = 'asset'
join media_entity song
on song.entity_id = ff.song_id
and song.entity_type = 'song'
where ff.feature_id = %s
""",
(embedding_id,),
).fetchone()
counts = {}
for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']:
counts[table] = conn.execute(f'select count(*) as c from {table}').fetchone()['c']
report.update(
inserted={
'song_id': song_id,
'asset_id': asset_id,
'window_id': window_id,
'fingerprint_feature_id': fingerprint_id,
'embedding_feature_id': embedding_id,
'membership_id': membership_id,
},
counts=counts,
embedding_lineage=lineage,
)
conn.commit()
output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
print(json.dumps(report, ensure_ascii=False, indent=2))
return 0
if __name__ == '__main__':
raise SystemExit(main())
create extension if not exists pgcrypto;
create table if not exists media_entity (
entity_id bigserial primary key,
entity_type text not null check (entity_type in ('song', 'work', 'recording')),
root_song_id bigint,
parent_entity_id bigint,
biz_key text,
title text not null,
artist_name text,
entity_status text not null default 'active',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint fk_media_entity_root_song
foreign key (root_song_id) references media_entity(entity_id),
constraint fk_media_entity_parent
foreign key (parent_entity_id) references media_entity(entity_id)
);
create unique index if not exists uq_media_entity_song_biz_key
on media_entity(entity_type, biz_key)
where biz_key is not null;
create index if not exists idx_media_entity_root_song
on media_entity(root_song_id);
create table if not exists audio_object (
object_id bigserial primary key,
object_type text not null check (object_type in ('asset', 'window')),
song_id bigint not null references media_entity(entity_id),
parent_object_id bigint references audio_object(object_id),
source_type text,
storage_uri text,
storage_scheme text,
checksum text,
codec text,
sample_rate integer,
channels integer,
duration_ms integer,
start_ms integer,
end_ms integer,
object_status text not null default 'ready',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint ck_audio_object_window_parent
check (
(object_type = 'asset' and parent_object_id is null)
or (object_type = 'window' and parent_object_id is not null)
)
);
create index if not exists idx_audio_object_song_type
on audio_object(song_id, object_type);
create index if not exists idx_audio_object_parent
on audio_object(parent_object_id);
create unique index if not exists uq_audio_object_asset_checksum
on audio_object(song_id, checksum)
where object_type = 'asset' and checksum is not null;
create unique index if not exists uq_audio_object_window_range
on audio_object(parent_object_id, start_ms, end_ms)
where object_type = 'window';
create table if not exists feature_fact (
feature_id bigserial primary key,
feature_type text not null check (feature_type in ('fingerprint', 'embedding')),
object_id bigint not null references audio_object(object_id),
song_id bigint not null references media_entity(entity_id),
model_name text not null,
model_version text not null,
feature_set_name text not null,
feature_schema_ver text not null default 'v1',
embedding_dim integer,
fingerprint_value text,
embedding_uri text,
vector_table_name text,
checksum text,
feature_status text not null default 'ready',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint ck_feature_payload
check (
(feature_type = 'fingerprint' and fingerprint_value is not null)
or (feature_type = 'embedding' and (embedding_uri is not null or vector_table_name is not null))
)
);
create index if not exists idx_feature_fact_object_type
on feature_fact(object_id, feature_type);
create index if not exists idx_feature_fact_song_type
on feature_fact(song_id, feature_type);
create unique index if not exists uq_feature_fact_embedding
on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type)
where feature_type = 'embedding';
create unique index if not exists uq_feature_fact_fingerprint
on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type)
where feature_type = 'fingerprint';
create table if not exists set_membership (
membership_id bigserial primary key,
set_type text not null check (set_type in ('reference_set', 'eval_set', 'hot_set')),
set_name text not null,
member_type text not null check (member_type in ('song', 'asset', 'window', 'feature')),
member_id bigint not null,
song_id bigint references media_entity(entity_id),
is_active boolean not null default true,
priority integer not null default 100,
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now()
);
create unique index if not exists uq_set_membership_unique
on set_membership(set_type, set_name, member_type, member_id);
create index if not exists idx_set_membership_set_lookup
on set_membership(set_type, set_name, is_active, priority);
## 2026-06-04
- 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。
- 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。
-`docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`
......
......@@ -2,6 +2,8 @@
> 更新:2026-06-04
> 目标:把当前 **song-centric + 融合优先** 设计落成一版可以直接评审和继续实现的 PostgreSQL DDL 草案。
> SQL 文件:[`acr-engine/sql/acr_pg_schema_songcentric_v1.sql`](../acr-engine/sql/acr_pg_schema_songcentric_v1.sql)
> live smoke:[`acr-engine/scripts/smoke_songcentric_schema_live.py`](../acr-engine/scripts/smoke_songcentric_schema_live.py)
---
......