Prove the fused song-centric ACR schema on live PostgreSQL
Constraint: Stay within the current 4-table song-centric model and validate it against the user-provided PostgreSQL before treating it as the active schema candidate. Rejected: Leave the fused model as docs-only guidance | Without a runnable SQL file and smoke evidence, downstream implementation would still be ambiguous. Confidence: high Scope-risk: narrow Directive: Prefer acr_pg_schema_songcentric_v1.sql for new schema experiments tied to the current song-centric design; do not revive the older split-table model for Phase-1 by default. Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/smoke_songcentric_schema_live.py --dsn postgres://d2:d2pass@127.0.0.1:5432/d2 --schema acr_songcentric_test; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files Not-tested: high-volume bulk ingest on the fused schema
Showing
5 changed files
with
315 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_songcentric_test", | ||
| 3 | "sql_path": "acr-engine/sql/acr_pg_schema_songcentric_v1.sql", | ||
| 4 | "inserted": { | ||
| 5 | "song_id": 1, | ||
| 6 | "asset_id": 1, | ||
| 7 | "window_id": 2, | ||
| 8 | "fingerprint_feature_id": 1, | ||
| 9 | "embedding_feature_id": 2, | ||
| 10 | "membership_id": 1 | ||
| 11 | }, | ||
| 12 | "counts": { | ||
| 13 | "media_entity": 1, | ||
| 14 | "audio_object": 2, | ||
| 15 | "feature_fact": 2, | ||
| 16 | "set_membership": 1 | ||
| 17 | }, | ||
| 18 | "embedding_lineage": { | ||
| 19 | "feature_id": 2, | ||
| 20 | "feature_type": "embedding", | ||
| 21 | "model_name": "mert", | ||
| 22 | "model_version": "v1-95m", | ||
| 23 | "feature_set_name": "mert_5s_hop2.5_meanpool", | ||
| 24 | "window_id": 2, | ||
| 25 | "asset_id": 1, | ||
| 26 | "song_id": 1, | ||
| 27 | "title": "Smoke Song", | ||
| 28 | "artist_name": "Smoke Artist" | ||
| 29 | } | ||
| 30 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | |||
| 8 | import psycopg | ||
| 9 | from psycopg.rows import dict_row | ||
| 10 | |||
| 11 | |||
| 12 | def quote_ident(name: str) -> str: | ||
| 13 | return '"' + name.replace('"', '""') + '"' | ||
| 14 | |||
| 15 | |||
| 16 | def main() -> int: | ||
| 17 | parser = argparse.ArgumentParser() | ||
| 18 | parser.add_argument('--dsn', required=True) | ||
| 19 | parser.add_argument('--schema', default='acr_songcentric_test') | ||
| 20 | parser.add_argument('--sql', default='acr-engine/sql/acr_pg_schema_songcentric_v1.sql') | ||
| 21 | parser.add_argument('--output', default='acr-engine/data/pgvector_eval/music20/songcentric_schema_smoke_report.json') | ||
| 22 | args = parser.parse_args() | ||
| 23 | |||
| 24 | sql_path = Path('/workspace') / args.sql | ||
| 25 | output_path = Path('/workspace') / args.output | ||
| 26 | output_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 27 | schema = args.schema | ||
| 28 | qschema = quote_ident(schema) | ||
| 29 | |||
| 30 | report: dict = {'schema': schema, 'sql_path': str(sql_path.relative_to('/workspace'))} | ||
| 31 | |||
| 32 | with psycopg.connect(args.dsn, row_factory=dict_row) as conn: | ||
| 33 | conn.execute(f'drop schema if exists {qschema} cascade') | ||
| 34 | conn.execute(f'create schema {qschema}') | ||
| 35 | conn.execute(f'set search_path to {qschema}, public') | ||
| 36 | conn.execute(sql_path.read_text()) | ||
| 37 | |||
| 38 | song_id = conn.execute( | ||
| 39 | """ | ||
| 40 | insert into media_entity (entity_type, biz_key, title, artist_name) | ||
| 41 | values ('song', 'song-9001', 'Smoke Song', 'Smoke Artist') | ||
| 42 | returning entity_id | ||
| 43 | """ | ||
| 44 | ).fetchone()['entity_id'] | ||
| 45 | |||
| 46 | asset_id = conn.execute( | ||
| 47 | """ | ||
| 48 | insert into audio_object ( | ||
| 49 | object_type, song_id, source_type, storage_uri, storage_scheme, | ||
| 50 | checksum, codec, sample_rate, channels, duration_ms | ||
| 51 | ) values ( | ||
| 52 | 'asset', %s, 'official', 's3://bucket/smoke-song.wav', 's3', | ||
| 53 | 'sha256:smoke-asset', 'wav', 44100, 2, 180000 | ||
| 54 | ) returning object_id | ||
| 55 | """, | ||
| 56 | (song_id,), | ||
| 57 | ).fetchone()['object_id'] | ||
| 58 | |||
| 59 | window_id = conn.execute( | ||
| 60 | """ | ||
| 61 | insert into audio_object ( | ||
| 62 | object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms | ||
| 63 | ) values ('window', %s, %s, 30000, 35000, 5000) | ||
| 64 | returning object_id | ||
| 65 | """, | ||
| 66 | (song_id, asset_id), | ||
| 67 | ).fetchone()['object_id'] | ||
| 68 | |||
| 69 | fingerprint_id = conn.execute( | ||
| 70 | """ | ||
| 71 | insert into feature_fact ( | ||
| 72 | feature_type, object_id, song_id, model_name, model_version, | ||
| 73 | feature_set_name, fingerprint_value | ||
| 74 | ) values ( | ||
| 75 | 'fingerprint', %s, %s, 'chromaprint', 'phase1', 'chromaprint_5s', 'fp-smoke' | ||
| 76 | ) returning feature_id | ||
| 77 | """, | ||
| 78 | (window_id, song_id), | ||
| 79 | ).fetchone()['feature_id'] | ||
| 80 | |||
| 81 | embedding_id = conn.execute( | ||
| 82 | """ | ||
| 83 | insert into feature_fact ( | ||
| 84 | feature_type, object_id, song_id, model_name, model_version, | ||
| 85 | feature_set_name, embedding_dim, embedding_uri, vector_table_name | ||
| 86 | ) values ( | ||
| 87 | 'embedding', %s, %s, 'mert', 'v1-95m', | ||
| 88 | 'mert_5s_hop2.5_meanpool', 768, 's3://bucket/smoke-song-win.npy', 'audio_embedding_vector_768' | ||
| 89 | ) returning feature_id | ||
| 90 | """, | ||
| 91 | (window_id, song_id), | ||
| 92 | ).fetchone()['feature_id'] | ||
| 93 | |||
| 94 | membership_id = conn.execute( | ||
| 95 | """ | ||
| 96 | insert into set_membership ( | ||
| 97 | set_type, set_name, member_type, member_id, song_id, priority | ||
| 98 | ) values ( | ||
| 99 | 'reference_set', 'phase1_hot_reference_v1', 'asset', %s, %s, 100 | ||
| 100 | ) returning membership_id | ||
| 101 | """, | ||
| 102 | (asset_id, song_id), | ||
| 103 | ).fetchone()['membership_id'] | ||
| 104 | |||
| 105 | lineage = conn.execute( | ||
| 106 | """ | ||
| 107 | select ff.feature_id, | ||
| 108 | ff.feature_type, | ||
| 109 | ff.model_name, | ||
| 110 | ff.model_version, | ||
| 111 | ff.feature_set_name, | ||
| 112 | win.object_id as window_id, | ||
| 113 | ast.object_id as asset_id, | ||
| 114 | song.entity_id as song_id, | ||
| 115 | song.title, | ||
| 116 | song.artist_name | ||
| 117 | from feature_fact ff | ||
| 118 | join audio_object win | ||
| 119 | on win.object_id = ff.object_id | ||
| 120 | and win.object_type = 'window' | ||
| 121 | join audio_object ast | ||
| 122 | on ast.object_id = win.parent_object_id | ||
| 123 | and ast.object_type = 'asset' | ||
| 124 | join media_entity song | ||
| 125 | on song.entity_id = ff.song_id | ||
| 126 | and song.entity_type = 'song' | ||
| 127 | where ff.feature_id = %s | ||
| 128 | """, | ||
| 129 | (embedding_id,), | ||
| 130 | ).fetchone() | ||
| 131 | |||
| 132 | counts = {} | ||
| 133 | for table in ['media_entity', 'audio_object', 'feature_fact', 'set_membership']: | ||
| 134 | counts[table] = conn.execute(f'select count(*) as c from {table}').fetchone()['c'] | ||
| 135 | |||
| 136 | report.update( | ||
| 137 | inserted={ | ||
| 138 | 'song_id': song_id, | ||
| 139 | 'asset_id': asset_id, | ||
| 140 | 'window_id': window_id, | ||
| 141 | 'fingerprint_feature_id': fingerprint_id, | ||
| 142 | 'embedding_feature_id': embedding_id, | ||
| 143 | 'membership_id': membership_id, | ||
| 144 | }, | ||
| 145 | counts=counts, | ||
| 146 | embedding_lineage=lineage, | ||
| 147 | ) | ||
| 148 | conn.commit() | ||
| 149 | |||
| 150 | output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2)) | ||
| 151 | print(json.dumps(report, ensure_ascii=False, indent=2)) | ||
| 152 | return 0 | ||
| 153 | |||
| 154 | |||
| 155 | if __name__ == '__main__': | ||
| 156 | raise SystemExit(main()) |
| 1 | create extension if not exists pgcrypto; | ||
| 2 | |||
| 3 | create table if not exists media_entity ( | ||
| 4 | entity_id bigserial primary key, | ||
| 5 | entity_type text not null check (entity_type in ('song', 'work', 'recording')), | ||
| 6 | root_song_id bigint, | ||
| 7 | parent_entity_id bigint, | ||
| 8 | biz_key text, | ||
| 9 | title text not null, | ||
| 10 | artist_name text, | ||
| 11 | entity_status text not null default 'active', | ||
| 12 | metadata_json jsonb not null default '{}'::jsonb, | ||
| 13 | created_at timestamptz not null default now(), | ||
| 14 | updated_at timestamptz not null default now(), | ||
| 15 | constraint fk_media_entity_root_song | ||
| 16 | foreign key (root_song_id) references media_entity(entity_id), | ||
| 17 | constraint fk_media_entity_parent | ||
| 18 | foreign key (parent_entity_id) references media_entity(entity_id) | ||
| 19 | ); | ||
| 20 | |||
| 21 | create unique index if not exists uq_media_entity_song_biz_key | ||
| 22 | on media_entity(entity_type, biz_key) | ||
| 23 | where biz_key is not null; | ||
| 24 | |||
| 25 | create index if not exists idx_media_entity_root_song | ||
| 26 | on media_entity(root_song_id); | ||
| 27 | |||
| 28 | create table if not exists audio_object ( | ||
| 29 | object_id bigserial primary key, | ||
| 30 | object_type text not null check (object_type in ('asset', 'window')), | ||
| 31 | song_id bigint not null references media_entity(entity_id), | ||
| 32 | parent_object_id bigint references audio_object(object_id), | ||
| 33 | source_type text, | ||
| 34 | storage_uri text, | ||
| 35 | storage_scheme text, | ||
| 36 | checksum text, | ||
| 37 | codec text, | ||
| 38 | sample_rate integer, | ||
| 39 | channels integer, | ||
| 40 | duration_ms integer, | ||
| 41 | start_ms integer, | ||
| 42 | end_ms integer, | ||
| 43 | object_status text not null default 'ready', | ||
| 44 | metadata_json jsonb not null default '{}'::jsonb, | ||
| 45 | created_at timestamptz not null default now(), | ||
| 46 | updated_at timestamptz not null default now(), | ||
| 47 | constraint ck_audio_object_window_parent | ||
| 48 | check ( | ||
| 49 | (object_type = 'asset' and parent_object_id is null) | ||
| 50 | or (object_type = 'window' and parent_object_id is not null) | ||
| 51 | ) | ||
| 52 | ); | ||
| 53 | |||
| 54 | create index if not exists idx_audio_object_song_type | ||
| 55 | on audio_object(song_id, object_type); | ||
| 56 | |||
| 57 | create index if not exists idx_audio_object_parent | ||
| 58 | on audio_object(parent_object_id); | ||
| 59 | |||
| 60 | create unique index if not exists uq_audio_object_asset_checksum | ||
| 61 | on audio_object(song_id, checksum) | ||
| 62 | where object_type = 'asset' and checksum is not null; | ||
| 63 | |||
| 64 | create unique index if not exists uq_audio_object_window_range | ||
| 65 | on audio_object(parent_object_id, start_ms, end_ms) | ||
| 66 | where object_type = 'window'; | ||
| 67 | |||
| 68 | create table if not exists feature_fact ( | ||
| 69 | feature_id bigserial primary key, | ||
| 70 | feature_type text not null check (feature_type in ('fingerprint', 'embedding')), | ||
| 71 | object_id bigint not null references audio_object(object_id), | ||
| 72 | song_id bigint not null references media_entity(entity_id), | ||
| 73 | model_name text not null, | ||
| 74 | model_version text not null, | ||
| 75 | feature_set_name text not null, | ||
| 76 | feature_schema_ver text not null default 'v1', | ||
| 77 | embedding_dim integer, | ||
| 78 | fingerprint_value text, | ||
| 79 | embedding_uri text, | ||
| 80 | vector_table_name text, | ||
| 81 | checksum text, | ||
| 82 | feature_status text not null default 'ready', | ||
| 83 | metadata_json jsonb not null default '{}'::jsonb, | ||
| 84 | created_at timestamptz not null default now(), | ||
| 85 | updated_at timestamptz not null default now(), | ||
| 86 | constraint ck_feature_payload | ||
| 87 | check ( | ||
| 88 | (feature_type = 'fingerprint' and fingerprint_value is not null) | ||
| 89 | or (feature_type = 'embedding' and (embedding_uri is not null or vector_table_name is not null)) | ||
| 90 | ) | ||
| 91 | ); | ||
| 92 | |||
| 93 | create index if not exists idx_feature_fact_object_type | ||
| 94 | on feature_fact(object_id, feature_type); | ||
| 95 | |||
| 96 | create index if not exists idx_feature_fact_song_type | ||
| 97 | on feature_fact(song_id, feature_type); | ||
| 98 | |||
| 99 | create unique index if not exists uq_feature_fact_embedding | ||
| 100 | on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type) | ||
| 101 | where feature_type = 'embedding'; | ||
| 102 | |||
| 103 | create unique index if not exists uq_feature_fact_fingerprint | ||
| 104 | on feature_fact(object_id, model_name, model_version, feature_set_name, feature_type) | ||
| 105 | where feature_type = 'fingerprint'; | ||
| 106 | |||
| 107 | create table if not exists set_membership ( | ||
| 108 | membership_id bigserial primary key, | ||
| 109 | set_type text not null check (set_type in ('reference_set', 'eval_set', 'hot_set')), | ||
| 110 | set_name text not null, | ||
| 111 | member_type text not null check (member_type in ('song', 'asset', 'window', 'feature')), | ||
| 112 | member_id bigint not null, | ||
| 113 | song_id bigint references media_entity(entity_id), | ||
| 114 | is_active boolean not null default true, | ||
| 115 | priority integer not null default 100, | ||
| 116 | metadata_json jsonb not null default '{}'::jsonb, | ||
| 117 | created_at timestamptz not null default now(), | ||
| 118 | updated_at timestamptz not null default now() | ||
| 119 | ); | ||
| 120 | |||
| 121 | create unique index if not exists uq_set_membership_unique | ||
| 122 | on set_membership(set_type, set_name, member_type, member_id); | ||
| 123 | |||
| 124 | create index if not exists idx_set_membership_set_lookup | ||
| 125 | on set_membership(set_type, set_name, is_active, priority); |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增正式 SQL 文件 `acr-engine/sql/acr_pg_schema_songcentric_v1.sql` 与 live smoke 脚本 `acr-engine/scripts/smoke_songcentric_schema_live.py`,把 4 张融合优先表从文档草案推进到可执行 schema,并准备在用户 PostgreSQL 上做 fresh 验证。 | ||
| 4 | |||
| 3 | - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 | 5 | - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 |
| 4 | 6 | ||
| 5 | - 在 `docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`。 | 7 | - 在 `docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`。 | ... | ... |
| ... | @@ -2,6 +2,8 @@ | ... | @@ -2,6 +2,8 @@ |
| 2 | 2 | ||
| 3 | > 更新:2026-06-04 | 3 | > 更新:2026-06-04 |
| 4 | > 目标:把当前 **song-centric + 融合优先** 设计落成一版可以直接评审和继续实现的 PostgreSQL DDL 草案。 | 4 | > 目标:把当前 **song-centric + 融合优先** 设计落成一版可以直接评审和继续实现的 PostgreSQL DDL 草案。 |
| 5 | > SQL 文件:[`acr-engine/sql/acr_pg_schema_songcentric_v1.sql`](../acr-engine/sql/acr_pg_schema_songcentric_v1.sql) | ||
| 6 | > live smoke:[`acr-engine/scripts/smoke_songcentric_schema_live.py`](../acr-engine/scripts/smoke_songcentric_schema_live.py) | ||
| 5 | 7 | ||
| 6 | --- | 8 | --- |
| 7 | 9 | ... | ... |
-
Please register or sign in to post a comment