postgres_db_schema_samples.md
28 KB
PostgreSQL Schema Samples / song-centric 4 表 DDL 与样例
更新:2026-06-04
SQL 文件:acr-engine/sql/acr_pg_schema_songcentric_v1.sql
1. 一页结论
当前默认物理模型:
media_entity -> audio_object -> feature_fact -> set_membership
当前默认逻辑语义:
song -> asset -> window -> fingerprint / embedding
其中:
-
audio_object统一承载原始音频和切片 -
feature_fact统一承载 exact/semantic 特征 -
set_membership统一承载 reference/eval/hot 集关系
2. 切片 / 模型 / feature 落在哪张表
| 对象 | 表 | 关键字段 | 示例 |
|---|---|---|---|
| song | media_entity |
entity_type='song' |
song_000001 |
| asset | audio_object |
object_type='asset' |
一首歌的原始 wav/mp3/flac |
| window | audio_object |
object_type='window' |
0-5000ms, 2500-7500ms
|
| fingerprint | feature_fact |
feature_type='fingerprint' |
chromaprint_matcher |
| embedding | feature_fact |
feature_type='embedding' |
MERT/MuQ/fallback vector |
| model | feature_fact |
model_name, model_version
|
chromaprint_matcher, mert-v1-95m, muq-large-msd-iter, local_wavehash_embed
|
| feature set | feature_fact |
feature_set_name, feature_schema_ver
|
mert_5s_hop2.5_v1 |
3. Phase-1 数据绑定一页图
flowchart LR
S[media_entity
song] --> A[audio_object
asset]
A --> W[audio_object
window]
W --> F1[feature_fact
chromaprint_matcher]
W --> F2[feature_fact
mert-v1-95m]
W --> F3[feature_fact
muq-large-msd-iter 计划]
关键绑定字段:
audio_object.song_id -> media_entity.entity_idwindow.parent_object_id -> asset.object_idfeature_fact.object_id -> window.object_idfeature_fact.song_id -> media_entity.entity_id
一句话:feature_fact 绑的是“具体 window”,不是抽象 song;但为了快速返回结果,又会把 song_id 冗余写进去。
4. DDL
3.1 media_entity
create table if not exists media_entity (
entity_id bigserial primary key,
entity_type text not null check (entity_type in ('song', 'work', 'recording')),
root_song_id bigint,
parent_entity_id bigint,
biz_key text,
title text not null,
artist_name text,
entity_status text not null default 'active',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint fk_media_entity_root_song
foreign key (root_song_id) references media_entity(entity_id),
constraint fk_media_entity_parent
foreign key (parent_entity_id) references media_entity(entity_id)
);
3.2 audio_object
create table if not exists audio_object (
object_id bigserial primary key,
object_type text not null check (object_type in ('asset', 'window')),
song_id bigint not null references media_entity(entity_id),
parent_object_id bigint references audio_object(object_id),
source_type text,
storage_uri text,
storage_scheme text,
checksum text,
codec text,
sample_rate integer,
channels integer,
duration_ms integer,
start_ms integer,
end_ms integer,
object_status text not null default 'ready',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint ck_audio_object_window_parent
check (
(object_type = 'asset' and parent_object_id is null)
or (object_type = 'window' and parent_object_id is not null)
)
);
3.3 feature_fact
create table if not exists feature_fact (
feature_id bigserial primary key,
feature_type text not null check (feature_type in ('fingerprint', 'embedding')),
object_id bigint not null references audio_object(object_id),
song_id bigint not null references media_entity(entity_id),
model_name text not null,
model_version text not null,
feature_set_name text not null,
feature_schema_ver text not null default 'v1',
embedding_dim integer,
fingerprint_value text,
embedding_uri text,
vector_table_name text,
checksum text,
feature_status text not null default 'ready',
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
constraint ck_feature_payload
check (
(feature_type = 'fingerprint' and fingerprint_value is not null)
or (feature_type = 'embedding' and (embedding_uri is not null or vector_table_name is not null))
)
);
3.4 set_membership
create table if not exists set_membership (
membership_id bigserial primary key,
set_type text not null check (set_type in ('reference_set', 'eval_set', 'hot_set')),
set_name text not null,
member_type text not null check (member_type in ('song', 'asset', 'window', 'feature')),
member_id bigint not null,
song_id bigint references media_entity(entity_id),
is_active boolean not null default true,
priority integer not null default 100,
metadata_json jsonb not null default '{}'::jsonb,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now()
);
4. 典型写入流程图
4.1 表写入顺序
flowchart TD
A[insert song] --> B[insert asset]
B --> C[insert windows]
C --> D1[insert fingerprint facts]
C --> D2[insert embedding facts]
A --> E[insert set_membership]
B --> E
C --> E
4.2 查询回溯顺序
flowchart LR
A[query features] --> B[feature_fact]
B --> C[window]
C --> D[asset]
D --> E[song]
5. 导入前的 manifest 样例
当前主链导入前,推荐就把 feature 放到 windows[].features[] 里:
{
"song": {"biz_key": "song_alpha", "title": "song alpha", "artist_name": "artist a"},
"asset": {
"source_type": "official",
"storage_uri": "/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav",
"storage_scheme": "file",
"checksum": "path:/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav",
"codec": "wav",
"sample_rate": 16000,
"channels": 1,
"duration_ms": 8000
},
"windows": [
{
"start_ms": 0,
"end_ms": 5000,
"features": [
{
"feature_type": "fingerprint",
"model_name": "chromaprint_matcher",
"model_version": "phase1_local",
"feature_set_name": "chromaprint_matcher_5s",
"fingerprint_value": "dc0c731425f360787f462da693ff4a50"
},
{
"feature_type": "embedding",
"model_name": "mert-v1-95m",
"model_version": "hf-main",
"feature_set_name": "mert_5s_hop2.5_v1",
"feature_schema_ver": "v1",
"embedding_dim": 768,
"embedding_uri": "inline-mert://19c0162d3bdde235:0:5000",
"vector_table_name": "audio_embedding_vector_768_placeholder"
}
]
}
],
"memberships": [
{"set_type": "reference_set", "set_name": "phase1_hot_reference_v1", "member_type": "asset", "priority": 100}
]
}
这份 JSON 的含义非常直接:
-
song决定最终要回到哪个song_id -
asset决定原始音频文件是谁 -
windows[]决定切片边界 -
windows[].features[]决定每个切片已经由哪些模型编码过
6. 样例数据
6.1 写 song
insert into media_entity (
entity_type, biz_key, title, artist_name, metadata_json
) values (
'song', 'song_000001', 'Song Alpha', 'Artist A',
'{"source":"catalog_import","language":"zh"}'::jsonb
)
returning entity_id;
6.2 写 asset
insert into audio_object (
object_type, song_id, source_type, storage_uri, storage_scheme,
checksum, codec, sample_rate, channels, duration_ms, metadata_json
) values (
'asset', :song_id, 'catalog_master',
's3://bucket/song_alpha/clip1.wav', 's3',
'sha256:asset001', 'wav', 44100, 2, 183000,
'{"uploader":"pipeline_v1"}'::jsonb
)
returning object_id;
6.3 写 window
insert into audio_object (
object_type, song_id, parent_object_id,
start_ms, end_ms, duration_ms, metadata_json
) values
('window', :song_id, :asset_id, 0, 5000, 5000, '{"hop_ms":2500}'::jsonb),
('window', :song_id, :asset_id, 2500, 7500, 5000, '{"hop_ms":2500}'::jsonb)
returning object_id;
6.4 写 fingerprint
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
fingerprint_value, checksum, metadata_json
) values (
'fingerprint', :window_id, :song_id,
'chromaprint_matcher', 'phase1_local', 'chromaprint_matcher_5s', 'v1',
'AQAAE0mUaEkSZSo...', 'sha256:fp001',
'{"lane":"exact"}'::jsonb
);
6.5 写 embedding
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
embedding_dim, embedding_uri, vector_table_name, checksum, metadata_json
) values (
'embedding', :window_id, :song_id,
'mert-v1-95m', 'hf-main', 'mert_5s_hop2.5_v1', 'v1',
768, 's3://bucket/embeddings/song_alpha_win0001.npy', 'audio_embedding_vector_768',
'sha256:emb001', '{"lane":"semantic"}'::jsonb
);
6.6 写 set membership
insert into set_membership (
set_type, set_name, member_type, member_id, song_id, priority, metadata_json
) values
('reference_set', 'phase1_hot_reference_v1', 'song', :song_id, :song_id, 100, '{}'::jsonb),
('reference_set', 'phase1_hot_reference_v1', 'asset', :asset_id, :song_id, 100, '{}'::jsonb),
('reference_set', 'phase1_hot_reference_v1', 'window', :window_id, :song_id, 100, '{}'::jsonb);
7. 典型查询
6.1 查看某首歌有哪些 asset
select object_id, storage_uri, checksum, duration_ms
from audio_object
where song_id = :song_id
and object_type = 'asset'
order by object_id;
6.2 查看某个 asset 切了哪些 window
select object_id, start_ms, end_ms, duration_ms
from audio_object
where parent_object_id = :asset_id
and object_type = 'window'
order by start_ms;
6.3 查看某个 window 被哪些模型编码过
select feature_type, model_name, model_version, feature_set_name, embedding_dim,
fingerprint_value, embedding_uri, vector_table_name
from feature_fact
where object_id = :window_id
order by feature_type, model_name, model_version;
6.4 从 feature 回查 song
select ff.feature_id,
ff.feature_type,
ff.model_name,
w.object_id as window_id,
w.start_ms,
w.end_ms,
a.object_id as asset_id,
a.storage_uri,
s.entity_id as song_id,
s.title,
s.artist_name
from feature_fact ff
join audio_object w
on w.object_id = ff.object_id
and w.object_type = 'window'
join audio_object a
on a.object_id = w.parent_object_id
and a.object_type = 'asset'
join media_entity s
on s.entity_id = ff.song_id
where ff.feature_id = :feature_id;
6.5 查询 reference set 中的全部 window
select sm.set_name,
sm.member_id as window_id,
sm.song_id,
ao.parent_object_id as asset_id,
ao.start_ms,
ao.end_ms
from set_membership sm
join audio_object ao
on ao.object_id = sm.member_id
and sm.member_type = 'window'
where sm.set_type = 'reference_set'
and sm.set_name = 'phase1_hot_reference_v1'
and sm.is_active = true
order by sm.song_id, ao.start_ms;
7. 一个最小存储样例怎么理解
song(Song Alpha)
-> asset(clip1.wav)
-> window(0-5000ms)
-> fingerprint(chromaprint)
-> embedding(mert-v1-95m)
-> window(2500-7500ms)
-> fingerprint(chromaprint)
-> embedding(mert-v1-95m)
落表后意味着:
-
song在media_entity -
asset/window都在audio_object -
chromaprint/mert都在feature_fact - 它们是否属于 hot reference 在
set_membership
8. 设计意图总结
这套结构主要解决:
- 同一 song 下多个音频文件
- 同一 asset 下多个切片窗口
- 同一 window 被多个模型重复编码
- fingerprint / embedding 统一落库
- reference/eval/hot 统一集合治理
- 查询后快速归属到
song_id
9. 当前最该关注的后续演进点
- 保持 4 表主链不变
- 给 semantic lane 接真实
MERT/MuQadapter - 继续复用
feature_fact.model_name/model_version/feature_set_name做模型演进 - 必要时再补更重的 registry / vector table 治理
10. 相关文档
11. 在线检索回溯样例
11.1 从命中的 feature 回查 song
flowchart LR
A[feature_fact] --> B[window]
B --> C[asset]
C --> D[song]
11.2 典型在线查询 SQL
select ff.feature_id,
ff.feature_type,
ff.model_name,
ff.feature_set_name,
w.object_id as window_id,
w.start_ms,
w.end_ms,
a.object_id as asset_id,
a.storage_uri,
s.entity_id as song_id,
s.title,
s.artist_name
from feature_fact ff
join audio_object w
on w.object_id = ff.object_id
and w.object_type = 'window'
join audio_object a
on a.object_id = w.parent_object_id
and a.object_type = 'asset'
join media_entity s
on s.entity_id = ff.song_id
where ff.feature_id = :feature_id;
11.3 典型 song-level 聚合 SQL
select ff.song_id,
s.title,
s.artist_name,
count(*) as matched_windows
from feature_fact ff
join media_entity s
on s.entity_id = ff.song_id
where ff.feature_id = any(:matched_feature_ids)
group by ff.song_id, s.title, s.artist_name
order by matched_windows desc
limit 20;
12. exact + semantic 双通道融合样例
12.1 融合流程图
flowchart TD
A[exact candidates] --> C[song aggregation]
B[semantic candidates] --> C
C --> D[rerank]
D --> E[topK song_ids]
12.2 推荐的 Phase-1 融合口径
final_song_score =
0.55 * exact_score_norm
+ 0.35 * semantic_score_norm
+ 0.10 * coverage_score_norm
12.3 融合聚合 SQL 骨架
with matched as (
select ff.song_id,
ff.feature_type,
w.object_id as window_id,
w.parent_object_id as asset_id,
w.start_ms,
w.end_ms,
:score_map[ff.feature_id]::double precision as raw_score
from feature_fact ff
join audio_object w
on w.object_id = ff.object_id
and w.object_type = 'window'
where ff.feature_id = any(:matched_feature_ids)
)
select m.song_id,
s.title,
s.artist_name,
count(*) filter (where m.feature_type = 'fingerprint') as exact_hit_count,
count(*) filter (where m.feature_type = 'embedding') as semantic_hit_count,
max(raw_score) filter (where m.feature_type = 'fingerprint') as exact_best_score,
max(raw_score) filter (where m.feature_type = 'embedding') as semantic_best_score,
max(end_ms) - min(start_ms) as offset_coverage_ms
from matched m
join media_entity s
on s.entity_id = m.song_id
group by m.song_id, s.title, s.artist_name
order by coalesce(max(raw_score) filter (where m.feature_type = 'fingerprint'), 0) desc,
coalesce(max(raw_score) filter (where m.feature_type = 'embedding'), 0) desc,
offset_coverage_ms desc
limit 20;
13. 绑定关系与开源模型落库样例
13.1 最小绑定关系
media_entity(song)
-> audio_object(asset)
-> audio_object(window)
-> feature_fact(chromaprint)
-> feature_fact(mert-v1-95m)
-> feature_fact(muq-base)
13.2 具体样例
Step 1: song
insert into media_entity (
entity_type, biz_key, title, artist_name
) values (
'song', 'song_000123', 'Demo Song', 'Demo Artist'
)
returning entity_id;
Step 2: asset
insert into audio_object (
object_type, song_id, storage_uri, checksum, codec, sample_rate, channels, duration_ms
) values (
'asset', :song_id, 's3://bucket/demo_song/master.wav', 'sha256:asset-demo', 'wav', 44100, 2, 210000
)
returning object_id;
Step 3: window
insert into audio_object (
object_type, song_id, parent_object_id, start_ms, end_ms, duration_ms
) values (
'window', :song_id, :asset_id, 0, 5000, 5000
)
returning object_id;
Step 4: chromaprint fingerprint
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
fingerprint_value
) values (
'fingerprint', :window_id, :song_id,
'chromaprint_matcher', 'phase1_local', 'chromaprint_matcher_5s', 'v1',
'AQAAE0mUaEkSZSo...'
);
Step 5: MERT embedding
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
embedding_dim, embedding_uri, vector_table_name
) values (
'embedding', :window_id, :song_id,
'mert-v1-95m', 'hf-main', 'mert_5s_hop2.5_v1', 'v1',
768, 's3://bucket/emb/demo_song_win0001_mert.npy', 'audio_embedding_vector_768'
);
Step 6: MuQ embedding
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
embedding_dim, embedding_uri, vector_table_name
) values (
'embedding', :window_id, :song_id,
'muq-large-msd-iter', 'hf-main', 'muq_5s_hop2.5_v1', 'v1',
768, 's3://bucket/emb/demo_song_win0001_muq.npy', 'audio_embedding_vector_768'
);
Step 7: fallback embedding
insert into feature_fact (
feature_type, object_id, song_id,
model_name, model_version, feature_set_name, feature_schema_ver,
embedding_dim, embedding_uri, vector_table_name
) values (
'embedding', :window_id, :song_id,
'local_wavehash_embed', 'phase1_local', 'wavehash_5s_hop2.5_v1', 'v1',
8, 'file:///tmp/demo_song_win0001_wavehash.npy', 'audio_embedding_vector_8_placeholder'
);
13.3 查询某个 window 已经被哪些开源模型编码过
select object_id,
song_id,
feature_type,
model_name,
model_version,
feature_set_name,
embedding_dim,
fingerprint_value,
embedding_uri,
vector_table_name
from feature_fact
where object_id = :window_id
order by feature_type, model_name;
13.4 查询某个 song 当前有哪些模型特征
select ff.song_id,
ff.model_name,
ff.model_version,
ff.feature_type,
count(*) as feature_rows
from feature_fact ff
where ff.song_id = :song_id
group by ff.song_id, ff.model_name, ff.model_version, ff.feature_type
order by ff.feature_type, ff.model_name;
14. 一个真实绑定查询样例
下面这条 SQL 用来回答用户最关心的问题:
一条 feature 是怎么和 audio object 绑定,并最终回到
song_id的?
select ff.feature_id,
ff.feature_type,
ff.model_name,
ff.model_version,
ff.feature_set_name,
w.object_id as window_id,
w.start_ms,
w.end_ms,
a.object_id as asset_id,
a.storage_uri,
s.entity_id as song_id,
s.biz_key
from feature_fact ff
join audio_object w
on w.object_id = ff.object_id
and w.object_type = 'window'
join audio_object a
on a.object_id = w.parent_object_id
and a.object_type = 'asset'
join media_entity s
on s.entity_id = ff.song_id
where ff.feature_id = :feature_id;
你可以把它理解成 4 步:
- 从
feature_fact找到这条特征 - 用
object_id找到它绑定的window - 用
parent_object_id找到它所属的asset - 用
song_id找到最终归属的song
14.1 一个当前 live 的真实结果
当前 PostgreSQL acr_songcentric_test 中,feature_id = 34 的真实回溯结果是:
feature_id = 34
feature_type = embedding
model_name = mert-v1-95m
model_version = hf-main
feature_set_name = mert_5s_hop2.5_v1
window_id = 22
window_range = 1000-6000 ms
asset_id = 20
asset_uri = /workspace/acr-engine/data/songcentric_builder_smoke/song_beta/artist_b/clip2.wav
song_id = 9
song_biz_key = song_beta
这条 live 结果说明:
- 当前真实 semantic baseline 已经是
mert-v1-95m - 一条 embedding feature 可以被精确回溯到具体
window/asset/song - 这正是当前版权保护链路里“快速定位 song_id”的最小证据闭环
15. 一个完整的多 asset / 多 window / 多 model 样例
假设:
- 同一个
song_id = 1001 - 有 2 个音频文件:
master.wav、ugc_clip.mp3 - 每个 asset 切成 2 个 window
- 每个 window 都跑
chromaprint_matcher + mert-v1-95m + muq-large-msd-iter
14.1 逻辑结构
song(1001)
-> asset(2001, master.wav)
-> window(3001, 0-5000)
-> chromaprint_matcher
-> mert-v1-95m
-> muq-large-msd-iter
-> window(3002, 2500-7500)
-> chromaprint_matcher
-> mert-v1-95m
-> muq-large-msd-iter
-> asset(2002, ugc_clip.mp3)
-> window(3003, 10000-15000)
-> chromaprint_matcher
-> mert-v1-95m
-> muq-large-msd-iter
-> window(3004, 12500-17500)
-> chromaprint_matcher
-> mert-v1-95m
-> muq-large-msd-iter
14.2 会落成多少行
| 表 | 行数 | 说明 |
|---|---|---|
media_entity |
1 | 一个 song |
audio_object |
6 | 2 个 asset + 4 个 window |
feature_fact |
12 | 4 个 window × 3 个模型 |
set_membership |
视需要 | 可给 song/asset/window 挂 reference_set |
14.3 查询某个 song 的全量树状数据
select s.entity_id as song_id,
s.title,
a.object_id as asset_id,
a.storage_uri,
w.object_id as window_id,
w.start_ms,
w.end_ms,
ff.feature_type,
ff.model_name,
ff.model_version,
ff.feature_set_name
from media_entity s
join audio_object a
on a.song_id = s.entity_id
and a.object_type = 'asset'
join audio_object w
on w.parent_object_id = a.object_id
and w.object_type = 'window'
left join feature_fact ff
on ff.object_id = w.object_id
where s.entity_id = :song_id
order by a.object_id, w.start_ms, ff.feature_type, ff.model_name;
14.4 查询哪些 window 缺某个模型
这个 SQL 很适合做补算任务扫描,比如检查哪些 window 还没跑 muq-large-msd-iter:
select w.object_id as window_id,
w.song_id,
w.parent_object_id as asset_id,
w.start_ms,
w.end_ms
from audio_object w
where w.object_type = 'window'
and not exists (
select 1
from feature_fact ff
where ff.object_id = w.object_id
and ff.feature_type = 'embedding'
and ff.model_name = 'muq-large-msd-iter'
and ff.model_version = 'hf-main'
and ff.feature_set_name = 'muq_5s_hop2.5_v1'
)
order by w.song_id, w.parent_object_id, w.start_ms;
14.5 查询某个 asset 下每个 window 已经具备哪些模型
select w.object_id as window_id,
w.start_ms,
w.end_ms,
string_agg(ff.model_name || ':' || ff.feature_type, ', ' order by ff.model_name) as ready_features
from audio_object w
left join feature_fact ff
on ff.object_id = w.object_id
where w.object_type = 'window'
and w.parent_object_id = :asset_id
group by w.object_id, w.start_ms, w.end_ms
order by w.start_ms;
16. 批量入库与索引建设样例
15.1 推荐批量顺序
batch-1: media_entity(song)
batch-2: audio_object(asset)
batch-3: audio_object(window)
batch-4: feature_fact(chromaprint)
batch-5: feature_fact(mert-v1-95m)
batch-6: feature_fact(muq-large-msd-iter)
15.2 推荐补充索引
create index if not exists idx_feature_fact_model_lookup
on feature_fact(model_name, model_version, feature_set_name, feature_type, song_id);
15.3 主链完整性巡检
没有 window 的 asset
select a.object_id as asset_id, a.song_id, a.storage_uri
from audio_object a
where a.object_type = 'asset'
and not exists (
select 1
from audio_object w
where w.parent_object_id = a.object_id
and w.object_type = 'window'
);
没有 chromaprint 的 window
select w.object_id as window_id, w.song_id
from audio_object w
where w.object_type = 'window'
and not exists (
select 1
from feature_fact ff
where ff.object_id = w.object_id
and ff.feature_type = 'fingerprint'
and ff.model_name = 'chromaprint'
);
没有 MERT 的 window
select w.object_id as window_id, w.song_id
from audio_object w
where w.object_type = 'window'
and not exists (
select 1
from feature_fact ff
where ff.object_id = w.object_id
and ff.feature_type = 'embedding'
and ff.model_name = 'mert-v1-95m'
and ff.model_version = 'hf-main'
and ff.feature_set_name = 'mert_5s_hop2.5_v1'
);
15.4 冷热分层口径
hot_set -> 高频版权曲
reference_set -> 主 reference catalog
cold -> 长尾曲库,先保主链 + exact
16. vector table / embedding 文件存储样例
16.1 feature_fact 中怎么记录 embedding 位置
insert into feature_fact (
feature_type,
object_id,
song_id,
model_name,
model_version,
feature_set_name,
feature_schema_ver,
embedding_dim,
embedding_uri,
vector_table_name,
checksum,
metadata_json
) values (
'embedding',
:window_id,
:song_id,
'mert-v1-95m',
'hf-main',
'mert_5s_hop2.5_v1',
'v1',
768,
's3://acr-emb/phase1/mert-v1-95m/song_1001/asset_2001/window_3001_mert_5s_hop2.5_v1.npy',
'audio_embedding_vector_768',
'sha256:emb-demo-001',
'{"storage_tier":"hot"}'::jsonb
);
16.2 一个推荐的向量侧表样例
这里是逻辑样例,真实向量类型可按你的 pgvector 版本落地。
create table if not exists audio_embedding_vector_768 (
embedding_row_id bigserial primary key,
feature_id bigint not null references feature_fact(feature_id),
vector_dim integer not null default 768,
embedding_vector vector(768),
created_at timestamptz not null default now()
);
16.3 feature_fact 与 vector table 的关联查询
select ff.feature_id,
ff.song_id,
ff.object_id as window_id,
ff.model_name,
ff.feature_set_name,
ff.embedding_dim,
ff.embedding_uri,
ff.vector_table_name,
v.embedding_row_id
from feature_fact ff
left join audio_embedding_vector_768 v
on v.feature_id = ff.feature_id
where ff.feature_type = 'embedding'
and ff.model_name = 'mert-v1-95m'
and ff.feature_set_name = 'mert_5s_hop2.5_v1';
16.4 查哪些 embedding 还没进入 vector table
select ff.feature_id,
ff.song_id,
ff.object_id as window_id,
ff.embedding_uri
from feature_fact ff
left join audio_embedding_vector_768 v
on v.feature_id = ff.feature_id
where ff.feature_type = 'embedding'
and ff.embedding_dim = 768
and ff.vector_table_name = 'audio_embedding_vector_768'
and v.feature_id is null
order by ff.song_id, ff.object_id;
16.5 查哪些冷层 embedding 可以先不进热索引
select ff.feature_id,
ff.song_id,
ff.object_id,
ff.embedding_uri,
ff.metadata_json
from feature_fact ff
where ff.feature_type = 'embedding'
and coalesce(ff.metadata_json->>'storage_tier', 'cold') = 'cold'
and ff.vector_table_name is null
order by ff.song_id, ff.object_id;