acr_pg_schema_v2.sql
19.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
-- ACR PostgreSQL Schema V2
-- Purpose:
-- 1. Support canonical_song/work/recording/asset/window hierarchy
-- 2. Support encoder-first evolution via model_registry + feature_set_registry
-- 3. Support pgvector-backed hot reference sets without binding the entire system to one vector table
CREATE EXTENSION IF NOT EXISTS vector;
-- =========================================================
-- 1. Canonical business entities
-- =========================================================
CREATE TABLE IF NOT EXISTS canonical_song (
canonical_song_id BIGSERIAL PRIMARY KEY,
biz_song_code TEXT UNIQUE,
title TEXT NOT NULL,
title_norm TEXT,
primary_artist TEXT,
primary_artist_norm TEXT,
language_code TEXT,
rights_status TEXT,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS work (
work_id BIGSERIAL PRIMARY KEY,
canonical_song_id BIGINT NOT NULL REFERENCES canonical_song(canonical_song_id),
work_code TEXT UNIQUE,
work_title TEXT NOT NULL,
work_title_norm TEXT,
composer TEXT,
lyricist TEXT,
publisher TEXT,
iswc TEXT,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS recording (
recording_id BIGSERIAL PRIMARY KEY,
work_id BIGINT NOT NULL REFERENCES work(work_id),
canonical_song_id BIGINT NOT NULL REFERENCES canonical_song(canonical_song_id),
recording_code TEXT UNIQUE,
recording_title TEXT,
artist_name TEXT,
album_name TEXT,
version_type TEXT NOT NULL,
is_reference BOOLEAN NOT NULL DEFAULT FALSE,
reference_priority INTEGER NOT NULL DEFAULT 100,
release_date DATE,
isrc TEXT,
duration_sec NUMERIC(10,3),
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- =========================================================
-- 2. Assets and windows
-- =========================================================
CREATE TABLE IF NOT EXISTS recording_asset (
asset_id BIGSERIAL PRIMARY KEY,
recording_id BIGINT NOT NULL REFERENCES recording(recording_id),
asset_role TEXT NOT NULL,
storage_uri TEXT NOT NULL,
storage_scheme TEXT NOT NULL,
file_ext TEXT,
mime_type TEXT,
file_size_bytes BIGINT,
audio_sha256 TEXT,
sample_rate INTEGER,
channels INTEGER,
bit_rate_kbps INTEGER,
codec_name TEXT,
duration_sec NUMERIC(10,3),
loudness_lufs NUMERIC(8,3),
normalized_storage_uri TEXT,
ingest_status TEXT NOT NULL DEFAULT 'ready',
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS uq_recording_asset_sha256
ON recording_asset(audio_sha256)
WHERE audio_sha256 IS NOT NULL;
CREATE TABLE IF NOT EXISTS audio_window (
window_id BIGSERIAL PRIMARY KEY,
asset_id BIGINT NOT NULL REFERENCES recording_asset(asset_id),
recording_id BIGINT NOT NULL REFERENCES recording(recording_id),
work_id BIGINT NOT NULL REFERENCES work(work_id),
canonical_song_id BIGINT NOT NULL REFERENCES canonical_song(canonical_song_id),
window_index INTEGER NOT NULL,
start_sec NUMERIC(10,3) NOT NULL,
end_sec NUMERIC(10,3) NOT NULL,
duration_sec NUMERIC(10,3) NOT NULL,
segment_role TEXT NOT NULL DEFAULT 'reference',
segment_type TEXT,
quality_score NUMERIC(8,5),
active_for_index BOOLEAN NOT NULL DEFAULT TRUE,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_window_asset_idx
ON audio_window(asset_id, window_index);
-- =========================================================
-- 3. Model and feature registries
-- =========================================================
CREATE TABLE IF NOT EXISTS model_registry (
model_id BIGSERIAL PRIMARY KEY,
model_name TEXT NOT NULL,
model_family TEXT NOT NULL,
model_version TEXT NOT NULL,
model_source TEXT,
model_uri TEXT,
license_name TEXT,
input_modality TEXT NOT NULL DEFAULT 'audio',
input_sample_rate INTEGER,
input_channel_mode TEXT DEFAULT 'mono',
default_window_sec NUMERIC(10,3),
default_hop_sec NUMERIC(10,3),
output_embedding_dim INTEGER,
pooling_supported TEXT[],
layer_selection_supported BOOLEAN NOT NULL DEFAULT FALSE,
is_trainable BOOLEAN NOT NULL DEFAULT FALSE,
is_active BOOLEAN NOT NULL DEFAULT TRUE,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(model_name, model_version)
);
CREATE TABLE IF NOT EXISTS feature_set_registry (
feature_set_id BIGSERIAL PRIMARY KEY,
model_id BIGINT NOT NULL REFERENCES model_registry(model_id),
feature_name TEXT NOT NULL,
feature_level TEXT NOT NULL,
extraction_granularity TEXT NOT NULL,
window_sec NUMERIC(10,3),
hop_sec NUMERIC(10,3),
embedding_dim INTEGER,
pooling_strategy TEXT,
layer_selection TEXT,
normalize_l2 BOOLEAN NOT NULL DEFAULT TRUE,
distance_metric TEXT NOT NULL,
quantization_type TEXT,
feature_schema_version TEXT NOT NULL,
config_json JSONB NOT NULL DEFAULT '{}'::jsonb,
status TEXT NOT NULL DEFAULT 'active',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS feature_extraction_job (
extraction_job_id BIGSERIAL PRIMARY KEY,
feature_set_id BIGINT NOT NULL REFERENCES feature_set_registry(feature_set_id),
target_scope TEXT NOT NULL,
job_status TEXT NOT NULL DEFAULT 'pending',
shard_key TEXT,
input_count BIGINT,
output_count BIGINT,
started_at TIMESTAMPTZ,
finished_at TIMESTAMPTZ,
log_uri TEXT,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- =========================================================
-- 4. Feature facts
-- =========================================================
CREATE TABLE IF NOT EXISTS audio_embedding (
embedding_id BIGSERIAL PRIMARY KEY,
feature_set_id BIGINT NOT NULL REFERENCES feature_set_registry(feature_set_id),
extraction_job_id BIGINT REFERENCES feature_extraction_job(extraction_job_id),
asset_id BIGINT REFERENCES recording_asset(asset_id),
window_id BIGINT REFERENCES audio_window(window_id),
recording_id BIGINT NOT NULL REFERENCES recording(recording_id),
work_id BIGINT NOT NULL REFERENCES work(work_id),
canonical_song_id BIGINT NOT NULL REFERENCES canonical_song(canonical_song_id),
embedding_storage_mode TEXT NOT NULL,
embedding_uri TEXT,
vector_norm NUMERIC(12,6),
checksum TEXT,
is_indexed BOOLEAN NOT NULL DEFAULT FALSE,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT ck_audio_embedding_scope CHECK (asset_id IS NOT NULL OR window_id IS NOT NULL)
);
CREATE TABLE IF NOT EXISTS audio_embedding_vector_192 (
embedding_id BIGINT PRIMARY KEY REFERENCES audio_embedding(embedding_id) ON DELETE CASCADE,
embedding VECTOR(192) NOT NULL
);
CREATE TABLE IF NOT EXISTS audio_embedding_vector_768 (
embedding_id BIGINT PRIMARY KEY REFERENCES audio_embedding(embedding_id) ON DELETE CASCADE,
embedding VECTOR(768) NOT NULL
);
CREATE TABLE IF NOT EXISTS audio_fingerprint (
fingerprint_id BIGSERIAL PRIMARY KEY,
feature_set_id BIGINT NOT NULL REFERENCES feature_set_registry(feature_set_id),
asset_id BIGINT REFERENCES recording_asset(asset_id),
window_id BIGINT REFERENCES audio_window(window_id),
recording_id BIGINT NOT NULL REFERENCES recording(recording_id),
work_id BIGINT NOT NULL REFERENCES work(work_id),
canonical_song_id BIGINT NOT NULL REFERENCES canonical_song(canonical_song_id),
fingerprint_uri TEXT,
hash_count INTEGER,
is_indexed BOOLEAN NOT NULL DEFAULT FALSE,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_fingerprint_feature_asset
ON audio_fingerprint(feature_set_id, asset_id);
CREATE TABLE IF NOT EXISTS reference_set_registry (
reference_set_id BIGSERIAL PRIMARY KEY,
set_name TEXT NOT NULL UNIQUE,
description TEXT,
encoder_scope TEXT,
status TEXT NOT NULL DEFAULT 'active',
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS reference_set_member (
reference_set_id BIGINT NOT NULL REFERENCES reference_set_registry(reference_set_id) ON DELETE CASCADE,
recording_id BIGINT NOT NULL REFERENCES recording(recording_id) ON DELETE CASCADE,
member_role TEXT NOT NULL DEFAULT 'hot_reference',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY(reference_set_id, recording_id)
);
-- =========================================================
-- 4.5 Lineage invariants (recommended production hardening)
-- =========================================================
CREATE OR REPLACE FUNCTION check_recording_lineage()
RETURNS trigger AS $$
DECLARE
work_song_id BIGINT;
BEGIN
SELECT canonical_song_id INTO work_song_id
FROM work
WHERE work_id = NEW.work_id;
IF work_song_id IS NULL THEN
RAISE EXCEPTION 'Invalid work_id=% for recording', NEW.work_id;
END IF;
IF NEW.canonical_song_id <> work_song_id THEN
RAISE EXCEPTION 'recording.canonical_song_id % mismatches work.canonical_song_id %', NEW.canonical_song_id, work_song_id;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION check_audio_window_lineage()
RETURNS trigger AS $$
DECLARE
asset_recording_id BIGINT;
rec_work_id BIGINT;
rec_song_id BIGINT;
BEGIN
SELECT recording_id INTO asset_recording_id
FROM recording_asset
WHERE asset_id = NEW.asset_id;
SELECT work_id, canonical_song_id INTO rec_work_id, rec_song_id
FROM recording
WHERE recording_id = NEW.recording_id;
IF asset_recording_id IS NULL OR rec_work_id IS NULL THEN
RAISE EXCEPTION 'Invalid asset_id=% or recording_id=% for audio_window', NEW.asset_id, NEW.recording_id;
END IF;
IF NEW.recording_id <> asset_recording_id THEN
RAISE EXCEPTION 'audio_window.recording_id % mismatches recording_asset.recording_id %', NEW.recording_id, asset_recording_id;
END IF;
IF NEW.work_id <> rec_work_id OR NEW.canonical_song_id <> rec_song_id THEN
RAISE EXCEPTION 'audio_window lineage mismatch for recording_id=%', NEW.recording_id;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION check_audio_embedding_lineage()
RETURNS trigger AS $$
DECLARE
parent_recording_id BIGINT;
parent_work_id BIGINT;
parent_song_id BIGINT;
BEGIN
IF NEW.window_id IS NOT NULL THEN
SELECT recording_id, work_id, canonical_song_id
INTO parent_recording_id, parent_work_id, parent_song_id
FROM audio_window
WHERE window_id = NEW.window_id;
ELSE
SELECT r.recording_id, r.work_id, r.canonical_song_id
INTO parent_recording_id, parent_work_id, parent_song_id
FROM recording_asset ra
JOIN recording r ON r.recording_id = ra.recording_id
WHERE ra.asset_id = NEW.asset_id;
END IF;
IF parent_recording_id IS NULL THEN
RAISE EXCEPTION 'Invalid parent reference for audio_embedding';
END IF;
IF NEW.recording_id <> parent_recording_id OR NEW.work_id <> parent_work_id OR NEW.canonical_song_id <> parent_song_id THEN
RAISE EXCEPTION 'audio_embedding lineage mismatch';
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- =========================================================
-- 5. Retrieval index and results
-- =========================================================
CREATE TABLE IF NOT EXISTS retrieval_index_registry (
retrieval_index_id BIGSERIAL PRIMARY KEY,
feature_set_id BIGINT NOT NULL REFERENCES feature_set_registry(feature_set_id),
index_name TEXT NOT NULL,
index_backend TEXT NOT NULL,
index_type TEXT NOT NULL,
storage_uri TEXT,
shard_no INTEGER,
row_count BIGINT,
index_status TEXT NOT NULL DEFAULT 'active',
config_json JSONB NOT NULL DEFAULT '{}'::jsonb,
built_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS retrieval_candidate (
retrieval_candidate_id BIGSERIAL PRIMARY KEY,
query_id TEXT NOT NULL,
retrieval_index_id BIGINT REFERENCES retrieval_index_registry(retrieval_index_id),
feature_set_id BIGINT REFERENCES feature_set_registry(feature_set_id),
source_lane TEXT NOT NULL CHECK (source_lane IN ('fingerprint', 'semantic', 'cover', 'melody', 'fusion')),
candidate_level TEXT NOT NULL CHECK (candidate_level IN ('window', 'recording', 'work', 'canonical_song')),
candidate_id BIGINT NOT NULL,
evidence_window_id BIGINT REFERENCES audio_window(window_id),
raw_score NUMERIC(14,8),
normalized_score NUMERIC(14,8),
rank_no INTEGER,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS match_decision (
match_decision_id BIGSERIAL PRIMARY KEY,
query_id TEXT NOT NULL,
canonical_song_id BIGINT REFERENCES canonical_song(canonical_song_id),
work_id BIGINT REFERENCES work(work_id),
recording_id BIGINT REFERENCES recording(recording_id),
decision_status TEXT NOT NULL,
decision_score NUMERIC(14,8),
decision_reason TEXT,
metadata_json JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TRIGGER trg_recording_lineage
BEFORE INSERT OR UPDATE ON recording
FOR EACH ROW EXECUTE FUNCTION check_recording_lineage();
CREATE TRIGGER trg_audio_window_lineage
BEFORE INSERT OR UPDATE ON audio_window
FOR EACH ROW EXECUTE FUNCTION check_audio_window_lineage();
CREATE TRIGGER trg_audio_embedding_lineage
BEFORE INSERT OR UPDATE ON audio_embedding
FOR EACH ROW EXECUTE FUNCTION check_audio_embedding_lineage();
-- =========================================================
-- 6. Recommended indexes
-- =========================================================
CREATE INDEX IF NOT EXISTS idx_work_canonical_song_id
ON work(canonical_song_id);
CREATE INDEX IF NOT EXISTS idx_recording_work_id
ON recording(work_id);
CREATE INDEX IF NOT EXISTS idx_recording_canonical_song_id
ON recording(canonical_song_id);
CREATE INDEX IF NOT EXISTS idx_recording_reference
ON recording(is_reference, reference_priority);
CREATE INDEX IF NOT EXISTS idx_recording_asset_recording_id
ON recording_asset(recording_id);
CREATE INDEX IF NOT EXISTS idx_audio_window_asset_id
ON audio_window(asset_id);
CREATE INDEX IF NOT EXISTS idx_audio_window_recording_id
ON audio_window(recording_id);
CREATE INDEX IF NOT EXISTS idx_audio_window_canonical_song_id
ON audio_window(canonical_song_id);
CREATE INDEX IF NOT EXISTS idx_audio_window_active_for_index
ON audio_window(active_for_index);
CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_window
ON audio_embedding(feature_set_id, window_id)
WHERE window_id IS NOT NULL;
CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_embedding_feature_asset
ON audio_embedding(feature_set_id, asset_id)
WHERE window_id IS NULL AND asset_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_audio_embedding_feature_set_id
ON audio_embedding(feature_set_id);
CREATE INDEX IF NOT EXISTS idx_audio_embedding_window_id
ON audio_embedding(window_id);
CREATE INDEX IF NOT EXISTS idx_audio_embedding_recording_id
ON audio_embedding(recording_id);
CREATE INDEX IF NOT EXISTS idx_reference_set_member_recording_id
ON reference_set_member(recording_id);
CREATE INDEX IF NOT EXISTS idx_retrieval_candidate_query_id
ON retrieval_candidate(query_id);
CREATE INDEX IF NOT EXISTS idx_match_decision_query_id
ON match_decision(query_id);
-- Optional hot-set HNSW indexes for pgvector-backed online retrieval
CREATE INDEX IF NOT EXISTS idx_audio_embedding_vector_192_cos_hnsw
ON audio_embedding_vector_192 USING hnsw (embedding vector_cosine_ops);
CREATE INDEX IF NOT EXISTS idx_audio_embedding_vector_768_cos_hnsw
ON audio_embedding_vector_768 USING hnsw (embedding vector_cosine_ops);