Commit 0f75787b 0f75787baf94fc7a20755192c8d1cbbaf2d9404e by cnb.bofCdSsphPA

Build song-centric manifests directly from real audio directories

Constraint: Keep the current fused 4-table workflow while reducing manual JSONL authoring for onboarding real audio files into live PostgreSQL.
Rejected: Require hand-authored manifests as the only path into the song-centric importer | It slows real data onboarding and raises operator effort.
Confidence: high
Scope-risk: narrow
Directive: Prefer build_songcentric_manifest_from_directory.py -> import_songcentric_manifest_live.py as the default Phase-1 path for real file-directory onboarding.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/build_songcentric_manifest_from_directory.py on a real local wav smoke directory; imported the generated manifest into postgres://d2:d2pass@127.0.0.1:5432/d2 schema acr_songcentric_test; reran the import and verified counts remained media_entity=9, audio_object=22, feature_fact=9, set_membership=9; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: non-wav duration probing and very large directory trees
1 parent d04a6e65
{"song": {"biz_key": "song_alpha", "title": "song alpha", "artist_name": "artist a"}, "asset": {"source_type": "official", "storage_uri": "/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav", "storage_scheme": "file", "checksum": "path:/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav", "codec": "wav", "sample_rate": 16000, "channels": 1, "duration_ms": 8000}, "windows": [{"start_ms": 0, "end_ms": 5000}, {"start_ms": 2500, "end_ms": 7500}, {"start_ms": 3000, "end_ms": 8000}], "memberships": [{"set_type": "reference_set", "set_name": "phase1_hot_reference_v1", "member_type": "asset", "priority": 100}]}
{"song": {"biz_key": "song_beta", "title": "song beta", "artist_name": "artist b"}, "asset": {"source_type": "official", "storage_uri": "/workspace/acr-engine/data/songcentric_builder_smoke/song_beta/artist_b/clip2.wav", "storage_scheme": "file", "checksum": "path:/workspace/acr-engine/data/songcentric_builder_smoke/song_beta/artist_b/clip2.wav", "codec": "wav", "sample_rate": 16000, "channels": 1, "duration_ms": 6000}, "windows": [{"start_ms": 0, "end_ms": 5000}, {"start_ms": 1000, "end_ms": 6000}], "memberships": [{"set_type": "reference_set", "set_name": "phase1_hot_reference_v1", "member_type": "asset", "priority": 100}]}
{
"input_root": "/workspace/acr-engine/data/songcentric_builder_smoke",
"output": "/workspace/acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
"song_count": 2,
"asset_count": 2,
"window_count": 5,
"window_ms": 5000,
"stride_ms": 2500,
"set_name": "phase1_hot_reference_v1"
}
\ No newline at end of file
{
"schema": "acr_songcentric_test",
"manifest": "acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
"imported": [
{
"song_id": 8,
"asset_id": 16,
"window_ids": [
17,
18,
19
],
"feature_ids": [],
"membership_ids": [
8
]
},
{
"song_id": 9,
"asset_id": 20,
"window_ids": [
21,
22
],
"feature_ids": [],
"membership_ids": [
9
]
}
],
"counts": {
"media_entity": 9,
"audio_object": 22,
"feature_fact": 9,
"set_membership": 9
},
"window_lineage_sample": {
"window_id": 22,
"asset_id": 20,
"song_id": 9,
"title": "song beta",
"start_ms": 1000,
"end_ms": 6000
},
"feature_lineage_sample": {
"feature_type": "embedding",
"model_name": "muq",
"model_version": "large-msd-iter",
"feature_set_name": "muq_5s_hop2.5_meanpool",
"window_id": 15,
"song_id": 7,
"title": "Feature Manifest Song 2"
}
}
\ No newline at end of file
{
"schema": "acr_songcentric_test",
"manifest": "acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
"imported": [
{
"song_id": 8,
"asset_id": 16,
"window_ids": [
17,
18,
19
],
"feature_ids": [],
"membership_ids": [
8
]
},
{
"song_id": 9,
"asset_id": 20,
"window_ids": [
21,
22
],
"feature_ids": [],
"membership_ids": [
9
]
}
],
"counts": {
"media_entity": 9,
"audio_object": 22,
"feature_fact": 9,
"set_membership": 9
},
"window_lineage_sample": {
"window_id": 22,
"asset_id": 20,
"song_id": 9,
"title": "song beta",
"start_ms": 1000,
"end_ms": 6000
},
"feature_lineage_sample": {
"feature_type": "embedding",
"model_name": "muq",
"model_version": "large-msd-iter",
"feature_set_name": "muq_5s_hop2.5_meanpool",
"window_id": 15,
"song_id": 7,
"title": "Feature Manifest Song 2"
}
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import math
import wave
from pathlib import Path
AUDIO_EXTS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a'}
def detect_duration_ms(path: Path) -> int | None:
if path.suffix.lower() == '.wav':
try:
with wave.open(str(path), 'rb') as wf:
frames = wf.getnframes()
rate = wf.getframerate()
if rate > 0:
return int(frames * 1000 / rate)
except Exception:
return None
return None
def infer_song_meta(file_path: Path, root: Path) -> tuple[str, str, str]:
rel = file_path.relative_to(root)
if len(rel.parts) >= 2:
song_key = rel.parts[0]
title = rel.parts[0].replace('_', ' ')
artist = rel.parts[1].replace('_', ' ') if len(rel.parts) >= 3 else 'unknown'
return song_key, title, artist
stem = file_path.stem
return stem, stem.replace('_', ' '), 'unknown'
def build_windows(duration_ms: int | None, window_ms: int, stride_ms: int) -> list[dict]:
if duration_ms is None:
return [{'start_ms': 0, 'end_ms': window_ms}]
if duration_ms <= window_ms:
return [{'start_ms': 0, 'end_ms': duration_ms}]
windows = []
start = 0
while start + window_ms <= duration_ms:
windows.append({'start_ms': start, 'end_ms': start + window_ms})
start += stride_ms
if not windows or windows[-1]['end_ms'] < duration_ms:
windows.append({'start_ms': max(duration_ms - window_ms, 0), 'end_ms': duration_ms})
dedup = []
seen = set()
for w in windows:
key = (w['start_ms'], w['end_ms'])
if key not in seen:
seen.add(key)
dedup.append(w)
return dedup
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--input-root', required=True)
parser.add_argument('--output', required=True)
parser.add_argument('--window-ms', type=int, default=5000)
parser.add_argument('--stride-ms', type=int, default=2500)
parser.add_argument('--set-type', default='reference_set')
parser.add_argument('--set-name', default='phase1_hot_reference_v1')
parser.add_argument('--source-type', default='official')
parser.add_argument('--report-output')
args = parser.parse_args()
root = Path(args.input_root).resolve()
output = Path(args.output).resolve()
output.parent.mkdir(parents=True, exist_ok=True)
report_output = Path(args.report_output).resolve() if args.report_output else None
if report_output:
report_output.parent.mkdir(parents=True, exist_ok=True)
rows = []
file_count = 0
window_count = 0
song_keys = set()
for path in sorted(root.rglob('*')):
if not path.is_file() or path.suffix.lower() not in AUDIO_EXTS:
continue
file_count += 1
song_key, title, artist = infer_song_meta(path, root)
song_keys.add(song_key)
duration_ms = detect_duration_ms(path)
windows = build_windows(duration_ms, args.window_ms, args.stride_ms)
window_count += len(windows)
rows.append(
{
'song': {
'biz_key': song_key,
'title': title,
'artist_name': artist,
},
'asset': {
'source_type': args.source_type,
'storage_uri': str(path),
'storage_scheme': 'file',
'checksum': f'path:{path}',
'codec': path.suffix.lower().lstrip('.'),
'sample_rate': 16000 if path.suffix.lower() == '.wav' else None,
'channels': 1 if path.suffix.lower() == '.wav' else None,
'duration_ms': duration_ms,
},
'windows': windows,
'memberships': [
{
'set_type': args.set_type,
'set_name': args.set_name,
'member_type': 'asset',
'priority': 100,
}
],
}
)
output.write_text('\n'.join(json.dumps(row, ensure_ascii=False) for row in rows) + ('\n' if rows else ''))
report = {
'input_root': str(root),
'output': str(output),
'song_count': len(song_keys),
'asset_count': file_count,
'window_count': window_count,
'window_ms': args.window_ms,
'stride_ms': args.stride_ms,
'set_name': args.set_name,
}
if report_output:
report_output.write_text(json.dumps(report, ensure_ascii=False, indent=2))
print(json.dumps(report, ensure_ascii=False, indent=2))
return 0
if __name__ == '__main__':
raise SystemExit(main())
## 2026-06-04
- 新增 `acr-engine/scripts/build_songcentric_manifest_from_directory.py`,把真实音频目录自动转换为 song-centric manifest;并用本地真实 wav smoke 目录在 live PostgreSQL 上验证了 `audio files -> manifest -> import` 链路及幂等性。
- 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并用 `songcentric_feature_manifest_sample.jsonl` 在 live PostgreSQL 上验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环与幂等性。
- 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并新增 `songcentric_feature_manifest_sample.jsonl` 用于验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环。
......
......@@ -270,6 +270,21 @@ flowchart TD
当前带 feature 样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl)
### 4.6 真实目录生成 manifest 流程
```mermaid
flowchart TD
A[real audio directory] --> B[build_songcentric_manifest_from_directory.py]
B --> C[songcentric_directory_manifest.jsonl]
C --> D[import_songcentric_manifest_live.py]
D --> E[media_entity]
D --> F[audio_object]
D --> G[set_membership]
```
当前目录构建脚本:[`acr-engine/scripts/build_songcentric_manifest_from_directory.py`](../acr-engine/scripts/build_songcentric_manifest_from_directory.py)
---
## 5. 最常用 SQL 样例
......