Commit 0f75787b 0f75787baf94fc7a20755192c8d1cbbaf2d9404e by cnb.bofCdSsphPA

Build song-centric manifests directly from real audio directories

Constraint: Keep the current fused 4-table workflow while reducing manual JSONL authoring for onboarding real audio files into live PostgreSQL.
Rejected: Require hand-authored manifests as the only path into the song-centric importer | It slows real data onboarding and raises operator effort.
Confidence: high
Scope-risk: narrow
Directive: Prefer build_songcentric_manifest_from_directory.py -> import_songcentric_manifest_live.py as the default Phase-1 path for real file-directory onboarding.
Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/build_songcentric_manifest_from_directory.py on a real local wav smoke directory; imported the generated manifest into postgres://d2:d2pass@127.0.0.1:5432/d2 schema acr_songcentric_test; reran the import and verified counts remained media_entity=9, audio_object=22, feature_fact=9, set_membership=9; git diff --check; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files
Not-tested: non-wav duration probing and very large directory trees
1 parent d04a6e65
1 {"song": {"biz_key": "song_alpha", "title": "song alpha", "artist_name": "artist a"}, "asset": {"source_type": "official", "storage_uri": "/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav", "storage_scheme": "file", "checksum": "path:/workspace/acr-engine/data/songcentric_builder_smoke/song_alpha/artist_a/clip1.wav", "codec": "wav", "sample_rate": 16000, "channels": 1, "duration_ms": 8000}, "windows": [{"start_ms": 0, "end_ms": 5000}, {"start_ms": 2500, "end_ms": 7500}, {"start_ms": 3000, "end_ms": 8000}], "memberships": [{"set_type": "reference_set", "set_name": "phase1_hot_reference_v1", "member_type": "asset", "priority": 100}]}
2 {"song": {"biz_key": "song_beta", "title": "song beta", "artist_name": "artist b"}, "asset": {"source_type": "official", "storage_uri": "/workspace/acr-engine/data/songcentric_builder_smoke/song_beta/artist_b/clip2.wav", "storage_scheme": "file", "checksum": "path:/workspace/acr-engine/data/songcentric_builder_smoke/song_beta/artist_b/clip2.wav", "codec": "wav", "sample_rate": 16000, "channels": 1, "duration_ms": 6000}, "windows": [{"start_ms": 0, "end_ms": 5000}, {"start_ms": 1000, "end_ms": 6000}], "memberships": [{"set_type": "reference_set", "set_name": "phase1_hot_reference_v1", "member_type": "asset", "priority": 100}]}
1 {
2 "input_root": "/workspace/acr-engine/data/songcentric_builder_smoke",
3 "output": "/workspace/acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
4 "song_count": 2,
5 "asset_count": 2,
6 "window_count": 5,
7 "window_ms": 5000,
8 "stride_ms": 2500,
9 "set_name": "phase1_hot_reference_v1"
10 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "schema": "acr_songcentric_test",
3 "manifest": "acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
4 "imported": [
5 {
6 "song_id": 8,
7 "asset_id": 16,
8 "window_ids": [
9 17,
10 18,
11 19
12 ],
13 "feature_ids": [],
14 "membership_ids": [
15 8
16 ]
17 },
18 {
19 "song_id": 9,
20 "asset_id": 20,
21 "window_ids": [
22 21,
23 22
24 ],
25 "feature_ids": [],
26 "membership_ids": [
27 9
28 ]
29 }
30 ],
31 "counts": {
32 "media_entity": 9,
33 "audio_object": 22,
34 "feature_fact": 9,
35 "set_membership": 9
36 },
37 "window_lineage_sample": {
38 "window_id": 22,
39 "asset_id": 20,
40 "song_id": 9,
41 "title": "song beta",
42 "start_ms": 1000,
43 "end_ms": 6000
44 },
45 "feature_lineage_sample": {
46 "feature_type": "embedding",
47 "model_name": "muq",
48 "model_version": "large-msd-iter",
49 "feature_set_name": "muq_5s_hop2.5_meanpool",
50 "window_id": 15,
51 "song_id": 7,
52 "title": "Feature Manifest Song 2"
53 }
54 }
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "schema": "acr_songcentric_test",
3 "manifest": "acr-engine/data/pgvector_eval/music20/songcentric_directory_manifest.jsonl",
4 "imported": [
5 {
6 "song_id": 8,
7 "asset_id": 16,
8 "window_ids": [
9 17,
10 18,
11 19
12 ],
13 "feature_ids": [],
14 "membership_ids": [
15 8
16 ]
17 },
18 {
19 "song_id": 9,
20 "asset_id": 20,
21 "window_ids": [
22 21,
23 22
24 ],
25 "feature_ids": [],
26 "membership_ids": [
27 9
28 ]
29 }
30 ],
31 "counts": {
32 "media_entity": 9,
33 "audio_object": 22,
34 "feature_fact": 9,
35 "set_membership": 9
36 },
37 "window_lineage_sample": {
38 "window_id": 22,
39 "asset_id": 20,
40 "song_id": 9,
41 "title": "song beta",
42 "start_ms": 1000,
43 "end_ms": 6000
44 },
45 "feature_lineage_sample": {
46 "feature_type": "embedding",
47 "model_name": "muq",
48 "model_version": "large-msd-iter",
49 "feature_set_name": "muq_5s_hop2.5_meanpool",
50 "window_id": 15,
51 "song_id": 7,
52 "title": "Feature Manifest Song 2"
53 }
54 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import math
7 import wave
8 from pathlib import Path
9
10 AUDIO_EXTS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a'}
11
12
13 def detect_duration_ms(path: Path) -> int | None:
14 if path.suffix.lower() == '.wav':
15 try:
16 with wave.open(str(path), 'rb') as wf:
17 frames = wf.getnframes()
18 rate = wf.getframerate()
19 if rate > 0:
20 return int(frames * 1000 / rate)
21 except Exception:
22 return None
23 return None
24
25
26 def infer_song_meta(file_path: Path, root: Path) -> tuple[str, str, str]:
27 rel = file_path.relative_to(root)
28 if len(rel.parts) >= 2:
29 song_key = rel.parts[0]
30 title = rel.parts[0].replace('_', ' ')
31 artist = rel.parts[1].replace('_', ' ') if len(rel.parts) >= 3 else 'unknown'
32 return song_key, title, artist
33 stem = file_path.stem
34 return stem, stem.replace('_', ' '), 'unknown'
35
36
37 def build_windows(duration_ms: int | None, window_ms: int, stride_ms: int) -> list[dict]:
38 if duration_ms is None:
39 return [{'start_ms': 0, 'end_ms': window_ms}]
40 if duration_ms <= window_ms:
41 return [{'start_ms': 0, 'end_ms': duration_ms}]
42 windows = []
43 start = 0
44 while start + window_ms <= duration_ms:
45 windows.append({'start_ms': start, 'end_ms': start + window_ms})
46 start += stride_ms
47 if not windows or windows[-1]['end_ms'] < duration_ms:
48 windows.append({'start_ms': max(duration_ms - window_ms, 0), 'end_ms': duration_ms})
49 dedup = []
50 seen = set()
51 for w in windows:
52 key = (w['start_ms'], w['end_ms'])
53 if key not in seen:
54 seen.add(key)
55 dedup.append(w)
56 return dedup
57
58
59 def main() -> int:
60 parser = argparse.ArgumentParser()
61 parser.add_argument('--input-root', required=True)
62 parser.add_argument('--output', required=True)
63 parser.add_argument('--window-ms', type=int, default=5000)
64 parser.add_argument('--stride-ms', type=int, default=2500)
65 parser.add_argument('--set-type', default='reference_set')
66 parser.add_argument('--set-name', default='phase1_hot_reference_v1')
67 parser.add_argument('--source-type', default='official')
68 parser.add_argument('--report-output')
69 args = parser.parse_args()
70
71 root = Path(args.input_root).resolve()
72 output = Path(args.output).resolve()
73 output.parent.mkdir(parents=True, exist_ok=True)
74 report_output = Path(args.report_output).resolve() if args.report_output else None
75 if report_output:
76 report_output.parent.mkdir(parents=True, exist_ok=True)
77
78 rows = []
79 file_count = 0
80 window_count = 0
81 song_keys = set()
82
83 for path in sorted(root.rglob('*')):
84 if not path.is_file() or path.suffix.lower() not in AUDIO_EXTS:
85 continue
86 file_count += 1
87 song_key, title, artist = infer_song_meta(path, root)
88 song_keys.add(song_key)
89 duration_ms = detect_duration_ms(path)
90 windows = build_windows(duration_ms, args.window_ms, args.stride_ms)
91 window_count += len(windows)
92 rows.append(
93 {
94 'song': {
95 'biz_key': song_key,
96 'title': title,
97 'artist_name': artist,
98 },
99 'asset': {
100 'source_type': args.source_type,
101 'storage_uri': str(path),
102 'storage_scheme': 'file',
103 'checksum': f'path:{path}',
104 'codec': path.suffix.lower().lstrip('.'),
105 'sample_rate': 16000 if path.suffix.lower() == '.wav' else None,
106 'channels': 1 if path.suffix.lower() == '.wav' else None,
107 'duration_ms': duration_ms,
108 },
109 'windows': windows,
110 'memberships': [
111 {
112 'set_type': args.set_type,
113 'set_name': args.set_name,
114 'member_type': 'asset',
115 'priority': 100,
116 }
117 ],
118 }
119 )
120
121 output.write_text('\n'.join(json.dumps(row, ensure_ascii=False) for row in rows) + ('\n' if rows else ''))
122
123 report = {
124 'input_root': str(root),
125 'output': str(output),
126 'song_count': len(song_keys),
127 'asset_count': file_count,
128 'window_count': window_count,
129 'window_ms': args.window_ms,
130 'stride_ms': args.stride_ms,
131 'set_name': args.set_name,
132 }
133 if report_output:
134 report_output.write_text(json.dumps(report, ensure_ascii=False, indent=2))
135 print(json.dumps(report, ensure_ascii=False, indent=2))
136 return 0
137
138
139 if __name__ == '__main__':
140 raise SystemExit(main())
1 ## 2026-06-04 1 ## 2026-06-04
2 2
3 - 新增 `acr-engine/scripts/build_songcentric_manifest_from_directory.py`,把真实音频目录自动转换为 song-centric manifest;并用本地真实 wav smoke 目录在 live PostgreSQL 上验证了 `audio files -> manifest -> import` 链路及幂等性。
4
3 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并用 `songcentric_feature_manifest_sample.jsonl` 在 live PostgreSQL 上验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环与幂等性。 5 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并用 `songcentric_feature_manifest_sample.jsonl` 在 live PostgreSQL 上验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环与幂等性。
4 6
5 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并新增 `songcentric_feature_manifest_sample.jsonl` 用于验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环。 7 - 扩展 `import_songcentric_manifest_live.py` 支持从 manifest 的 `windows[].features[]` 直接落 `feature_fact`,并新增 `songcentric_feature_manifest_sample.jsonl` 用于验证 `song -> asset -> window -> feature -> membership` 的完整导入闭环。
......
...@@ -270,6 +270,21 @@ flowchart TD ...@@ -270,6 +270,21 @@ flowchart TD
270 270
271 当前带 feature 样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl) 271 当前带 feature 样例 manifest:[`acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl`](../acr-engine/data/pgvector_eval/music20/songcentric_feature_manifest_sample.jsonl)
272 272
273
274 ### 4.6 真实目录生成 manifest 流程
275
276 ```mermaid
277 flowchart TD
278 A[real audio directory] --> B[build_songcentric_manifest_from_directory.py]
279 B --> C[songcentric_directory_manifest.jsonl]
280 C --> D[import_songcentric_manifest_live.py]
281 D --> E[media_entity]
282 D --> F[audio_object]
283 D --> G[set_membership]
284 ```
285
286 当前目录构建脚本:[`acr-engine/scripts/build_songcentric_manifest_from_directory.py`](../acr-engine/scripts/build_songcentric_manifest_from_directory.py)
287
273 --- 288 ---
274 289
275 ## 5. 最常用 SQL 样例 290 ## 5. 最常用 SQL 样例
......