Commit 7eff944b 7eff944bedeed205a3bf5d9f2225d954ceb13a61 by cnb.bofCdSsphPA

Prove the offline business-export chain with a runnable smoke over local audio

Constraint: Keep verification offline-only and avoid touching real databases or production assets
Rejected: Stop at manifest generation without execution evidence | A dry-run smoke gives the next session stronger handoff confidence
Confidence: high
Scope-risk: narrow
Directive: Stage local sample audio inside the smoke workspace so manifest paths remain self-contained and reproducible
Tested: Ran business_export_offline_smoke.py end-to-end; verified normalize/build summaries and train.py --dry-run success; rechecked adapter doc links
Not-tested: Did not run full training/evaluation on live business exports or connect to any database
1 parent 3bdc0139
1 #!/usr/bin/env python3
2 from __future__ import annotations
3
4 import argparse
5 import csv
6 import json
7 import shutil
8 import subprocess
9 from pathlib import Path
10
11 PYTHON = "/usr/local/miniconda3/bin/python"
12
13
14 def run(cmd: list[str], cwd: Path) -> str:
15 return subprocess.check_output(cmd, cwd=str(cwd), text=True)
16
17
18 def stage_audio(output_root: Path, src_rel: str, dst_name: str) -> str:
19 repo = Path(__file__).resolve().parents[1]
20 src = (repo / src_rel).resolve()
21 audio_dir = output_root / "audio"
22 audio_dir.mkdir(parents=True, exist_ok=True)
23 dst = audio_dir / dst_name
24 if not dst.exists():
25 shutil.copy2(src, dst)
26 return str(dst.relative_to(output_root))
27
28
29 def main() -> None:
30 parser = argparse.ArgumentParser(description="Run an offline smoke for the business-export adapter chain")
31 parser.add_argument("--output-root", default="/tmp/business_export_offline_smoke")
32 parser.add_argument("--device", default="cpu")
33 args = parser.parse_args()
34
35 repo = Path(__file__).resolve().parents[1]
36 output_root = Path(args.output_root).resolve()
37 output_root.mkdir(parents=True, exist_ok=True)
38
39 sample_csv = output_root / "business_asset_export_real_smoke.csv"
40 manifests_dir = output_root / "manifests"
41 manifest_ready = output_root / "manifest_ready.jsonl"
42
43 rows = [
44 {
45 "song_id": "song_smoke_a",
46 "asset_id": "asset_smoke_a_ref",
47 "type": 11,
48 "role": "reference",
49 "split": "train",
50 "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00000.mp3", "song_smoke_a_ref.mp3"),
51 "source_dataset": "internal_catalog",
52 "title": "Smoke Song A",
53 "artist": "Smoke Artist",
54 "album_id": "smoke_album",
55 "bucket": "lossless_reference_core",
56 "offset_sec": 0,
57 "duration_sec": 29.976,
58 "sample_rate": 44100,
59 "bitrate": 192,
60 "license": "licensed",
61 "is_lossless": False,
62 },
63 {
64 "song_id": "song_smoke_a",
65 "asset_id": "asset_smoke_a_query",
66 "type": 7,
67 "role": "query",
68 "split": "test",
69 "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00000.mp3", "song_smoke_a_query.mp3"),
70 "source_dataset": "internal_catalog",
71 "title": "Smoke Song A",
72 "artist": "Smoke Artist",
73 "album_id": "smoke_album",
74 "bucket": "short_video_hook",
75 "offset_sec": 4.0,
76 "duration_sec": 8.0,
77 "sample_rate": 44100,
78 "bitrate": 192,
79 "license": "licensed",
80 "is_lossless": False,
81 },
82 {
83 "song_id": "song_smoke_b",
84 "asset_id": "asset_smoke_b_ref",
85 "type": 11,
86 "role": "reference",
87 "split": "train",
88 "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00001.mp3", "song_smoke_b_ref.mp3"),
89 "source_dataset": "internal_catalog",
90 "title": "Smoke Song B",
91 "artist": "Smoke Artist",
92 "album_id": "smoke_album",
93 "bucket": "lossless_reference_core",
94 "offset_sec": 0,
95 "duration_sec": 30.002,
96 "sample_rate": 44100,
97 "bitrate": 192,
98 "license": "licensed",
99 "is_lossless": False,
100 },
101 {
102 "song_id": "song_smoke_b",
103 "asset_id": "asset_smoke_b_query",
104 "type": 8,
105 "role": "query",
106 "split": "train",
107 "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00001.mp3", "song_smoke_b_query.mp3"),
108 "source_dataset": "internal_catalog",
109 "title": "Smoke Song B",
110 "artist": "Smoke Artist",
111 "album_id": "smoke_album",
112 "bucket": "short_video_hook",
113 "offset_sec": 6.0,
114 "duration_sec": 8.0,
115 "sample_rate": 44100,
116 "bitrate": 192,
117 "license": "licensed",
118 "is_lossless": False,
119 },
120 {
121 "song_id": "song_smoke_c",
122 "asset_id": "asset_smoke_c_excluded",
123 "type": 18,
124 "role": "excluded",
125 "split": "holdout",
126 "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00002.mp3", "song_smoke_c_excluded.mp3"),
127 "source_dataset": "internal_catalog",
128 "title": "Smoke Song C",
129 "artist": "Smoke Artist",
130 "album_id": "smoke_album",
131 "bucket": "demo_variation_pool",
132 "offset_sec": 0,
133 "duration_sec": 29.976,
134 "sample_rate": 44100,
135 "bitrate": 192,
136 "license": "review_pending",
137 "is_lossless": False,
138 },
139 ]
140
141 with sample_csv.open("w", newline="") as f:
142 writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
143 writer.writeheader()
144 writer.writerows(rows)
145
146 normalize_summary = json.loads(run([PYTHON, "scripts/normalize_business_export.py", "--input", str(sample_csv), "--output", str(manifest_ready)], cwd=repo))
147 build_summary = json.loads(run([PYTHON, "scripts/build_business_project_manifests.py", "--input", str(manifest_ready), "--output-dir", str(manifests_dir)], cwd=repo))
148 dryrun = run([
149 PYTHON,
150 "train.py",
151 "--data", str(manifests_dir),
152 "--output", str(output_root / "models"),
153 "--device", args.device,
154 "--epochs", "1",
155 "--batch-size", "2",
156 "--dry-run",
157 ], cwd=repo)
158
159 summary = {
160 "manifest_ready": str(manifest_ready),
161 "manifests_dir": str(manifests_dir),
162 "normalize_summary": normalize_summary,
163 "build_summary": build_summary,
164 "dry_run_passed": "Dry run passed!" in dryrun,
165 }
166 print(json.dumps(summary, ensure_ascii=False, indent=2))
167
168
169 if __name__ == "__main__":
170 main()