business_export_offline_smoke.py 5.94 KB
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import csv
import json
import shutil
import subprocess
from pathlib import Path

PYTHON = "/usr/local/miniconda3/bin/python"


def run(cmd: list[str], cwd: Path) -> str:
    return subprocess.check_output(cmd, cwd=str(cwd), text=True)


def stage_audio(output_root: Path, src_rel: str, dst_name: str) -> str:
    repo = Path(__file__).resolve().parents[1]
    src = (repo / src_rel).resolve()
    audio_dir = output_root / "audio"
    audio_dir.mkdir(parents=True, exist_ok=True)
    dst = audio_dir / dst_name
    if not dst.exists():
        shutil.copy2(src, dst)
    return str(dst.relative_to(output_root))


def main() -> None:
    parser = argparse.ArgumentParser(description="Run an offline smoke for the business-export adapter chain")
    parser.add_argument("--output-root", default="/tmp/business_export_offline_smoke")
    parser.add_argument("--device", default="cpu")
    args = parser.parse_args()

    repo = Path(__file__).resolve().parents[1]
    output_root = Path(args.output_root).resolve()
    output_root.mkdir(parents=True, exist_ok=True)

    sample_csv = output_root / "business_asset_export_real_smoke.csv"
    manifests_dir = output_root / "manifests"
    manifest_ready = output_root / "manifest_ready.jsonl"

    rows = [
        {
            "song_id": "song_smoke_a",
            "asset_id": "asset_smoke_a_ref",
            "type": 11,
            "role": "reference",
            "split": "train",
            "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00000.mp3", "song_smoke_a_ref.mp3"),
            "source_dataset": "internal_catalog",
            "title": "Smoke Song A",
            "artist": "Smoke Artist",
            "album_id": "smoke_album",
            "bucket": "lossless_reference_core",
            "offset_sec": 0,
            "duration_sec": 29.976,
            "sample_rate": 44100,
            "bitrate": 192,
            "license": "licensed",
            "is_lossless": False,
        },
        {
            "song_id": "song_smoke_a",
            "asset_id": "asset_smoke_a_query",
            "type": 7,
            "role": "query",
            "split": "test",
            "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00000.mp3", "song_smoke_a_query.mp3"),
            "source_dataset": "internal_catalog",
            "title": "Smoke Song A",
            "artist": "Smoke Artist",
            "album_id": "smoke_album",
            "bucket": "short_video_hook",
            "offset_sec": 4.0,
            "duration_sec": 8.0,
            "sample_rate": 44100,
            "bitrate": 192,
            "license": "licensed",
            "is_lossless": False,
        },
        {
            "song_id": "song_smoke_b",
            "asset_id": "asset_smoke_b_ref",
            "type": 11,
            "role": "reference",
            "split": "train",
            "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00001.mp3", "song_smoke_b_ref.mp3"),
            "source_dataset": "internal_catalog",
            "title": "Smoke Song B",
            "artist": "Smoke Artist",
            "album_id": "smoke_album",
            "bucket": "lossless_reference_core",
            "offset_sec": 0,
            "duration_sec": 30.002,
            "sample_rate": 44100,
            "bitrate": 192,
            "license": "licensed",
            "is_lossless": False,
        },
        {
            "song_id": "song_smoke_b",
            "asset_id": "asset_smoke_b_query",
            "type": 8,
            "role": "query",
            "split": "train",
            "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00001.mp3", "song_smoke_b_query.mp3"),
            "source_dataset": "internal_catalog",
            "title": "Smoke Song B",
            "artist": "Smoke Artist",
            "album_id": "smoke_album",
            "bucket": "short_video_hook",
            "offset_sec": 6.0,
            "duration_sec": 8.0,
            "sample_rate": 44100,
            "bitrate": 192,
            "license": "licensed",
            "is_lossless": False,
        },
        {
            "song_id": "song_smoke_c",
            "asset_id": "asset_smoke_c_excluded",
            "type": 18,
            "role": "excluded",
            "split": "holdout",
            "audio_path": stage_audio(output_root, "data/external_smoke/fma/audio/fma_00002.mp3", "song_smoke_c_excluded.mp3"),
            "source_dataset": "internal_catalog",
            "title": "Smoke Song C",
            "artist": "Smoke Artist",
            "album_id": "smoke_album",
            "bucket": "demo_variation_pool",
            "offset_sec": 0,
            "duration_sec": 29.976,
            "sample_rate": 44100,
            "bitrate": 192,
            "license": "review_pending",
            "is_lossless": False,
        },
    ]

    with sample_csv.open("w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
        writer.writeheader()
        writer.writerows(rows)

    normalize_summary = json.loads(run([PYTHON, "scripts/normalize_business_export.py", "--input", str(sample_csv), "--output", str(manifest_ready)], cwd=repo))
    build_summary = json.loads(run([PYTHON, "scripts/build_business_project_manifests.py", "--input", str(manifest_ready), "--output-dir", str(manifests_dir)], cwd=repo))
    dryrun = run([
        PYTHON,
        "train.py",
        "--data", str(manifests_dir),
        "--output", str(output_root / "models"),
        "--device", args.device,
        "--epochs", "1",
        "--batch-size", "2",
        "--dry-run",
    ], cwd=repo)

    summary = {
        "manifest_ready": str(manifest_ready),
        "manifests_dir": str(manifests_dir),
        "normalize_summary": normalize_summary,
        "build_summary": build_summary,
        "dry_run_passed": "Dry run passed!" in dryrun,
    }
    print(json.dumps(summary, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()