fetch_fma_subset.py 2.5 KB
#!/usr/bin/env python3
"""Download a bounded real FMA subset through yt-dlp when direct archive URLs are unavailable."""

from __future__ import annotations

import argparse
import json
import shutil
import subprocess
from pathlib import Path

DEFAULT_TRACK_IDS = [2, 5, 10, 20, 26, 30, 46, 48]
FMA_TRACK_URL = "https://freemusicarchive.org/music/track/{track_id}"


def ensure_ytdlp() -> str:
    path = shutil.which("yt-dlp")
    if not path:
        raise SystemExit(json.dumps({
            "status": "blocked",
            "reason": "yt_dlp_missing",
            "recommendation": "Install yt-dlp or provide local FMA audio manually into data/raw/fma_small_audio",
        }, indent=2, ensure_ascii=False))
    return path


def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = False) -> dict:
    outtmpl = str(output_dir / "%(id)s.%(ext)s")
    url = FMA_TRACK_URL.format(track_id=track_id)
    cmd = [
        ytdlp,
        "--no-playlist",
        "-o", outtmpl,
    ]
    if not overwrite:
        cmd.append("--no-overwrites")
    cmd.append(url)
    proc = subprocess.run(cmd, text=True, capture_output=True)
    return {
        "track_id": track_id,
        "url": url,
        "status": "downloaded" if proc.returncode == 0 else "failed",
        "returncode": proc.returncode,
        "stdout": proc.stdout[-1200:],
        "stderr": proc.stderr[-1200:],
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output-dir", default="data/raw/fma_small_audio")
    parser.add_argument("--track-ids", nargs="*", type=int, default=DEFAULT_TRACK_IDS)
    parser.add_argument("--overwrite", action="store_true")
    parser.add_argument("--report", default=None)
    args = parser.parse_args()

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    ytdlp = ensure_ytdlp()

    results = [fetch_one(track_id, output_dir, ytdlp, overwrite=args.overwrite) for track_id in args.track_ids]
    summary = {
        "output_dir": str(output_dir.resolve()),
        "requested": len(args.track_ids),
        "downloaded": sum(1 for x in results if x["status"] == "downloaded"),
        "failed": sum(1 for x in results if x["status"] != "downloaded"),
        "results": results,
    }
    text = json.dumps(summary, indent=2, ensure_ascii=False)
    if args.report:
        report_path = Path(args.report)
        report_path.parent.mkdir(parents=True, exist_ok=True)
        report_path.write_text(text)
    print(text)


if __name__ == "__main__":
    main()