fetch_fma_subset.py
2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
"""Download a bounded real FMA subset through yt-dlp when direct archive URLs are unavailable."""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
from pathlib import Path
DEFAULT_TRACK_IDS = [2, 5, 10, 20, 26, 30, 46, 48]
FMA_TRACK_URL = "https://freemusicarchive.org/music/track/{track_id}"
def ensure_ytdlp() -> str:
path = shutil.which("yt-dlp")
if not path:
raise SystemExit(json.dumps({
"status": "blocked",
"reason": "yt_dlp_missing",
"recommendation": "Install yt-dlp or provide local FMA audio manually into data/raw/fma_small_audio",
}, indent=2, ensure_ascii=False))
return path
def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = False) -> dict:
outtmpl = str(output_dir / "%(id)s.%(ext)s")
url = FMA_TRACK_URL.format(track_id=track_id)
cmd = [
ytdlp,
"--no-playlist",
"-o", outtmpl,
]
if not overwrite:
cmd.append("--no-overwrites")
cmd.append(url)
proc = subprocess.run(cmd, text=True, capture_output=True)
return {
"track_id": track_id,
"url": url,
"status": "downloaded" if proc.returncode == 0 else "failed",
"returncode": proc.returncode,
"stdout": proc.stdout[-1200:],
"stderr": proc.stderr[-1200:],
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output-dir", default="data/raw/fma_small_audio")
parser.add_argument("--track-ids", nargs="*", type=int, default=DEFAULT_TRACK_IDS)
parser.add_argument("--overwrite", action="store_true")
parser.add_argument("--report", default=None)
args = parser.parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
ytdlp = ensure_ytdlp()
results = [fetch_one(track_id, output_dir, ytdlp, overwrite=args.overwrite) for track_id in args.track_ids]
summary = {
"output_dir": str(output_dir.resolve()),
"requested": len(args.track_ids),
"downloaded": sum(1 for x in results if x["status"] == "downloaded"),
"failed": sum(1 for x in results if x["status"] != "downloaded"),
"results": results,
}
text = json.dumps(summary, indent=2, ensure_ascii=False)
if args.report:
report_path = Path(args.report)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(text)
print(text)
if __name__ == "__main__":
main()