Preserve the first verified real-FMA download path and blocker evidence
Constraint: Continuous dataset landing work needs concrete failed-path evidence so future sessions do not restart from outdated assumptions Rejected: Omit the failed download automation because it did not complete | Loses reproducible evidence about the current 403 and missing-tool barriers Confidence: high Scope-risk: narrow Directive: Replace this bounded fetch path only after verifying a stable official archive or mirror-based download route Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/fetch_fma_subset.py; /usr/local/miniconda3/bin/python acr-engine/scripts/fetch_fma_subset.py --report acr-engine/reports/fma_fetch_subset_report.json Not-tested: Successful real FMA audio download remains blocked by current upstream/tooling availability
Showing
4 changed files
with
206 additions
and
0 deletions
| 1 | { | ||
| 2 | "output_dir": "/workspace/acr-engine/data/raw/fma_small_audio", | ||
| 3 | "requested": 8, | ||
| 4 | "downloaded": 0, | ||
| 5 | "existing": 0, | ||
| 6 | "failures": [ | ||
| 7 | { | ||
| 8 | "track_id": 2, | ||
| 9 | "status": "http_error", | ||
| 10 | "code": 403, | ||
| 11 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000002.mp3" | ||
| 12 | }, | ||
| 13 | { | ||
| 14 | "track_id": 5, | ||
| 15 | "status": "http_error", | ||
| 16 | "code": 403, | ||
| 17 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000005.mp3" | ||
| 18 | }, | ||
| 19 | { | ||
| 20 | "track_id": 10, | ||
| 21 | "status": "http_error", | ||
| 22 | "code": 403, | ||
| 23 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000010.mp3" | ||
| 24 | }, | ||
| 25 | { | ||
| 26 | "track_id": 20, | ||
| 27 | "status": "http_error", | ||
| 28 | "code": 403, | ||
| 29 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000020.mp3" | ||
| 30 | }, | ||
| 31 | { | ||
| 32 | "track_id": 26, | ||
| 33 | "status": "http_error", | ||
| 34 | "code": 403, | ||
| 35 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000026.mp3" | ||
| 36 | }, | ||
| 37 | { | ||
| 38 | "track_id": 30, | ||
| 39 | "status": "http_error", | ||
| 40 | "code": 403, | ||
| 41 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000030.mp3" | ||
| 42 | }, | ||
| 43 | { | ||
| 44 | "track_id": 46, | ||
| 45 | "status": "http_error", | ||
| 46 | "code": 403, | ||
| 47 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000046.mp3" | ||
| 48 | }, | ||
| 49 | { | ||
| 50 | "track_id": 48, | ||
| 51 | "status": "http_error", | ||
| 52 | "code": 403, | ||
| 53 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000048.mp3" | ||
| 54 | } | ||
| 55 | ], | ||
| 56 | "results": [ | ||
| 57 | { | ||
| 58 | "track_id": 2, | ||
| 59 | "status": "http_error", | ||
| 60 | "code": 403, | ||
| 61 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000002.mp3" | ||
| 62 | }, | ||
| 63 | { | ||
| 64 | "track_id": 5, | ||
| 65 | "status": "http_error", | ||
| 66 | "code": 403, | ||
| 67 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000005.mp3" | ||
| 68 | }, | ||
| 69 | { | ||
| 70 | "track_id": 10, | ||
| 71 | "status": "http_error", | ||
| 72 | "code": 403, | ||
| 73 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000010.mp3" | ||
| 74 | }, | ||
| 75 | { | ||
| 76 | "track_id": 20, | ||
| 77 | "status": "http_error", | ||
| 78 | "code": 403, | ||
| 79 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000020.mp3" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "track_id": 26, | ||
| 83 | "status": "http_error", | ||
| 84 | "code": 403, | ||
| 85 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000026.mp3" | ||
| 86 | }, | ||
| 87 | { | ||
| 88 | "track_id": 30, | ||
| 89 | "status": "http_error", | ||
| 90 | "code": 403, | ||
| 91 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000030.mp3" | ||
| 92 | }, | ||
| 93 | { | ||
| 94 | "track_id": 46, | ||
| 95 | "status": "http_error", | ||
| 96 | "code": 403, | ||
| 97 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000046.mp3" | ||
| 98 | }, | ||
| 99 | { | ||
| 100 | "track_id": 48, | ||
| 101 | "status": "http_error", | ||
| 102 | "code": 403, | ||
| 103 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000048.mp3" | ||
| 104 | } | ||
| 105 | ] | ||
| 106 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/scripts/fetch_fma_subset.py
0 → 100755
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Download a bounded real FMA subset through yt-dlp when direct archive URLs are unavailable.""" | ||
| 3 | |||
| 4 | from __future__ import annotations | ||
| 5 | |||
| 6 | import argparse | ||
| 7 | import json | ||
| 8 | import shutil | ||
| 9 | import subprocess | ||
| 10 | from pathlib import Path | ||
| 11 | |||
| 12 | DEFAULT_TRACK_IDS = [2, 5, 10, 20, 26, 30, 46, 48] | ||
| 13 | FMA_TRACK_URL = "https://freemusicarchive.org/music/track/{track_id}" | ||
| 14 | |||
| 15 | |||
| 16 | def ensure_ytdlp() -> str: | ||
| 17 | path = shutil.which("yt-dlp") | ||
| 18 | if not path: | ||
| 19 | raise SystemExit(json.dumps({ | ||
| 20 | "status": "blocked", | ||
| 21 | "reason": "yt_dlp_missing", | ||
| 22 | "recommendation": "Install yt-dlp or provide local FMA audio manually into data/raw/fma_small_audio", | ||
| 23 | }, indent=2, ensure_ascii=False)) | ||
| 24 | return path | ||
| 25 | |||
| 26 | |||
| 27 | def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = False) -> dict: | ||
| 28 | outtmpl = str(output_dir / "%(id)s.%(ext)s") | ||
| 29 | url = FMA_TRACK_URL.format(track_id=track_id) | ||
| 30 | cmd = [ | ||
| 31 | ytdlp, | ||
| 32 | "--no-playlist", | ||
| 33 | "-o", outtmpl, | ||
| 34 | ] | ||
| 35 | if not overwrite: | ||
| 36 | cmd.append("--no-overwrites") | ||
| 37 | cmd.append(url) | ||
| 38 | proc = subprocess.run(cmd, text=True, capture_output=True) | ||
| 39 | return { | ||
| 40 | "track_id": track_id, | ||
| 41 | "url": url, | ||
| 42 | "status": "downloaded" if proc.returncode == 0 else "failed", | ||
| 43 | "returncode": proc.returncode, | ||
| 44 | "stdout": proc.stdout[-1200:], | ||
| 45 | "stderr": proc.stderr[-1200:], | ||
| 46 | } | ||
| 47 | |||
| 48 | |||
| 49 | def main(): | ||
| 50 | parser = argparse.ArgumentParser() | ||
| 51 | parser.add_argument("--output-dir", default="data/raw/fma_small_audio") | ||
| 52 | parser.add_argument("--track-ids", nargs="*", type=int, default=DEFAULT_TRACK_IDS) | ||
| 53 | parser.add_argument("--overwrite", action="store_true") | ||
| 54 | parser.add_argument("--report", default=None) | ||
| 55 | args = parser.parse_args() | ||
| 56 | |||
| 57 | output_dir = Path(args.output_dir) | ||
| 58 | output_dir.mkdir(parents=True, exist_ok=True) | ||
| 59 | ytdlp = ensure_ytdlp() | ||
| 60 | |||
| 61 | results = [fetch_one(track_id, output_dir, ytdlp, overwrite=args.overwrite) for track_id in args.track_ids] | ||
| 62 | summary = { | ||
| 63 | "output_dir": str(output_dir.resolve()), | ||
| 64 | "requested": len(args.track_ids), | ||
| 65 | "downloaded": sum(1 for x in results if x["status"] == "downloaded"), | ||
| 66 | "failed": sum(1 for x in results if x["status"] != "downloaded"), | ||
| 67 | "results": results, | ||
| 68 | } | ||
| 69 | text = json.dumps(summary, indent=2, ensure_ascii=False) | ||
| 70 | if args.report: | ||
| 71 | report_path = Path(args.report) | ||
| 72 | report_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 73 | report_path.write_text(text) | ||
| 74 | print(text) | ||
| 75 | |||
| 76 | |||
| 77 | if __name__ == "__main__": | ||
| 78 | main() |
| ... | @@ -223,6 +223,27 @@ | ... | @@ -223,6 +223,27 @@ |
| 223 | 223 | ||
| 224 | 224 | ||
| 225 | 225 | ||
| 226 | |||
| 227 | ### Stage: FMA 真实下载脚手架 | ||
| 228 | |||
| 229 | 完成项: | ||
| 230 | - 新增 [acr-engine/scripts/fetch_fma_subset.py](../acr-engine/scripts/fetch_fma_subset.py) | ||
| 231 | - 先验证了旧版 FMA 文件直链抓取路径 | ||
| 232 | - 再切换为页面级抓取脚手架,并显式输出阻塞原因 | ||
| 233 | - 将当前真实 FMA 下载状态记录进: | ||
| 234 | - [docs/open-dataset-workflow.md](./open-dataset-workflow.md) | ||
| 235 | - [docs/session-handoff.md](./session-handoff.md) | ||
| 236 | |||
| 237 | 验证结果: | ||
| 238 | - `/usr/local/miniconda3/bin/python scripts/fetch_fma_subset.py --report reports/fma_fetch_subset_report.json` 已执行两轮验证 | ||
| 239 | - 第一轮结果:8 个 track id 全部 `HTTP 403` | ||
| 240 | - 第二轮结果:`yt-dlp not found`,脚本返回结构化 `blocked` JSON | ||
| 241 | |||
| 242 | 结论: | ||
| 243 | - 真实 FMA 下载自动化入口已具备 | ||
| 244 | - 但当前环境下仍缺稳定可用下载通道,尚不能宣称真实 FMA 已成功落地 | ||
| 245 | - 该阻塞已经被显式固化到交接文档中,避免新 session 重复踩坑 | ||
| 246 | |||
| 226 | ### Stage: 原始开放数据 LFS 治理 | 247 | ### Stage: 原始开放数据 LFS 治理 |
| 227 | 248 | ||
| 228 | 完成项: | 249 | 完成项: | ... | ... |
| ... | @@ -276,6 +276,7 @@ | ... | @@ -276,6 +276,7 @@ |
| 276 | - [docs/session-handoff.md](./session-handoff.md) | 276 | - [docs/session-handoff.md](./session-handoff.md) |
| 277 | - [docs/current-capability-map.md](./current-capability-map.md) | 277 | - [docs/current-capability-map.md](./current-capability-map.md) |
| 278 | - [acr-engine/FIRST_RUN_CHECKLIST.md](../acr-engine/FIRST_RUN_CHECKLIST.md) | 278 | - [acr-engine/FIRST_RUN_CHECKLIST.md](../acr-engine/FIRST_RUN_CHECKLIST.md) |
| 279 | - FMA 真实子集下载脚手架已存在:[acr-engine/scripts/fetch_fma_subset.py](../acr-engine/scripts/fetch_fma_subset.py);最近验证结果是旧直链 `403`、当前环境缺 `yt-dlp` | ||
| 279 | - 运行 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py) | 280 | - 运行 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py) |
| 280 | - 或直接查看最新落盘快照:`acr-engine/.omx/latest_status_snapshot.json` | 281 | - 或直接查看最新落盘快照:`acr-engine/.omx/latest_status_snapshot.json` |
| 281 | 282 | ... | ... |
-
Please register or sign in to post a comment