Separate local tooling issues from upstream FMA URL breakage
Constraint: Real-data progress requires proving whether failures come from our environment or from changed upstream access paths Rejected: Keep treating the fetch blocker as a missing-tool problem | Would misdirect future debugging after yt-dlp module support was verified Confidence: high Scope-risk: narrow Directive: Do not retry historical FMA page URLs again unless a fresh source confirms their return; pivot to official archives or stable mirrors instead Tested: which yt-dlp || true; /usr/local/miniconda3/bin/python -m yt_dlp --version; /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/fetch_fma_subset.py; /usr/local/miniconda3/bin/python acr-engine/scripts/fetch_fma_subset.py --report acr-engine/reports/fma_fetch_subset_report.json Not-tested: Successful real FMA download still pending a valid upstream archive or mirror URL
Showing
4 changed files
with
170 additions
and
88 deletions
| ... | @@ -2,105 +2,156 @@ | ... | @@ -2,105 +2,156 @@ |
| 2 | "output_dir": "/workspace/acr-engine/data/raw/fma_small_audio", | 2 | "output_dir": "/workspace/acr-engine/data/raw/fma_small_audio", |
| 3 | "requested": 8, | 3 | "requested": 8, |
| 4 | "downloaded": 0, | 4 | "downloaded": 0, |
| 5 | "existing": 0, | 5 | "failed": 8, |
| 6 | "failures": [ | 6 | "ytdlp_cmd": [ |
| 7 | { | 7 | "/usr/local/miniconda3/bin/python", |
| 8 | "track_id": 2, | 8 | "-m", |
| 9 | "status": "http_error", | 9 | "yt_dlp" |
| 10 | "code": 403, | ||
| 11 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000002.mp3" | ||
| 12 | }, | ||
| 13 | { | ||
| 14 | "track_id": 5, | ||
| 15 | "status": "http_error", | ||
| 16 | "code": 403, | ||
| 17 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000005.mp3" | ||
| 18 | }, | ||
| 19 | { | ||
| 20 | "track_id": 10, | ||
| 21 | "status": "http_error", | ||
| 22 | "code": 403, | ||
| 23 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000010.mp3" | ||
| 24 | }, | ||
| 25 | { | ||
| 26 | "track_id": 20, | ||
| 27 | "status": "http_error", | ||
| 28 | "code": 403, | ||
| 29 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000020.mp3" | ||
| 30 | }, | ||
| 31 | { | ||
| 32 | "track_id": 26, | ||
| 33 | "status": "http_error", | ||
| 34 | "code": 403, | ||
| 35 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000026.mp3" | ||
| 36 | }, | ||
| 37 | { | ||
| 38 | "track_id": 30, | ||
| 39 | "status": "http_error", | ||
| 40 | "code": 403, | ||
| 41 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000030.mp3" | ||
| 42 | }, | ||
| 43 | { | ||
| 44 | "track_id": 46, | ||
| 45 | "status": "http_error", | ||
| 46 | "code": 403, | ||
| 47 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000046.mp3" | ||
| 48 | }, | ||
| 49 | { | ||
| 50 | "track_id": 48, | ||
| 51 | "status": "http_error", | ||
| 52 | "code": 403, | ||
| 53 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000048.mp3" | ||
| 54 | } | ||
| 55 | ], | 10 | ], |
| 56 | "results": [ | 11 | "results": [ |
| 57 | { | 12 | { |
| 58 | "track_id": 2, | 13 | "track_id": 2, |
| 59 | "status": "http_error", | 14 | "url": "https://freemusicarchive.org/music/track/2", |
| 60 | "code": 403, | 15 | "status": "failed", |
| 61 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000002.mp3" | 16 | "returncode": 1, |
| 17 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/2\n[generic] 2: Downloading webpage\n", | ||
| 18 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 19 | "command": [ | ||
| 20 | "/usr/local/miniconda3/bin/python", | ||
| 21 | "-m", | ||
| 22 | "yt_dlp", | ||
| 23 | "--no-playlist", | ||
| 24 | "-o", | ||
| 25 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 26 | "--no-overwrites", | ||
| 27 | "https://freemusicarchive.org/music/track/2" | ||
| 28 | ] | ||
| 62 | }, | 29 | }, |
| 63 | { | 30 | { |
| 64 | "track_id": 5, | 31 | "track_id": 5, |
| 65 | "status": "http_error", | 32 | "url": "https://freemusicarchive.org/music/track/5", |
| 66 | "code": 403, | 33 | "status": "failed", |
| 67 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000005.mp3" | 34 | "returncode": 1, |
| 35 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/5\n[generic] 5: Downloading webpage\n", | ||
| 36 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 37 | "command": [ | ||
| 38 | "/usr/local/miniconda3/bin/python", | ||
| 39 | "-m", | ||
| 40 | "yt_dlp", | ||
| 41 | "--no-playlist", | ||
| 42 | "-o", | ||
| 43 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 44 | "--no-overwrites", | ||
| 45 | "https://freemusicarchive.org/music/track/5" | ||
| 46 | ] | ||
| 68 | }, | 47 | }, |
| 69 | { | 48 | { |
| 70 | "track_id": 10, | 49 | "track_id": 10, |
| 71 | "status": "http_error", | 50 | "url": "https://freemusicarchive.org/music/track/10", |
| 72 | "code": 403, | 51 | "status": "failed", |
| 73 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000010.mp3" | 52 | "returncode": 1, |
| 53 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/10\n[generic] 10: Downloading webpage\n", | ||
| 54 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 55 | "command": [ | ||
| 56 | "/usr/local/miniconda3/bin/python", | ||
| 57 | "-m", | ||
| 58 | "yt_dlp", | ||
| 59 | "--no-playlist", | ||
| 60 | "-o", | ||
| 61 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 62 | "--no-overwrites", | ||
| 63 | "https://freemusicarchive.org/music/track/10" | ||
| 64 | ] | ||
| 74 | }, | 65 | }, |
| 75 | { | 66 | { |
| 76 | "track_id": 20, | 67 | "track_id": 20, |
| 77 | "status": "http_error", | 68 | "url": "https://freemusicarchive.org/music/track/20", |
| 78 | "code": 403, | 69 | "status": "failed", |
| 79 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000020.mp3" | 70 | "returncode": 1, |
| 71 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/20\n[generic] 20: Downloading webpage\n", | ||
| 72 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 73 | "command": [ | ||
| 74 | "/usr/local/miniconda3/bin/python", | ||
| 75 | "-m", | ||
| 76 | "yt_dlp", | ||
| 77 | "--no-playlist", | ||
| 78 | "-o", | ||
| 79 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 80 | "--no-overwrites", | ||
| 81 | "https://freemusicarchive.org/music/track/20" | ||
| 82 | ] | ||
| 80 | }, | 83 | }, |
| 81 | { | 84 | { |
| 82 | "track_id": 26, | 85 | "track_id": 26, |
| 83 | "status": "http_error", | 86 | "url": "https://freemusicarchive.org/music/track/26", |
| 84 | "code": 403, | 87 | "status": "failed", |
| 85 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000026.mp3" | 88 | "returncode": 1, |
| 89 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/26\n[generic] 26: Downloading webpage\n", | ||
| 90 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 91 | "command": [ | ||
| 92 | "/usr/local/miniconda3/bin/python", | ||
| 93 | "-m", | ||
| 94 | "yt_dlp", | ||
| 95 | "--no-playlist", | ||
| 96 | "-o", | ||
| 97 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 98 | "--no-overwrites", | ||
| 99 | "https://freemusicarchive.org/music/track/26" | ||
| 100 | ] | ||
| 86 | }, | 101 | }, |
| 87 | { | 102 | { |
| 88 | "track_id": 30, | 103 | "track_id": 30, |
| 89 | "status": "http_error", | 104 | "url": "https://freemusicarchive.org/music/track/30", |
| 90 | "code": 403, | 105 | "status": "failed", |
| 91 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000030.mp3" | 106 | "returncode": 1, |
| 107 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/30\n[generic] 30: Downloading webpage\n", | ||
| 108 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 109 | "command": [ | ||
| 110 | "/usr/local/miniconda3/bin/python", | ||
| 111 | "-m", | ||
| 112 | "yt_dlp", | ||
| 113 | "--no-playlist", | ||
| 114 | "-o", | ||
| 115 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 116 | "--no-overwrites", | ||
| 117 | "https://freemusicarchive.org/music/track/30" | ||
| 118 | ] | ||
| 92 | }, | 119 | }, |
| 93 | { | 120 | { |
| 94 | "track_id": 46, | 121 | "track_id": 46, |
| 95 | "status": "http_error", | 122 | "url": "https://freemusicarchive.org/music/track/46", |
| 96 | "code": 403, | 123 | "status": "failed", |
| 97 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000046.mp3" | 124 | "returncode": 1, |
| 125 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/46\n[generic] 46: Downloading webpage\n", | ||
| 126 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 127 | "command": [ | ||
| 128 | "/usr/local/miniconda3/bin/python", | ||
| 129 | "-m", | ||
| 130 | "yt_dlp", | ||
| 131 | "--no-playlist", | ||
| 132 | "-o", | ||
| 133 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 134 | "--no-overwrites", | ||
| 135 | "https://freemusicarchive.org/music/track/46" | ||
| 136 | ] | ||
| 98 | }, | 137 | }, |
| 99 | { | 138 | { |
| 100 | "track_id": 48, | 139 | "track_id": 48, |
| 101 | "status": "http_error", | 140 | "url": "https://freemusicarchive.org/music/track/48", |
| 102 | "code": 403, | 141 | "status": "failed", |
| 103 | "url": "https://files.freemusicarchive.org/storage-freemusicarchive-org/music/000/000048.mp3" | 142 | "returncode": 1, |
| 143 | "stdout": "[generic] Extracting URL: https://freemusicarchive.org/music/track/48\n[generic] 48: Downloading webpage\n", | ||
| 144 | "stderr": "ERROR: [generic] Unable to download webpage: HTTP Error 404: Not Found (caused by <HTTPError 404: Not Found>)\n", | ||
| 145 | "command": [ | ||
| 146 | "/usr/local/miniconda3/bin/python", | ||
| 147 | "-m", | ||
| 148 | "yt_dlp", | ||
| 149 | "--no-playlist", | ||
| 150 | "-o", | ||
| 151 | "data/raw/fma_small_audio/%(id)s.%(ext)s", | ||
| 152 | "--no-overwrites", | ||
| 153 | "https://freemusicarchive.org/music/track/48" | ||
| 154 | ] | ||
| 104 | } | 155 | } |
| 105 | ] | 156 | ] |
| 106 | } | 157 | } |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -11,24 +11,29 @@ from pathlib import Path | ... | @@ -11,24 +11,29 @@ from pathlib import Path |
| 11 | 11 | ||
| 12 | DEFAULT_TRACK_IDS = [2, 5, 10, 20, 26, 30, 46, 48] | 12 | DEFAULT_TRACK_IDS = [2, 5, 10, 20, 26, 30, 46, 48] |
| 13 | FMA_TRACK_URL = "https://freemusicarchive.org/music/track/{track_id}" | 13 | FMA_TRACK_URL = "https://freemusicarchive.org/music/track/{track_id}" |
| 14 | PYTHON = "/usr/local/miniconda3/bin/python" | ||
| 14 | 15 | ||
| 15 | 16 | ||
| 16 | def ensure_ytdlp() -> str: | 17 | def ensure_ytdlp_cmd() -> list[str]: |
| 17 | path = shutil.which("yt-dlp") | 18 | shell_entry = shutil.which("yt-dlp") |
| 18 | if not path: | 19 | if shell_entry: |
| 19 | raise SystemExit(json.dumps({ | 20 | return [shell_entry] |
| 20 | "status": "blocked", | 21 | probe = subprocess.run([PYTHON, "-m", "yt_dlp", "--version"], text=True, capture_output=True) |
| 21 | "reason": "yt_dlp_missing", | 22 | if probe.returncode == 0: |
| 22 | "recommendation": "Install yt-dlp or provide local FMA audio manually into data/raw/fma_small_audio", | 23 | return [PYTHON, "-m", "yt_dlp"] |
| 23 | }, indent=2, ensure_ascii=False)) | 24 | raise SystemExit(json.dumps({ |
| 24 | return path | 25 | "status": "blocked", |
| 26 | "reason": "yt_dlp_missing", | ||
| 27 | "recommendation": "Install yt-dlp or provide local FMA audio manually into data/raw/fma_small_audio", | ||
| 28 | "stderr": probe.stderr[-1200:], | ||
| 29 | }, indent=2, ensure_ascii=False)) | ||
| 25 | 30 | ||
| 26 | 31 | ||
| 27 | def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = False) -> dict: | 32 | def fetch_one(track_id: int, output_dir: Path, ytdlp_cmd: list[str], overwrite: bool = False) -> dict: |
| 28 | outtmpl = str(output_dir / "%(id)s.%(ext)s") | 33 | outtmpl = str(output_dir / "%(id)s.%(ext)s") |
| 29 | url = FMA_TRACK_URL.format(track_id=track_id) | 34 | url = FMA_TRACK_URL.format(track_id=track_id) |
| 30 | cmd = [ | 35 | cmd = [ |
| 31 | ytdlp, | 36 | *ytdlp_cmd, |
| 32 | "--no-playlist", | 37 | "--no-playlist", |
| 33 | "-o", outtmpl, | 38 | "-o", outtmpl, |
| 34 | ] | 39 | ] |
| ... | @@ -43,6 +48,7 @@ def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = Fal | ... | @@ -43,6 +48,7 @@ def fetch_one(track_id: int, output_dir: Path, ytdlp: str, overwrite: bool = Fal |
| 43 | "returncode": proc.returncode, | 48 | "returncode": proc.returncode, |
| 44 | "stdout": proc.stdout[-1200:], | 49 | "stdout": proc.stdout[-1200:], |
| 45 | "stderr": proc.stderr[-1200:], | 50 | "stderr": proc.stderr[-1200:], |
| 51 | "command": cmd, | ||
| 46 | } | 52 | } |
| 47 | 53 | ||
| 48 | 54 | ||
| ... | @@ -56,14 +62,15 @@ def main(): | ... | @@ -56,14 +62,15 @@ def main(): |
| 56 | 62 | ||
| 57 | output_dir = Path(args.output_dir) | 63 | output_dir = Path(args.output_dir) |
| 58 | output_dir.mkdir(parents=True, exist_ok=True) | 64 | output_dir.mkdir(parents=True, exist_ok=True) |
| 59 | ytdlp = ensure_ytdlp() | 65 | ytdlp_cmd = ensure_ytdlp_cmd() |
| 60 | 66 | ||
| 61 | results = [fetch_one(track_id, output_dir, ytdlp, overwrite=args.overwrite) for track_id in args.track_ids] | 67 | results = [fetch_one(track_id, output_dir, ytdlp_cmd, overwrite=args.overwrite) for track_id in args.track_ids] |
| 62 | summary = { | 68 | summary = { |
| 63 | "output_dir": str(output_dir.resolve()), | 69 | "output_dir": str(output_dir.resolve()), |
| 64 | "requested": len(args.track_ids), | 70 | "requested": len(args.track_ids), |
| 65 | "downloaded": sum(1 for x in results if x["status"] == "downloaded"), | 71 | "downloaded": sum(1 for x in results if x["status"] == "downloaded"), |
| 66 | "failed": sum(1 for x in results if x["status"] != "downloaded"), | 72 | "failed": sum(1 for x in results if x["status"] != "downloaded"), |
| 73 | "ytdlp_cmd": ytdlp_cmd, | ||
| 67 | "results": results, | 74 | "results": results, |
| 68 | } | 75 | } |
| 69 | text = json.dumps(summary, indent=2, ensure_ascii=False) | 76 | text = json.dumps(summary, indent=2, ensure_ascii=False) | ... | ... |
| ... | @@ -224,6 +224,30 @@ | ... | @@ -224,6 +224,30 @@ |
| 224 | 224 | ||
| 225 | 225 | ||
| 226 | 226 | ||
| 227 | |||
| 228 | ### Stage: FMA 下载器模块调用修复 | ||
| 229 | |||
| 230 | 完成项: | ||
| 231 | - 修复 [acr-engine/scripts/fetch_fma_subset.py](../acr-engine/scripts/fetch_fma_subset.py) 的 `yt-dlp` 检测方式 | ||
| 232 | - 从 shell 可执行查找改为优先支持: | ||
| 233 | - `yt-dlp` 可执行文件 | ||
| 234 | - `/usr/local/miniconda3/bin/python -m yt_dlp` 模块调用 | ||
| 235 | - 重新执行真实 FMA bounded 下载验证 | ||
| 236 | |||
| 237 | 验证结果: | ||
| 238 | - `which yt-dlp` 仍为空 | ||
| 239 | - `/usr/local/miniconda3/bin/python -m yt_dlp --version` 成功,版本 `2026.03.17` | ||
| 240 | - `/usr/local/miniconda3/bin/python scripts/fetch_fma_subset.py --report reports/fma_fetch_subset_report.json` 成功执行 | ||
| 241 | - 当前结果: | ||
| 242 | - `ytdlp_cmd=["/usr/local/miniconda3/bin/python", "-m", "yt_dlp"]` | ||
| 243 | - 8/8 请求均失败 | ||
| 244 | - 失败原因已从“工具缺失”收敛为 `freemusicarchive.org/music/track/<id>` 返回 `HTTP 404` | ||
| 245 | |||
| 246 | 结论: | ||
| 247 | - 下载脚本的模块调用问题已经修复 | ||
| 248 | - 当前真实阻塞不再是本地环境,而是 FMA 历史页面 URL 路径已不可用 | ||
| 249 | - 下一步应转向官方整包或稳定镜像,而不是继续重试旧页面 URL | ||
| 250 | |||
| 227 | ### Stage: FMA 真实下载脚手架 | 251 | ### Stage: FMA 真实下载脚手架 |
| 228 | 252 | ||
| 229 | 完成项: | 253 | 完成项: | ... | ... |
| ... | @@ -276,7 +276,7 @@ | ... | @@ -276,7 +276,7 @@ |
| 276 | - [docs/session-handoff.md](./session-handoff.md) | 276 | - [docs/session-handoff.md](./session-handoff.md) |
| 277 | - [docs/current-capability-map.md](./current-capability-map.md) | 277 | - [docs/current-capability-map.md](./current-capability-map.md) |
| 278 | - [acr-engine/FIRST_RUN_CHECKLIST.md](../acr-engine/FIRST_RUN_CHECKLIST.md) | 278 | - [acr-engine/FIRST_RUN_CHECKLIST.md](../acr-engine/FIRST_RUN_CHECKLIST.md) |
| 279 | - FMA 真实子集下载脚手架已存在:[acr-engine/scripts/fetch_fma_subset.py](../acr-engine/scripts/fetch_fma_subset.py);最近验证结果是旧直链 `403`、当前环境缺 `yt-dlp` | 279 | - FMA 真实子集下载脚手架已存在:[acr-engine/scripts/fetch_fma_subset.py](../acr-engine/scripts/fetch_fma_subset.py);最近验证结果是旧直链 `403`、随后已修复为 `python -m yt_dlp` 调用,但页面级历史 URL 又返回 `404` |
| 280 | - 运行 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py) | 280 | - 运行 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py) |
| 281 | - 或直接查看最新落盘快照:`acr-engine/.omx/latest_status_snapshot.json` | 281 | - 或直接查看最新落盘快照:`acr-engine/.omx/latest_status_snapshot.json` |
| 282 | 282 | ... | ... |
-
Please register or sign in to post a comment