Make long-running real FMA ingestion resumable across sessions
Constraint: The verified FMA archive is multi-gigabyte and downloads slowly, so the workflow must remain inspectable and resumable before extraction can happen Rejected: Depend on ad hoc curl and unzip commands only | Makes long-running handoff and recovery brittle during Ralph-style continuous execution Confidence: high Scope-risk: narrow Directive: Keep official FMA archive acquisition centered on prepare_fma_archive.py so future sessions share one resumable control surface Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/prepare_fma_archive.py; /usr/local/miniconda3/bin/python acr-engine/scripts/prepare_fma_archive.py inspect; unzip -v | head -n 2 Not-tested: Archive extraction and real-data smoke remain pending completion of the full fma_small.zip download
Showing
3 changed files
with
157 additions
and
0 deletions
| ... | @@ -32,6 +32,27 @@ flowchart LR | ... | @@ -32,6 +32,27 @@ flowchart LR |
| 32 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 | 32 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 |
| 33 | ``` | 33 | ``` |
| 34 | 34 | ||
| 35 | |||
| 36 | ## Official FMA archive workflow | ||
| 37 | |||
| 38 | ### Status commands | ||
| 39 | |||
| 40 | ```bash | ||
| 41 | /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect | ||
| 42 | /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py download | ||
| 43 | /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py extract | ||
| 44 | ``` | ||
| 45 | |||
| 46 | ### What this script standardizes | ||
| 47 | |||
| 48 | - official source URL: `https://os.unil.cloud.switch.ch/fma/fma_small.zip` | ||
| 49 | - resumable archive download to `data/raw/fma_small.zip` | ||
| 50 | - extraction target: `data/raw/fma_small_audio/` | ||
| 51 | |||
| 52 | ### Current note | ||
| 53 | |||
| 54 | If `archive_size` is growing but `num_audio_files=0`, the archive is still downloading and extraction has not happened yet. | ||
| 55 | |||
| 35 | ## Git LFS policy | 56 | ## Git LFS policy |
| 36 | 57 | ||
| 37 | Large raw archives and audio under `data/raw/` are tracked through Git LFS via [/.gitattributes](../../.gitattributes). | 58 | Large raw archives and audio under `data/raw/` are tracked through Git LFS via [/.gitattributes](../../.gitattributes). | ... | ... |
acr-engine/scripts/prepare_fma_archive.py
0 → 100755
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Manage download/extract workflow for the official FMA Small archive.""" | ||
| 3 | |||
| 4 | from __future__ import annotations | ||
| 5 | |||
| 6 | import argparse | ||
| 7 | import json | ||
| 8 | import subprocess | ||
| 9 | from pathlib import Path | ||
| 10 | |||
| 11 | FMA_SMALL_URL = "https://os.unil.cloud.switch.ch/fma/fma_small.zip" | ||
| 12 | ARCHIVE_PATH = Path("data/raw/fma_small.zip") | ||
| 13 | EXTRACT_DIR = Path("data/raw/fma_small_audio") | ||
| 14 | |||
| 15 | |||
| 16 | def run(cmd: list[str]) -> subprocess.CompletedProcess: | ||
| 17 | return subprocess.run(cmd, text=True, capture_output=True) | ||
| 18 | |||
| 19 | |||
| 20 | def download(resume: bool = True) -> dict: | ||
| 21 | ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True) | ||
| 22 | cmd = ["curl", "-L"] | ||
| 23 | if resume: | ||
| 24 | cmd += ["--continue-at", "-"] | ||
| 25 | cmd += ["--output", str(ARCHIVE_PATH), FMA_SMALL_URL] | ||
| 26 | proc = run(cmd) | ||
| 27 | return { | ||
| 28 | "action": "download", | ||
| 29 | "command": cmd, | ||
| 30 | "returncode": proc.returncode, | ||
| 31 | "archive_path": str(ARCHIVE_PATH.resolve()), | ||
| 32 | "archive_exists": ARCHIVE_PATH.exists(), | ||
| 33 | "archive_size": ARCHIVE_PATH.stat().st_size if ARCHIVE_PATH.exists() else 0, | ||
| 34 | "stdout_tail": proc.stdout[-1200:], | ||
| 35 | "stderr_tail": proc.stderr[-1200:], | ||
| 36 | } | ||
| 37 | |||
| 38 | |||
| 39 | def inspect() -> dict: | ||
| 40 | archive_exists = ARCHIVE_PATH.exists() | ||
| 41 | extract_exists = EXTRACT_DIR.exists() | ||
| 42 | num_audio = 0 | ||
| 43 | if extract_exists: | ||
| 44 | num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}]) | ||
| 45 | return { | ||
| 46 | "action": "inspect", | ||
| 47 | "archive_url": FMA_SMALL_URL, | ||
| 48 | "archive_path": str(ARCHIVE_PATH.resolve()), | ||
| 49 | "archive_exists": archive_exists, | ||
| 50 | "archive_size": ARCHIVE_PATH.stat().st_size if archive_exists else 0, | ||
| 51 | "extract_dir": str(EXTRACT_DIR.resolve()), | ||
| 52 | "extract_exists": extract_exists, | ||
| 53 | "num_audio_files": num_audio, | ||
| 54 | } | ||
| 55 | |||
| 56 | |||
| 57 | def extract(overwrite: bool = False) -> dict: | ||
| 58 | if not ARCHIVE_PATH.exists(): | ||
| 59 | raise SystemExit(json.dumps({ | ||
| 60 | "status": "blocked", | ||
| 61 | "reason": "archive_missing", | ||
| 62 | "archive_path": str(ARCHIVE_PATH.resolve()), | ||
| 63 | "recommendation": f"Run download first from {FMA_SMALL_URL}", | ||
| 64 | }, indent=2, ensure_ascii=False)) | ||
| 65 | EXTRACT_DIR.mkdir(parents=True, exist_ok=True) | ||
| 66 | cmd = ["unzip"] | ||
| 67 | if overwrite: | ||
| 68 | cmd.append("-o") | ||
| 69 | else: | ||
| 70 | cmd.append("-n") | ||
| 71 | cmd += [str(ARCHIVE_PATH), "-d", str(EXTRACT_DIR)] | ||
| 72 | proc = run(cmd) | ||
| 73 | num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}]) | ||
| 74 | return { | ||
| 75 | "action": "extract", | ||
| 76 | "command": cmd, | ||
| 77 | "returncode": proc.returncode, | ||
| 78 | "extract_dir": str(EXTRACT_DIR.resolve()), | ||
| 79 | "num_audio_files": num_audio, | ||
| 80 | "stdout_tail": proc.stdout[-1200:], | ||
| 81 | "stderr_tail": proc.stderr[-1200:], | ||
| 82 | } | ||
| 83 | |||
| 84 | |||
| 85 | def main(): | ||
| 86 | parser = argparse.ArgumentParser() | ||
| 87 | sub = parser.add_subparsers(dest="cmd", required=True) | ||
| 88 | |||
| 89 | p = sub.add_parser("download") | ||
| 90 | p.add_argument("--no-resume", action="store_true") | ||
| 91 | |||
| 92 | sub.add_parser("inspect") | ||
| 93 | |||
| 94 | p = sub.add_parser("extract") | ||
| 95 | p.add_argument("--overwrite", action="store_true") | ||
| 96 | |||
| 97 | args = parser.parse_args() | ||
| 98 | if args.cmd == "download": | ||
| 99 | result = download(resume=not args.no_resume) | ||
| 100 | elif args.cmd == "inspect": | ||
| 101 | result = inspect() | ||
| 102 | elif args.cmd == "extract": | ||
| 103 | result = extract(overwrite=args.overwrite) | ||
| 104 | else: | ||
| 105 | raise SystemExit(2) | ||
| 106 | print(json.dumps(result, indent=2, ensure_ascii=False)) | ||
| 107 | |||
| 108 | |||
| 109 | if __name__ == "__main__": | ||
| 110 | main() |
| ... | @@ -226,6 +226,32 @@ | ... | @@ -226,6 +226,32 @@ |
| 226 | 226 | ||
| 227 | 227 | ||
| 228 | 228 | ||
| 229 | |||
| 230 | ### Stage: FMA 整包下载/解压脚手架 | ||
| 231 | |||
| 232 | 完成项: | ||
| 233 | - 新增 [acr-engine/scripts/prepare_fma_archive.py](../acr-engine/scripts/prepare_fma_archive.py) | ||
| 234 | - 支持: | ||
| 235 | - `inspect` | ||
| 236 | - `download`(支持 resume) | ||
| 237 | - `extract` | ||
| 238 | - 将 FMA 官方整包路径接入: | ||
| 239 | - [acr-engine/data/raw/README.md](../acr-engine/data/raw/README.md) | ||
| 240 | - [docs/open-dataset-workflow.md](./open-dataset-workflow.md) | ||
| 241 | |||
| 242 | 验证结果: | ||
| 243 | - `/usr/local/miniconda3/bin/python -m py_compile scripts/prepare_fma_archive.py` 成功 | ||
| 244 | - `/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect` 成功 | ||
| 245 | - 当前结果: | ||
| 246 | - `archive_exists=true` | ||
| 247 | - `archive_size=9404416` | ||
| 248 | - `extract_exists=true` | ||
| 249 | - `num_audio_files=0` | ||
| 250 | |||
| 251 | 结论: | ||
| 252 | - 真实 FMA 整包路径现在不仅在下载,而且已经有标准化的 inspect/download/extract 工具链 | ||
| 253 | - 即使下载耗时较长,后续 session 也能直接续传与接力 | ||
| 254 | |||
| 229 | ### Stage: FMA 官方整包下载路径确认 | 255 | ### Stage: FMA 官方整包下载路径确认 |
| 230 | 256 | ||
| 231 | 完成项: | 257 | 完成项: | ... | ... |
-
Please register or sign in to post a comment