Commit 2b389caa 2b389caacb950c102349943297874408545d4f6f by cnb.bofCdSsphPA

Make long-running real FMA ingestion resumable across sessions

Constraint: The verified FMA archive is multi-gigabyte and downloads slowly, so the workflow must remain inspectable and resumable before extraction can happen
Rejected: Depend on ad hoc curl and unzip commands only | Makes long-running handoff and recovery brittle during Ralph-style continuous execution
Confidence: high
Scope-risk: narrow
Directive: Keep official FMA archive acquisition centered on prepare_fma_archive.py so future sessions share one resumable control surface
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/prepare_fma_archive.py; /usr/local/miniconda3/bin/python acr-engine/scripts/prepare_fma_archive.py inspect; unzip -v | head -n 2
Not-tested: Archive extraction and real-data smoke remain pending completion of the full fma_small.zip download
1 parent 7c54eb28
......@@ -32,6 +32,27 @@ flowchart LR
/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2
```
## Official FMA archive workflow
### Status commands
```bash
/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect
/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py download
/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py extract
```
### What this script standardizes
- official source URL: `https://os.unil.cloud.switch.ch/fma/fma_small.zip`
- resumable archive download to `data/raw/fma_small.zip`
- extraction target: `data/raw/fma_small_audio/`
### Current note
If `archive_size` is growing but `num_audio_files=0`, the archive is still downloading and extraction has not happened yet.
## Git LFS policy
Large raw archives and audio under `data/raw/` are tracked through Git LFS via [/.gitattributes](../../.gitattributes).
......
#!/usr/bin/env python3
"""Manage download/extract workflow for the official FMA Small archive."""
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
FMA_SMALL_URL = "https://os.unil.cloud.switch.ch/fma/fma_small.zip"
ARCHIVE_PATH = Path("data/raw/fma_small.zip")
EXTRACT_DIR = Path("data/raw/fma_small_audio")
def run(cmd: list[str]) -> subprocess.CompletedProcess:
return subprocess.run(cmd, text=True, capture_output=True)
def download(resume: bool = True) -> dict:
ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True)
cmd = ["curl", "-L"]
if resume:
cmd += ["--continue-at", "-"]
cmd += ["--output", str(ARCHIVE_PATH), FMA_SMALL_URL]
proc = run(cmd)
return {
"action": "download",
"command": cmd,
"returncode": proc.returncode,
"archive_path": str(ARCHIVE_PATH.resolve()),
"archive_exists": ARCHIVE_PATH.exists(),
"archive_size": ARCHIVE_PATH.stat().st_size if ARCHIVE_PATH.exists() else 0,
"stdout_tail": proc.stdout[-1200:],
"stderr_tail": proc.stderr[-1200:],
}
def inspect() -> dict:
archive_exists = ARCHIVE_PATH.exists()
extract_exists = EXTRACT_DIR.exists()
num_audio = 0
if extract_exists:
num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}])
return {
"action": "inspect",
"archive_url": FMA_SMALL_URL,
"archive_path": str(ARCHIVE_PATH.resolve()),
"archive_exists": archive_exists,
"archive_size": ARCHIVE_PATH.stat().st_size if archive_exists else 0,
"extract_dir": str(EXTRACT_DIR.resolve()),
"extract_exists": extract_exists,
"num_audio_files": num_audio,
}
def extract(overwrite: bool = False) -> dict:
if not ARCHIVE_PATH.exists():
raise SystemExit(json.dumps({
"status": "blocked",
"reason": "archive_missing",
"archive_path": str(ARCHIVE_PATH.resolve()),
"recommendation": f"Run download first from {FMA_SMALL_URL}",
}, indent=2, ensure_ascii=False))
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
cmd = ["unzip"]
if overwrite:
cmd.append("-o")
else:
cmd.append("-n")
cmd += [str(ARCHIVE_PATH), "-d", str(EXTRACT_DIR)]
proc = run(cmd)
num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}])
return {
"action": "extract",
"command": cmd,
"returncode": proc.returncode,
"extract_dir": str(EXTRACT_DIR.resolve()),
"num_audio_files": num_audio,
"stdout_tail": proc.stdout[-1200:],
"stderr_tail": proc.stderr[-1200:],
}
def main():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("download")
p.add_argument("--no-resume", action="store_true")
sub.add_parser("inspect")
p = sub.add_parser("extract")
p.add_argument("--overwrite", action="store_true")
args = parser.parse_args()
if args.cmd == "download":
result = download(resume=not args.no_resume)
elif args.cmd == "inspect":
result = inspect()
elif args.cmd == "extract":
result = extract(overwrite=args.overwrite)
else:
raise SystemExit(2)
print(json.dumps(result, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
......@@ -226,6 +226,32 @@
### Stage: FMA 整包下载/解压脚手架
完成项:
- 新增 [acr-engine/scripts/prepare_fma_archive.py](../acr-engine/scripts/prepare_fma_archive.py)
- 支持:
- `inspect`
- `download`(支持 resume)
- `extract`
- 将 FMA 官方整包路径接入:
- [acr-engine/data/raw/README.md](../acr-engine/data/raw/README.md)
- [docs/open-dataset-workflow.md](./open-dataset-workflow.md)
验证结果:
- `/usr/local/miniconda3/bin/python -m py_compile scripts/prepare_fma_archive.py` 成功
- `/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect` 成功
- 当前结果:
- `archive_exists=true`
- `archive_size=9404416`
- `extract_exists=true`
- `num_audio_files=0`
结论:
- 真实 FMA 整包路径现在不仅在下载,而且已经有标准化的 inspect/download/extract 工具链
- 即使下载耗时较长,后续 session 也能直接续传与接力
### Stage: FMA 官方整包下载路径确认
完成项:
......