Commit 2b389caa 2b389caacb950c102349943297874408545d4f6f by cnb.bofCdSsphPA

Make long-running real FMA ingestion resumable across sessions

Constraint: The verified FMA archive is multi-gigabyte and downloads slowly, so the workflow must remain inspectable and resumable before extraction can happen
Rejected: Depend on ad hoc curl and unzip commands only | Makes long-running handoff and recovery brittle during Ralph-style continuous execution
Confidence: high
Scope-risk: narrow
Directive: Keep official FMA archive acquisition centered on prepare_fma_archive.py so future sessions share one resumable control surface
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/prepare_fma_archive.py; /usr/local/miniconda3/bin/python acr-engine/scripts/prepare_fma_archive.py inspect; unzip -v | head -n 2
Not-tested: Archive extraction and real-data smoke remain pending completion of the full fma_small.zip download
1 parent 7c54eb28
...@@ -32,6 +32,27 @@ flowchart LR ...@@ -32,6 +32,27 @@ flowchart LR
32 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 32 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2
33 ``` 33 ```
34 34
35
36 ## Official FMA archive workflow
37
38 ### Status commands
39
40 ```bash
41 /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect
42 /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py download
43 /usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py extract
44 ```
45
46 ### What this script standardizes
47
48 - official source URL: `https://os.unil.cloud.switch.ch/fma/fma_small.zip`
49 - resumable archive download to `data/raw/fma_small.zip`
50 - extraction target: `data/raw/fma_small_audio/`
51
52 ### Current note
53
54 If `archive_size` is growing but `num_audio_files=0`, the archive is still downloading and extraction has not happened yet.
55
35 ## Git LFS policy 56 ## Git LFS policy
36 57
37 Large raw archives and audio under `data/raw/` are tracked through Git LFS via [/.gitattributes](../../.gitattributes). 58 Large raw archives and audio under `data/raw/` are tracked through Git LFS via [/.gitattributes](../../.gitattributes).
......
1 #!/usr/bin/env python3
2 """Manage download/extract workflow for the official FMA Small archive."""
3
4 from __future__ import annotations
5
6 import argparse
7 import json
8 import subprocess
9 from pathlib import Path
10
11 FMA_SMALL_URL = "https://os.unil.cloud.switch.ch/fma/fma_small.zip"
12 ARCHIVE_PATH = Path("data/raw/fma_small.zip")
13 EXTRACT_DIR = Path("data/raw/fma_small_audio")
14
15
16 def run(cmd: list[str]) -> subprocess.CompletedProcess:
17 return subprocess.run(cmd, text=True, capture_output=True)
18
19
20 def download(resume: bool = True) -> dict:
21 ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True)
22 cmd = ["curl", "-L"]
23 if resume:
24 cmd += ["--continue-at", "-"]
25 cmd += ["--output", str(ARCHIVE_PATH), FMA_SMALL_URL]
26 proc = run(cmd)
27 return {
28 "action": "download",
29 "command": cmd,
30 "returncode": proc.returncode,
31 "archive_path": str(ARCHIVE_PATH.resolve()),
32 "archive_exists": ARCHIVE_PATH.exists(),
33 "archive_size": ARCHIVE_PATH.stat().st_size if ARCHIVE_PATH.exists() else 0,
34 "stdout_tail": proc.stdout[-1200:],
35 "stderr_tail": proc.stderr[-1200:],
36 }
37
38
39 def inspect() -> dict:
40 archive_exists = ARCHIVE_PATH.exists()
41 extract_exists = EXTRACT_DIR.exists()
42 num_audio = 0
43 if extract_exists:
44 num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}])
45 return {
46 "action": "inspect",
47 "archive_url": FMA_SMALL_URL,
48 "archive_path": str(ARCHIVE_PATH.resolve()),
49 "archive_exists": archive_exists,
50 "archive_size": ARCHIVE_PATH.stat().st_size if archive_exists else 0,
51 "extract_dir": str(EXTRACT_DIR.resolve()),
52 "extract_exists": extract_exists,
53 "num_audio_files": num_audio,
54 }
55
56
57 def extract(overwrite: bool = False) -> dict:
58 if not ARCHIVE_PATH.exists():
59 raise SystemExit(json.dumps({
60 "status": "blocked",
61 "reason": "archive_missing",
62 "archive_path": str(ARCHIVE_PATH.resolve()),
63 "recommendation": f"Run download first from {FMA_SMALL_URL}",
64 }, indent=2, ensure_ascii=False))
65 EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
66 cmd = ["unzip"]
67 if overwrite:
68 cmd.append("-o")
69 else:
70 cmd.append("-n")
71 cmd += [str(ARCHIVE_PATH), "-d", str(EXTRACT_DIR)]
72 proc = run(cmd)
73 num_audio = len([p for p in EXTRACT_DIR.rglob('*') if p.suffix.lower() in {'.mp3', '.wav', '.flac', '.ogg'}])
74 return {
75 "action": "extract",
76 "command": cmd,
77 "returncode": proc.returncode,
78 "extract_dir": str(EXTRACT_DIR.resolve()),
79 "num_audio_files": num_audio,
80 "stdout_tail": proc.stdout[-1200:],
81 "stderr_tail": proc.stderr[-1200:],
82 }
83
84
85 def main():
86 parser = argparse.ArgumentParser()
87 sub = parser.add_subparsers(dest="cmd", required=True)
88
89 p = sub.add_parser("download")
90 p.add_argument("--no-resume", action="store_true")
91
92 sub.add_parser("inspect")
93
94 p = sub.add_parser("extract")
95 p.add_argument("--overwrite", action="store_true")
96
97 args = parser.parse_args()
98 if args.cmd == "download":
99 result = download(resume=not args.no_resume)
100 elif args.cmd == "inspect":
101 result = inspect()
102 elif args.cmd == "extract":
103 result = extract(overwrite=args.overwrite)
104 else:
105 raise SystemExit(2)
106 print(json.dumps(result, indent=2, ensure_ascii=False))
107
108
109 if __name__ == "__main__":
110 main()
...@@ -226,6 +226,32 @@ ...@@ -226,6 +226,32 @@
226 226
227 227
228 228
229
230 ### Stage: FMA 整包下载/解压脚手架
231
232 完成项:
233 - 新增 [acr-engine/scripts/prepare_fma_archive.py](../acr-engine/scripts/prepare_fma_archive.py)
234 - 支持:
235 - `inspect`
236 - `download`(支持 resume)
237 - `extract`
238 - 将 FMA 官方整包路径接入:
239 - [acr-engine/data/raw/README.md](../acr-engine/data/raw/README.md)
240 - [docs/open-dataset-workflow.md](./open-dataset-workflow.md)
241
242 验证结果:
243 - `/usr/local/miniconda3/bin/python -m py_compile scripts/prepare_fma_archive.py` 成功
244 - `/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py inspect` 成功
245 - 当前结果:
246 - `archive_exists=true`
247 - `archive_size=9404416`
248 - `extract_exists=true`
249 - `num_audio_files=0`
250
251 结论:
252 - 真实 FMA 整包路径现在不仅在下载,而且已经有标准化的 inspect/download/extract 工具链
253 - 即使下载耗时较长,后续 session 也能直接续传与接力
254
229 ### Stage: FMA 官方整包下载路径确认 255 ### Stage: FMA 官方整包下载路径确认
230 256
231 完成项: 257 完成项:
......