Recover stalled real-dataset transfer with a durable background resume path
Constraint: Long FMA archive downloads cannot rely on fragile foreground execution if Ralph-style work must continue across sessions Rejected: Keep manually reissuing foreground download commands after stalls | Increases interruption risk and weakens resumability evidence Confidence: high Scope-risk: narrow Directive: Prefer prepare_fma_archive.py bg-download for future large archive recovery so PID and log evidence remain standardized Tested: /usr/local/miniconda3/bin/python acr-engine/scripts/prepare_fma_archive.py bg-download; /usr/local/miniconda3/bin/python acr-engine/scripts/prepare_fma_archive.py inspect; tail -n 40 /tmp/fma_modelscope_download.log Not-tested: Full archive completion, extraction, and real-data smoke remain pending
Showing
2 changed files
with
53 additions
and
0 deletions
| ... | @@ -37,6 +37,28 @@ def download(resume: bool = True) -> dict: | ... | @@ -37,6 +37,28 @@ def download(resume: bool = True) -> dict: |
| 37 | } | 37 | } |
| 38 | 38 | ||
| 39 | 39 | ||
| 40 | def bg_download(log_path: Path, resume: bool = True) -> dict: | ||
| 41 | ARCHIVE_PATH.parent.mkdir(parents=True, exist_ok=True) | ||
| 42 | log_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 43 | cmd = ["nohup", "curl", "-L"] | ||
| 44 | if resume: | ||
| 45 | cmd += ["--continue-at", "-"] | ||
| 46 | cmd += ["--output", str(ARCHIVE_PATH), FMA_SMALL_URL] | ||
| 47 | shell_cmd = " ".join(cmd) + f" >> {log_path} 2>&1 & echo $!" | ||
| 48 | proc = subprocess.run(["bash", "-lc", shell_cmd], text=True, capture_output=True) | ||
| 49 | pid = proc.stdout.strip() | ||
| 50 | return { | ||
| 51 | "action": "bg-download", | ||
| 52 | "returncode": proc.returncode, | ||
| 53 | "pid": pid, | ||
| 54 | "log_path": str(log_path.resolve()), | ||
| 55 | "archive_path": str(ARCHIVE_PATH.resolve()), | ||
| 56 | "archive_exists": ARCHIVE_PATH.exists(), | ||
| 57 | "archive_size": ARCHIVE_PATH.stat().st_size if ARCHIVE_PATH.exists() else 0, | ||
| 58 | "stderr_tail": proc.stderr[-1200:], | ||
| 59 | } | ||
| 60 | |||
| 61 | |||
| 40 | def inspect() -> dict: | 62 | def inspect() -> dict: |
| 41 | archive_exists = ARCHIVE_PATH.exists() | 63 | archive_exists = ARCHIVE_PATH.exists() |
| 42 | extract_exists = EXTRACT_DIR.exists() | 64 | extract_exists = EXTRACT_DIR.exists() |
| ... | @@ -95,6 +117,10 @@ def main(): | ... | @@ -95,6 +117,10 @@ def main(): |
| 95 | p = sub.add_parser("download") | 117 | p = sub.add_parser("download") |
| 96 | p.add_argument("--no-resume", action="store_true") | 118 | p.add_argument("--no-resume", action="store_true") |
| 97 | 119 | ||
| 120 | p = sub.add_parser("bg-download") | ||
| 121 | p.add_argument("--no-resume", action="store_true") | ||
| 122 | p.add_argument("--log-path", default="/tmp/fma_modelscope_download.log") | ||
| 123 | |||
| 98 | sub.add_parser("inspect") | 124 | sub.add_parser("inspect") |
| 99 | 125 | ||
| 100 | p = sub.add_parser("extract") | 126 | p = sub.add_parser("extract") |
| ... | @@ -103,6 +129,8 @@ def main(): | ... | @@ -103,6 +129,8 @@ def main(): |
| 103 | args = parser.parse_args() | 129 | args = parser.parse_args() |
| 104 | if args.cmd == "download": | 130 | if args.cmd == "download": |
| 105 | result = download(resume=not args.no_resume) | 131 | result = download(resume=not args.no_resume) |
| 132 | elif args.cmd == "bg-download": | ||
| 133 | result = bg_download(Path(args.log_path), resume=not args.no_resume) | ||
| 106 | elif args.cmd == "inspect": | 134 | elif args.cmd == "inspect": |
| 107 | result = inspect() | 135 | result = inspect() |
| 108 | elif args.cmd == "extract": | 136 | elif args.cmd == "extract": | ... | ... |
| ... | @@ -231,6 +231,31 @@ | ... | @@ -231,6 +231,31 @@ |
| 231 | 231 | ||
| 232 | 232 | ||
| 233 | 233 | ||
| 234 | |||
| 235 | ### Stage: FMA 后台续传恢复 | ||
| 236 | |||
| 237 | 完成项: | ||
| 238 | - 为 [acr-engine/scripts/prepare_fma_archive.py](../acr-engine/scripts/prepare_fma_archive.py) 新增 `bg-download` | ||
| 239 | - 使用 `nohup curl` + 日志文件的方式增强大文件后台续传稳定性 | ||
| 240 | - 在发现下载停滞后,切换到新的后台恢复路径并重新托管 ModelScope 下载 | ||
| 241 | |||
| 242 | 验证结果: | ||
| 243 | - `/usr/local/miniconda3/bin/python scripts/prepare_fma_archive.py bg-download` 成功 | ||
| 244 | - 当前返回: | ||
| 245 | - `returncode=0` | ||
| 246 | - `pid=47175` | ||
| 247 | - `log_path=/tmp/fma_modelscope_download.log` | ||
| 248 | - 重新 inspect 后结果: | ||
| 249 | - `archive_size` 从 `61550592` 增长到 `71835648` | ||
| 250 | - `archive_progress_percent=0.9354` | ||
| 251 | - 日志验证: | ||
| 252 | - `Resuming transfer from byte position 61550592` | ||
| 253 | - 当前吞吐已达到 MB/s 级别 | ||
| 254 | |||
| 255 | 结论: | ||
| 256 | - FMA 真实数据下载不再依赖脆弱的一次性前台命令 | ||
| 257 | - 当前已恢复到可持续的后台续传状态,后续 session 更容易接力 | ||
| 258 | |||
| 234 | ### Stage: FMA 下载进度可视化 | 259 | ### Stage: FMA 下载进度可视化 |
| 235 | 260 | ||
| 236 | 完成项: | 261 | 完成项: | ... | ... |
-
Please register or sign in to post a comment