Prevent empty local dataset folders from masquerading as smoke-ready
Constraint: Real-data validation now depends on user-requested local corpus drop zones that may exist before they contain any audio Rejected: Let smoke-local fail deep inside training | Produces slower and less actionable feedback for continuous sessions Confidence: high Scope-risk: narrow Directive: Keep readiness thresholds aligned with the minimum viable query split assumptions before expanding real-data automation Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py scripts/status_snapshot.py; /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0; /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0; /usr/local/miniconda3/bin/python scripts/status_snapshot.py --output .omx/latest_status_snapshot.json Not-tested: Full smoke-local on real FMA or MTG-Jamendo remains blocked until audio is actually downloaded
Showing
4 changed files
with
197 additions
and
21 deletions
| ... | @@ -6,39 +6,67 @@ from pathlib import Path | ... | @@ -6,39 +6,67 @@ from pathlib import Path |
| 6 | 6 | ||
| 7 | root = Path.cwd() | 7 | root = Path.cwd() |
| 8 | workspace_root = root.parent | 8 | workspace_root = root.parent |
| 9 | PYTHON = "/usr/local/miniconda3/bin/python" | ||
| 10 | |||
| 9 | 11 | ||
| 10 | def sh(cmd): | 12 | def sh(cmd): |
| 11 | return subprocess.check_output(cmd, shell=True, text=True).strip() | 13 | return subprocess.check_output(cmd, shell=True, text=True).strip() |
| 12 | 14 | ||
| 15 | |||
| 16 | def check_dataset(dataset: str, input_dir: str): | ||
| 17 | cmd = [ | ||
| 18 | PYTHON, | ||
| 19 | "src/data/external_adapters.py", | ||
| 20 | "check-local-ready", | ||
| 21 | dataset, | ||
| 22 | input_dir, | ||
| 23 | "--eval-ratio", "0.2", | ||
| 24 | "--query-duration", "8.0", | ||
| 25 | ] | ||
| 26 | result = subprocess.check_output(cmd, text=True) | ||
| 27 | return json.loads(result) | ||
| 28 | |||
| 29 | |||
| 13 | def build_snapshot(): | 30 | def build_snapshot(): |
| 31 | fma_dir = "data/raw/fma_small_audio" | ||
| 32 | jamendo_dir = "data/raw/mtg_jamendo_audio" | ||
| 33 | fma_ready = check_dataset("fma", fma_dir) | ||
| 34 | jamendo_ready = check_dataset("mtg_jamendo", jamendo_dir) | ||
| 14 | return { | 35 | return { |
| 15 | 'latest_commit': sh('git log --oneline -n 1'), | 36 | "latest_commit": sh("git log --oneline -n 1"), |
| 16 | 'docs': { | 37 | "docs": { |
| 17 | 'readme': str((workspace_root / 'docs/README.md').resolve()), | 38 | "readme": str((workspace_root / "docs/README.md").resolve()), |
| 18 | 'handoff': str((workspace_root / 'docs/session-handoff.md').resolve()), | 39 | "handoff": str((workspace_root / "docs/session-handoff.md").resolve()), |
| 19 | 'workflow': str((workspace_root / 'docs/open-dataset-workflow.md').resolve()), | 40 | "workflow": str((workspace_root / "docs/open-dataset-workflow.md").resolve()), |
| 41 | "capability_map": str((workspace_root / "docs/current-capability-map.md").resolve()), | ||
| 42 | }, | ||
| 43 | "drop_zones": { | ||
| 44 | "fma": str((root / fma_dir).resolve()), | ||
| 45 | "mtg_jamendo": str((root / jamendo_dir).resolve()), | ||
| 46 | }, | ||
| 47 | "dataset_readiness": { | ||
| 48 | "fma": fma_ready, | ||
| 49 | "mtg_jamendo": jamendo_ready, | ||
| 20 | }, | 50 | }, |
| 21 | 'drop_zones': { | 51 | "verified_open_smoke_dirs": { |
| 22 | 'fma': str((root / 'data/raw/fma_small_audio').resolve()), | 52 | "manifests": str((root / "data/external_ingested/synthetic_as_open_fixed/fma/manifests").resolve()), |
| 23 | 'mtg_jamendo': str((root / 'data/raw/mtg_jamendo_audio').resolve()), | 53 | "reports": str((root / "reports/open-smoke-fixed/fma").resolve()), |
| 54 | "one_shot_reports": str((root / "data/external_smoke/fma_reports_smoke").resolve()), | ||
| 24 | }, | 55 | }, |
| 25 | 'verified_open_smoke_dirs': { | 56 | "next_commands": { |
| 26 | 'manifests': str((root / 'data/external_ingested/synthetic_as_open_fixed/fma/manifests').resolve()), | 57 | "check_fma": f"{PYTHON} src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0", |
| 27 | 'reports': str((root / 'reports/open-smoke-fixed/fma').resolve()), | 58 | "inspect_fma": f"{PYTHON} src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0", |
| 28 | 'one_shot_reports': str((root / 'data/external_smoke/fma_reports_smoke').resolve()), | 59 | "smoke_fma": f"{PYTHON} src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2", |
| 60 | "check_jamendo": f"{PYTHON} src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0", | ||
| 61 | "inspect_jamendo": f"{PYTHON} src/data/external_adapters.py inspect-local mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0", | ||
| 62 | "smoke_jamendo": f"{PYTHON} src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2", | ||
| 29 | }, | 63 | }, |
| 30 | 'next_commands': { | ||
| 31 | 'inspect_fma': '/usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0', | ||
| 32 | 'smoke_fma': '/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2', | ||
| 33 | 'inspect_jamendo': '/usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0', | ||
| 34 | 'smoke_jamendo': '/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2' | ||
| 35 | } | ||
| 36 | } | 64 | } |
| 37 | 65 | ||
| 38 | 66 | ||
| 39 | def main(): | 67 | def main(): |
| 40 | parser = argparse.ArgumentParser() | 68 | parser = argparse.ArgumentParser() |
| 41 | parser.add_argument('--output', default=None) | 69 | parser.add_argument("--output", default=None) |
| 42 | args = parser.parse_args() | 70 | args = parser.parse_args() |
| 43 | 71 | ||
| 44 | snapshot = build_snapshot() | 72 | snapshot = build_snapshot() |
| ... | @@ -50,5 +78,5 @@ def main(): | ... | @@ -50,5 +78,5 @@ def main(): |
| 50 | print(text) | 78 | print(text) |
| 51 | 79 | ||
| 52 | 80 | ||
| 53 | if __name__ == '__main__': | 81 | if __name__ == "__main__": |
| 54 | main() | 82 | main() | ... | ... |
| ... | @@ -10,6 +10,11 @@ import json | ... | @@ -10,6 +10,11 @@ import json |
| 10 | import subprocess | 10 | import subprocess |
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg") | ||
| 14 | MIN_SMOKE_AUDIO_FILES = 2 | ||
| 15 | MIN_SMOKE_ELIGIBLE_QUERY_FILES = 2 | ||
| 16 | |||
| 17 | |||
| 13 | @dataclass | 18 | @dataclass |
| 14 | class DatasetRecord: | 19 | class DatasetRecord: |
| 15 | name: str | 20 | name: str |
| ... | @@ -198,6 +203,77 @@ REGISTRY: List[DatasetRecord] = [ | ... | @@ -198,6 +203,77 @@ REGISTRY: List[DatasetRecord] = [ |
| 198 | ] | 203 | ] |
| 199 | 204 | ||
| 200 | 205 | ||
| 206 | def count_audio_files(input_dir: Path) -> int: | ||
| 207 | return len([p for p in input_dir.rglob("*") if p.suffix.lower() in AUDIO_EXTS]) | ||
| 208 | |||
| 209 | |||
| 210 | def assess_local_dataset_ready( | ||
| 211 | dataset: str, | ||
| 212 | input_dir: Path, | ||
| 213 | query_duration: float = 8.0, | ||
| 214 | eval_ratio: float = 0.2, | ||
| 215 | ) -> Dict: | ||
| 216 | adapter = ADAPTERS[dataset] | ||
| 217 | input_dir = input_dir.resolve() | ||
| 218 | exists = input_dir.exists() | ||
| 219 | is_dir = input_dir.is_dir() | ||
| 220 | inspect_summary = None | ||
| 221 | num_audio_files = 0 | ||
| 222 | eligible_query_files = 0 | ||
| 223 | issues = [] | ||
| 224 | |||
| 225 | if not exists: | ||
| 226 | issues.append("input_dir_missing") | ||
| 227 | elif not is_dir: | ||
| 228 | issues.append("input_path_not_directory") | ||
| 229 | else: | ||
| 230 | inspect_summary = adapter.inspect_local_audio( | ||
| 231 | input_dir, | ||
| 232 | query_duration=query_duration, | ||
| 233 | eval_ratio=eval_ratio, | ||
| 234 | ) | ||
| 235 | num_audio_files = int(inspect_summary.get("num_audio_files", 0)) | ||
| 236 | eligible_query_files = int(inspect_summary.get("eligible_query_files", 0)) | ||
| 237 | if num_audio_files < MIN_SMOKE_AUDIO_FILES: | ||
| 238 | issues.append("not_enough_audio_files_for_smoke") | ||
| 239 | if eligible_query_files < MIN_SMOKE_ELIGIBLE_QUERY_FILES: | ||
| 240 | issues.append("not_enough_query_eligible_files_for_smoke") | ||
| 241 | |||
| 242 | ready = len(issues) == 0 | ||
| 243 | recommendations = [] | ||
| 244 | if "input_dir_missing" in issues: | ||
| 245 | recommendations.append(f"Create and populate {input_dir} with local audio files ({', '.join(AUDIO_EXTS)})") | ||
| 246 | if "input_path_not_directory" in issues: | ||
| 247 | recommendations.append("Replace the input path with a directory containing local audio files") | ||
| 248 | if "not_enough_audio_files_for_smoke" in issues: | ||
| 249 | recommendations.append(f"Add at least {MIN_SMOKE_AUDIO_FILES} audio files before running smoke-local") | ||
| 250 | if "not_enough_query_eligible_files_for_smoke" in issues: | ||
| 251 | recommendations.append( | ||
| 252 | f"Add at least {MIN_SMOKE_ELIGIBLE_QUERY_FILES} files with duration >= {query_duration:.1f}s" | ||
| 253 | ) | ||
| 254 | if ready: | ||
| 255 | recommendations.append("Run smoke-local to verify the full train/index/eval/artifact pipeline on this local dataset") | ||
| 256 | |||
| 257 | return { | ||
| 258 | "dataset": dataset, | ||
| 259 | "input_dir": str(input_dir), | ||
| 260 | "exists": exists, | ||
| 261 | "is_dir": is_dir, | ||
| 262 | "ready_for_smoke": ready, | ||
| 263 | "num_audio_files": num_audio_files, | ||
| 264 | "eligible_query_files": eligible_query_files, | ||
| 265 | "minimum_requirements": { | ||
| 266 | "audio_files": MIN_SMOKE_AUDIO_FILES, | ||
| 267 | "eligible_query_files": MIN_SMOKE_ELIGIBLE_QUERY_FILES, | ||
| 268 | "query_duration": query_duration, | ||
| 269 | "eval_ratio": eval_ratio, | ||
| 270 | }, | ||
| 271 | "issues": issues, | ||
| 272 | "recommendations": recommendations, | ||
| 273 | "inspect": inspect_summary, | ||
| 274 | } | ||
| 275 | |||
| 276 | |||
| 201 | def write_registry(output_path: str): | 277 | def write_registry(output_path: str): |
| 202 | out = Path(output_path) | 278 | out = Path(output_path) |
| 203 | out.parent.mkdir(parents=True, exist_ok=True) | 279 | out.parent.mkdir(parents=True, exist_ok=True) |
| ... | @@ -231,8 +307,21 @@ def smoke_local_dataset( | ... | @@ -231,8 +307,21 @@ def smoke_local_dataset( |
| 231 | train_epochs: int, | 307 | train_epochs: int, |
| 232 | batch_size: int, | 308 | batch_size: int, |
| 233 | ) -> Dict: | 309 | ) -> Dict: |
| 310 | readiness = assess_local_dataset_ready( | ||
| 311 | dataset, | ||
| 312 | input_dir, | ||
| 313 | query_duration=query_duration, | ||
| 314 | eval_ratio=eval_ratio, | ||
| 315 | ) | ||
| 316 | if not readiness["ready_for_smoke"]: | ||
| 317 | raise SystemExit(json.dumps({ | ||
| 318 | "status": "blocked", | ||
| 319 | "reason": "dataset_not_ready_for_smoke", | ||
| 320 | "readiness": readiness, | ||
| 321 | }, indent=2, ensure_ascii=False)) | ||
| 322 | |||
| 234 | adapter = ADAPTERS[dataset] | 323 | adapter = ADAPTERS[dataset] |
| 235 | inspect_summary = adapter.inspect_local_audio(input_dir, query_duration=query_duration, eval_ratio=eval_ratio) | 324 | inspect_summary = readiness["inspect"] |
| 236 | prepare_summary = adapter.prepare_local_audio( | 325 | prepare_summary = adapter.prepare_local_audio( |
| 237 | input_dir, | 326 | input_dir, |
| 238 | output_root / dataset, | 327 | output_root / dataset, |
| ... | @@ -305,6 +394,7 @@ def smoke_local_dataset( | ... | @@ -305,6 +394,7 @@ def smoke_local_dataset( |
| 305 | 394 | ||
| 306 | return { | 395 | return { |
| 307 | "dataset": dataset, | 396 | "dataset": dataset, |
| 397 | "readiness": readiness, | ||
| 308 | "inspect": inspect_summary, | 398 | "inspect": inspect_summary, |
| 309 | "prepare": prepare_summary, | 399 | "prepare": prepare_summary, |
| 310 | "validate": validate_summary, | 400 | "validate": validate_summary, |
| ... | @@ -352,6 +442,12 @@ def main(): | ... | @@ -352,6 +442,12 @@ def main(): |
| 352 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | 442 | p.add_argument("dataset", choices=sorted(ADAPTERS)) |
| 353 | p.add_argument("manifests_dir") | 443 | p.add_argument("manifests_dir") |
| 354 | 444 | ||
| 445 | p = sub.add_parser("check-local-ready") | ||
| 446 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | ||
| 447 | p.add_argument("input_dir") | ||
| 448 | p.add_argument("--eval-ratio", type=float, default=0.2) | ||
| 449 | p.add_argument("--query-duration", type=float, default=8.0) | ||
| 450 | |||
| 355 | p = sub.add_parser("smoke-local") | 451 | p = sub.add_parser("smoke-local") |
| 356 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | 452 | p.add_argument("dataset", choices=sorted(ADAPTERS)) |
| 357 | p.add_argument("input_dir") | 453 | p.add_argument("input_dir") |
| ... | @@ -394,6 +490,14 @@ def main(): | ... | @@ -394,6 +490,14 @@ def main(): |
| 394 | elif args.cmd == "validate-local": | 490 | elif args.cmd == "validate-local": |
| 395 | summary = ADAPTERS[args.dataset].validate_local_manifests(Path(args.manifests_dir)) | 491 | summary = ADAPTERS[args.dataset].validate_local_manifests(Path(args.manifests_dir)) |
| 396 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | 492 | print(json.dumps(summary, indent=2, ensure_ascii=False)) |
| 493 | elif args.cmd == "check-local-ready": | ||
| 494 | summary = assess_local_dataset_ready( | ||
| 495 | dataset=args.dataset, | ||
| 496 | input_dir=Path(args.input_dir), | ||
| 497 | eval_ratio=args.eval_ratio, | ||
| 498 | query_duration=args.query_duration, | ||
| 499 | ) | ||
| 500 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | ||
| 397 | elif args.cmd == "smoke-local": | 501 | elif args.cmd == "smoke-local": |
| 398 | summary = smoke_local_dataset( | 502 | summary = smoke_local_dataset( |
| 399 | dataset=args.dataset, | 503 | dataset=args.dataset, | ... | ... |
| ... | @@ -221,6 +221,32 @@ | ... | @@ -221,6 +221,32 @@ |
| 221 | - 新 session 现在可以直接读取最近一次状态快照文件 | 221 | - 新 session 现在可以直接读取最近一次状态快照文件 |
| 222 | - 交接信息更适合自动化和长期持续开发 | 222 | - 交接信息更适合自动化和长期持续开发 |
| 223 | 223 | ||
| 224 | |||
| 225 | ### Stage: 真实数据就绪度守门 | ||
| 226 | |||
| 227 | 完成项: | ||
| 228 | - 为 [acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py) 新增 `check-local-ready` | ||
| 229 | - 为 `smoke-local` 增加前置就绪度守门,避免对空目录直接进入训练链路 | ||
| 230 | - 增强 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py),输出: | ||
| 231 | - `dataset_readiness` | ||
| 232 | - `capability_map` 文档入口 | ||
| 233 | - `check-local-ready` 下一步命令 | ||
| 234 | - 补充 [docs/open-dataset-workflow.md](./open-dataset-workflow.md) 与 [docs/session-handoff.md](./session-handoff.md) 的真实数据检查说明 | ||
| 235 | |||
| 236 | 验证结果: | ||
| 237 | - `/usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py scripts/status_snapshot.py` 成功 | ||
| 238 | - `/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0` 成功 | ||
| 239 | - `/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0` 成功 | ||
| 240 | - `/usr/local/miniconda3/bin/python scripts/status_snapshot.py --output .omx/latest_status_snapshot.json` 成功 | ||
| 241 | - 当前结果: | ||
| 242 | - `fma.ready_for_smoke=false` | ||
| 243 | - `mtg_jamendo.ready_for_smoke=false` | ||
| 244 | - 原因均为音频文件数与可切 query 文件数不足 | ||
| 245 | |||
| 246 | 结论: | ||
| 247 | - 真实开放数据现在有了明确的“进入 smoke 前门槛” | ||
| 248 | - 新 session 和自动化脚本可以立刻识别空目录,而不是误以为真实数据已经准备完成 | ||
| 249 | |||
| 224 | ### Stage: 当前能力地图 | 250 | ### Stage: 当前能力地图 |
| 225 | 251 | ||
| 226 | 完成项: | 252 | 完成项: | ... | ... |
| 1 | # Open Dataset Workflow / 开放数据工作流 | 1 | # Open Dataset Workflow / 开放数据工作流 |
| 2 | 2 | ||
| 3 | ## 0. 本地真实数据就绪检查 | ||
| 4 | |||
| 5 | 在跑 `smoke-local` 前,先确认目录里真的有足够的音频: | ||
| 6 | |||
| 7 | ```bash | ||
| 8 | /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 | ||
| 9 | /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0 | ||
| 10 | ``` | ||
| 11 | |||
| 12 | 判定标准: | ||
| 13 | |||
| 14 | - 至少 `2` 个音频文件 | ||
| 15 | - 至少 `2` 个时长 `>= 8s` 的可切 query 文件 | ||
| 16 | - `ready_for_smoke=true` 才进入完整 smoke | ||
| 17 | |||
| 18 | 如果目录为空,状态快照脚本也会明确提示未就绪。 | ||
| 19 | |||
| 20 | |||
| 3 | > 更新:2026-06-02 | 21 | > 更新:2026-06-02 |
| 4 | 22 | ||
| 5 | ## 一页结论 | 23 | ## 一页结论 | ... | ... |
-
Please register or sign in to post a comment