Commit f2360135 f2360135d2a8b39067f80fe77a5aee5ea9ecf696 by cnb.bofCdSsphPA

Prevent empty local dataset folders from masquerading as smoke-ready

Constraint: Real-data validation now depends on user-requested local corpus drop zones that may exist before they contain any audio
Rejected: Let smoke-local fail deep inside training | Produces slower and less actionable feedback for continuous sessions
Confidence: high
Scope-risk: narrow
Directive: Keep readiness thresholds aligned with the minimum viable query split assumptions before expanding real-data automation
Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py scripts/status_snapshot.py; /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0; /usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0; /usr/local/miniconda3/bin/python scripts/status_snapshot.py --output .omx/latest_status_snapshot.json
Not-tested: Full smoke-local on real FMA or MTG-Jamendo remains blocked until audio is actually downloaded
1 parent 18ba8663
......@@ -6,39 +6,67 @@ from pathlib import Path
root = Path.cwd()
workspace_root = root.parent
PYTHON = "/usr/local/miniconda3/bin/python"
def sh(cmd):
return subprocess.check_output(cmd, shell=True, text=True).strip()
def check_dataset(dataset: str, input_dir: str):
cmd = [
PYTHON,
"src/data/external_adapters.py",
"check-local-ready",
dataset,
input_dir,
"--eval-ratio", "0.2",
"--query-duration", "8.0",
]
result = subprocess.check_output(cmd, text=True)
return json.loads(result)
def build_snapshot():
fma_dir = "data/raw/fma_small_audio"
jamendo_dir = "data/raw/mtg_jamendo_audio"
fma_ready = check_dataset("fma", fma_dir)
jamendo_ready = check_dataset("mtg_jamendo", jamendo_dir)
return {
'latest_commit': sh('git log --oneline -n 1'),
'docs': {
'readme': str((workspace_root / 'docs/README.md').resolve()),
'handoff': str((workspace_root / 'docs/session-handoff.md').resolve()),
'workflow': str((workspace_root / 'docs/open-dataset-workflow.md').resolve()),
"latest_commit": sh("git log --oneline -n 1"),
"docs": {
"readme": str((workspace_root / "docs/README.md").resolve()),
"handoff": str((workspace_root / "docs/session-handoff.md").resolve()),
"workflow": str((workspace_root / "docs/open-dataset-workflow.md").resolve()),
"capability_map": str((workspace_root / "docs/current-capability-map.md").resolve()),
},
'drop_zones': {
'fma': str((root / 'data/raw/fma_small_audio').resolve()),
'mtg_jamendo': str((root / 'data/raw/mtg_jamendo_audio').resolve()),
"drop_zones": {
"fma": str((root / fma_dir).resolve()),
"mtg_jamendo": str((root / jamendo_dir).resolve()),
},
'verified_open_smoke_dirs': {
'manifests': str((root / 'data/external_ingested/synthetic_as_open_fixed/fma/manifests').resolve()),
'reports': str((root / 'reports/open-smoke-fixed/fma').resolve()),
'one_shot_reports': str((root / 'data/external_smoke/fma_reports_smoke').resolve()),
"dataset_readiness": {
"fma": fma_ready,
"mtg_jamendo": jamendo_ready,
},
"verified_open_smoke_dirs": {
"manifests": str((root / "data/external_ingested/synthetic_as_open_fixed/fma/manifests").resolve()),
"reports": str((root / "reports/open-smoke-fixed/fma").resolve()),
"one_shot_reports": str((root / "data/external_smoke/fma_reports_smoke").resolve()),
},
"next_commands": {
"check_fma": f"{PYTHON} src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0",
"inspect_fma": f"{PYTHON} src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0",
"smoke_fma": f"{PYTHON} src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2",
"check_jamendo": f"{PYTHON} src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0",
"inspect_jamendo": f"{PYTHON} src/data/external_adapters.py inspect-local mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0",
"smoke_jamendo": f"{PYTHON} src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2",
},
'next_commands': {
'inspect_fma': '/usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0',
'smoke_fma': '/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2',
'inspect_jamendo': '/usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0',
'smoke_jamendo': '/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2'
}
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output', default=None)
parser.add_argument("--output", default=None)
args = parser.parse_args()
snapshot = build_snapshot()
......@@ -50,5 +78,5 @@ def main():
print(text)
if __name__ == '__main__':
if __name__ == "__main__":
main()
......
......@@ -10,6 +10,11 @@ import json
import subprocess
AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg")
MIN_SMOKE_AUDIO_FILES = 2
MIN_SMOKE_ELIGIBLE_QUERY_FILES = 2
@dataclass
class DatasetRecord:
name: str
......@@ -198,6 +203,77 @@ REGISTRY: List[DatasetRecord] = [
]
def count_audio_files(input_dir: Path) -> int:
return len([p for p in input_dir.rglob("*") if p.suffix.lower() in AUDIO_EXTS])
def assess_local_dataset_ready(
dataset: str,
input_dir: Path,
query_duration: float = 8.0,
eval_ratio: float = 0.2,
) -> Dict:
adapter = ADAPTERS[dataset]
input_dir = input_dir.resolve()
exists = input_dir.exists()
is_dir = input_dir.is_dir()
inspect_summary = None
num_audio_files = 0
eligible_query_files = 0
issues = []
if not exists:
issues.append("input_dir_missing")
elif not is_dir:
issues.append("input_path_not_directory")
else:
inspect_summary = adapter.inspect_local_audio(
input_dir,
query_duration=query_duration,
eval_ratio=eval_ratio,
)
num_audio_files = int(inspect_summary.get("num_audio_files", 0))
eligible_query_files = int(inspect_summary.get("eligible_query_files", 0))
if num_audio_files < MIN_SMOKE_AUDIO_FILES:
issues.append("not_enough_audio_files_for_smoke")
if eligible_query_files < MIN_SMOKE_ELIGIBLE_QUERY_FILES:
issues.append("not_enough_query_eligible_files_for_smoke")
ready = len(issues) == 0
recommendations = []
if "input_dir_missing" in issues:
recommendations.append(f"Create and populate {input_dir} with local audio files ({', '.join(AUDIO_EXTS)})")
if "input_path_not_directory" in issues:
recommendations.append("Replace the input path with a directory containing local audio files")
if "not_enough_audio_files_for_smoke" in issues:
recommendations.append(f"Add at least {MIN_SMOKE_AUDIO_FILES} audio files before running smoke-local")
if "not_enough_query_eligible_files_for_smoke" in issues:
recommendations.append(
f"Add at least {MIN_SMOKE_ELIGIBLE_QUERY_FILES} files with duration >= {query_duration:.1f}s"
)
if ready:
recommendations.append("Run smoke-local to verify the full train/index/eval/artifact pipeline on this local dataset")
return {
"dataset": dataset,
"input_dir": str(input_dir),
"exists": exists,
"is_dir": is_dir,
"ready_for_smoke": ready,
"num_audio_files": num_audio_files,
"eligible_query_files": eligible_query_files,
"minimum_requirements": {
"audio_files": MIN_SMOKE_AUDIO_FILES,
"eligible_query_files": MIN_SMOKE_ELIGIBLE_QUERY_FILES,
"query_duration": query_duration,
"eval_ratio": eval_ratio,
},
"issues": issues,
"recommendations": recommendations,
"inspect": inspect_summary,
}
def write_registry(output_path: str):
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
......@@ -231,8 +307,21 @@ def smoke_local_dataset(
train_epochs: int,
batch_size: int,
) -> Dict:
readiness = assess_local_dataset_ready(
dataset,
input_dir,
query_duration=query_duration,
eval_ratio=eval_ratio,
)
if not readiness["ready_for_smoke"]:
raise SystemExit(json.dumps({
"status": "blocked",
"reason": "dataset_not_ready_for_smoke",
"readiness": readiness,
}, indent=2, ensure_ascii=False))
adapter = ADAPTERS[dataset]
inspect_summary = adapter.inspect_local_audio(input_dir, query_duration=query_duration, eval_ratio=eval_ratio)
inspect_summary = readiness["inspect"]
prepare_summary = adapter.prepare_local_audio(
input_dir,
output_root / dataset,
......@@ -305,6 +394,7 @@ def smoke_local_dataset(
return {
"dataset": dataset,
"readiness": readiness,
"inspect": inspect_summary,
"prepare": prepare_summary,
"validate": validate_summary,
......@@ -352,6 +442,12 @@ def main():
p.add_argument("dataset", choices=sorted(ADAPTERS))
p.add_argument("manifests_dir")
p = sub.add_parser("check-local-ready")
p.add_argument("dataset", choices=sorted(ADAPTERS))
p.add_argument("input_dir")
p.add_argument("--eval-ratio", type=float, default=0.2)
p.add_argument("--query-duration", type=float, default=8.0)
p = sub.add_parser("smoke-local")
p.add_argument("dataset", choices=sorted(ADAPTERS))
p.add_argument("input_dir")
......@@ -394,6 +490,14 @@ def main():
elif args.cmd == "validate-local":
summary = ADAPTERS[args.dataset].validate_local_manifests(Path(args.manifests_dir))
print(json.dumps(summary, indent=2, ensure_ascii=False))
elif args.cmd == "check-local-ready":
summary = assess_local_dataset_ready(
dataset=args.dataset,
input_dir=Path(args.input_dir),
eval_ratio=args.eval_ratio,
query_duration=args.query_duration,
)
print(json.dumps(summary, indent=2, ensure_ascii=False))
elif args.cmd == "smoke-local":
summary = smoke_local_dataset(
dataset=args.dataset,
......
......@@ -221,6 +221,32 @@
- 新 session 现在可以直接读取最近一次状态快照文件
- 交接信息更适合自动化和长期持续开发
### Stage: 真实数据就绪度守门
完成项:
-[acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py) 新增 `check-local-ready`
-`smoke-local` 增加前置就绪度守门,避免对空目录直接进入训练链路
- 增强 [acr-engine/scripts/status_snapshot.py](../acr-engine/scripts/status_snapshot.py),输出:
- `dataset_readiness`
- `capability_map` 文档入口
- `check-local-ready` 下一步命令
- 补充 [docs/open-dataset-workflow.md](./open-dataset-workflow.md)[docs/session-handoff.md](./session-handoff.md) 的真实数据检查说明
验证结果:
- `/usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py scripts/status_snapshot.py` 成功
- `/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0` 成功
- `/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0` 成功
- `/usr/local/miniconda3/bin/python scripts/status_snapshot.py --output .omx/latest_status_snapshot.json` 成功
- 当前结果:
- `fma.ready_for_smoke=false`
- `mtg_jamendo.ready_for_smoke=false`
- 原因均为音频文件数与可切 query 文件数不足
结论:
- 真实开放数据现在有了明确的“进入 smoke 前门槛”
- 新 session 和自动化脚本可以立刻识别空目录,而不是误以为真实数据已经准备完成
### Stage: 当前能力地图
完成项:
......
# Open Dataset Workflow / 开放数据工作流
## 0. 本地真实数据就绪检查
在跑 `smoke-local` 前,先确认目录里真的有足够的音频:
```bash
/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0
/usr/local/miniconda3/bin/python src/data/external_adapters.py check-local-ready mtg_jamendo data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0
```
判定标准:
- 至少 `2` 个音频文件
- 至少 `2` 个时长 `>= 8s` 的可切 query 文件
- `ready_for_smoke=true` 才进入完整 smoke
如果目录为空,状态快照脚本也会明确提示未就绪。
> 更新:2026-06-02
## 一页结论
......