Make smoke metadata explicit before more real-data comparisons
Constraint: Real-data smoke reports must distinguish manifest query duration from training segment duration to avoid 5s-vs-8s confusion across runs Rejected: Keep a single ambiguous query_duration field | Makes cross-run analysis and handoff error-prone Confidence: high Scope-risk: narrow Directive: Preserve explicit duration semantics in future smoke/report artifacts and keep legacy aliases only for compatibility Tested: build_smoke_config_summary() emits manifest_query_duration=8.0 and train_segment_duration=5.0 using configs/default.yaml Not-tested: End-to-end regeneration of the still-running real FMA smoke report bundle with the new config schema
Showing
3 changed files
with
87 additions
and
11 deletions
| ... | @@ -9,6 +9,7 @@ import argparse | ... | @@ -9,6 +9,7 @@ import argparse |
| 9 | import json | 9 | import json |
| 10 | import subprocess | 10 | import subprocess |
| 11 | import torch | 11 | import torch |
| 12 | import yaml | ||
| 12 | 13 | ||
| 13 | 14 | ||
| 14 | AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg") | 15 | AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg") |
| ... | @@ -22,6 +23,47 @@ def resolve_device(device: str) -> str: | ... | @@ -22,6 +23,47 @@ def resolve_device(device: str) -> str: |
| 22 | return device | 23 | return device |
| 23 | 24 | ||
| 24 | 25 | ||
| 26 | def load_default_training_config(config_path: str = "configs/default.yaml") -> Dict: | ||
| 27 | with open(config_path) as f: | ||
| 28 | return yaml.safe_load(f) | ||
| 29 | |||
| 30 | |||
| 31 | def build_smoke_config_summary( | ||
| 32 | dataset: str, | ||
| 33 | manifests_dir: Path, | ||
| 34 | manifest_query_duration: float, | ||
| 35 | train_epochs: int, | ||
| 36 | batch_size: int, | ||
| 37 | requested_device: str, | ||
| 38 | resolved_device: str, | ||
| 39 | base_cfg: Dict, | ||
| 40 | ) -> Dict: | ||
| 41 | return { | ||
| 42 | "model": { | ||
| 43 | "embed_dim": base_cfg["model"]["embed_dim"], | ||
| 44 | "channels": base_cfg["model"]["channels"], | ||
| 45 | "n_mels": base_cfg["model"]["n_mels"], | ||
| 46 | "use_band_split": base_cfg["model"].get("use_band_split", True), | ||
| 47 | }, | ||
| 48 | "data": { | ||
| 49 | "source_dataset": dataset, | ||
| 50 | "manifests_dir": str(manifests_dir), | ||
| 51 | "manifest_query_duration": manifest_query_duration, | ||
| 52 | "train_segment_duration": base_cfg["data"]["segment_dur"], | ||
| 53 | "sample_rate": base_cfg["data"]["sample_rate"], | ||
| 54 | "n_fft": base_cfg["data"]["n_fft"], | ||
| 55 | "hop_length": base_cfg["data"]["hop_length"], | ||
| 56 | "query_duration_legacy": manifest_query_duration, | ||
| 57 | }, | ||
| 58 | "run": { | ||
| 59 | "train_epochs": train_epochs, | ||
| 60 | "batch_size": batch_size, | ||
| 61 | "requested_device": requested_device, | ||
| 62 | "resolved_device": resolved_device, | ||
| 63 | }, | ||
| 64 | } | ||
| 65 | |||
| 66 | |||
| 25 | @dataclass | 67 | @dataclass |
| 26 | class DatasetRecord: | 68 | class DatasetRecord: |
| 27 | name: str | 69 | name: str |
| ... | @@ -340,6 +382,7 @@ def smoke_local_dataset( | ... | @@ -340,6 +382,7 @@ def smoke_local_dataset( |
| 340 | ) | 382 | ) |
| 341 | manifests_dir = Path(prepare_summary["output_dir"]) | 383 | manifests_dir = Path(prepare_summary["output_dir"]) |
| 342 | validate_summary = adapter.validate_local_manifests(manifests_dir) | 384 | validate_summary = adapter.validate_local_manifests(manifests_dir) |
| 385 | base_cfg = load_default_training_config() | ||
| 343 | 386 | ||
| 344 | model_dir = output_root / f"{dataset}_models_smoke" | 387 | model_dir = output_root / f"{dataset}_models_smoke" |
| 345 | index_dir = output_root / f"{dataset}_index_smoke" | 388 | index_dir = output_root / f"{dataset}_index_smoke" |
| ... | @@ -380,16 +423,16 @@ def smoke_local_dataset( | ... | @@ -380,16 +423,16 @@ def smoke_local_dataset( |
| 380 | "--output-json", str(eval_json), | 423 | "--output-json", str(eval_json), |
| 381 | ], check=True) | 424 | ], check=True) |
| 382 | 425 | ||
| 383 | config = { | 426 | config = build_smoke_config_summary( |
| 384 | "model": {"embed_dim": 192, "channels": 512, "n_mels": 128, "use_band_split": True}, | 427 | dataset=dataset, |
| 385 | "data": {"source_dataset": dataset, "manifests_dir": str(manifests_dir), "query_duration": query_duration}, | 428 | manifests_dir=manifests_dir, |
| 386 | "run": { | 429 | manifest_query_duration=query_duration, |
| 387 | "train_epochs": train_epochs, | 430 | train_epochs=train_epochs, |
| 388 | "batch_size": batch_size, | 431 | batch_size=batch_size, |
| 389 | "requested_device": device, | 432 | requested_device=device, |
| 390 | "resolved_device": resolved_device, | 433 | resolved_device=resolved_device, |
| 391 | }, | 434 | base_cfg=base_cfg, |
| 392 | } | 435 | ) |
| 393 | report_dir.mkdir(parents=True, exist_ok=True) | 436 | report_dir.mkdir(parents=True, exist_ok=True) |
| 394 | config_path.write_text(json.dumps(config, indent=2)) | 437 | config_path.write_text(json.dumps(config, indent=2)) |
| 395 | 438 | ... | ... |
| ... | @@ -2,6 +2,35 @@ | ... | @@ -2,6 +2,35 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 修改 `acr-engine/src/data/external_adapters.py` | ||
| 9 | - 新增 `load_default_training_config()` | ||
| 10 | - 新增 `build_smoke_config_summary()` | ||
| 11 | - 让 `smoke-local` 产出的 `config.json` 显式记录: | ||
| 12 | - `manifest_query_duration` | ||
| 13 | - `train_segment_duration` | ||
| 14 | - `sample_rate` | ||
| 15 | - `n_fft` | ||
| 16 | - `hop_length` | ||
| 17 | - `query_duration_legacy` | ||
| 18 | - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md),说明新旧配置口径 | ||
| 19 | |||
| 20 | 验证结果: | ||
| 21 | - 通过直接调用 `build_smoke_config_summary()` 验证输出: | ||
| 22 | - `manifest_query_duration = 8.0` | ||
| 23 | - `train_segment_duration = 5.0` | ||
| 24 | - `requested_device = auto` | ||
| 25 | - `resolved_device = cpu` | ||
| 26 | - 默认训练配置读取自: | ||
| 27 | - `configs/default.yaml` | ||
| 28 | - 其中 `data.segment_dur = 5.0` | ||
| 29 | |||
| 30 | 结论: | ||
| 31 | - 现在 smoke 配置摘要已经能明确区分“manifest 的 query 时长”和“训练 clip 时长” | ||
| 32 | - 后续即使 report 产物跨实验对比,也更容易避免 5s/8s 语义混淆 | ||
| 33 | |||
| 5 | ### Stage: 将连续开发偏好与当前进度固化到 AGENTS.md | 34 | ### Stage: 将连续开发偏好与当前进度固化到 AGENTS.md |
| 6 | 35 | ||
| 7 | 完成项: | 36 | 完成项: | ... | ... |
| ... | @@ -326,7 +326,11 @@ flowchart TD | ... | @@ -326,7 +326,11 @@ flowchart TD |
| 326 | 326 | ||
| 327 | 解释: | 327 | 解释: |
| 328 | - **manifest query 时长**、**训练 crop 时长**、**报告里记录的 query_duration** 当前不是完全同一个配置源; | 328 | - **manifest query 时长**、**训练 crop 时长**、**报告里记录的 query_duration** 当前不是完全同一个配置源; |
| 329 | - 现有 `fma_reports_smoke/config.json` 时间戳早于最新 manifests,属于需要继续治理的实验产物一致性问题; | 329 | - 旧的 `fma_reports_smoke/config.json` 时间戳早于最新 manifests,属于历史实验产物一致性问题; |
| 330 | - 当前代码侧已经开始把 smoke 配置摘要显式拆成: | ||
| 331 | - `manifest_query_duration` | ||
| 332 | - `train_segment_duration` | ||
| 333 | - `query_duration_legacy` | ||
| 330 | - 因此后续继续做工业级化时,应该把 “manifest query 时长 / train clip 时长 / eval query 时长 / report metadata” 统一纳入一个显式配置结构。 | 334 | - 因此后续继续做工业级化时,应该把 “manifest query 时长 / train clip 时长 / eval query 时长 / report metadata” 统一纳入一个显式配置结构。 |
| 331 | 335 | ||
| 332 | --- | 336 | --- | ... | ... |
-
Please register or sign in to post a comment