Commit d7df0087 d7df0087171d5c9e89596a4ea01a326bdf28c88f by cnb.bofCdSsphPA

Make smoke metadata explicit before more real-data comparisons

Constraint: Real-data smoke reports must distinguish manifest query duration from training segment duration to avoid 5s-vs-8s confusion across runs
Rejected: Keep a single ambiguous query_duration field | Makes cross-run analysis and handoff error-prone
Confidence: high
Scope-risk: narrow
Directive: Preserve explicit duration semantics in future smoke/report artifacts and keep legacy aliases only for compatibility
Tested: build_smoke_config_summary() emits manifest_query_duration=8.0 and train_segment_duration=5.0 using configs/default.yaml
Not-tested: End-to-end regeneration of the still-running real FMA smoke report bundle with the new config schema
1 parent 05a2ccca
...@@ -9,6 +9,7 @@ import argparse ...@@ -9,6 +9,7 @@ import argparse
9 import json 9 import json
10 import subprocess 10 import subprocess
11 import torch 11 import torch
12 import yaml
12 13
13 14
14 AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg") 15 AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg")
...@@ -22,6 +23,47 @@ def resolve_device(device: str) -> str: ...@@ -22,6 +23,47 @@ def resolve_device(device: str) -> str:
22 return device 23 return device
23 24
24 25
26 def load_default_training_config(config_path: str = "configs/default.yaml") -> Dict:
27 with open(config_path) as f:
28 return yaml.safe_load(f)
29
30
31 def build_smoke_config_summary(
32 dataset: str,
33 manifests_dir: Path,
34 manifest_query_duration: float,
35 train_epochs: int,
36 batch_size: int,
37 requested_device: str,
38 resolved_device: str,
39 base_cfg: Dict,
40 ) -> Dict:
41 return {
42 "model": {
43 "embed_dim": base_cfg["model"]["embed_dim"],
44 "channels": base_cfg["model"]["channels"],
45 "n_mels": base_cfg["model"]["n_mels"],
46 "use_band_split": base_cfg["model"].get("use_band_split", True),
47 },
48 "data": {
49 "source_dataset": dataset,
50 "manifests_dir": str(manifests_dir),
51 "manifest_query_duration": manifest_query_duration,
52 "train_segment_duration": base_cfg["data"]["segment_dur"],
53 "sample_rate": base_cfg["data"]["sample_rate"],
54 "n_fft": base_cfg["data"]["n_fft"],
55 "hop_length": base_cfg["data"]["hop_length"],
56 "query_duration_legacy": manifest_query_duration,
57 },
58 "run": {
59 "train_epochs": train_epochs,
60 "batch_size": batch_size,
61 "requested_device": requested_device,
62 "resolved_device": resolved_device,
63 },
64 }
65
66
25 @dataclass 67 @dataclass
26 class DatasetRecord: 68 class DatasetRecord:
27 name: str 69 name: str
...@@ -340,6 +382,7 @@ def smoke_local_dataset( ...@@ -340,6 +382,7 @@ def smoke_local_dataset(
340 ) 382 )
341 manifests_dir = Path(prepare_summary["output_dir"]) 383 manifests_dir = Path(prepare_summary["output_dir"])
342 validate_summary = adapter.validate_local_manifests(manifests_dir) 384 validate_summary = adapter.validate_local_manifests(manifests_dir)
385 base_cfg = load_default_training_config()
343 386
344 model_dir = output_root / f"{dataset}_models_smoke" 387 model_dir = output_root / f"{dataset}_models_smoke"
345 index_dir = output_root / f"{dataset}_index_smoke" 388 index_dir = output_root / f"{dataset}_index_smoke"
...@@ -380,16 +423,16 @@ def smoke_local_dataset( ...@@ -380,16 +423,16 @@ def smoke_local_dataset(
380 "--output-json", str(eval_json), 423 "--output-json", str(eval_json),
381 ], check=True) 424 ], check=True)
382 425
383 config = { 426 config = build_smoke_config_summary(
384 "model": {"embed_dim": 192, "channels": 512, "n_mels": 128, "use_band_split": True}, 427 dataset=dataset,
385 "data": {"source_dataset": dataset, "manifests_dir": str(manifests_dir), "query_duration": query_duration}, 428 manifests_dir=manifests_dir,
386 "run": { 429 manifest_query_duration=query_duration,
387 "train_epochs": train_epochs, 430 train_epochs=train_epochs,
388 "batch_size": batch_size, 431 batch_size=batch_size,
389 "requested_device": device, 432 requested_device=device,
390 "resolved_device": resolved_device, 433 resolved_device=resolved_device,
391 }, 434 base_cfg=base_cfg,
392 } 435 )
393 report_dir.mkdir(parents=True, exist_ok=True) 436 report_dir.mkdir(parents=True, exist_ok=True)
394 config_path.write_text(json.dumps(config, indent=2)) 437 config_path.write_text(json.dumps(config, indent=2))
395 438
......
...@@ -2,6 +2,35 @@ ...@@ -2,6 +2,35 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义
6
7 完成项:
8 - 修改 `acr-engine/src/data/external_adapters.py`
9 - 新增 `load_default_training_config()`
10 - 新增 `build_smoke_config_summary()`
11 -`smoke-local` 产出的 `config.json` 显式记录:
12 - `manifest_query_duration`
13 - `train_segment_duration`
14 - `sample_rate`
15 - `n_fft`
16 - `hop_length`
17 - `query_duration_legacy`
18 - 更新 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md),说明新旧配置口径
19
20 验证结果:
21 - 通过直接调用 `build_smoke_config_summary()` 验证输出:
22 - `manifest_query_duration = 8.0`
23 - `train_segment_duration = 5.0`
24 - `requested_device = auto`
25 - `resolved_device = cpu`
26 - 默认训练配置读取自:
27 - `configs/default.yaml`
28 - 其中 `data.segment_dur = 5.0`
29
30 结论:
31 - 现在 smoke 配置摘要已经能明确区分“manifest 的 query 时长”和“训练 clip 时长”
32 - 后续即使 report 产物跨实验对比,也更容易避免 5s/8s 语义混淆
33
5 ### Stage: 将连续开发偏好与当前进度固化到 AGENTS.md 34 ### Stage: 将连续开发偏好与当前进度固化到 AGENTS.md
6 35
7 完成项: 36 完成项:
......
...@@ -326,7 +326,11 @@ flowchart TD ...@@ -326,7 +326,11 @@ flowchart TD
326 326
327 解释: 327 解释:
328 - **manifest query 时长****训练 crop 时长****报告里记录的 query_duration** 当前不是完全同一个配置源; 328 - **manifest query 时长****训练 crop 时长****报告里记录的 query_duration** 当前不是完全同一个配置源;
329 - 现有 `fma_reports_smoke/config.json` 时间戳早于最新 manifests,属于需要继续治理的实验产物一致性问题; 329 - 旧的 `fma_reports_smoke/config.json` 时间戳早于最新 manifests,属于历史实验产物一致性问题;
330 - 当前代码侧已经开始把 smoke 配置摘要显式拆成:
331 - `manifest_query_duration`
332 - `train_segment_duration`
333 - `query_duration_legacy`
330 - 因此后续继续做工业级化时,应该把 “manifest query 时长 / train clip 时长 / eval query 时长 / report metadata” 统一纳入一个显式配置结构。 334 - 因此后续继续做工业级化时,应该把 “manifest query 时长 / train clip 时长 / eval query 时长 / report metadata” 统一纳入一个显式配置结构。
331 335
332 --- 336 ---
......