Commit a68a7296 a68a7296b29419e3cac387ade21fdb03fa51121d by cnb.bofCdSsphPA

Expand external dataset coverage before harder real-data training

Constraint: Open-dataset ingestion needs a way to generate multiple overlapping queries per track, otherwise training/eval coverage stays too sparse
Rejected: Keep only one random external query per track | Leaves long songs underrepresented and weakens reproducibility
Confidence: high
Scope-risk: moderate
Directive: Preserve single-query behavior as the default, but keep overlap-query generation configurable through query_stride for future corpora
Tested: manifest_tools audio-dir-to-splits --help shows --query-stride; prepare-local on data/synthetic_v2/songs with query_duration=8.0 and query_stride=4.0 produced 72 queries with query_index fields
Not-tested: Full end-to-end smoke-local completion on the still-running real FMA corpus with overlap-query mode enabled
1 parent d7df0087
...@@ -103,6 +103,7 @@ class BaseAdapter: ...@@ -103,6 +103,7 @@ class BaseAdapter:
103 output_root: Path, 103 output_root: Path,
104 eval_ratio: float = 0.2, 104 eval_ratio: float = 0.2,
105 query_duration: float = 8.0, 105 query_duration: float = 8.0,
106 query_stride: float | None = None,
106 seed: int = 42, 107 seed: int = 42,
107 ) -> Dict: 108 ) -> Dict:
108 output_root.mkdir(parents=True, exist_ok=True) 109 output_root.mkdir(parents=True, exist_ok=True)
...@@ -118,9 +119,16 @@ class BaseAdapter: ...@@ -118,9 +119,16 @@ class BaseAdapter:
118 str(eval_ratio), 119 str(eval_ratio),
119 "--query-duration", 120 "--query-duration",
120 str(query_duration), 121 str(query_duration),
122 ]
123 if query_stride is not None:
124 cmd.extend([
125 "--query-stride",
126 str(query_stride),
127 ])
128 cmd.extend([
121 "--seed", 129 "--seed",
122 str(seed), 130 str(seed),
123 ] 131 ])
124 result = subprocess.check_output(cmd, text=True) 132 result = subprocess.check_output(cmd, text=True)
125 summary = json.loads(result) 133 summary = json.loads(result)
126 summary["input_dir"] = str(input_dir) 134 summary["input_dir"] = str(input_dir)
...@@ -352,6 +360,7 @@ def smoke_local_dataset( ...@@ -352,6 +360,7 @@ def smoke_local_dataset(
352 output_root: Path, 360 output_root: Path,
353 eval_ratio: float, 361 eval_ratio: float,
354 query_duration: float, 362 query_duration: float,
363 query_stride: float | None,
355 seed: int, 364 seed: int,
356 train_epochs: int, 365 train_epochs: int,
357 batch_size: int, 366 batch_size: int,
...@@ -378,6 +387,7 @@ def smoke_local_dataset( ...@@ -378,6 +387,7 @@ def smoke_local_dataset(
378 output_root / dataset, 387 output_root / dataset,
379 eval_ratio=eval_ratio, 388 eval_ratio=eval_ratio,
380 query_duration=query_duration, 389 query_duration=query_duration,
390 query_stride=query_stride,
381 seed=seed, 391 seed=seed,
382 ) 392 )
383 manifests_dir = Path(prepare_summary["output_dir"]) 393 manifests_dir = Path(prepare_summary["output_dir"])
...@@ -433,6 +443,7 @@ def smoke_local_dataset( ...@@ -433,6 +443,7 @@ def smoke_local_dataset(
433 resolved_device=resolved_device, 443 resolved_device=resolved_device,
434 base_cfg=base_cfg, 444 base_cfg=base_cfg,
435 ) 445 )
446 config["data"]["manifest_query_stride"] = query_stride
436 report_dir.mkdir(parents=True, exist_ok=True) 447 report_dir.mkdir(parents=True, exist_ok=True)
437 config_path.write_text(json.dumps(config, indent=2)) 448 config_path.write_text(json.dumps(config, indent=2))
438 449
...@@ -481,6 +492,7 @@ def main(): ...@@ -481,6 +492,7 @@ def main():
481 p.add_argument("--output-root", default="data/external_ingested") 492 p.add_argument("--output-root", default="data/external_ingested")
482 p.add_argument("--eval-ratio", type=float, default=0.2) 493 p.add_argument("--eval-ratio", type=float, default=0.2)
483 p.add_argument("--query-duration", type=float, default=8.0) 494 p.add_argument("--query-duration", type=float, default=8.0)
495 p.add_argument("--query-stride", type=float, default=None)
484 p.add_argument("--seed", type=int, default=42) 496 p.add_argument("--seed", type=int, default=42)
485 497
486 p = sub.add_parser("inspect-local") 498 p = sub.add_parser("inspect-local")
...@@ -510,6 +522,7 @@ def main(): ...@@ -510,6 +522,7 @@ def main():
510 p.add_argument("--output-root", default="data/external_smoke") 522 p.add_argument("--output-root", default="data/external_smoke")
511 p.add_argument("--eval-ratio", type=float, default=0.2) 523 p.add_argument("--eval-ratio", type=float, default=0.2)
512 p.add_argument("--query-duration", type=float, default=8.0) 524 p.add_argument("--query-duration", type=float, default=8.0)
525 p.add_argument("--query-stride", type=float, default=None)
513 p.add_argument("--seed", type=int, default=42) 526 p.add_argument("--seed", type=int, default=42)
514 p.add_argument("--train-epochs", type=int, default=1) 527 p.add_argument("--train-epochs", type=int, default=1)
515 p.add_argument("--batch-size", type=int, default=2) 528 p.add_argument("--batch-size", type=int, default=2)
...@@ -531,6 +544,7 @@ def main(): ...@@ -531,6 +544,7 @@ def main():
531 root, 544 root,
532 eval_ratio=args.eval_ratio, 545 eval_ratio=args.eval_ratio,
533 query_duration=args.query_duration, 546 query_duration=args.query_duration,
547 query_stride=args.query_stride,
534 seed=args.seed, 548 seed=args.seed,
535 ) 549 )
536 print(json.dumps(summary, indent=2, ensure_ascii=False)) 550 print(json.dumps(summary, indent=2, ensure_ascii=False))
...@@ -562,6 +576,7 @@ def main(): ...@@ -562,6 +576,7 @@ def main():
562 output_root=Path(args.output_root), 576 output_root=Path(args.output_root),
563 eval_ratio=args.eval_ratio, 577 eval_ratio=args.eval_ratio,
564 query_duration=args.query_duration, 578 query_duration=args.query_duration,
579 query_stride=args.query_stride,
565 seed=args.seed, 580 seed=args.seed,
566 train_epochs=args.train_epochs, 581 train_epochs=args.train_epochs,
567 batch_size=args.batch_size, 582 batch_size=args.batch_size,
......
...@@ -9,6 +9,7 @@ import random ...@@ -9,6 +9,7 @@ import random
9 import shutil 9 import shutil
10 from pathlib import Path 10 from pathlib import Path
11 from typing import List, Dict 11 from typing import List, Dict
12 import numpy as np
12 import soundfile as sf 13 import soundfile as sf
13 14
14 15
...@@ -43,6 +44,7 @@ def build_train_eval_from_audio_dir( ...@@ -43,6 +44,7 @@ def build_train_eval_from_audio_dir(
43 exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"), 44 exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"),
44 eval_ratio: float = 0.2, 45 eval_ratio: float = 0.2,
45 query_duration: float = 8.0, 46 query_duration: float = 8.0,
47 query_stride: float | None = None,
46 seed: int = 42, 48 seed: int = 42,
47 ): 49 ):
48 rng = random.Random(seed) 50 rng = random.Random(seed)
...@@ -80,16 +82,27 @@ def build_train_eval_from_audio_dir( ...@@ -80,16 +82,27 @@ def build_train_eval_from_audio_dir(
80 refs.append(ref) 82 refs.append(ref)
81 83
82 if duration >= query_duration: 84 if duration >= query_duration:
85 if query_stride and query_stride > 0:
83 max_offset = max(0.0, duration - query_duration) 86 max_offset = max(0.0, duration - query_duration)
84 offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0 87 offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()]
88 if not offsets:
89 offsets = [0.0]
90 if offsets[-1] < round(max_offset, 3):
91 offsets.append(round(max_offset, 3))
92 else:
93 max_offset = max(0.0, duration - query_duration)
94 offsets = [round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3)]
95
96 for seg_idx, offset in enumerate(offsets):
85 query = { 97 query = {
86 "song_id": song_id, 98 "song_id": song_id,
87 "audio_path": str(rel), 99 "audio_path": str(rel),
88 "duration": query_duration, 100 "duration": query_duration,
89 "type": "clean", 101 "type": "clean",
90 "offset": round(offset, 3), 102 "offset": offset,
91 "segment_type": "external_query", 103 "segment_type": "external_query",
92 "source_dataset": source_dataset, 104 "source_dataset": source_dataset,
105 "query_index": seg_idx,
93 } 106 }
94 if rng.random() < eval_ratio: 107 if rng.random() < eval_ratio:
95 test.append(query) 108 test.append(query)
...@@ -109,6 +122,8 @@ def build_train_eval_from_audio_dir( ...@@ -109,6 +122,8 @@ def build_train_eval_from_audio_dir(
109 "catalog": len(refs), 122 "catalog": len(refs),
110 "train_queries": len(train), 123 "train_queries": len(train),
111 "test_queries": len(test), 124 "test_queries": len(test),
125 "query_duration": query_duration,
126 "query_stride": query_stride,
112 "output_dir": str(manifests_dir), 127 "output_dir": str(manifests_dir),
113 } 128 }
114 129
...@@ -209,6 +224,7 @@ def main(): ...@@ -209,6 +224,7 @@ def main():
209 p.add_argument("--source-dataset", required=True) 224 p.add_argument("--source-dataset", required=True)
210 p.add_argument("--eval-ratio", type=float, default=0.2) 225 p.add_argument("--eval-ratio", type=float, default=0.2)
211 p.add_argument("--query-duration", type=float, default=8.0) 226 p.add_argument("--query-duration", type=float, default=8.0)
227 p.add_argument("--query-stride", type=float, default=None)
212 p.add_argument("--seed", type=int, default=42) 228 p.add_argument("--seed", type=int, default=42)
213 229
214 p = sub.add_parser("inspect-audio-dir") 230 p = sub.add_parser("inspect-audio-dir")
...@@ -230,6 +246,7 @@ def main(): ...@@ -230,6 +246,7 @@ def main():
230 source_dataset=args.source_dataset, 246 source_dataset=args.source_dataset,
231 eval_ratio=args.eval_ratio, 247 eval_ratio=args.eval_ratio,
232 query_duration=args.query_duration, 248 query_duration=args.query_duration,
249 query_stride=args.query_stride,
233 seed=args.seed, 250 seed=args.seed,
234 ) 251 )
235 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) 252 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
......
...@@ -2,6 +2,37 @@ ...@@ -2,6 +2,37 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 为外部数据集接入增加 overlap query manifest 能力
6
7 完成项:
8 - 修改 `acr-engine/src/data/manifest_tools.py`
9 -`audio-dir-to-splits` 增加 `--query-stride`
10 - 支持按 stride 为单首歌生成多个 query
11 - 新 query 记录增加 `query_index`
12 - 修改 `acr-engine/src/data/external_adapters.py`
13 - `prepare-local` / `smoke-local` 透传 `--query-stride`
14 - smoke 配置摘要里记录 `manifest_query_stride`
15 - 更新 [open-dataset-workflow.md](./open-dataset-workflow.md)[dataset-spec.md](./dataset-spec.md)
16
17 验证结果:
18 - CLI 验证:
19 - `manifest_tools.py audio-dir-to-splits --help` 已出现 `--query-stride`
20 - 小数据验证:
21 - 执行:
22 - `prepare-local fma data/synthetic_v2/songs --query-duration 8.0 --query-stride 4.0`
23 - 返回:
24 - `train_queries = 57`
25 - `test_queries = 15`
26 - 解析 manifest 后得到:
27 - `num_queries = 72`
28 - `sample_query.offset = 0.0`
29 - `query_index = 0`
30 - `max_query_index = 2`
31
32 结论:
33 - 现在外部开源数据接入已经不再局限于“每首歌只采一个随机 query”
34 - 当需要更高覆盖率时,可以直接生成多 query / overlap query manifests,用于更稳定的训练与评测
35
5 ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义 36 ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义
6 37
7 完成项: 38 完成项:
......
...@@ -91,13 +91,14 @@ flowchart TD ...@@ -91,13 +91,14 @@ flowchart TD
91 |---|---|---:|---| 91 |---|---|---:|---|
92 | 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | 92 | 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) |
93 | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) | 93 | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) |
94 | `audio-dir-to-splits` | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | 94 | `audio-dir-to-splits` 默认 | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) |
95 | `audio-dir-to-splits --query-stride 4.0` 例 | 对单首歌生成多个滑窗 query | 是,可配置 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) |
95 96
96 ### 直接回答你的问题 97 ### 直接回答你的问题
97 98
98 - **有重叠窗口,但只在检索/索引链路里有。** 99 - **有重叠窗口,但只在检索/索引链路里有。**
99 - **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。 100 - **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。
100 - **当前外部数据集 manifest 生成器也没有自动为每首歌生成多个重叠 query。** 101 - **当前外部数据集 manifest 生成器默认仍是一首歌 1 个随机 query,但现在已经支持通过 `--query-stride` 开启多 query / overlap query 生成。**
101 102
102 --- 103 ---
103 104
......
...@@ -68,6 +68,7 @@ flowchart LR ...@@ -68,6 +68,7 @@ flowchart LR
68 ```bash 68 ```bash
69 /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 69 /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0
70 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 70 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0
71 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0
71 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests 72 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests
72 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run 73 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run
73 /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu 74 /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu
...@@ -86,6 +87,7 @@ flowchart LR ...@@ -86,6 +87,7 @@ flowchart LR
86 ```bash 87 ```bash
87 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 88 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2
88 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 --device auto 89 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 --device auto
90 /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0 --train-epochs 1 --batch-size 2 --device auto
89 ``` 91 ```
90 92
91 真实目录放置位置可参考: 93 真实目录放置位置可参考:
...@@ -131,6 +133,8 @@ flowchart LR ...@@ -131,6 +133,8 @@ flowchart LR
131 - 会一次性返回 inspect / prepare / validate / report 路径摘要 133 - 会一次性返回 inspect / prepare / validate / report 路径摘要
132 - 现在支持 `--device cpu|cuda|auto` 134 - 现在支持 `--device cpu|cuda|auto`
133 - `auto` 会在 smoke 内部解析成实际设备,避免把字符串 `auto` 直接传给 embedding/eval 侧 135 - `auto` 会在 smoke 内部解析成实际设备,避免把字符串 `auto` 直接传给 embedding/eval 侧
136 - 现在支持 `--query-stride`
137 - 当设置 `--query-stride < query-duration` 时,会为单首歌生成多个重叠 query,而不是只采 1 个随机 query
134 138
135 --- 139 ---
136 140
......