Expand external dataset coverage before harder real-data training
Constraint: Open-dataset ingestion needs a way to generate multiple overlapping queries per track, otherwise training/eval coverage stays too sparse Rejected: Keep only one random external query per track | Leaves long songs underrepresented and weakens reproducibility Confidence: high Scope-risk: moderate Directive: Preserve single-query behavior as the default, but keep overlap-query generation configurable through query_stride for future corpora Tested: manifest_tools audio-dir-to-splits --help shows --query-stride; prepare-local on data/synthetic_v2/songs with query_duration=8.0 and query_stride=4.0 produced 72 queries with query_index fields Not-tested: Full end-to-end smoke-local completion on the still-running real FMA corpus with overlap-query mode enabled
Showing
5 changed files
with
85 additions
and
17 deletions
| ... | @@ -103,6 +103,7 @@ class BaseAdapter: | ... | @@ -103,6 +103,7 @@ class BaseAdapter: |
| 103 | output_root: Path, | 103 | output_root: Path, |
| 104 | eval_ratio: float = 0.2, | 104 | eval_ratio: float = 0.2, |
| 105 | query_duration: float = 8.0, | 105 | query_duration: float = 8.0, |
| 106 | query_stride: float | None = None, | ||
| 106 | seed: int = 42, | 107 | seed: int = 42, |
| 107 | ) -> Dict: | 108 | ) -> Dict: |
| 108 | output_root.mkdir(parents=True, exist_ok=True) | 109 | output_root.mkdir(parents=True, exist_ok=True) |
| ... | @@ -118,9 +119,16 @@ class BaseAdapter: | ... | @@ -118,9 +119,16 @@ class BaseAdapter: |
| 118 | str(eval_ratio), | 119 | str(eval_ratio), |
| 119 | "--query-duration", | 120 | "--query-duration", |
| 120 | str(query_duration), | 121 | str(query_duration), |
| 122 | ] | ||
| 123 | if query_stride is not None: | ||
| 124 | cmd.extend([ | ||
| 125 | "--query-stride", | ||
| 126 | str(query_stride), | ||
| 127 | ]) | ||
| 128 | cmd.extend([ | ||
| 121 | "--seed", | 129 | "--seed", |
| 122 | str(seed), | 130 | str(seed), |
| 123 | ] | 131 | ]) |
| 124 | result = subprocess.check_output(cmd, text=True) | 132 | result = subprocess.check_output(cmd, text=True) |
| 125 | summary = json.loads(result) | 133 | summary = json.loads(result) |
| 126 | summary["input_dir"] = str(input_dir) | 134 | summary["input_dir"] = str(input_dir) |
| ... | @@ -352,6 +360,7 @@ def smoke_local_dataset( | ... | @@ -352,6 +360,7 @@ def smoke_local_dataset( |
| 352 | output_root: Path, | 360 | output_root: Path, |
| 353 | eval_ratio: float, | 361 | eval_ratio: float, |
| 354 | query_duration: float, | 362 | query_duration: float, |
| 363 | query_stride: float | None, | ||
| 355 | seed: int, | 364 | seed: int, |
| 356 | train_epochs: int, | 365 | train_epochs: int, |
| 357 | batch_size: int, | 366 | batch_size: int, |
| ... | @@ -378,6 +387,7 @@ def smoke_local_dataset( | ... | @@ -378,6 +387,7 @@ def smoke_local_dataset( |
| 378 | output_root / dataset, | 387 | output_root / dataset, |
| 379 | eval_ratio=eval_ratio, | 388 | eval_ratio=eval_ratio, |
| 380 | query_duration=query_duration, | 389 | query_duration=query_duration, |
| 390 | query_stride=query_stride, | ||
| 381 | seed=seed, | 391 | seed=seed, |
| 382 | ) | 392 | ) |
| 383 | manifests_dir = Path(prepare_summary["output_dir"]) | 393 | manifests_dir = Path(prepare_summary["output_dir"]) |
| ... | @@ -433,6 +443,7 @@ def smoke_local_dataset( | ... | @@ -433,6 +443,7 @@ def smoke_local_dataset( |
| 433 | resolved_device=resolved_device, | 443 | resolved_device=resolved_device, |
| 434 | base_cfg=base_cfg, | 444 | base_cfg=base_cfg, |
| 435 | ) | 445 | ) |
| 446 | config["data"]["manifest_query_stride"] = query_stride | ||
| 436 | report_dir.mkdir(parents=True, exist_ok=True) | 447 | report_dir.mkdir(parents=True, exist_ok=True) |
| 437 | config_path.write_text(json.dumps(config, indent=2)) | 448 | config_path.write_text(json.dumps(config, indent=2)) |
| 438 | 449 | ||
| ... | @@ -481,6 +492,7 @@ def main(): | ... | @@ -481,6 +492,7 @@ def main(): |
| 481 | p.add_argument("--output-root", default="data/external_ingested") | 492 | p.add_argument("--output-root", default="data/external_ingested") |
| 482 | p.add_argument("--eval-ratio", type=float, default=0.2) | 493 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 483 | p.add_argument("--query-duration", type=float, default=8.0) | 494 | p.add_argument("--query-duration", type=float, default=8.0) |
| 495 | p.add_argument("--query-stride", type=float, default=None) | ||
| 484 | p.add_argument("--seed", type=int, default=42) | 496 | p.add_argument("--seed", type=int, default=42) |
| 485 | 497 | ||
| 486 | p = sub.add_parser("inspect-local") | 498 | p = sub.add_parser("inspect-local") |
| ... | @@ -510,6 +522,7 @@ def main(): | ... | @@ -510,6 +522,7 @@ def main(): |
| 510 | p.add_argument("--output-root", default="data/external_smoke") | 522 | p.add_argument("--output-root", default="data/external_smoke") |
| 511 | p.add_argument("--eval-ratio", type=float, default=0.2) | 523 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 512 | p.add_argument("--query-duration", type=float, default=8.0) | 524 | p.add_argument("--query-duration", type=float, default=8.0) |
| 525 | p.add_argument("--query-stride", type=float, default=None) | ||
| 513 | p.add_argument("--seed", type=int, default=42) | 526 | p.add_argument("--seed", type=int, default=42) |
| 514 | p.add_argument("--train-epochs", type=int, default=1) | 527 | p.add_argument("--train-epochs", type=int, default=1) |
| 515 | p.add_argument("--batch-size", type=int, default=2) | 528 | p.add_argument("--batch-size", type=int, default=2) |
| ... | @@ -531,6 +544,7 @@ def main(): | ... | @@ -531,6 +544,7 @@ def main(): |
| 531 | root, | 544 | root, |
| 532 | eval_ratio=args.eval_ratio, | 545 | eval_ratio=args.eval_ratio, |
| 533 | query_duration=args.query_duration, | 546 | query_duration=args.query_duration, |
| 547 | query_stride=args.query_stride, | ||
| 534 | seed=args.seed, | 548 | seed=args.seed, |
| 535 | ) | 549 | ) |
| 536 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | 550 | print(json.dumps(summary, indent=2, ensure_ascii=False)) |
| ... | @@ -562,6 +576,7 @@ def main(): | ... | @@ -562,6 +576,7 @@ def main(): |
| 562 | output_root=Path(args.output_root), | 576 | output_root=Path(args.output_root), |
| 563 | eval_ratio=args.eval_ratio, | 577 | eval_ratio=args.eval_ratio, |
| 564 | query_duration=args.query_duration, | 578 | query_duration=args.query_duration, |
| 579 | query_stride=args.query_stride, | ||
| 565 | seed=args.seed, | 580 | seed=args.seed, |
| 566 | train_epochs=args.train_epochs, | 581 | train_epochs=args.train_epochs, |
| 567 | batch_size=args.batch_size, | 582 | batch_size=args.batch_size, | ... | ... |
| ... | @@ -9,6 +9,7 @@ import random | ... | @@ -9,6 +9,7 @@ import random |
| 9 | import shutil | 9 | import shutil |
| 10 | from pathlib import Path | 10 | from pathlib import Path |
| 11 | from typing import List, Dict | 11 | from typing import List, Dict |
| 12 | import numpy as np | ||
| 12 | import soundfile as sf | 13 | import soundfile as sf |
| 13 | 14 | ||
| 14 | 15 | ||
| ... | @@ -43,6 +44,7 @@ def build_train_eval_from_audio_dir( | ... | @@ -43,6 +44,7 @@ def build_train_eval_from_audio_dir( |
| 43 | exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"), | 44 | exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"), |
| 44 | eval_ratio: float = 0.2, | 45 | eval_ratio: float = 0.2, |
| 45 | query_duration: float = 8.0, | 46 | query_duration: float = 8.0, |
| 47 | query_stride: float | None = None, | ||
| 46 | seed: int = 42, | 48 | seed: int = 42, |
| 47 | ): | 49 | ): |
| 48 | rng = random.Random(seed) | 50 | rng = random.Random(seed) |
| ... | @@ -80,21 +82,32 @@ def build_train_eval_from_audio_dir( | ... | @@ -80,21 +82,32 @@ def build_train_eval_from_audio_dir( |
| 80 | refs.append(ref) | 82 | refs.append(ref) |
| 81 | 83 | ||
| 82 | if duration >= query_duration: | 84 | if duration >= query_duration: |
| 83 | max_offset = max(0.0, duration - query_duration) | 85 | if query_stride and query_stride > 0: |
| 84 | offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0 | 86 | max_offset = max(0.0, duration - query_duration) |
| 85 | query = { | 87 | offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] |
| 86 | "song_id": song_id, | 88 | if not offsets: |
| 87 | "audio_path": str(rel), | 89 | offsets = [0.0] |
| 88 | "duration": query_duration, | 90 | if offsets[-1] < round(max_offset, 3): |
| 89 | "type": "clean", | 91 | offsets.append(round(max_offset, 3)) |
| 90 | "offset": round(offset, 3), | ||
| 91 | "segment_type": "external_query", | ||
| 92 | "source_dataset": source_dataset, | ||
| 93 | } | ||
| 94 | if rng.random() < eval_ratio: | ||
| 95 | test.append(query) | ||
| 96 | else: | 92 | else: |
| 97 | train.append(query) | 93 | max_offset = max(0.0, duration - query_duration) |
| 94 | offsets = [round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3)] | ||
| 95 | |||
| 96 | for seg_idx, offset in enumerate(offsets): | ||
| 97 | query = { | ||
| 98 | "song_id": song_id, | ||
| 99 | "audio_path": str(rel), | ||
| 100 | "duration": query_duration, | ||
| 101 | "type": "clean", | ||
| 102 | "offset": offset, | ||
| 103 | "segment_type": "external_query", | ||
| 104 | "source_dataset": source_dataset, | ||
| 105 | "query_index": seg_idx, | ||
| 106 | } | ||
| 107 | if rng.random() < eval_ratio: | ||
| 108 | test.append(query) | ||
| 109 | else: | ||
| 110 | train.append(query) | ||
| 98 | 111 | ||
| 99 | if len(files) >= 2 and not train and test: | 112 | if len(files) >= 2 and not train and test: |
| 100 | train.append(test.pop()) | 113 | train.append(test.pop()) |
| ... | @@ -109,6 +122,8 @@ def build_train_eval_from_audio_dir( | ... | @@ -109,6 +122,8 @@ def build_train_eval_from_audio_dir( |
| 109 | "catalog": len(refs), | 122 | "catalog": len(refs), |
| 110 | "train_queries": len(train), | 123 | "train_queries": len(train), |
| 111 | "test_queries": len(test), | 124 | "test_queries": len(test), |
| 125 | "query_duration": query_duration, | ||
| 126 | "query_stride": query_stride, | ||
| 112 | "output_dir": str(manifests_dir), | 127 | "output_dir": str(manifests_dir), |
| 113 | } | 128 | } |
| 114 | 129 | ||
| ... | @@ -209,6 +224,7 @@ def main(): | ... | @@ -209,6 +224,7 @@ def main(): |
| 209 | p.add_argument("--source-dataset", required=True) | 224 | p.add_argument("--source-dataset", required=True) |
| 210 | p.add_argument("--eval-ratio", type=float, default=0.2) | 225 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 211 | p.add_argument("--query-duration", type=float, default=8.0) | 226 | p.add_argument("--query-duration", type=float, default=8.0) |
| 227 | p.add_argument("--query-stride", type=float, default=None) | ||
| 212 | p.add_argument("--seed", type=int, default=42) | 228 | p.add_argument("--seed", type=int, default=42) |
| 213 | 229 | ||
| 214 | p = sub.add_parser("inspect-audio-dir") | 230 | p = sub.add_parser("inspect-audio-dir") |
| ... | @@ -230,6 +246,7 @@ def main(): | ... | @@ -230,6 +246,7 @@ def main(): |
| 230 | source_dataset=args.source_dataset, | 246 | source_dataset=args.source_dataset, |
| 231 | eval_ratio=args.eval_ratio, | 247 | eval_ratio=args.eval_ratio, |
| 232 | query_duration=args.query_duration, | 248 | query_duration=args.query_duration, |
| 249 | query_stride=args.query_stride, | ||
| 233 | seed=args.seed, | 250 | seed=args.seed, |
| 234 | ) | 251 | ) |
| 235 | print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) | 252 | print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) | ... | ... |
| ... | @@ -2,6 +2,37 @@ | ... | @@ -2,6 +2,37 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 为外部数据集接入增加 overlap query manifest 能力 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 修改 `acr-engine/src/data/manifest_tools.py` | ||
| 9 | - 为 `audio-dir-to-splits` 增加 `--query-stride` | ||
| 10 | - 支持按 stride 为单首歌生成多个 query | ||
| 11 | - 新 query 记录增加 `query_index` | ||
| 12 | - 修改 `acr-engine/src/data/external_adapters.py` | ||
| 13 | - `prepare-local` / `smoke-local` 透传 `--query-stride` | ||
| 14 | - smoke 配置摘要里记录 `manifest_query_stride` | ||
| 15 | - 更新 [open-dataset-workflow.md](./open-dataset-workflow.md) 与 [dataset-spec.md](./dataset-spec.md) | ||
| 16 | |||
| 17 | 验证结果: | ||
| 18 | - CLI 验证: | ||
| 19 | - `manifest_tools.py audio-dir-to-splits --help` 已出现 `--query-stride` | ||
| 20 | - 小数据验证: | ||
| 21 | - 执行: | ||
| 22 | - `prepare-local fma data/synthetic_v2/songs --query-duration 8.0 --query-stride 4.0` | ||
| 23 | - 返回: | ||
| 24 | - `train_queries = 57` | ||
| 25 | - `test_queries = 15` | ||
| 26 | - 解析 manifest 后得到: | ||
| 27 | - `num_queries = 72` | ||
| 28 | - `sample_query.offset = 0.0` | ||
| 29 | - `query_index = 0` | ||
| 30 | - `max_query_index = 2` | ||
| 31 | |||
| 32 | 结论: | ||
| 33 | - 现在外部开源数据接入已经不再局限于“每首歌只采一个随机 query” | ||
| 34 | - 当需要更高覆盖率时,可以直接生成多 query / overlap query manifests,用于更稳定的训练与评测 | ||
| 35 | |||
| 5 | ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义 | 36 | ### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义 |
| 6 | 37 | ||
| 7 | 完成项: | 38 | 完成项: | ... | ... |
| ... | @@ -91,13 +91,14 @@ flowchart TD | ... | @@ -91,13 +91,14 @@ flowchart TD |
| 91 | |---|---|---:|---| | 91 | |---|---|---:|---| |
| 92 | | 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | | 92 | | 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) | |
| 93 | | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) | | 93 | | 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) | |
| 94 | | `audio-dir-to-splits` | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | | 94 | | `audio-dir-to-splits` 默认 | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | |
| 95 | | `audio-dir-to-splits --query-stride 4.0` 例 | 对单首歌生成多个滑窗 query | 是,可配置 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) | | ||
| 95 | 96 | ||
| 96 | ### 直接回答你的问题 | 97 | ### 直接回答你的问题 |
| 97 | 98 | ||
| 98 | - **有重叠窗口,但只在检索/索引链路里有。** | 99 | - **有重叠窗口,但只在检索/索引链路里有。** |
| 99 | - **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。 | 100 | - **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。 |
| 100 | - **当前外部数据集 manifest 生成器也没有自动为每首歌生成多个重叠 query。** | 101 | - **当前外部数据集 manifest 生成器默认仍是一首歌 1 个随机 query,但现在已经支持通过 `--query-stride` 开启多 query / overlap query 生成。** |
| 101 | 102 | ||
| 102 | --- | 103 | --- |
| 103 | 104 | ... | ... |
| ... | @@ -68,6 +68,7 @@ flowchart LR | ... | @@ -68,6 +68,7 @@ flowchart LR |
| 68 | ```bash | 68 | ```bash |
| 69 | /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 | 69 | /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 |
| 70 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 | 70 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 |
| 71 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0 | ||
| 71 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests | 72 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests |
| 72 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run | 73 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run |
| 73 | /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu | 74 | /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu |
| ... | @@ -86,6 +87,7 @@ flowchart LR | ... | @@ -86,6 +87,7 @@ flowchart LR |
| 86 | ```bash | 87 | ```bash |
| 87 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 | 88 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 |
| 88 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 --device auto | 89 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 --device auto |
| 90 | /usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0 --train-epochs 1 --batch-size 2 --device auto | ||
| 89 | ``` | 91 | ``` |
| 90 | 92 | ||
| 91 | 真实目录放置位置可参考: | 93 | 真实目录放置位置可参考: |
| ... | @@ -131,6 +133,8 @@ flowchart LR | ... | @@ -131,6 +133,8 @@ flowchart LR |
| 131 | - 会一次性返回 inspect / prepare / validate / report 路径摘要 | 133 | - 会一次性返回 inspect / prepare / validate / report 路径摘要 |
| 132 | - 现在支持 `--device cpu|cuda|auto` | 134 | - 现在支持 `--device cpu|cuda|auto` |
| 133 | - `auto` 会在 smoke 内部解析成实际设备,避免把字符串 `auto` 直接传给 embedding/eval 侧 | 135 | - `auto` 会在 smoke 内部解析成实际设备,避免把字符串 `auto` 直接传给 embedding/eval 侧 |
| 136 | - 现在支持 `--query-stride` | ||
| 137 | - 当设置 `--query-stride < query-duration` 时,会为单首歌生成多个重叠 query,而不是只采 1 个随机 query | ||
| 134 | 138 | ||
| 135 | --- | 139 | --- |
| 136 | 140 | ... | ... |
-
Please register or sign in to post a comment