Commit a68a7296 a68a7296b29419e3cac387ade21fdb03fa51121d by cnb.bofCdSsphPA

Expand external dataset coverage before harder real-data training

Constraint: Open-dataset ingestion needs a way to generate multiple overlapping queries per track, otherwise training/eval coverage stays too sparse
Rejected: Keep only one random external query per track | Leaves long songs underrepresented and weakens reproducibility
Confidence: high
Scope-risk: moderate
Directive: Preserve single-query behavior as the default, but keep overlap-query generation configurable through query_stride for future corpora
Tested: manifest_tools audio-dir-to-splits --help shows --query-stride; prepare-local on data/synthetic_v2/songs with query_duration=8.0 and query_stride=4.0 produced 72 queries with query_index fields
Not-tested: Full end-to-end smoke-local completion on the still-running real FMA corpus with overlap-query mode enabled
1 parent d7df0087
......@@ -103,6 +103,7 @@ class BaseAdapter:
output_root: Path,
eval_ratio: float = 0.2,
query_duration: float = 8.0,
query_stride: float | None = None,
seed: int = 42,
) -> Dict:
output_root.mkdir(parents=True, exist_ok=True)
......@@ -118,9 +119,16 @@ class BaseAdapter:
str(eval_ratio),
"--query-duration",
str(query_duration),
]
if query_stride is not None:
cmd.extend([
"--query-stride",
str(query_stride),
])
cmd.extend([
"--seed",
str(seed),
]
])
result = subprocess.check_output(cmd, text=True)
summary = json.loads(result)
summary["input_dir"] = str(input_dir)
......@@ -352,6 +360,7 @@ def smoke_local_dataset(
output_root: Path,
eval_ratio: float,
query_duration: float,
query_stride: float | None,
seed: int,
train_epochs: int,
batch_size: int,
......@@ -378,6 +387,7 @@ def smoke_local_dataset(
output_root / dataset,
eval_ratio=eval_ratio,
query_duration=query_duration,
query_stride=query_stride,
seed=seed,
)
manifests_dir = Path(prepare_summary["output_dir"])
......@@ -433,6 +443,7 @@ def smoke_local_dataset(
resolved_device=resolved_device,
base_cfg=base_cfg,
)
config["data"]["manifest_query_stride"] = query_stride
report_dir.mkdir(parents=True, exist_ok=True)
config_path.write_text(json.dumps(config, indent=2))
......@@ -481,6 +492,7 @@ def main():
p.add_argument("--output-root", default="data/external_ingested")
p.add_argument("--eval-ratio", type=float, default=0.2)
p.add_argument("--query-duration", type=float, default=8.0)
p.add_argument("--query-stride", type=float, default=None)
p.add_argument("--seed", type=int, default=42)
p = sub.add_parser("inspect-local")
......@@ -510,6 +522,7 @@ def main():
p.add_argument("--output-root", default="data/external_smoke")
p.add_argument("--eval-ratio", type=float, default=0.2)
p.add_argument("--query-duration", type=float, default=8.0)
p.add_argument("--query-stride", type=float, default=None)
p.add_argument("--seed", type=int, default=42)
p.add_argument("--train-epochs", type=int, default=1)
p.add_argument("--batch-size", type=int, default=2)
......@@ -531,6 +544,7 @@ def main():
root,
eval_ratio=args.eval_ratio,
query_duration=args.query_duration,
query_stride=args.query_stride,
seed=args.seed,
)
print(json.dumps(summary, indent=2, ensure_ascii=False))
......@@ -562,6 +576,7 @@ def main():
output_root=Path(args.output_root),
eval_ratio=args.eval_ratio,
query_duration=args.query_duration,
query_stride=args.query_stride,
seed=args.seed,
train_epochs=args.train_epochs,
batch_size=args.batch_size,
......
......@@ -9,6 +9,7 @@ import random
import shutil
from pathlib import Path
from typing import List, Dict
import numpy as np
import soundfile as sf
......@@ -43,6 +44,7 @@ def build_train_eval_from_audio_dir(
exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"),
eval_ratio: float = 0.2,
query_duration: float = 8.0,
query_stride: float | None = None,
seed: int = 42,
):
rng = random.Random(seed)
......@@ -80,16 +82,27 @@ def build_train_eval_from_audio_dir(
refs.append(ref)
if duration >= query_duration:
if query_stride and query_stride > 0:
max_offset = max(0.0, duration - query_duration)
offset = rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0
offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()]
if not offsets:
offsets = [0.0]
if offsets[-1] < round(max_offset, 3):
offsets.append(round(max_offset, 3))
else:
max_offset = max(0.0, duration - query_duration)
offsets = [round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3)]
for seg_idx, offset in enumerate(offsets):
query = {
"song_id": song_id,
"audio_path": str(rel),
"duration": query_duration,
"type": "clean",
"offset": round(offset, 3),
"offset": offset,
"segment_type": "external_query",
"source_dataset": source_dataset,
"query_index": seg_idx,
}
if rng.random() < eval_ratio:
test.append(query)
......@@ -109,6 +122,8 @@ def build_train_eval_from_audio_dir(
"catalog": len(refs),
"train_queries": len(train),
"test_queries": len(test),
"query_duration": query_duration,
"query_stride": query_stride,
"output_dir": str(manifests_dir),
}
......@@ -209,6 +224,7 @@ def main():
p.add_argument("--source-dataset", required=True)
p.add_argument("--eval-ratio", type=float, default=0.2)
p.add_argument("--query-duration", type=float, default=8.0)
p.add_argument("--query-stride", type=float, default=None)
p.add_argument("--seed", type=int, default=42)
p = sub.add_parser("inspect-audio-dir")
......@@ -230,6 +246,7 @@ def main():
source_dataset=args.source_dataset,
eval_ratio=args.eval_ratio,
query_duration=args.query_duration,
query_stride=args.query_stride,
seed=args.seed,
)
print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
......
......@@ -2,6 +2,37 @@
## 2026-06-02
### Stage: 为外部数据集接入增加 overlap query manifest 能力
完成项:
- 修改 `acr-engine/src/data/manifest_tools.py`
-`audio-dir-to-splits` 增加 `--query-stride`
- 支持按 stride 为单首歌生成多个 query
- 新 query 记录增加 `query_index`
- 修改 `acr-engine/src/data/external_adapters.py`
- `prepare-local` / `smoke-local` 透传 `--query-stride`
- smoke 配置摘要里记录 `manifest_query_stride`
- 更新 [open-dataset-workflow.md](./open-dataset-workflow.md)[dataset-spec.md](./dataset-spec.md)
验证结果:
- CLI 验证:
- `manifest_tools.py audio-dir-to-splits --help` 已出现 `--query-stride`
- 小数据验证:
- 执行:
- `prepare-local fma data/synthetic_v2/songs --query-duration 8.0 --query-stride 4.0`
- 返回:
- `train_queries = 57`
- `test_queries = 15`
- 解析 manifest 后得到:
- `num_queries = 72`
- `sample_query.offset = 0.0`
- `query_index = 0`
- `max_query_index = 2`
结论:
- 现在外部开源数据接入已经不再局限于“每首歌只采一个随机 query”
- 当需要更高覆盖率时,可以直接生成多 query / overlap query manifests,用于更稳定的训练与评测
### Stage: 显式拆分 smoke 配置里的 8s query 与 5s training segment 语义
完成项:
......
......@@ -91,13 +91,14 @@ flowchart TD
|---|---|---:|---|
| 训练 `SongPairDataset` | 每次采样随机取一个 5s clip | 否,**不是固定滑窗** | [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py) |
| 检索 / embedding / 建索引 | `window_sec=5.0`, `stride_sec=2.5` | 是,**50% overlap** | [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py) |
| `audio-dir-to-splits` | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) |
| `audio-dir-to-splits` 默认 | 每首歌只生成 1 个随机 query | 否 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) |
| `audio-dir-to-splits --query-stride 4.0` 例 | 对单首歌生成多个滑窗 query | 是,可配置 | [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py) |
### 直接回答你的问题
- **有重叠窗口,但只在检索/索引链路里有。**
- **当前训练主链路没有对 3 分钟 mp3 预展开成“全量重叠切片集”**,而是每次 batch 动态随机裁一个 5s 片段。
- **当前外部数据集 manifest 生成器也没有自动为每首歌生成多个重叠 query。**
- **当前外部数据集 manifest 生成器默认仍是一首歌 1 个随机 query,但现在已经支持通过 `--query-stride` 开启多 query / overlap query 生成。**
---
......
......@@ -68,6 +68,7 @@ flowchart LR
```bash
/usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0
/usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0
/usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0
/usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests
/usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run
/usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu
......@@ -86,6 +87,7 @@ flowchart LR
```bash
/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2
/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --train-epochs 1 --batch-size 2 --device auto
/usr/local/miniconda3/bin/python src/data/external_adapters.py smoke-local fma data/raw/fma_small_audio --output-root data/external_smoke --eval-ratio 0.2 --query-duration 8.0 --query-stride 4.0 --train-epochs 1 --batch-size 2 --device auto
```
真实目录放置位置可参考:
......@@ -131,6 +133,8 @@ flowchart LR
- 会一次性返回 inspect / prepare / validate / report 路径摘要
- 现在支持 `--device cpu|cuda|auto`
- `auto` 会在 smoke 内部解析成实际设备,避免把字符串 `auto` 直接传给 embedding/eval 侧
- 现在支持 `--query-stride`
- 当设置 `--query-stride < query-duration` 时,会为单首歌生成多个重叠 query,而不是只采 1 个随机 query
---
......