Close the open-dataset smoke loop through evaluation
Constraint: Open-dataset support was not complete until imported corpora could train, build indexes, and produce eval outputs without manual path surgery Rejected: Stop at train.py dry-run | Does not prove the retrieval/evaluation half of the workflow actually works Confidence: high Scope-risk: moderate Directive: Keep future external dataset layouts self-contained and manifests-root aware across training, indexing, and evaluation paths Tested: /usr/local/miniconda3/bin/python train.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --output data/models_open_smoke_fixed --device cpu --epochs 1 --batch-size 2; /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --output data/index_open_smoke_fixed --device cpu; /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --index-prefix data/index_open_smoke_fixed/reference --split test --device cpu --fast-eval --output-json reports/open-smoke-fixed/fma/eval.json; /usr/local/miniconda3/bin/python -m py_compile evaluate.py run_demo.py src/engines/ecapa_embedder.py src/engines/chromaprint_matcher.py src/data/dataset.py src/data/manifest_tools.py src/data/external_adapters.py train.py Not-tested: Real downloaded FMA or MTG-Jamendo corpora at larger scale
Showing
12 changed files
with
70 additions
and
8 deletions
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
| 1 | { | ||
| 2 | "fma_00001": 0, | ||
| 3 | "fma_00002": 1, | ||
| 4 | "fma_00005": 2, | ||
| 5 | "fma_00007": 3, | ||
| 6 | "fma_00008": 4, | ||
| 7 | "fma_00010": 5, | ||
| 8 | "fma_00012": 6, | ||
| 9 | "fma_00014": 7, | ||
| 10 | "fma_00015": 8, | ||
| 11 | "fma_00016": 9, | ||
| 12 | "fma_00017": 10, | ||
| 13 | "fma_00018": 11, | ||
| 14 | "fma_00019": 12, | ||
| 15 | "fma_00021": 13, | ||
| 16 | "fma_00022": 14, | ||
| 17 | "fma_00023": 15 | ||
| 18 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -31,6 +31,7 @@ def main(): | ... | @@ -31,6 +31,7 @@ def main(): |
| 31 | args = parser.parse_args() | 31 | args = parser.parse_args() |
| 32 | 32 | ||
| 33 | data_dir = Path(args.data) | 33 | data_dir = Path(args.data) |
| 34 | asset_root = data_dir.parent if data_dir.name == "manifests" else data_dir | ||
| 34 | matcher = ChromaprintMatcher() | 35 | matcher = ChromaprintMatcher() |
| 35 | matcher.load(str(Path(args.index_prefix).parent / "chromaprint.pkl")) | 36 | matcher.load(str(Path(args.index_prefix).parent / "chromaprint.pkl")) |
| 36 | embedder = ECAPAEmbedder(model_path=args.model, device=args.device) | 37 | embedder = ECAPAEmbedder(model_path=args.model, device=args.device) |
| ... | @@ -53,7 +54,7 @@ def main(): | ... | @@ -53,7 +54,7 @@ def main(): |
| 53 | engine.load_metadata(str(p)) | 54 | engine.load_metadata(str(p)) |
| 54 | 55 | ||
| 55 | items = load_items(data_dir / f"{args.split}.json") | 56 | items = load_items(data_dir / f"{args.split}.json") |
| 56 | queries = [x for x in items if str(x.get("audio_path", "")).startswith("segments/")] | 57 | queries = [x for x in items if x.get("type") != "reference"] |
| 57 | if not queries: | 58 | if not queries: |
| 58 | raise SystemExit("No segment queries found for evaluation") | 59 | raise SystemExit("No segment queries found for evaluation") |
| 59 | 60 | ||
| ... | @@ -63,7 +64,7 @@ def main(): | ... | @@ -63,7 +64,7 @@ def main(): |
| 63 | failures = [] | 64 | failures = [] |
| 64 | 65 | ||
| 65 | for item in queries: | 66 | for item in queries: |
| 66 | result = engine.recognize(str(data_dir / item["audio_path"]), top_n=args.top_k) | 67 | result = engine.recognize(str(asset_root / item["audio_path"]), top_n=args.top_k) |
| 67 | preds = [c["song_id"] for c in result["candidates"]] | 68 | preds = [c["song_id"] for c in result["candidates"]] |
| 68 | truth = item["song_id"] | 69 | truth = item["song_id"] |
| 69 | qtype = item.get("type", "unknown") | 70 | qtype = item.get("type", "unknown") | ... | ... |
| ... | @@ -29,9 +29,10 @@ def cmd_generate_data(args): | ... | @@ -29,9 +29,10 @@ def cmd_generate_data(args): |
| 29 | 29 | ||
| 30 | def build_chroma_index(data_dir: Path, output_dir: Path): | 30 | def build_chroma_index(data_dir: Path, output_dir: Path): |
| 31 | matcher = ChromaprintMatcher() | 31 | matcher = ChromaprintMatcher() |
| 32 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' | ||
| 32 | matcher.index_songs_from_dir( | 33 | matcher.index_songs_from_dir( |
| 33 | songs_dir=str(data_dir / 'songs'), | 34 | songs_dir=str(data_dir), |
| 34 | metadata_path=str(data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'), | 35 | metadata_path=str(metadata_path), |
| 35 | cache_path=str(output_dir / 'chromaprint.pkl'), | 36 | cache_path=str(output_dir / 'chromaprint.pkl'), |
| 36 | ) | 37 | ) |
| 37 | print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") | 38 | print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") |
| ... | @@ -40,9 +41,10 @@ def build_chroma_index(data_dir: Path, output_dir: Path): | ... | @@ -40,9 +41,10 @@ def build_chroma_index(data_dir: Path, output_dir: Path): |
| 40 | 41 | ||
| 41 | def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str): | 42 | def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str): |
| 42 | embedder = ECAPAEmbedder(model_path=str(model_path), device=device) | 43 | embedder = ECAPAEmbedder(model_path=str(model_path), device=device) |
| 44 | metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json' | ||
| 43 | ref_embs, ref_ids = embedder.build_reference_index( | 45 | ref_embs, ref_ids = embedder.build_reference_index( |
| 44 | songs_dir=str(data_dir / 'songs'), | 46 | songs_dir=str(data_dir), |
| 45 | metadata_path=str(data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'), | 47 | metadata_path=str(metadata_path), |
| 46 | output_path=str(output_prefix), | 48 | output_path=str(output_prefix), |
| 47 | ) | 49 | ) |
| 48 | print(f"[done] embedding index built: {len(ref_ids)} refs") | 50 | print(f"[done] embedding index built: {len(ref_ids)} refs") | ... | ... |
| ... | @@ -82,7 +82,7 @@ class ChromaprintMatcher: | ... | @@ -82,7 +82,7 @@ class ChromaprintMatcher: |
| 82 | 82 | ||
| 83 | songs_dir = Path(songs_dir) | 83 | songs_dir = Path(songs_dir) |
| 84 | for item in meta: | 84 | for item in meta: |
| 85 | if "songs" not in item.get("audio_path", ""): | 85 | if item.get("type") != "reference": |
| 86 | continue | 86 | continue |
| 87 | audio_path = songs_dir.parent / item["audio_path"] | 87 | audio_path = songs_dir.parent / item["audio_path"] |
| 88 | if not audio_path.exists(): | 88 | if not audio_path.exists(): | ... | ... |
| ... | @@ -103,7 +103,7 @@ class ECAPAEmbedder: | ... | @@ -103,7 +103,7 @@ class ECAPAEmbedder: |
| 103 | songs_dir = Path(songs_dir) | 103 | songs_dir = Path(songs_dir) |
| 104 | 104 | ||
| 105 | for item in meta: | 105 | for item in meta: |
| 106 | if item.get("type") != "reference" and "songs/" not in item.get("audio_path", ""): | 106 | if item.get("type") != "reference": |
| 107 | continue | 107 | continue |
| 108 | audio_path = songs_dir.parent / item["audio_path"] | 108 | audio_path = songs_dir.parent / item["audio_path"] |
| 109 | if not audio_path.exists(): | 109 | if not audio_path.exists(): | ... | ... |
| ... | @@ -72,6 +72,27 @@ | ... | @@ -72,6 +72,27 @@ |
| 72 | - 开放数据路径现在不仅能生成 manifests,还能真正进入训练 | 72 | - 开放数据路径现在不仅能生成 manifests,还能真正进入训练 |
| 73 | - 后续接入真实 FMA / MTG-Jamendo 时,可以直接走同一链路 | 73 | - 后续接入真实 FMA / MTG-Jamendo 时,可以直接走同一链路 |
| 74 | 74 | ||
| 75 | ### Stage: 开放数据完整 smoke 闭环(train/index/eval) | ||
| 76 | |||
| 77 | 完成项: | ||
| 78 | - 修复 `run_demo.py` 对开放数据自包含布局的索引入口假设 | ||
| 79 | - 修复 `src/engines/ecapa_embedder.py` / `src/engines/chromaprint_matcher.py` 对 reference 路径的硬编码筛选 | ||
| 80 | - 修复 `evaluate.py` 对开放数据 query 与 `manifests` 根路径的解析 | ||
| 81 | - 打通开放数据 `prepare-local -> validate-local -> train -> build-index -> evaluate` | ||
| 82 | |||
| 83 | 验证结果: | ||
| 84 | - `/usr/local/miniconda3/bin/python train.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --output data/models_open_smoke_fixed --device cpu --epochs 1 --batch-size 2` 成功 | ||
| 85 | - `/usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --output data/index_open_smoke_fixed --device cpu` 成功 | ||
| 86 | - `/usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --index-prefix data/index_open_smoke_fixed/reference --split test --device cpu --fast-eval --output-json reports/open-smoke-fixed/fma/eval.json` 成功 | ||
| 87 | - 当前结果: | ||
| 88 | - `num_queries=8` | ||
| 89 | - `top1=1.0` | ||
| 90 | - `topk=1.0` | ||
| 91 | |||
| 92 | 结论: | ||
| 93 | - 开放数据接入链路现在已经完整闭环 | ||
| 94 | - 真实 FMA / MTG-Jamendo 本地目录接入时,可直接复用同一流程 | ||
| 95 | |||
| 75 | ### Stage: confused 定向优化 v6(sample-level weighting) | 96 | ### Stage: confused 定向优化 v6(sample-level weighting) |
| 76 | 97 | ||
| 77 | 完成项: | 98 | 完成项: | ... | ... |
| ... | @@ -47,6 +47,8 @@ flowchart LR | ... | @@ -47,6 +47,8 @@ flowchart LR |
| 47 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 | 47 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 |
| 48 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests | 48 | /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests |
| 49 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run | 49 | /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run |
| 50 | /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu | ||
| 51 | /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json | ||
| 50 | ``` | 52 | ``` |
| 51 | 53 | ||
| 52 | ### 3.2 多目录比较 | 54 | ### 3.2 多目录比较 |
| ... | @@ -82,6 +84,9 @@ flowchart LR | ... | @@ -82,6 +84,9 @@ flowchart LR |
| 82 | - `ok=true` | 84 | - `ok=true` |
| 83 | - `train.py --dry-run`: | 85 | - `train.py --dry-run`: |
| 84 | - `Dry run passed! Pipeline is working.` | 86 | - `Dry run passed! Pipeline is working.` |
| 87 | - `build-index + evaluate`: | ||
| 88 | - `top1=1.0` | ||
| 89 | - `topk=1.0` | ||
| 85 | 90 | ||
| 86 | --- | 91 | --- |
| 87 | 92 | ... | ... |
-
Please register or sign in to post a comment