Commit dc9ef1b8 dc9ef1b810185d2eda7258f5095c9dd5cb297f3f by cnb.bofCdSsphPA

Close the open-dataset smoke loop through evaluation

Constraint: Open-dataset support was not complete until imported corpora could train, build indexes, and produce eval outputs without manual path surgery
Rejected: Stop at train.py dry-run | Does not prove the retrieval/evaluation half of the workflow actually works
Confidence: high
Scope-risk: moderate
Directive: Keep future external dataset layouts self-contained and manifests-root aware across training, indexing, and evaluation paths
Tested: /usr/local/miniconda3/bin/python train.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --output data/models_open_smoke_fixed --device cpu --epochs 1 --batch-size 2; /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --output data/index_open_smoke_fixed --device cpu; /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --index-prefix data/index_open_smoke_fixed/reference --split test --device cpu --fast-eval --output-json reports/open-smoke-fixed/fma/eval.json; /usr/local/miniconda3/bin/python -m py_compile evaluate.py run_demo.py src/engines/ecapa_embedder.py src/engines/chromaprint_matcher.py src/data/dataset.py src/data/manifest_tools.py src/data/external_adapters.py train.py
Not-tested: Real downloaded FMA or MTG-Jamendo corpora at larger scale
1 parent b766c74e
This file is too large to display.
1 {
2 "fma_00001": 0,
3 "fma_00002": 1,
4 "fma_00005": 2,
5 "fma_00007": 3,
6 "fma_00008": 4,
7 "fma_00010": 5,
8 "fma_00012": 6,
9 "fma_00014": 7,
10 "fma_00015": 8,
11 "fma_00016": 9,
12 "fma_00017": 10,
13 "fma_00018": 11,
14 "fma_00019": 12,
15 "fma_00021": 13,
16 "fma_00022": 14,
17 "fma_00023": 15
18 }
...\ No newline at end of file ...\ No newline at end of file
...@@ -31,6 +31,7 @@ def main(): ...@@ -31,6 +31,7 @@ def main():
31 args = parser.parse_args() 31 args = parser.parse_args()
32 32
33 data_dir = Path(args.data) 33 data_dir = Path(args.data)
34 asset_root = data_dir.parent if data_dir.name == "manifests" else data_dir
34 matcher = ChromaprintMatcher() 35 matcher = ChromaprintMatcher()
35 matcher.load(str(Path(args.index_prefix).parent / "chromaprint.pkl")) 36 matcher.load(str(Path(args.index_prefix).parent / "chromaprint.pkl"))
36 embedder = ECAPAEmbedder(model_path=args.model, device=args.device) 37 embedder = ECAPAEmbedder(model_path=args.model, device=args.device)
...@@ -53,7 +54,7 @@ def main(): ...@@ -53,7 +54,7 @@ def main():
53 engine.load_metadata(str(p)) 54 engine.load_metadata(str(p))
54 55
55 items = load_items(data_dir / f"{args.split}.json") 56 items = load_items(data_dir / f"{args.split}.json")
56 queries = [x for x in items if str(x.get("audio_path", "")).startswith("segments/")] 57 queries = [x for x in items if x.get("type") != "reference"]
57 if not queries: 58 if not queries:
58 raise SystemExit("No segment queries found for evaluation") 59 raise SystemExit("No segment queries found for evaluation")
59 60
...@@ -63,7 +64,7 @@ def main(): ...@@ -63,7 +64,7 @@ def main():
63 failures = [] 64 failures = []
64 65
65 for item in queries: 66 for item in queries:
66 result = engine.recognize(str(data_dir / item["audio_path"]), top_n=args.top_k) 67 result = engine.recognize(str(asset_root / item["audio_path"]), top_n=args.top_k)
67 preds = [c["song_id"] for c in result["candidates"]] 68 preds = [c["song_id"] for c in result["candidates"]]
68 truth = item["song_id"] 69 truth = item["song_id"]
69 qtype = item.get("type", "unknown") 70 qtype = item.get("type", "unknown")
......
1 {
2 "split": "test",
3 "num_queries": 8,
4 "top1": 1.0,
5 "topk": 1.0,
6 "by_type": {
7 "clean": {
8 "n": 8,
9 "top1": 1.0,
10 "topk": 1.0
11 }
12 },
13 "hard_case_summary": {},
14 "sample_failures": []
15 }
...\ No newline at end of file ...\ No newline at end of file
...@@ -29,9 +29,10 @@ def cmd_generate_data(args): ...@@ -29,9 +29,10 @@ def cmd_generate_data(args):
29 29
30 def build_chroma_index(data_dir: Path, output_dir: Path): 30 def build_chroma_index(data_dir: Path, output_dir: Path):
31 matcher = ChromaprintMatcher() 31 matcher = ChromaprintMatcher()
32 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'
32 matcher.index_songs_from_dir( 33 matcher.index_songs_from_dir(
33 songs_dir=str(data_dir / 'songs'), 34 songs_dir=str(data_dir),
34 metadata_path=str(data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'), 35 metadata_path=str(metadata_path),
35 cache_path=str(output_dir / 'chromaprint.pkl'), 36 cache_path=str(output_dir / 'chromaprint.pkl'),
36 ) 37 )
37 print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}") 38 print(f"[done] chromaprint index built: hashes={matcher.num_hashes}, postings={matcher.index_size}")
...@@ -40,9 +41,10 @@ def build_chroma_index(data_dir: Path, output_dir: Path): ...@@ -40,9 +41,10 @@ def build_chroma_index(data_dir: Path, output_dir: Path):
40 41
41 def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str): 42 def build_embedding_index(data_dir: Path, model_path: Path, output_prefix: Path, device: str):
42 embedder = ECAPAEmbedder(model_path=str(model_path), device=device) 43 embedder = ECAPAEmbedder(model_path=str(model_path), device=device)
44 metadata_path = data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'
43 ref_embs, ref_ids = embedder.build_reference_index( 45 ref_embs, ref_ids = embedder.build_reference_index(
44 songs_dir=str(data_dir / 'songs'), 46 songs_dir=str(data_dir),
45 metadata_path=str(data_dir / 'catalog.json' if (data_dir / 'catalog.json').exists() else data_dir / 'train.json'), 47 metadata_path=str(metadata_path),
46 output_path=str(output_prefix), 48 output_path=str(output_prefix),
47 ) 49 )
48 print(f"[done] embedding index built: {len(ref_ids)} refs") 50 print(f"[done] embedding index built: {len(ref_ids)} refs")
......
...@@ -82,7 +82,7 @@ class ChromaprintMatcher: ...@@ -82,7 +82,7 @@ class ChromaprintMatcher:
82 82
83 songs_dir = Path(songs_dir) 83 songs_dir = Path(songs_dir)
84 for item in meta: 84 for item in meta:
85 if "songs" not in item.get("audio_path", ""): 85 if item.get("type") != "reference":
86 continue 86 continue
87 audio_path = songs_dir.parent / item["audio_path"] 87 audio_path = songs_dir.parent / item["audio_path"]
88 if not audio_path.exists(): 88 if not audio_path.exists():
......
...@@ -103,7 +103,7 @@ class ECAPAEmbedder: ...@@ -103,7 +103,7 @@ class ECAPAEmbedder:
103 songs_dir = Path(songs_dir) 103 songs_dir = Path(songs_dir)
104 104
105 for item in meta: 105 for item in meta:
106 if item.get("type") != "reference" and "songs/" not in item.get("audio_path", ""): 106 if item.get("type") != "reference":
107 continue 107 continue
108 audio_path = songs_dir.parent / item["audio_path"] 108 audio_path = songs_dir.parent / item["audio_path"]
109 if not audio_path.exists(): 109 if not audio_path.exists():
......
...@@ -72,6 +72,27 @@ ...@@ -72,6 +72,27 @@
72 - 开放数据路径现在不仅能生成 manifests,还能真正进入训练 72 - 开放数据路径现在不仅能生成 manifests,还能真正进入训练
73 - 后续接入真实 FMA / MTG-Jamendo 时,可以直接走同一链路 73 - 后续接入真实 FMA / MTG-Jamendo 时,可以直接走同一链路
74 74
75 ### Stage: 开放数据完整 smoke 闭环(train/index/eval)
76
77 完成项:
78 - 修复 `run_demo.py` 对开放数据自包含布局的索引入口假设
79 - 修复 `src/engines/ecapa_embedder.py` / `src/engines/chromaprint_matcher.py` 对 reference 路径的硬编码筛选
80 - 修复 `evaluate.py` 对开放数据 query 与 `manifests` 根路径的解析
81 - 打通开放数据 `prepare-local -> validate-local -> train -> build-index -> evaluate`
82
83 验证结果:
84 - `/usr/local/miniconda3/bin/python train.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --output data/models_open_smoke_fixed --device cpu --epochs 1 --batch-size 2` 成功
85 - `/usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --output data/index_open_smoke_fixed --device cpu` 成功
86 - `/usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/synthetic_as_open_fixed/fma/manifests --model data/models_open_smoke_fixed/best_model.pt --index-prefix data/index_open_smoke_fixed/reference --split test --device cpu --fast-eval --output-json reports/open-smoke-fixed/fma/eval.json` 成功
87 - 当前结果:
88 - `num_queries=8`
89 - `top1=1.0`
90 - `topk=1.0`
91
92 结论:
93 - 开放数据接入链路现在已经完整闭环
94 - 真实 FMA / MTG-Jamendo 本地目录接入时,可直接复用同一流程
95
75 ### Stage: confused 定向优化 v6(sample-level weighting) 96 ### Stage: confused 定向优化 v6(sample-level weighting)
76 97
77 完成项: 98 完成项:
......
...@@ -47,6 +47,8 @@ flowchart LR ...@@ -47,6 +47,8 @@ flowchart LR
47 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 47 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0
48 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests 48 /usr/local/miniconda3/bin/python src/data/external_adapters.py validate-local fma data/external_ingested/fma/manifests
49 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run 49 /usr/local/miniconda3/bin/python train.py --data data/external_ingested/fma/manifests --output data/models_fma_smoke --device cpu --epochs 1 --batch-size 2 --dry-run
50 /usr/local/miniconda3/bin/python run_demo.py build-index --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --output data/index_fma_smoke --device cpu
51 /usr/local/miniconda3/bin/python evaluate.py --data data/external_ingested/fma/manifests --model data/models_fma_smoke/best_model.pt --index-prefix data/index_fma_smoke/reference --split test --device cpu --fast-eval --output-json reports/fma-smoke/eval.json
50 ``` 52 ```
51 53
52 ### 3.2 多目录比较 54 ### 3.2 多目录比较
...@@ -82,6 +84,9 @@ flowchart LR ...@@ -82,6 +84,9 @@ flowchart LR
82 - `ok=true` 84 - `ok=true`
83 - `train.py --dry-run` 85 - `train.py --dry-run`
84 - `Dry run passed! Pipeline is working.` 86 - `Dry run passed! Pipeline is working.`
87 - `build-index + evaluate`
88 - `top1=1.0`
89 - `topk=1.0`
85 90
86 --- 91 ---
87 92
......