Commit c734a31e c734a31eeaa91c64e491d2601b3161043045fefb by cnb.bofCdSsphPA

Add open-dataset inventory checks before ingestion

Constraint: Personal-use dataset setup needs quick scale visibility before generating train/eval manifests
Rejected: Generate splits blindly | Hides whether a local corpus is large enough for meaningful train/test separation
Confidence: high
Scope-risk: narrow
Directive: Run inspect-local on real FMA or MTG-Jamendo folders before prepare-local and training
Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/manifest_tools.py src/data/external_adapters.py; /usr/local/miniconda3/bin/python src/data/manifest_tools.py inspect-audio-dir tmp/open_music_demo --query-duration 5.0 --eval-ratio 0.5; /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma tmp/open_music_demo --eval-ratio 0.5 --query-duration 5.0
Not-tested: Real large external corpus inventory on downloaded FMA or MTG-Jamendo directories
1 parent fb1d00b6
1 # External Open-Music Ingestion
2
3 ## Goal
4 Convert local open-music audio folders into ACR-ready manifests for:
5 - training queries
6 - evaluation queries
7 - reference catalog indexing
8
9 ## Recommended personal-use flow
10
11 ### 1. Prepare a local audio directory
12 Examples:
13 - `data/raw/fma_small_audio/`
14 - `data/raw/mtg_jamendo_audio/`
15
16 ### 2. Generate manifests through the adapter entrypoint
17 Optional pre-check:
18 ```bash
19 /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0
20 ```
21
22 Then generate manifests:
23 ```bash
24 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0
25 ```
26
27 or
28
29 ```bash
30 /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local mtg_jamendo data/raw/mtg_jamendo_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0
31 ```
32
33 ### 3. Use outputs
34 Generated files:
35 - `catalog.json`: reference tracks for indexing
36 - `train.json`: train queries + references
37 - `test.json`: held-out eval queries + references
38 - `val.json`: optional validation split
39
40 ## Notes
41 - Small datasets are automatically protected so both train/test query sets exist.
42 - For personal use, FMA and MTG-Jamendo should be the first real baselines.
43 - Keep `test.json` fixed across experiments to compare models fairly.
...@@ -73,6 +73,27 @@ class BaseAdapter: ...@@ -73,6 +73,27 @@ class BaseAdapter:
73 summary["dataset"] = self.name 73 summary["dataset"] = self.name
74 return summary 74 return summary
75 75
76 def inspect_local_audio(
77 self,
78 input_dir: Path,
79 query_duration: float = 8.0,
80 eval_ratio: float = 0.2,
81 ) -> Dict:
82 cmd = [
83 "/usr/local/miniconda3/bin/python",
84 "src/data/manifest_tools.py",
85 "inspect-audio-dir",
86 str(input_dir),
87 "--query-duration",
88 str(query_duration),
89 "--eval-ratio",
90 str(eval_ratio),
91 ]
92 result = subprocess.check_output(cmd, text=True)
93 summary = json.loads(result)
94 summary["dataset"] = self.name
95 return summary
96
76 97
77 class FMAAdapter(BaseAdapter): 98 class FMAAdapter(BaseAdapter):
78 name = "fma" 99 name = "fma"
...@@ -195,6 +216,12 @@ def main(): ...@@ -195,6 +216,12 @@ def main():
195 p.add_argument("--query-duration", type=float, default=8.0) 216 p.add_argument("--query-duration", type=float, default=8.0)
196 p.add_argument("--seed", type=int, default=42) 217 p.add_argument("--seed", type=int, default=42)
197 218
219 p = sub.add_parser("inspect-local")
220 p.add_argument("dataset", choices=sorted(ADAPTERS))
221 p.add_argument("input_dir")
222 p.add_argument("--eval-ratio", type=float, default=0.2)
223 p.add_argument("--query-duration", type=float, default=8.0)
224
198 args = parser.parse_args() 225 args = parser.parse_args()
199 if args.cmd == "registry": 226 if args.cmd == "registry":
200 path = write_registry(args.output) 227 path = write_registry(args.output)
...@@ -214,6 +241,13 @@ def main(): ...@@ -214,6 +241,13 @@ def main():
214 seed=args.seed, 241 seed=args.seed,
215 ) 242 )
216 print(json.dumps(summary, indent=2, ensure_ascii=False)) 243 print(json.dumps(summary, indent=2, ensure_ascii=False))
244 elif args.cmd == "inspect-local":
245 summary = ADAPTERS[args.dataset].inspect_local_audio(
246 Path(args.input_dir),
247 eval_ratio=args.eval_ratio,
248 query_duration=args.query_duration,
249 )
250 print(json.dumps(summary, indent=2, ensure_ascii=False))
217 251
218 252
219 if __name__ == "__main__": 253 if __name__ == "__main__":
......
...@@ -106,6 +106,44 @@ def build_train_eval_from_audio_dir( ...@@ -106,6 +106,44 @@ def build_train_eval_from_audio_dir(
106 } 106 }
107 107
108 108
109 def inspect_audio_dir(
110 audio_dir: Path,
111 exts: tuple[str, ...] = (".wav", ".mp3", ".flac", ".ogg"),
112 query_duration: float = 8.0,
113 eval_ratio: float = 0.2,
114 ):
115 files = [p for p in sorted(audio_dir.rglob("*")) if p.suffix.lower() in exts]
116 durations = []
117 eligible = 0
118 for path in files:
119 try:
120 duration = float(sf.info(str(path)).duration)
121 except Exception:
122 duration = 0.0
123 durations.append(duration)
124 if duration >= query_duration:
125 eligible += 1
126
127 durations_sorted = sorted(durations)
128 total = len(files)
129 train_queries = max(0, eligible - max(1 if eligible >= 2 else 0, round(eligible * eval_ratio)))
130 test_queries = 0 if eligible == 0 else max(1 if eligible >= 2 else eligible, round(eligible * eval_ratio))
131
132 return {
133 "audio_dir": str(audio_dir),
134 "num_audio_files": total,
135 "eligible_query_files": eligible,
136 "query_duration": query_duration,
137 "recommended_train_queries": train_queries,
138 "recommended_test_queries": test_queries,
139 "duration_stats": {
140 "min": round(durations_sorted[0], 3) if durations_sorted else 0.0,
141 "median": round(durations_sorted[len(durations_sorted) // 2], 3) if durations_sorted else 0.0,
142 "max": round(durations_sorted[-1], 3) if durations_sorted else 0.0,
143 },
144 }
145
146
109 def main(): 147 def main():
110 parser = argparse.ArgumentParser() 148 parser = argparse.ArgumentParser()
111 sub = parser.add_subparsers(dest="cmd", required=True) 149 sub = parser.add_subparsers(dest="cmd", required=True)
...@@ -124,6 +162,11 @@ def main(): ...@@ -124,6 +162,11 @@ def main():
124 p.add_argument("--query-duration", type=float, default=8.0) 162 p.add_argument("--query-duration", type=float, default=8.0)
125 p.add_argument("--seed", type=int, default=42) 163 p.add_argument("--seed", type=int, default=42)
126 164
165 p = sub.add_parser("inspect-audio-dir")
166 p.add_argument("audio_dir")
167 p.add_argument("--query-duration", type=float, default=8.0)
168 p.add_argument("--eval-ratio", type=float, default=0.2)
169
127 args = parser.parse_args() 170 args = parser.parse_args()
128 if args.cmd == "csv-to-catalog": 171 if args.cmd == "csv-to-catalog":
129 count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field) 172 count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
...@@ -138,6 +181,13 @@ def main(): ...@@ -138,6 +181,13 @@ def main():
138 seed=args.seed, 181 seed=args.seed,
139 ) 182 )
140 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) 183 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
184 elif args.cmd == "inspect-audio-dir":
185 summary = inspect_audio_dir(
186 Path(args.audio_dir),
187 query_duration=args.query_duration,
188 eval_ratio=args.eval_ratio,
189 )
190 print(json.dumps({"status": "ok", **summary}, ensure_ascii=False))
141 191
142 192
143 if __name__ == "__main__": 193 if __name__ == "__main__":
......
...@@ -120,6 +120,33 @@ ...@@ -120,6 +120,33 @@
120 - 现在接入真实 FMA / MTG-Jamendo 目录时,不需要再手动拼 manifests 120 - 现在接入真实 FMA / MTG-Jamendo 目录时,不需要再手动拼 manifests
121 - adapter 已经能作为统一入口管理开放数据集的训练/评估切分 121 - adapter 已经能作为统一入口管理开放数据集的训练/评估切分
122 122
123 ### Stage: 开源目录规模扫描(inspect-local)
124
125 完成项:
126 - 扩展 `src/data/manifest_tools.py`
127 - 新增 `inspect-audio-dir`
128 - 扩展 `src/data/external_adapters.py`
129 - 新增 `inspect-local`
130 - 在真正生成 manifests 之前,可以先报告:
131 - 音频文件数量
132 - 可切 query 的文件数
133 - 推荐 train/test query 数
134 - 基础时长统计
135
136 验证结果:
137 - `python -m py_compile src/data/manifest_tools.py src/data/external_adapters.py` 成功
138 - `python src/data/manifest_tools.py inspect-audio-dir tmp/open_music_demo --query-duration 5.0 --eval-ratio 0.5` 成功
139 - `python src/data/external_adapters.py inspect-local fma tmp/open_music_demo --eval-ratio 0.5 --query-duration 5.0` 成功
140 - 返回结果:
141 - `num_audio_files=2`
142 - `eligible_query_files=2`
143 - `recommended_train_queries=1`
144 - `recommended_test_queries=1`
145
146 结论:
147 - 现在真实 FMA / MTG-Jamendo 目录在导入前就能先做规模预估
148 - 这对个人使用下的快速数据准备非常有帮助
149
123 ## 2026-06-02 150 ## 2026-06-02
124 151
125 ### Stage: 文档补全 + ACR 最小可运行链路 152 ### Stage: 文档补全 + ACR 最小可运行链路
......
...@@ -145,6 +145,7 @@ flowchart LR ...@@ -145,6 +145,7 @@ flowchart LR
145 CLI 入口: 145 CLI 入口:
146 - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits` 146 - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits`
147 - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>` 147 - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>`
148 - 导入前预检查:`src/data/external_adapters.py inspect-local <dataset> <input_dir>`
148 149
149 ## 5. 文字说明 150 ## 5. 文字说明
150 151
......