Add batch inventory for multiple open music directories
Constraint: Personal-use dataset preparation needs fast comparison across several local open-music corpora before ingestion Rejected: Inspect each dataset directory manually one by one | Slows repeated train/eval setup and comparison Confidence: high Scope-risk: narrow Directive: Use inspect-batch on real FMA and MTG-Jamendo folders before selecting training and held-out evaluation corpora Tested: /usr/local/miniconda3/bin/python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py; /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-batch fma=tmp/open_music_demo_fma mtg_jamendo=tmp/open_music_demo_jamendo --eval-ratio 0.5 --query-duration 5.0 Not-tested: Real upstream corpus inventory on downloaded full-size open datasets
Showing
4 changed files
with
49 additions
and
0 deletions
| ... | @@ -19,6 +19,11 @@ Optional pre-check: | ... | @@ -19,6 +19,11 @@ Optional pre-check: |
| 19 | /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 | 19 | /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-local fma data/raw/fma_small_audio --eval-ratio 0.2 --query-duration 8.0 |
| 20 | ``` | 20 | ``` |
| 21 | 21 | ||
| 22 | Batch pre-check across multiple candidate corpora: | ||
| 23 | ```bash | ||
| 24 | /usr/local/miniconda3/bin/python src/data/external_adapters.py inspect-batch fma=data/raw/fma_small_audio mtg_jamendo=data/raw/mtg_jamendo_audio --eval-ratio 0.2 --query-duration 8.0 | ||
| 25 | ``` | ||
| 26 | |||
| 22 | Then generate manifests: | 27 | Then generate manifests: |
| 23 | ```bash | 28 | ```bash |
| 24 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 | 29 | /usr/local/miniconda3/bin/python src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio --output-root data/external_ingested --eval-ratio 0.2 --query-duration 8.0 | ... | ... |
| ... | @@ -194,6 +194,21 @@ def write_registry(output_path: str): | ... | @@ -194,6 +194,21 @@ def write_registry(output_path: str): |
| 194 | return out | 194 | return out |
| 195 | 195 | ||
| 196 | 196 | ||
| 197 | def inspect_batch(pairs: List[str], eval_ratio: float, query_duration: float) -> Dict: | ||
| 198 | results = [] | ||
| 199 | for pair in pairs: | ||
| 200 | dataset, input_dir = pair.split("=", 1) | ||
| 201 | if dataset not in ADAPTERS: | ||
| 202 | raise SystemExit(f"Unknown dataset adapter: {dataset}") | ||
| 203 | summary = ADAPTERS[dataset].inspect_local_audio( | ||
| 204 | Path(input_dir), | ||
| 205 | eval_ratio=eval_ratio, | ||
| 206 | query_duration=query_duration, | ||
| 207 | ) | ||
| 208 | results.append(summary) | ||
| 209 | return {"datasets": results, "count": len(results)} | ||
| 210 | |||
| 211 | |||
| 197 | def main(): | 212 | def main(): |
| 198 | parser = argparse.ArgumentParser() | 213 | parser = argparse.ArgumentParser() |
| 199 | sub = parser.add_subparsers(dest="cmd", required=True) | 214 | sub = parser.add_subparsers(dest="cmd", required=True) |
| ... | @@ -222,6 +237,11 @@ def main(): | ... | @@ -222,6 +237,11 @@ def main(): |
| 222 | p.add_argument("--eval-ratio", type=float, default=0.2) | 237 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 223 | p.add_argument("--query-duration", type=float, default=8.0) | 238 | p.add_argument("--query-duration", type=float, default=8.0) |
| 224 | 239 | ||
| 240 | p = sub.add_parser("inspect-batch") | ||
| 241 | p.add_argument("pairs", nargs="+", help="dataset=input_dir") | ||
| 242 | p.add_argument("--eval-ratio", type=float, default=0.2) | ||
| 243 | p.add_argument("--query-duration", type=float, default=8.0) | ||
| 244 | |||
| 225 | args = parser.parse_args() | 245 | args = parser.parse_args() |
| 226 | if args.cmd == "registry": | 246 | if args.cmd == "registry": |
| 227 | path = write_registry(args.output) | 247 | path = write_registry(args.output) |
| ... | @@ -248,6 +268,9 @@ def main(): | ... | @@ -248,6 +268,9 @@ def main(): |
| 248 | query_duration=args.query_duration, | 268 | query_duration=args.query_duration, |
| 249 | ) | 269 | ) |
| 250 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | 270 | print(json.dumps(summary, indent=2, ensure_ascii=False)) |
| 271 | elif args.cmd == "inspect-batch": | ||
| 272 | summary = inspect_batch(args.pairs, args.eval_ratio, args.query_duration) | ||
| 273 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | ||
| 251 | 274 | ||
| 252 | 275 | ||
| 253 | if __name__ == "__main__": | 276 | if __name__ == "__main__": | ... | ... |
| ... | @@ -147,6 +147,26 @@ | ... | @@ -147,6 +147,26 @@ |
| 147 | - 现在真实 FMA / MTG-Jamendo 目录在导入前就能先做规模预估 | 147 | - 现在真实 FMA / MTG-Jamendo 目录在导入前就能先做规模预估 |
| 148 | - 这对个人使用下的快速数据准备非常有帮助 | 148 | - 这对个人使用下的快速数据准备非常有帮助 |
| 149 | 149 | ||
| 150 | ### Stage: 多目录批量 inventory(inspect-batch) | ||
| 151 | |||
| 152 | 完成项: | ||
| 153 | - 扩展 `src/data/external_adapters.py` | ||
| 154 | - 新增 `inspect-batch` | ||
| 155 | - 支持一次性检查多个开放数据目录,例如: | ||
| 156 | - `fma=<dir>` | ||
| 157 | - `mtg_jamendo=<dir>` | ||
| 158 | |||
| 159 | 验证结果: | ||
| 160 | - `python -m py_compile src/data/external_adapters.py src/data/manifest_tools.py` 成功 | ||
| 161 | - `python src/data/external_adapters.py inspect-batch fma=tmp/open_music_demo_fma mtg_jamendo=tmp/open_music_demo_jamendo --eval-ratio 0.5 --query-duration 5.0` 成功 | ||
| 162 | - 返回: | ||
| 163 | - `count=2` | ||
| 164 | - 每个数据源均给出 `num_audio_files / eligible_query_files / recommended_train_queries / recommended_test_queries` | ||
| 165 | |||
| 166 | 结论: | ||
| 167 | - 现在可以批量对比多个候选开放数据目录的可用规模 | ||
| 168 | - 这让后续接入真实 FMA / MTG-Jamendo / 其他音乐集更高效 | ||
| 169 | |||
| 150 | ## 2026-06-02 | 170 | ## 2026-06-02 |
| 151 | 171 | ||
| 152 | ### Stage: 文档补全 + ACR 最小可运行链路 | 172 | ### Stage: 文档补全 + ACR 最小可运行链路 | ... | ... |
| ... | @@ -146,6 +146,7 @@ CLI 入口: | ... | @@ -146,6 +146,7 @@ CLI 入口: |
| 146 | - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits` | 146 | - 低层工具:`src/data/manifest_tools.py audio-dir-to-splits` |
| 147 | - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>` | 147 | - 高层统一入口:`src/data/external_adapters.py prepare-local <dataset> <input_dir>` |
| 148 | - 导入前预检查:`src/data/external_adapters.py inspect-local <dataset> <input_dir>` | 148 | - 导入前预检查:`src/data/external_adapters.py inspect-local <dataset> <input_dir>` |
| 149 | - 多目录批量预检查:`src/data/external_adapters.py inspect-batch fma=<dir> mtg_jamendo=<dir> ...` | ||
| 149 | 150 | ||
| 150 | ## 5. 文字说明 | 151 | ## 5. 文字说明 |
| 151 | 152 | ... | ... |
-
Please register or sign in to post a comment