external_adapters.py 9.87 KB
"""Dataset adapter skeletons for external/open music corpora."""

from __future__ import annotations

from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List
import argparse
import json
import subprocess


@dataclass
class DatasetRecord:
    name: str
    source_url: str
    license: str
    commercial_use: str
    notes: str


class BaseAdapter:
    name = "base"

    def describe(self) -> Dict:
        raise NotImplementedError

    def init_layout(self, root: Path) -> Dict:
        root.mkdir(parents=True, exist_ok=True)
        for sub in ["raw", "processed", "manifests", "licenses"]:
            (root / sub).mkdir(exist_ok=True)
        manifest = {
            "dataset": self.name,
            "root": str(root),
            "status": "initialized",
            "next_steps": [
                "download raw audio according to upstream license terms",
                "convert to catalog/query manifests",
                "record license evidence before training",
            ],
        }
        with open(root / "manifests" / "bootstrap.json", "w") as f:
            json.dump(manifest, f, indent=2, ensure_ascii=False)
        return manifest

    def prepare_local_audio(
        self,
        input_dir: Path,
        output_root: Path,
        eval_ratio: float = 0.2,
        query_duration: float = 8.0,
        seed: int = 42,
    ) -> Dict:
        output_root.mkdir(parents=True, exist_ok=True)
        cmd = [
            "/usr/local/miniconda3/bin/python",
            "src/data/manifest_tools.py",
            "audio-dir-to-splits",
            str(input_dir),
            str(output_root),
            "--source-dataset",
            self.name,
            "--eval-ratio",
            str(eval_ratio),
            "--query-duration",
            str(query_duration),
            "--seed",
            str(seed),
        ]
        result = subprocess.check_output(cmd, text=True)
        summary = json.loads(result)
        summary["input_dir"] = str(input_dir)
        summary["dataset"] = self.name
        return summary

    def inspect_local_audio(
        self,
        input_dir: Path,
        query_duration: float = 8.0,
        eval_ratio: float = 0.2,
    ) -> Dict:
        cmd = [
            "/usr/local/miniconda3/bin/python",
            "src/data/manifest_tools.py",
            "inspect-audio-dir",
            str(input_dir),
            "--query-duration",
            str(query_duration),
            "--eval-ratio",
            str(eval_ratio),
        ]
        result = subprocess.check_output(cmd, text=True)
        summary = json.loads(result)
        summary["dataset"] = self.name
        return summary

    def validate_local_manifests(self, manifests_dir: Path) -> Dict:
        cmd = [
            "/usr/local/miniconda3/bin/python",
            "src/data/manifest_tools.py",
            "validate-splits",
            str(manifests_dir),
        ]
        result = subprocess.check_output(cmd, text=True)
        summary = json.loads(result)
        summary["dataset"] = self.name
        return summary


class FMAAdapter(BaseAdapter):
    name = "fma"

    def describe(self) -> Dict:
        return {
            "name": "FMA",
            "source_url": "https://github.com/mdeff/fma",
            "recommended_subset": "fma_small",
            "catalog_strategy": "full tracks as references; random 5-15s crops as queries",
            "license_policy": "review per subset/track before commercial training",
        }


class MTGJamendoAdapter(BaseAdapter):
    name = "mtg_jamendo"

    def describe(self) -> Dict:
        return {
            "name": "MTG-Jamendo",
            "source_url": "https://github.com/MTG/mtg-jamendo-dataset",
            "recommended_subset": "small curated slice",
            "catalog_strategy": "download upstream audio subset then build catalog/query manifests",
            "license_policy": "verify CC terms for intended commercial use",
        }


class CCMusicAdapter(BaseAdapter):
    name = "ccmusic"

    def describe(self) -> Dict:
        return {
            "name": "CCMusic",
            "source_url": "https://ccmusic-database.github.io/en/database/ccm.html",
            "recommended_subset": "whitelisted approved subset only",
            "catalog_strategy": "use approved corpora only; normalize to project manifests",
            "license_policy": "application/permission review required before use",
        }


class ModelScopeMusicAdapter(BaseAdapter):
    name = "modelscope_music"

    def describe(self) -> Dict:
        return {
            "name": "ModelScope music datasets",
            "source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset",
            "recommended_subset": "manual whitelist only",
            "catalog_strategy": "treat as discovery surface; add per-dataset adapter after legal review",
            "license_policy": "deny until whitelisted",
        }


ADAPTERS = {
    "fma": FMAAdapter(),
    "mtg_jamendo": MTGJamendoAdapter(),
    "ccmusic": CCMusicAdapter(),
    "modelscope_music": ModelScopeMusicAdapter(),
}

REGISTRY: List[DatasetRecord] = [
    DatasetRecord(
        name="FMA",
        source_url="https://github.com/mdeff/fma",
        license="Track-dependent / metadata CC BY 4.0; verify per subset",
        commercial_use="review_required",
        notes="Good first realistic MIR baseline",
    ),
    DatasetRecord(
        name="MTG-Jamendo",
        source_url="https://github.com/MTG/mtg-jamendo-dataset",
        license="Creative Commons source tracks; verify exact subset terms",
        commercial_use="review_required",
        notes="Good retrieval/tagging corpus with scripts",
    ),
    DatasetRecord(
        name="CCMusic",
        source_url="https://ccmusic-database.github.io/en/database/ccm.html",
        license="varies / application may be required",
        commercial_use="review_required",
        notes="Useful Chinese MIR source, needs permission review",
    ),
    DatasetRecord(
        name="ModelScope-music",
        source_url="https://modelscope.cn/search?page=1&search=music&type=dataset",
        license="varies by dataset",
        commercial_use="deny_until_whitelisted",
        notes="Discovery surface only until per-dataset review is complete",
    ),
]


def write_registry(output_path: str):
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)
    with open(out, "w") as f:
        json.dump([asdict(x) for x in REGISTRY], f, indent=2, ensure_ascii=False)
    return out


def inspect_batch(pairs: List[str], eval_ratio: float, query_duration: float) -> Dict:
    results = []
    for pair in pairs:
        dataset, input_dir = pair.split("=", 1)
        if dataset not in ADAPTERS:
            raise SystemExit(f"Unknown dataset adapter: {dataset}")
        summary = ADAPTERS[dataset].inspect_local_audio(
            Path(input_dir),
            eval_ratio=eval_ratio,
            query_duration=query_duration,
        )
        results.append(summary)
    return {"datasets": results, "count": len(results)}


def main():
    parser = argparse.ArgumentParser()
    sub = parser.add_subparsers(dest="cmd", required=True)

    p = sub.add_parser("registry")
    p.add_argument("--output", default="data/dataset_registry.json")

    p = sub.add_parser("init")
    p.add_argument("dataset", choices=sorted(ADAPTERS))
    p.add_argument("--root", default="data/external")

    p = sub.add_parser("describe")
    p.add_argument("dataset", choices=sorted(ADAPTERS))

    p = sub.add_parser("prepare-local")
    p.add_argument("dataset", choices=sorted(ADAPTERS))
    p.add_argument("input_dir")
    p.add_argument("--output-root", default="data/external_ingested")
    p.add_argument("--eval-ratio", type=float, default=0.2)
    p.add_argument("--query-duration", type=float, default=8.0)
    p.add_argument("--seed", type=int, default=42)

    p = sub.add_parser("inspect-local")
    p.add_argument("dataset", choices=sorted(ADAPTERS))
    p.add_argument("input_dir")
    p.add_argument("--eval-ratio", type=float, default=0.2)
    p.add_argument("--query-duration", type=float, default=8.0)

    p = sub.add_parser("inspect-batch")
    p.add_argument("pairs", nargs="+", help="dataset=input_dir")
    p.add_argument("--eval-ratio", type=float, default=0.2)
    p.add_argument("--query-duration", type=float, default=8.0)

    p = sub.add_parser("validate-local")
    p.add_argument("dataset", choices=sorted(ADAPTERS))
    p.add_argument("manifests_dir")

    args = parser.parse_args()
    if args.cmd == "registry":
        path = write_registry(args.output)
        print(path)
    elif args.cmd == "init":
        root = Path(args.root) / args.dataset
        print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False))
    elif args.cmd == "describe":
        print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False))
    elif args.cmd == "prepare-local":
        root = Path(args.output_root) / args.dataset
        summary = ADAPTERS[args.dataset].prepare_local_audio(
            Path(args.input_dir),
            root,
            eval_ratio=args.eval_ratio,
            query_duration=args.query_duration,
            seed=args.seed,
        )
        print(json.dumps(summary, indent=2, ensure_ascii=False))
    elif args.cmd == "inspect-local":
        summary = ADAPTERS[args.dataset].inspect_local_audio(
            Path(args.input_dir),
            eval_ratio=args.eval_ratio,
            query_duration=args.query_duration,
        )
        print(json.dumps(summary, indent=2, ensure_ascii=False))
    elif args.cmd == "inspect-batch":
        summary = inspect_batch(args.pairs, args.eval_ratio, args.query_duration)
        print(json.dumps(summary, indent=2, ensure_ascii=False))
    elif args.cmd == "validate-local":
        summary = ADAPTERS[args.dataset].validate_local_manifests(Path(args.manifests_dir))
        print(json.dumps(summary, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    main()