bootstrap_external.py 1.81 KB
#!/usr/bin/env python3
"""Bootstrap manifest skeletons for whitelisted external datasets."""

import argparse
import json
from pathlib import Path

TEMPLATES = {
    'fma': {'source_dataset': 'fma', 'license_status': 'review_required'},
    'mtg_jamendo': {'source_dataset': 'mtg_jamendo', 'license_status': 'review_required'},
    'ccmusic': {'source_dataset': 'ccmusic', 'license_status': 'review_required'},
    'modelscope_music': {'source_dataset': 'modelscope_music', 'license_status': 'deny_until_whitelisted'},
}


def bootstrap(dataset: str, output_dir: str, num_placeholders: int = 3):
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    base = TEMPLATES[dataset]
    rows = []
    for i in range(num_placeholders):
        rows.append({
            'song_id': f'{dataset}_track_{i:04d}',
            'audio_path': f'raw/{dataset}_track_{i:04d}.wav',
            'duration': 0.0,
            'type': 'reference',
            **base,
        })
    (out / 'raw').mkdir(exist_ok=True)
    (out / 'manifests').mkdir(exist_ok=True)
    with open(out / 'manifests' / 'catalog.bootstrap.json', 'w') as f:
        json.dump(rows, f, indent=2, ensure_ascii=False)
    with open(out / 'README.bootstrap.md', 'w') as f:
        f.write(f'# {dataset} bootstrap\n\n- Fill raw audio files under `raw/`\n- Review license before training\n- Convert to final catalog/query manifests\n')
    return out / 'manifests' / 'catalog.bootstrap.json'


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset', choices=sorted(TEMPLATES))
    parser.add_argument('--output-dir', required=True)
    parser.add_argument('--num-placeholders', type=int, default=3)
    args = parser.parse_args()
    path = bootstrap(args.dataset, args.output_dir, args.num_placeholders)
    print(path)


if __name__ == '__main__':
    main()