bootstrap_external.py
1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python3
"""Bootstrap manifest skeletons for whitelisted external datasets."""
import argparse
import json
from pathlib import Path
TEMPLATES = {
'fma': {'source_dataset': 'fma', 'license_status': 'review_required'},
'mtg_jamendo': {'source_dataset': 'mtg_jamendo', 'license_status': 'review_required'},
'ccmusic': {'source_dataset': 'ccmusic', 'license_status': 'review_required'},
'modelscope_music': {'source_dataset': 'modelscope_music', 'license_status': 'deny_until_whitelisted'},
}
def bootstrap(dataset: str, output_dir: str, num_placeholders: int = 3):
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
base = TEMPLATES[dataset]
rows = []
for i in range(num_placeholders):
rows.append({
'song_id': f'{dataset}_track_{i:04d}',
'audio_path': f'raw/{dataset}_track_{i:04d}.wav',
'duration': 0.0,
'type': 'reference',
**base,
})
(out / 'raw').mkdir(exist_ok=True)
(out / 'manifests').mkdir(exist_ok=True)
with open(out / 'manifests' / 'catalog.bootstrap.json', 'w') as f:
json.dump(rows, f, indent=2, ensure_ascii=False)
with open(out / 'README.bootstrap.md', 'w') as f:
f.write(f'# {dataset} bootstrap\n\n- Fill raw audio files under `raw/`\n- Review license before training\n- Convert to final catalog/query manifests\n')
return out / 'manifests' / 'catalog.bootstrap.json'
def main():
parser = argparse.ArgumentParser()
parser.add_argument('dataset', choices=sorted(TEMPLATES))
parser.add_argument('--output-dir', required=True)
parser.add_argument('--num-placeholders', type=int, default=3)
args = parser.parse_args()
path = bootstrap(args.dataset, args.output_dir, args.num_placeholders)
print(path)
if __name__ == '__main__':
main()