build_humming_eval_manifest.py
1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
from pathlib import Path
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument('--chunks-json', required=True)
ap.add_argument('--song-id', required=True)
ap.add_argument('--split', default='test')
ap.add_argument('--output', required=True)
ap.add_argument('--source-dataset', default='humming_real')
args = ap.parse_args()
payload = json.loads(Path(args.chunks_json).read_text(encoding='utf-8'))
rows = []
for chunk in payload.get('chunks', []):
rows.append({
'song_id': args.song_id,
'audio_path': chunk['audio_path'],
'duration': chunk['duration_sec'],
'type': 'humming_real',
'segment_type': 'humming_query',
'offset': chunk['start_sec'],
'source_dataset': args.source_dataset,
'split': args.split,
})
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding='utf-8')
print(json.dumps({'rows': len(rows), 'output': str(out)}, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()