manifest_tools.py
1.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""External dataset manifest conversion templates."""
from __future__ import annotations
import argparse
import csv
import json
from pathlib import Path
from typing import List, Dict
def write_catalog(records: List[Dict], output_path: Path):
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_path", id_field: str = "song_id"):
records = []
with open(csv_path, newline="") as f:
reader = csv.DictReader(f)
for row in reader:
records.append(
{
"song_id": row[id_field],
"audio_path": row[path_field],
"duration": float(row.get("duration", 0.0) or 0.0),
"type": "reference",
"source_dataset": row.get("source_dataset", "external"),
}
)
write_catalog(records, output_path)
return len(records)
def main():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("csv-to-catalog")
p.add_argument("csv_path")
p.add_argument("output_path")
p.add_argument("--path-field", default="audio_path")
p.add_argument("--id-field", default="song_id")
args = parser.parse_args()
if args.cmd == "csv-to-catalog":
count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))
if __name__ == "__main__":
main()