manifest_tools.py 1.66 KB
"""External dataset manifest conversion templates."""

from __future__ import annotations

import argparse
import csv
import json
from pathlib import Path
from typing import List, Dict


def write_catalog(records: List[Dict], output_path: Path):
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)


def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_path", id_field: str = "song_id"):
    records = []
    with open(csv_path, newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            records.append(
                {
                    "song_id": row[id_field],
                    "audio_path": row[path_field],
                    "duration": float(row.get("duration", 0.0) or 0.0),
                    "type": "reference",
                    "source_dataset": row.get("source_dataset", "external"),
                }
            )
    write_catalog(records, output_path)
    return len(records)


def main():
    parser = argparse.ArgumentParser()
    sub = parser.add_subparsers(dest="cmd", required=True)

    p = sub.add_parser("csv-to-catalog")
    p.add_argument("csv_path")
    p.add_argument("output_path")
    p.add_argument("--path-field", default="audio_path")
    p.add_argument("--id-field", default="song_id")

    args = parser.parse_args()
    if args.cmd == "csv-to-catalog":
        count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
        print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))


if __name__ == "__main__":
    main()