build_business_project_manifests.py 2.99 KB
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from pathlib import Path


def load_jsonl(path: Path) -> list[dict]:
    return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]


def write_json(path: Path, rows: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(rows, ensure_ascii=False, indent=2))


def build_reference(row: dict) -> dict:
    return {
        'song_id': row['song_id'],
        'audio_path': row['audio_path'],
        'duration': row.get('duration_sec') or 0.0,
        'type': 'reference',
        'source_dataset': row.get('source_dataset', 'business_music'),
    }


def build_query(row: dict) -> dict:
    return {
        'song_id': row['song_id'],
        'audio_path': row['audio_path'],
        'duration': row.get('duration_sec') or 8.0,
        'type': 'clean',
        'offset': row.get('offset_sec') or 0.0,
        'segment_type': 'external_query',
        'source_dataset': row.get('source_dataset', 'business_music'),
    }


def dedupe_refs(rows: list[dict]) -> list[dict]:
    seen = set()
    out = []
    for row in rows:
        key = (row['song_id'], row['audio_path'])
        if key in seen:
            continue
        seen.add(key)
        out.append(row)
    return out


def main() -> None:
    parser = argparse.ArgumentParser(description='Build project manifests from business manifest-ready JSONL')
    parser.add_argument('--input', required=True, help='manifest-ready JSONL from normalize_business_export.py')
    parser.add_argument('--output-dir', required=True, help='output manifests dir')
    parser.add_argument('--include-holdout-in-val', action='store_true', help='map holdout queries into val.json')
    args = parser.parse_args()

    rows = load_jsonl(Path(args.input).resolve())
    refs_src = [row for row in rows if row.get('role') == 'reference']
    query_src = [row for row in rows if row.get('role') == 'query']

    refs = dedupe_refs([build_reference(row) for row in refs_src])
    train_queries = [build_query(row) for row in query_src if row.get('split') == 'train']
    test_queries = [build_query(row) for row in query_src if row.get('split') == 'test']
    val_queries = [build_query(row) for row in query_src if row.get('split') == 'val']
    if args.include_holdout_in_val:
        val_queries.extend(build_query(row) for row in query_src if row.get('split') == 'holdout')

    out_dir = Path(args.output_dir).resolve()
    write_json(out_dir / 'catalog.json', refs)
    write_json(out_dir / 'train.json', train_queries + refs)
    write_json(out_dir / 'test.json', test_queries + refs)
    write_json(out_dir / 'val.json', val_queries)

    summary = {
        'catalog_refs': len(refs),
        'train_queries': len(train_queries),
        'test_queries': len(test_queries),
        'val_queries': len(val_queries),
        'output_dir': str(out_dir),
    }
    print(json.dumps(summary, ensure_ascii=False, indent=2))


if __name__ == '__main__':
    main()