split_business_manifest_ready.py 1.7 KB
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from collections import Counter
from pathlib import Path


def load_jsonl(path: Path) -> list[dict]:
    return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]


def write_json(path: Path, rows: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(rows, ensure_ascii=False, indent=2))


def main() -> None:
    parser = argparse.ArgumentParser(description='Split manifest-ready business JSONL into reference/query/excluded JSON files')
    parser.add_argument('--input', required=True)
    parser.add_argument('--output-dir', required=True)
    args = parser.parse_args()

    input_path = Path(args.input).resolve()
    output_dir = Path(args.output_dir).resolve()
    rows = load_jsonl(input_path)

    grouped = {'reference': [], 'query': [], 'excluded': []}
    for row in rows:
        role = row.get('role', 'excluded')
        grouped.setdefault(role, []).append(row)

    write_json(output_dir / 'reference.json', grouped.get('reference', []))
    write_json(output_dir / 'query.json', grouped.get('query', []))
    write_json(output_dir / 'excluded.json', grouped.get('excluded', []))

    summary = {
        'input_rows': len(rows),
        'role_counts': dict(Counter(row.get('role', 'excluded') for row in rows)),
        'outputs': {
            'reference': str((output_dir / 'reference.json').resolve()),
            'query': str((output_dir / 'query.json').resolve()),
            'excluded': str((output_dir / 'excluded.json').resolve()),
        },
    }
    print(json.dumps(summary, ensure_ascii=False, indent=2))


if __name__ == '__main__':
    main()