split_business_manifest_ready.py
1.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
def load_jsonl(path: Path) -> list[dict]:
return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
def write_json(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(rows, ensure_ascii=False, indent=2))
def main() -> None:
parser = argparse.ArgumentParser(description='Split manifest-ready business JSONL into reference/query/excluded JSON files')
parser.add_argument('--input', required=True)
parser.add_argument('--output-dir', required=True)
args = parser.parse_args()
input_path = Path(args.input).resolve()
output_dir = Path(args.output_dir).resolve()
rows = load_jsonl(input_path)
grouped = {'reference': [], 'query': [], 'excluded': []}
for row in rows:
role = row.get('role', 'excluded')
grouped.setdefault(role, []).append(row)
write_json(output_dir / 'reference.json', grouped.get('reference', []))
write_json(output_dir / 'query.json', grouped.get('query', []))
write_json(output_dir / 'excluded.json', grouped.get('excluded', []))
summary = {
'input_rows': len(rows),
'role_counts': dict(Counter(row.get('role', 'excluded') for row in rows)),
'outputs': {
'reference': str((output_dir / 'reference.json').resolve()),
'query': str((output_dir / 'query.json').resolve()),
'excluded': str((output_dir / 'excluded.json').resolve()),
},
}
print(json.dumps(summary, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()