loaders.py
2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from __future__ import annotations
import json
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Any
def setup_logging(level: int = logging.INFO) -> None:
logging.basicConfig(
level=level,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
def ensure_parent(path: str | Path) -> Path:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
return target
def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]:
target = Path(path)
if not target.exists():
if missing_ok:
return []
raise FileNotFoundError(target)
rows: list[dict[str, Any]] = []
with target.open("r", encoding="utf-8") as file:
for line_no, line in enumerate(file, start=1):
stripped = line.strip()
if not stripped:
continue
try:
rows.append(json.loads(stripped))
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
return rows
def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]:
target = Path(path)
if not target.exists():
if missing_ok:
return
raise FileNotFoundError(target)
with target.open("r", encoding="utf-8") as file:
for line_no, line in enumerate(file, start=1):
stripped = line.strip()
if not stripped:
continue
try:
yield json.loads(stripped)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int:
target = ensure_parent(path)
count = 0
with target.open("w", encoding="utf-8") as file:
for row in rows:
file.write(json.dumps(row, ensure_ascii=False) + "\n")
count += 1
return count
def append_jsonl(path: str | Path, row: dict[str, Any]) -> None:
target = ensure_parent(path)
with target.open("a", encoding="utf-8") as file:
file.write(json.dumps(row, ensure_ascii=False) + "\n")
def write_json(path: str | Path, payload: dict[str, Any]) -> None:
target = ensure_parent(path)
with target.open("w", encoding="utf-8") as file:
json.dump(payload, file, ensure_ascii=False, indent=2)
file.write("\n")
def compact_text(value: Any) -> str:
text = "" if value is None else str(value)
return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip()