chunks.py
2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from __future__ import annotations
from typing import Any
from weknora_eval.loaders import compact_text, read_jsonl, write_json, write_jsonl
from weknora_eval.parsers.local import build_parse_summary
from weknora_eval.schemas import ParsedDocument
def parse_chunks(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
parsing = config["parsing"]
chunks_config = parsing.get("chunks", {})
input_path = chunks_config.get("input_path", "data/exported/chunks.jsonl")
min_chars = int(chunks_config.get("min_chars", parsing.get("local", {}).get("min_chars", 80)))
rows: list[dict[str, Any]] = []
failures: list[dict[str, Any]] = []
for chunk in read_jsonl(input_path):
content = compact_text(chunk.get("content"))
chunk_id = chunk.get("chunk_id") or chunk.get("id")
source_file = chunk.get("source_file") or chunk.get("knowledge_filename") or "unknown"
if not content or len(content) < min_chars:
failures.append(
{
"source_file": source_file,
"parser": "weknora:chunks",
"status": "skipped",
"error": f"chunk content shorter than min_chars={min_chars}",
"fallback_used": None,
"chunk_id": chunk_id,
}
)
continue
document = ParsedDocument(
doc_id=str(chunk_id or f"{source_file}::chunk-{len(rows) + 1}"),
source_file=str(source_file),
file_type=str(chunk.get("chunk_type") or "chunk"),
content=content,
metadata={
"parser": "weknora:chunks",
"chunk_id": chunk_id,
"knowledge_id": chunk.get("knowledge_id"),
"knowledge_base_id": chunk.get("knowledge_base_id"),
"chunk_index": chunk.get("chunk_index"),
},
)
rows.append(document.to_dict())
write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
if failures:
write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
summary = build_parse_summary(rows, failures, parser="weknora:chunks")
write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
return rows, summary