chunks.py 2.32 KB
from __future__ import annotations

from typing import Any

from weknora_eval.loaders import compact_text, read_jsonl, write_json, write_jsonl
from weknora_eval.parsers.local import build_parse_summary
from weknora_eval.schemas import ParsedDocument


def parse_chunks(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    parsing = config["parsing"]
    chunks_config = parsing.get("chunks", {})
    input_path = chunks_config.get("input_path", "data/exported/chunks.jsonl")
    min_chars = int(chunks_config.get("min_chars", parsing.get("local", {}).get("min_chars", 80)))

    rows: list[dict[str, Any]] = []
    failures: list[dict[str, Any]] = []
    for chunk in read_jsonl(input_path):
        content = compact_text(chunk.get("content"))
        chunk_id = chunk.get("chunk_id") or chunk.get("id")
        source_file = chunk.get("source_file") or chunk.get("knowledge_filename") or "unknown"
        if not content or len(content) < min_chars:
            failures.append(
                {
                    "source_file": source_file,
                    "parser": "weknora:chunks",
                    "status": "skipped",
                    "error": f"chunk content shorter than min_chars={min_chars}",
                    "fallback_used": None,
                    "chunk_id": chunk_id,
                }
            )
            continue

        document = ParsedDocument(
            doc_id=str(chunk_id or f"{source_file}::chunk-{len(rows) + 1}"),
            source_file=str(source_file),
            file_type=str(chunk.get("chunk_type") or "chunk"),
            content=content,
            metadata={
                "parser": "weknora:chunks",
                "chunk_id": chunk_id,
                "knowledge_id": chunk.get("knowledge_id"),
                "knowledge_base_id": chunk.get("knowledge_base_id"),
                "chunk_index": chunk.get("chunk_index"),
            },
        )
        rows.append(document.to_dict())

    write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
    if failures:
        write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)

    summary = build_parse_summary(rows, failures, parser="weknora:chunks")
    write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
    return rows, summary