03_export_chunks.py 1.91 KB
from __future__ import annotations

import sys

import _bootstrap  # noqa: F401

from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl


def main() -> int:
    setup_logging()
    config = load_config()
    client = client_from_config(config)
    knowledge_rows = client.list_knowledge()
    write_jsonl("data/exported/knowledge.jsonl", knowledge_rows)
    knowledge_by_id = {row.get("id"): row for row in knowledge_rows}

    chunk_rows = []
    for knowledge in knowledge_rows:
        knowledge_id = knowledge.get("id")
        if not knowledge_id:
            continue
        if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled":
            continue
        for chunk in client.list_chunks(str(knowledge_id)):
            content = (chunk.get("content") or "").strip()
            if not content:
                continue
            if chunk.get("is_enabled") is False:
                continue
            source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge
            chunk_rows.append(
                {
                    "chunk_id": chunk.get("id"),
                    "knowledge_id": chunk.get("knowledge_id") or knowledge_id,
                    "knowledge_base_id": chunk.get("knowledge_base_id")
                    or config["weknora"]["knowledge_base_id"],
                    "chunk_index": chunk.get("chunk_index"),
                    "content": content,
                    "source_file": source.get("file_name") or source.get("title"),
                    "chunk_type": chunk.get("chunk_type"),
                    "raw": chunk,
                }
            )
    write_jsonl("data/exported/chunks.jsonl", chunk_rows)
    print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records")
    return 0


if __name__ == "__main__":
    sys.exit(main())