03_export_chunks.py
1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
knowledge_rows = client.list_knowledge()
write_jsonl("data/exported/knowledge.jsonl", knowledge_rows)
knowledge_by_id = {row.get("id"): row for row in knowledge_rows}
chunk_rows = []
for knowledge in knowledge_rows:
knowledge_id = knowledge.get("id")
if not knowledge_id:
continue
if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled":
continue
for chunk in client.list_chunks(str(knowledge_id)):
content = (chunk.get("content") or "").strip()
if not content:
continue
if chunk.get("is_enabled") is False:
continue
source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge
chunk_rows.append(
{
"chunk_id": chunk.get("id"),
"knowledge_id": chunk.get("knowledge_id") or knowledge_id,
"knowledge_base_id": chunk.get("knowledge_base_id")
or config["weknora"]["knowledge_base_id"],
"chunk_index": chunk.get("chunk_index"),
"content": content,
"source_file": source.get("file_name") or source.get("title"),
"chunk_type": chunk.get("chunk_type"),
"raw": chunk,
}
)
write_jsonl("data/exported/chunks.jsonl", chunk_rows)
print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records")
return 0
if __name__ == "__main__":
sys.exit(main())