01_upload_docs.py
1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from __future__ import annotations
import sys
from pathlib import Path
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
Path("data/raw_docs/xlsx").glob("*.xlsx")
)
rows = []
for path in files:
data = client.upload_file(path)
rows.append(
{
"knowledge_id": data.get("id"),
"file_name": data.get("file_name") or data.get("title") or path.name,
"file_type": data.get("file_type") or path.suffix.lstrip("."),
"parse_status": data.get("parse_status"),
"enable_status": data.get("enable_status"),
"raw": data,
}
)
write_jsonl("data/exported/knowledge_uploads.jsonl", rows)
print(f"Uploaded {len(rows)} files")
return 0
if __name__ == "__main__":
sys.exit(main())