01_upload_docs.py 1.13 KB
from __future__ import annotations

import sys
from pathlib import Path

import _bootstrap  # noqa: F401

from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl


def main() -> int:
    setup_logging()
    config = load_config()
    client = client_from_config(config)
    files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
        Path("data/raw_docs/xlsx").glob("*.xlsx")
    )
    rows = []
    for path in files:
        data = client.upload_file(path)
        rows.append(
            {
                "knowledge_id": data.get("id"),
                "file_name": data.get("file_name") or data.get("title") or path.name,
                "file_type": data.get("file_type") or path.suffix.lstrip("."),
                "parse_status": data.get("parse_status"),
                "enable_status": data.get("enable_status"),
                "raw": data,
            }
        )
    write_jsonl("data/exported/knowledge_uploads.jsonl", rows)
    print(f"Uploaded {len(rows)} files")
    return 0


if __name__ == "__main__":
    sys.exit(main())