Support raw docs root directory

沈秋雨
Commit 854ed21c ... 854ed21c35156acb1cd1d590b7a794f12dfebd39 authored 2026-04-21 15:54:42 +0800 by 沈秋雨
Showing 8 changed files with 61 additions and 10 deletions
.gitignore
README.md
TESTING_GUIDE.md
data/raw_docs/.gitkeep
scripts/01_upload_docs.py
src/weknora_eval/parsers/local.py
src/weknora_eval/parsers/mineru.py
src/weknora_eval/raw_docs.py
--- a/.gitignore
View file @854ed21
+++ b/.gitignore
View file @854ed21
@@ -6,6 +6,10 @@ __pycache__/
 .pytest_cache/
 .ruff_cache/

+data/raw_docs/*
+!data/raw_docs/.gitkeep
+!data/raw_docs/pdf/
+!data/raw_docs/xlsx/
 data/raw_docs/pdf/*
 data/raw_docs/xlsx/*
 data/parsed_docs/*.json
--- a/README.md
View file @854ed21
+++ b/README.md
View file @854ed21
@@ -42,10 +42,12 @@ cp .env.example .env

 ## 首轮 Pilot

-把原始文件放到：
+把原始文件放到 `data/raw_docs/`，脚本会按扩展名自动识别 PDF 和 XLSX。也兼容旧目录：

- `data/raw_docs/pdf/`
- `data/raw_docs/xlsx/`
+- `data/raw_docs/*.pdf`
+- `data/raw_docs/*.xlsx`
+- `data/raw_docs/pdf/*.pdf`
+- `data/raw_docs/xlsx/*.xlsx`

 按顺序执行：

--- a/TESTING_GUIDE.md
View file @854ed21
+++ b/TESTING_GUIDE.md
View file @854ed21
@@ -132,6 +132,14 @@ All configured model services are reachable.
 放置文件：

 ```bash
+mkdir -p data/raw_docs
+cp /path/to/*.pdf data/raw_docs/
+cp /path/to/*.xlsx data/raw_docs/
+```
+
+也兼容旧目录：
+
+```bash
 mkdir -p data/raw_docs/pdf data/raw_docs/xlsx
 cp /path/to/*.pdf data/raw_docs/pdf/
 cp /path/to/*.xlsx data/raw_docs/xlsx/
--- a/data/raw_docs/.gitkeep 0 → 100644
View file @854ed21
+++ b/data/raw_docs/.gitkeep 0 → 100644
View file @854ed21
+
--- a/scripts/01_upload_docs.py
View file @854ed21
+++ b/scripts/01_upload_docs.py
View file @854ed21
 from __future__ import annotations

 import sys
-from pathlib import Path

 import _bootstrap  # noqa: F401

 from weknora_eval.api import client_from_config
 from weknora_eval.config import load_config
 from weknora_eval.loaders import setup_logging, write_jsonl
+from weknora_eval.raw_docs import iter_raw_doc_files


 def main() -> int:
    setup_logging()
    config = load_config()
    client = client_from_config(config)
-    files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
-        Path("data/raw_docs/xlsx").glob("*.xlsx")
-    )
+    files = iter_raw_doc_files()
    rows = []
    for path in files:
        data = client.upload_file(path)
--- a/src/weknora_eval/parsers/local.py
View file @854ed21
+++ b/src/weknora_eval/parsers/local.py
View file @854ed21
@@ -7,6 +7,7 @@ from typing import Any
 from openpyxl import load_workbook

 from weknora_eval.loaders import compact_text, write_json, write_jsonl
+from weknora_eval.raw_docs import iter_pdf_files, iter_xlsx_files
 from weknora_eval.schemas import ParsedDocument


@@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
    docs: list[ParsedDocument] = []
    failures: list[dict[str, Any]] = []

-    for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
+    for pdf_path in iter_pdf_files():
        try:
            docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
        except Exception as exc:  # noqa: BLE001 - parser failures must be persisted.
@@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
                }
            )

-    for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")):
+    for xlsx_path in iter_xlsx_files():
        try:
            docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
        except Exception as exc:  # noqa: BLE001
--- a/src/weknora_eval/parsers/mineru.py
View file @854ed21
+++ b/src/weknora_eval/parsers/mineru.py
View file @854ed21
@@ -8,6 +8,7 @@ import requests

 from weknora_eval.loaders import compact_text, write_json, write_jsonl
 from weknora_eval.parsers.local import build_parse_summary, parse_pdf
+from weknora_eval.raw_docs import iter_pdf_files
 from weknora_eval.schemas import ParsedDocument


@@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic
    docs: list[ParsedDocument] = []
    failures: list[dict[str, Any]] = []

-    for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
+    for pdf_path in iter_pdf_files():
        parser_name = f"mineru:{mode}"
        try:
            if mode == "cli":
--- a/src/weknora_eval/raw_docs.py 0 → 100644
View file @854ed21
+++ b/src/weknora_eval/raw_docs.py 0 → 100644
View file @854ed21
+from __future__ import annotations
+
+from pathlib import Path
+
+
+RAW_DOCS_ROOT = Path("data/raw_docs")
+SUPPORTED_RAW_EXTENSIONS = {".pdf", ".xlsx"}
+
+
+def iter_raw_doc_files(*, extensions: set[str] | None = None) -> list[Path]:
+    wanted = {item.lower() for item in (extensions or SUPPORTED_RAW_EXTENSIONS)}
+    files: dict[Path, Path] = {}
+    if not RAW_DOCS_ROOT.exists():
+        return []
+
+    for path in RAW_DOCS_ROOT.iterdir():
+        if path.is_file() and path.suffix.lower() in wanted:
+            files[path.resolve()] = path
+
+    for subdir in ("pdf", "xlsx"):
+        directory = RAW_DOCS_ROOT / subdir
+        if not directory.exists():
+            continue
+        for path in directory.iterdir():
+            if path.is_file() and path.suffix.lower() in wanted:
+                files[path.resolve()] = path
+
+    return sorted(files.values(), key=lambda item: str(item))
+
+
+def iter_pdf_files() -> list[Path]:
+    return iter_raw_doc_files(extensions={".pdf"})
+
+
+def iter_xlsx_files() -> list[Path]:
+    return iter_raw_doc_files(extensions={".xlsx"})