Commit 854ed21c 854ed21c35156acb1cd1d590b7a794f12dfebd39 by 沈秋雨

Support raw docs root directory

1 parent a1c6a382
......@@ -6,6 +6,10 @@ __pycache__/
.pytest_cache/
.ruff_cache/
data/raw_docs/*
!data/raw_docs/.gitkeep
!data/raw_docs/pdf/
!data/raw_docs/xlsx/
data/raw_docs/pdf/*
data/raw_docs/xlsx/*
data/parsed_docs/*.json
......
......@@ -42,10 +42,12 @@ cp .env.example .env
## 首轮 Pilot
把原始文件放到:
把原始文件放到 `data/raw_docs/`,脚本会按扩展名自动识别 PDF 和 XLSX。也兼容旧目录
- `data/raw_docs/pdf/`
- `data/raw_docs/xlsx/`
- `data/raw_docs/*.pdf`
- `data/raw_docs/*.xlsx`
- `data/raw_docs/pdf/*.pdf`
- `data/raw_docs/xlsx/*.xlsx`
按顺序执行:
......
......@@ -132,6 +132,14 @@ All configured model services are reachable.
放置文件:
```bash
mkdir -p data/raw_docs
cp /path/to/*.pdf data/raw_docs/
cp /path/to/*.xlsx data/raw_docs/
```
也兼容旧目录:
```bash
mkdir -p data/raw_docs/pdf data/raw_docs/xlsx
cp /path/to/*.pdf data/raw_docs/pdf/
cp /path/to/*.xlsx data/raw_docs/xlsx/
......
from __future__ import annotations
import sys
from pathlib import Path
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl
from weknora_eval.raw_docs import iter_raw_doc_files
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
Path("data/raw_docs/xlsx").glob("*.xlsx")
)
files = iter_raw_doc_files()
rows = []
for path in files:
data = client.upload_file(path)
......
......@@ -7,6 +7,7 @@ from typing import Any
from openpyxl import load_workbook
from weknora_eval.loaders import compact_text, write_json, write_jsonl
from weknora_eval.raw_docs import iter_pdf_files, iter_xlsx_files
from weknora_eval.schemas import ParsedDocument
......@@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
docs: list[ParsedDocument] = []
failures: list[dict[str, Any]] = []
for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
for pdf_path in iter_pdf_files():
try:
docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
except Exception as exc: # noqa: BLE001 - parser failures must be persisted.
......@@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
}
)
for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")):
for xlsx_path in iter_xlsx_files():
try:
docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
except Exception as exc: # noqa: BLE001
......
......@@ -8,6 +8,7 @@ import requests
from weknora_eval.loaders import compact_text, write_json, write_jsonl
from weknora_eval.parsers.local import build_parse_summary, parse_pdf
from weknora_eval.raw_docs import iter_pdf_files
from weknora_eval.schemas import ParsedDocument
......@@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic
docs: list[ParsedDocument] = []
failures: list[dict[str, Any]] = []
for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
for pdf_path in iter_pdf_files():
parser_name = f"mineru:{mode}"
try:
if mode == "cli":
......
from __future__ import annotations
from pathlib import Path
RAW_DOCS_ROOT = Path("data/raw_docs")
SUPPORTED_RAW_EXTENSIONS = {".pdf", ".xlsx"}
def iter_raw_doc_files(*, extensions: set[str] | None = None) -> list[Path]:
wanted = {item.lower() for item in (extensions or SUPPORTED_RAW_EXTENSIONS)}
files: dict[Path, Path] = {}
if not RAW_DOCS_ROOT.exists():
return []
for path in RAW_DOCS_ROOT.iterdir():
if path.is_file() and path.suffix.lower() in wanted:
files[path.resolve()] = path
for subdir in ("pdf", "xlsx"):
directory = RAW_DOCS_ROOT / subdir
if not directory.exists():
continue
for path in directory.iterdir():
if path.is_file() and path.suffix.lower() in wanted:
files[path.resolve()] = path
return sorted(files.values(), key=lambda item: str(item))
def iter_pdf_files() -> list[Path]:
return iter_raw_doc_files(extensions={".pdf"})
def iter_xlsx_files() -> list[Path]:
return iter_raw_doc_files(extensions={".xlsx"})