Support raw docs root directory
Showing
8 changed files
with
61 additions
and
10 deletions
| ... | @@ -6,6 +6,10 @@ __pycache__/ | ... | @@ -6,6 +6,10 @@ __pycache__/ |
| 6 | .pytest_cache/ | 6 | .pytest_cache/ |
| 7 | .ruff_cache/ | 7 | .ruff_cache/ |
| 8 | 8 | ||
| 9 | data/raw_docs/* | ||
| 10 | !data/raw_docs/.gitkeep | ||
| 11 | !data/raw_docs/pdf/ | ||
| 12 | !data/raw_docs/xlsx/ | ||
| 9 | data/raw_docs/pdf/* | 13 | data/raw_docs/pdf/* |
| 10 | data/raw_docs/xlsx/* | 14 | data/raw_docs/xlsx/* |
| 11 | data/parsed_docs/*.json | 15 | data/parsed_docs/*.json | ... | ... |
| ... | @@ -42,10 +42,12 @@ cp .env.example .env | ... | @@ -42,10 +42,12 @@ cp .env.example .env |
| 42 | 42 | ||
| 43 | ## 首轮 Pilot | 43 | ## 首轮 Pilot |
| 44 | 44 | ||
| 45 | 把原始文件放到: | 45 | 把原始文件放到 `data/raw_docs/`,脚本会按扩展名自动识别 PDF 和 XLSX。也兼容旧目录: |
| 46 | 46 | ||
| 47 | - `data/raw_docs/pdf/` | 47 | - `data/raw_docs/*.pdf` |
| 48 | - `data/raw_docs/xlsx/` | 48 | - `data/raw_docs/*.xlsx` |
| 49 | - `data/raw_docs/pdf/*.pdf` | ||
| 50 | - `data/raw_docs/xlsx/*.xlsx` | ||
| 49 | 51 | ||
| 50 | 按顺序执行: | 52 | 按顺序执行: |
| 51 | 53 | ... | ... |
| ... | @@ -132,6 +132,14 @@ All configured model services are reachable. | ... | @@ -132,6 +132,14 @@ All configured model services are reachable. |
| 132 | 放置文件: | 132 | 放置文件: |
| 133 | 133 | ||
| 134 | ```bash | 134 | ```bash |
| 135 | mkdir -p data/raw_docs | ||
| 136 | cp /path/to/*.pdf data/raw_docs/ | ||
| 137 | cp /path/to/*.xlsx data/raw_docs/ | ||
| 138 | ``` | ||
| 139 | |||
| 140 | 也兼容旧目录: | ||
| 141 | |||
| 142 | ```bash | ||
| 135 | mkdir -p data/raw_docs/pdf data/raw_docs/xlsx | 143 | mkdir -p data/raw_docs/pdf data/raw_docs/xlsx |
| 136 | cp /path/to/*.pdf data/raw_docs/pdf/ | 144 | cp /path/to/*.pdf data/raw_docs/pdf/ |
| 137 | cp /path/to/*.xlsx data/raw_docs/xlsx/ | 145 | cp /path/to/*.xlsx data/raw_docs/xlsx/ | ... | ... |
data/raw_docs/.gitkeep
0 → 100644
| 1 | from __future__ import annotations | 1 | from __future__ import annotations |
| 2 | 2 | ||
| 3 | import sys | 3 | import sys |
| 4 | from pathlib import Path | ||
| 5 | 4 | ||
| 6 | import _bootstrap # noqa: F401 | 5 | import _bootstrap # noqa: F401 |
| 7 | 6 | ||
| 8 | from weknora_eval.api import client_from_config | 7 | from weknora_eval.api import client_from_config |
| 9 | from weknora_eval.config import load_config | 8 | from weknora_eval.config import load_config |
| 10 | from weknora_eval.loaders import setup_logging, write_jsonl | 9 | from weknora_eval.loaders import setup_logging, write_jsonl |
| 10 | from weknora_eval.raw_docs import iter_raw_doc_files | ||
| 11 | 11 | ||
| 12 | 12 | ||
| 13 | def main() -> int: | 13 | def main() -> int: |
| 14 | setup_logging() | 14 | setup_logging() |
| 15 | config = load_config() | 15 | config = load_config() |
| 16 | client = client_from_config(config) | 16 | client = client_from_config(config) |
| 17 | files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted( | 17 | files = iter_raw_doc_files() |
| 18 | Path("data/raw_docs/xlsx").glob("*.xlsx") | ||
| 19 | ) | ||
| 20 | rows = [] | 18 | rows = [] |
| 21 | for path in files: | 19 | for path in files: |
| 22 | data = client.upload_file(path) | 20 | data = client.upload_file(path) | ... | ... |
| ... | @@ -7,6 +7,7 @@ from typing import Any | ... | @@ -7,6 +7,7 @@ from typing import Any |
| 7 | from openpyxl import load_workbook | 7 | from openpyxl import load_workbook |
| 8 | 8 | ||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl |
| 10 | from weknora_eval.raw_docs import iter_pdf_files, iter_xlsx_files | ||
| 10 | from weknora_eval.schemas import ParsedDocument | 11 | from weknora_eval.schemas import ParsedDocument |
| 11 | 12 | ||
| 12 | 13 | ||
| ... | @@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s | ... | @@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s |
| 20 | docs: list[ParsedDocument] = [] | 21 | docs: list[ParsedDocument] = [] |
| 21 | failures: list[dict[str, Any]] = [] | 22 | failures: list[dict[str, Any]] = [] |
| 22 | 23 | ||
| 23 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | 24 | for pdf_path in iter_pdf_files(): |
| 24 | try: | 25 | try: |
| 25 | docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars)) | 26 | docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars)) |
| 26 | except Exception as exc: # noqa: BLE001 - parser failures must be persisted. | 27 | except Exception as exc: # noqa: BLE001 - parser failures must be persisted. |
| ... | @@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s | ... | @@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s |
| 34 | } | 35 | } |
| 35 | ) | 36 | ) |
| 36 | 37 | ||
| 37 | for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")): | 38 | for xlsx_path in iter_xlsx_files(): |
| 38 | try: | 39 | try: |
| 39 | docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars)) | 40 | docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars)) |
| 40 | except Exception as exc: # noqa: BLE001 | 41 | except Exception as exc: # noqa: BLE001 | ... | ... |
| ... | @@ -8,6 +8,7 @@ import requests | ... | @@ -8,6 +8,7 @@ import requests |
| 8 | 8 | ||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl |
| 10 | from weknora_eval.parsers.local import build_parse_summary, parse_pdf | 10 | from weknora_eval.parsers.local import build_parse_summary, parse_pdf |
| 11 | from weknora_eval.raw_docs import iter_pdf_files | ||
| 11 | from weknora_eval.schemas import ParsedDocument | 12 | from weknora_eval.schemas import ParsedDocument |
| 12 | 13 | ||
| 13 | 14 | ||
| ... | @@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic | ... | @@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic |
| 26 | docs: list[ParsedDocument] = [] | 27 | docs: list[ParsedDocument] = [] |
| 27 | failures: list[dict[str, Any]] = [] | 28 | failures: list[dict[str, Any]] = [] |
| 28 | 29 | ||
| 29 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | 30 | for pdf_path in iter_pdf_files(): |
| 30 | parser_name = f"mineru:{mode}" | 31 | parser_name = f"mineru:{mode}" |
| 31 | try: | 32 | try: |
| 32 | if mode == "cli": | 33 | if mode == "cli": | ... | ... |
src/weknora_eval/raw_docs.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from pathlib import Path | ||
| 4 | |||
| 5 | |||
| 6 | RAW_DOCS_ROOT = Path("data/raw_docs") | ||
| 7 | SUPPORTED_RAW_EXTENSIONS = {".pdf", ".xlsx"} | ||
| 8 | |||
| 9 | |||
| 10 | def iter_raw_doc_files(*, extensions: set[str] | None = None) -> list[Path]: | ||
| 11 | wanted = {item.lower() for item in (extensions or SUPPORTED_RAW_EXTENSIONS)} | ||
| 12 | files: dict[Path, Path] = {} | ||
| 13 | if not RAW_DOCS_ROOT.exists(): | ||
| 14 | return [] | ||
| 15 | |||
| 16 | for path in RAW_DOCS_ROOT.iterdir(): | ||
| 17 | if path.is_file() and path.suffix.lower() in wanted: | ||
| 18 | files[path.resolve()] = path | ||
| 19 | |||
| 20 | for subdir in ("pdf", "xlsx"): | ||
| 21 | directory = RAW_DOCS_ROOT / subdir | ||
| 22 | if not directory.exists(): | ||
| 23 | continue | ||
| 24 | for path in directory.iterdir(): | ||
| 25 | if path.is_file() and path.suffix.lower() in wanted: | ||
| 26 | files[path.resolve()] = path | ||
| 27 | |||
| 28 | return sorted(files.values(), key=lambda item: str(item)) | ||
| 29 | |||
| 30 | |||
| 31 | def iter_pdf_files() -> list[Path]: | ||
| 32 | return iter_raw_doc_files(extensions={".pdf"}) | ||
| 33 | |||
| 34 | |||
| 35 | def iter_xlsx_files() -> list[Path]: | ||
| 36 | return iter_raw_doc_files(extensions={".xlsx"}) |
-
Please register or sign in to post a comment