Commit 854ed21c 854ed21c35156acb1cd1d590b7a794f12dfebd39 by 沈秋雨

Support raw docs root directory

1 parent a1c6a382
...@@ -6,6 +6,10 @@ __pycache__/ ...@@ -6,6 +6,10 @@ __pycache__/
6 .pytest_cache/ 6 .pytest_cache/
7 .ruff_cache/ 7 .ruff_cache/
8 8
9 data/raw_docs/*
10 !data/raw_docs/.gitkeep
11 !data/raw_docs/pdf/
12 !data/raw_docs/xlsx/
9 data/raw_docs/pdf/* 13 data/raw_docs/pdf/*
10 data/raw_docs/xlsx/* 14 data/raw_docs/xlsx/*
11 data/parsed_docs/*.json 15 data/parsed_docs/*.json
......
...@@ -42,10 +42,12 @@ cp .env.example .env ...@@ -42,10 +42,12 @@ cp .env.example .env
42 42
43 ## 首轮 Pilot 43 ## 首轮 Pilot
44 44
45 把原始文件放到: 45 把原始文件放到 `data/raw_docs/`,脚本会按扩展名自动识别 PDF 和 XLSX。也兼容旧目录
46 46
47 - `data/raw_docs/pdf/` 47 - `data/raw_docs/*.pdf`
48 - `data/raw_docs/xlsx/` 48 - `data/raw_docs/*.xlsx`
49 - `data/raw_docs/pdf/*.pdf`
50 - `data/raw_docs/xlsx/*.xlsx`
49 51
50 按顺序执行: 52 按顺序执行:
51 53
......
...@@ -132,6 +132,14 @@ All configured model services are reachable. ...@@ -132,6 +132,14 @@ All configured model services are reachable.
132 放置文件: 132 放置文件:
133 133
134 ```bash 134 ```bash
135 mkdir -p data/raw_docs
136 cp /path/to/*.pdf data/raw_docs/
137 cp /path/to/*.xlsx data/raw_docs/
138 ```
139
140 也兼容旧目录:
141
142 ```bash
135 mkdir -p data/raw_docs/pdf data/raw_docs/xlsx 143 mkdir -p data/raw_docs/pdf data/raw_docs/xlsx
136 cp /path/to/*.pdf data/raw_docs/pdf/ 144 cp /path/to/*.pdf data/raw_docs/pdf/
137 cp /path/to/*.xlsx data/raw_docs/xlsx/ 145 cp /path/to/*.xlsx data/raw_docs/xlsx/
......
1 from __future__ import annotations 1 from __future__ import annotations
2 2
3 import sys 3 import sys
4 from pathlib import Path
5 4
6 import _bootstrap # noqa: F401 5 import _bootstrap # noqa: F401
7 6
8 from weknora_eval.api import client_from_config 7 from weknora_eval.api import client_from_config
9 from weknora_eval.config import load_config 8 from weknora_eval.config import load_config
10 from weknora_eval.loaders import setup_logging, write_jsonl 9 from weknora_eval.loaders import setup_logging, write_jsonl
10 from weknora_eval.raw_docs import iter_raw_doc_files
11 11
12 12
13 def main() -> int: 13 def main() -> int:
14 setup_logging() 14 setup_logging()
15 config = load_config() 15 config = load_config()
16 client = client_from_config(config) 16 client = client_from_config(config)
17 files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted( 17 files = iter_raw_doc_files()
18 Path("data/raw_docs/xlsx").glob("*.xlsx")
19 )
20 rows = [] 18 rows = []
21 for path in files: 19 for path in files:
22 data = client.upload_file(path) 20 data = client.upload_file(path)
......
...@@ -7,6 +7,7 @@ from typing import Any ...@@ -7,6 +7,7 @@ from typing import Any
7 from openpyxl import load_workbook 7 from openpyxl import load_workbook
8 8
9 from weknora_eval.loaders import compact_text, write_json, write_jsonl 9 from weknora_eval.loaders import compact_text, write_json, write_jsonl
10 from weknora_eval.raw_docs import iter_pdf_files, iter_xlsx_files
10 from weknora_eval.schemas import ParsedDocument 11 from weknora_eval.schemas import ParsedDocument
11 12
12 13
...@@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s ...@@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
20 docs: list[ParsedDocument] = [] 21 docs: list[ParsedDocument] = []
21 failures: list[dict[str, Any]] = [] 22 failures: list[dict[str, Any]] = []
22 23
23 for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): 24 for pdf_path in iter_pdf_files():
24 try: 25 try:
25 docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars)) 26 docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
26 except Exception as exc: # noqa: BLE001 - parser failures must be persisted. 27 except Exception as exc: # noqa: BLE001 - parser failures must be persisted.
...@@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s ...@@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
34 } 35 }
35 ) 36 )
36 37
37 for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")): 38 for xlsx_path in iter_xlsx_files():
38 try: 39 try:
39 docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars)) 40 docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
40 except Exception as exc: # noqa: BLE001 41 except Exception as exc: # noqa: BLE001
......
...@@ -8,6 +8,7 @@ import requests ...@@ -8,6 +8,7 @@ import requests
8 8
9 from weknora_eval.loaders import compact_text, write_json, write_jsonl 9 from weknora_eval.loaders import compact_text, write_json, write_jsonl
10 from weknora_eval.parsers.local import build_parse_summary, parse_pdf 10 from weknora_eval.parsers.local import build_parse_summary, parse_pdf
11 from weknora_eval.raw_docs import iter_pdf_files
11 from weknora_eval.schemas import ParsedDocument 12 from weknora_eval.schemas import ParsedDocument
12 13
13 14
...@@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic ...@@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic
26 docs: list[ParsedDocument] = [] 27 docs: list[ParsedDocument] = []
27 failures: list[dict[str, Any]] = [] 28 failures: list[dict[str, Any]] = []
28 29
29 for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): 30 for pdf_path in iter_pdf_files():
30 parser_name = f"mineru:{mode}" 31 parser_name = f"mineru:{mode}"
31 try: 32 try:
32 if mode == "cli": 33 if mode == "cli":
......
1 from __future__ import annotations
2
3 from pathlib import Path
4
5
6 RAW_DOCS_ROOT = Path("data/raw_docs")
7 SUPPORTED_RAW_EXTENSIONS = {".pdf", ".xlsx"}
8
9
10 def iter_raw_doc_files(*, extensions: set[str] | None = None) -> list[Path]:
11 wanted = {item.lower() for item in (extensions or SUPPORTED_RAW_EXTENSIONS)}
12 files: dict[Path, Path] = {}
13 if not RAW_DOCS_ROOT.exists():
14 return []
15
16 for path in RAW_DOCS_ROOT.iterdir():
17 if path.is_file() and path.suffix.lower() in wanted:
18 files[path.resolve()] = path
19
20 for subdir in ("pdf", "xlsx"):
21 directory = RAW_DOCS_ROOT / subdir
22 if not directory.exists():
23 continue
24 for path in directory.iterdir():
25 if path.is_file() and path.suffix.lower() in wanted:
26 files[path.resolve()] = path
27
28 return sorted(files.values(), key=lambda item: str(item))
29
30
31 def iter_pdf_files() -> list[Path]:
32 return iter_raw_doc_files(extensions={".pdf"})
33
34
35 def iter_xlsx_files() -> list[Path]:
36 return iter_raw_doc_files(extensions={".xlsx"})