Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
weknora_ragas
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
854ed21c
...
854ed21c35156acb1cd1d590b7a794f12dfebd39
authored
2026-04-21 15:54:42 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Support raw docs root directory
1 parent
a1c6a382
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
61 additions
and
10 deletions
.gitignore
README.md
TESTING_GUIDE.md
data/raw_docs/.gitkeep
scripts/01_upload_docs.py
src/weknora_eval/parsers/local.py
src/weknora_eval/parsers/mineru.py
src/weknora_eval/raw_docs.py
.gitignore
View file @
854ed21
...
...
@@ -6,6 +6,10 @@ __pycache__/
.pytest_cache/
.ruff_cache/
data/raw_docs/*
!data/raw_docs/.gitkeep
!data/raw_docs/pdf/
!data/raw_docs/xlsx/
data/raw_docs/pdf/*
data/raw_docs/xlsx/*
data/parsed_docs/*.json
...
...
README.md
View file @
854ed21
...
...
@@ -42,10 +42,12 @@ cp .env.example .env
## 首轮 Pilot
把原始文件放到:
把原始文件放到
`data/raw_docs/`
,脚本会按扩展名自动识别 PDF 和 XLSX。也兼容旧目录
:
-
`data/raw_docs/pdf/`
-
`data/raw_docs/xlsx/`
-
`data/raw_docs/*.pdf`
-
`data/raw_docs/*.xlsx`
-
`data/raw_docs/pdf/*.pdf`
-
`data/raw_docs/xlsx/*.xlsx`
按顺序执行:
...
...
TESTING_GUIDE.md
View file @
854ed21
...
...
@@ -132,6 +132,14 @@ All configured model services are reachable.
放置文件:
```
bash
mkdir -p data/raw_docs
cp /path/to/
*
.pdf data/raw_docs/
cp /path/to/
*
.xlsx data/raw_docs/
```
也兼容旧目录:
```
bash
mkdir -p data/raw_docs/pdf data/raw_docs/xlsx
cp /path/to/
*
.pdf data/raw_docs/pdf/
cp /path/to/
*
.xlsx data/raw_docs/xlsx/
...
...
data/raw_docs/.gitkeep
0 → 100644
View file @
854ed21
scripts/01_upload_docs.py
View file @
854ed21
from
__future__
import
annotations
import
sys
from
pathlib
import
Path
import
_bootstrap
# noqa: F401
from
weknora_eval.api
import
client_from_config
from
weknora_eval.config
import
load_config
from
weknora_eval.loaders
import
setup_logging
,
write_jsonl
from
weknora_eval.raw_docs
import
iter_raw_doc_files
def
main
()
->
int
:
setup_logging
()
config
=
load_config
()
client
=
client_from_config
(
config
)
files
=
sorted
(
Path
(
"data/raw_docs/pdf"
)
.
glob
(
"*.pdf"
))
+
sorted
(
Path
(
"data/raw_docs/xlsx"
)
.
glob
(
"*.xlsx"
)
)
files
=
iter_raw_doc_files
()
rows
=
[]
for
path
in
files
:
data
=
client
.
upload_file
(
path
)
...
...
src/weknora_eval/parsers/local.py
View file @
854ed21
...
...
@@ -7,6 +7,7 @@ from typing import Any
from
openpyxl
import
load_workbook
from
weknora_eval.loaders
import
compact_text
,
write_json
,
write_jsonl
from
weknora_eval.raw_docs
import
iter_pdf_files
,
iter_xlsx_files
from
weknora_eval.schemas
import
ParsedDocument
...
...
@@ -20,7 +21,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
docs
:
list
[
ParsedDocument
]
=
[]
failures
:
list
[
dict
[
str
,
Any
]]
=
[]
for
pdf_path
in
sorted
(
Path
(
"data/raw_docs/pdf"
)
.
glob
(
"*.pdf"
)
):
for
pdf_path
in
iter_pdf_files
(
):
try
:
docs
.
extend
(
parse_pdf
(
pdf_path
,
backend
=
pdf_backend
,
min_chars
=
min_chars
))
except
Exception
as
exc
:
# noqa: BLE001 - parser failures must be persisted.
...
...
@@ -34,7 +35,7 @@ def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[s
}
)
for
xlsx_path
in
sorted
(
Path
(
"data/raw_docs/xlsx"
)
.
glob
(
"*.xlsx"
)
):
for
xlsx_path
in
iter_xlsx_files
(
):
try
:
docs
.
extend
(
parse_xlsx
(
xlsx_path
,
mode
=
xlsx_mode
,
min_chars
=
min_chars
))
except
Exception
as
exc
:
# noqa: BLE001
...
...
src/weknora_eval/parsers/mineru.py
View file @
854ed21
...
...
@@ -8,6 +8,7 @@ import requests
from
weknora_eval.loaders
import
compact_text
,
write_json
,
write_jsonl
from
weknora_eval.parsers.local
import
build_parse_summary
,
parse_pdf
from
weknora_eval.raw_docs
import
iter_pdf_files
from
weknora_eval.schemas
import
ParsedDocument
...
...
@@ -26,7 +27,7 @@ def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dic
docs
:
list
[
ParsedDocument
]
=
[]
failures
:
list
[
dict
[
str
,
Any
]]
=
[]
for
pdf_path
in
sorted
(
Path
(
"data/raw_docs/pdf"
)
.
glob
(
"*.pdf"
)
):
for
pdf_path
in
iter_pdf_files
(
):
parser_name
=
f
"mineru:{mode}"
try
:
if
mode
==
"cli"
:
...
...
src/weknora_eval/raw_docs.py
0 → 100644
View file @
854ed21
from
__future__
import
annotations
from
pathlib
import
Path
RAW_DOCS_ROOT
=
Path
(
"data/raw_docs"
)
SUPPORTED_RAW_EXTENSIONS
=
{
".pdf"
,
".xlsx"
}
def
iter_raw_doc_files
(
*
,
extensions
:
set
[
str
]
|
None
=
None
)
->
list
[
Path
]:
wanted
=
{
item
.
lower
()
for
item
in
(
extensions
or
SUPPORTED_RAW_EXTENSIONS
)}
files
:
dict
[
Path
,
Path
]
=
{}
if
not
RAW_DOCS_ROOT
.
exists
():
return
[]
for
path
in
RAW_DOCS_ROOT
.
iterdir
():
if
path
.
is_file
()
and
path
.
suffix
.
lower
()
in
wanted
:
files
[
path
.
resolve
()]
=
path
for
subdir
in
(
"pdf"
,
"xlsx"
):
directory
=
RAW_DOCS_ROOT
/
subdir
if
not
directory
.
exists
():
continue
for
path
in
directory
.
iterdir
():
if
path
.
is_file
()
and
path
.
suffix
.
lower
()
in
wanted
:
files
[
path
.
resolve
()]
=
path
return
sorted
(
files
.
values
(),
key
=
lambda
item
:
str
(
item
))
def
iter_pdf_files
()
->
list
[
Path
]:
return
iter_raw_doc_files
(
extensions
=
{
".pdf"
})
def
iter_xlsx_files
()
->
list
[
Path
]:
return
iter_raw_doc_files
(
extensions
=
{
".xlsx"
})
Please
register
or
sign in
to post a comment