Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
weknora_ragas
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
b9e14ffa
...
b9e14ffa1378c453b090e158c910410e26d2eafa
authored
2026-04-21 16:54:27 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Pass MinerU HTTP device field
1 parent
127df0d6
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
2 deletions
.env.example
configs/eval.yaml
src/weknora_eval/parsers/mineru.py
.env.example
View file @
b9e14ff
...
...
@@ -3,6 +3,10 @@ WEKNORA_API_KEY=
WEKNORA_KB_ID=
WEKNORA_KB_NAME=ragas-eval-pilot
# MinerU HTTP parser. Use cpu, cuda, cuda:0, etc. according to the deployed
# MinerU backend.
MINERU_DEVICE=cpu
# Ragas generation and judge models. These are evaluation-side models, not the
# model configuration used by the WeKnora backend.
RAGAS_LLM_API_KEY=replace-me
...
...
configs/eval.yaml
View file @
b9e14ff
...
...
@@ -29,6 +29,8 @@ parsing:
output_dir
:
"
data/parsed_docs/mineru_raw"
http_base_url
:
"
http://172.23.184.9:8002"
http_parse_path
:
"
/file_parse"
http_form_fields
:
device
:
"
${MINERU_DEVICE:-cpu}"
api_key
:
"
mineru"
timeout_seconds
:
600
fallback_to_local
:
false
...
...
src/weknora_eval/parsers/mineru.py
View file @
b9e14ff
...
...
@@ -131,16 +131,23 @@ def parse_pdf_with_http(
endpoint
=
str
(
mineru_config
.
get
(
"http_parse_path"
)
or
"/file_parse"
)
if
not
endpoint
.
startswith
(
"/"
):
endpoint
=
"/"
+
endpoint
form_fields
=
{
str
(
key
):
str
(
value
)
for
key
,
value
in
(
mineru_config
.
get
(
"http_form_fields"
)
or
{})
.
items
()
if
value
not
in
{
None
,
""
}
}
with
target
.
open
(
"rb"
)
as
file
:
response
=
requests
.
post
(
f
"{base_url}{endpoint}"
,
files
=
[(
"files"
,
(
target
.
name
,
file
,
"application/pdf"
))],
data
=
form_fields
,
headers
=
headers
,
timeout
=
int
(
mineru_config
.
get
(
"timeout_seconds"
,
600
)),
)
if
response
.
status_code
>=
400
:
raise
MinerUParseError
(
f
"MinerU HTTP failed with {response.status_code}: {response.text[:500]}"
)
error_detail
=
_mineru_error_detail
(
response
)
raise
MinerUParseError
(
f
"MinerU HTTP failed with {response.status_code}: {error_detail}"
)
payload
=
response
.
json
()
contents
=
extract_mineru_contents
(
payload
)
...
...
@@ -161,12 +168,29 @@ def parse_pdf_with_http(
source_file
=
target
.
name
,
file_type
=
"pdf"
,
content
=
content
,
metadata
=
{
"parser"
:
"mineru:http"
,
"mineru_endpoint"
:
endpoint
},
metadata
=
{
"parser"
:
"mineru:http"
,
"mineru_endpoint"
:
endpoint
,
"mineru_form_fields"
:
form_fields
,
},
)
)
return
docs
def
_mineru_error_detail
(
response
:
requests
.
Response
)
->
str
:
try
:
payload
=
response
.
json
()
except
ValueError
:
return
response
.
text
[:
1000
]
error
=
payload
.
get
(
"error"
)
if
error
:
task_id
=
payload
.
get
(
"task_id"
)
status
=
payload
.
get
(
"status"
)
return
f
"task_id={task_id} status={status} error={error}"
return
response
.
text
[:
1000
]
def
extract_mineru_contents
(
payload
:
Any
)
->
list
[
str
]:
contents
:
list
[
str
]
=
[]
_collect_text_values
(
payload
,
contents
)
...
...
Please
register
or
sign in
to post a comment