Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
weknora_ragas
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
599453fa
...
599453faf39d1936a4b7439c4d296d35f200c9e5
authored
2026-04-21 16:47:55 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Support MinerU HTTP file parse endpoint
1 parent
1e0c82cf
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
13 deletions
configs/eval.yaml
src/weknora_eval/parsers/mineru.py
configs/eval.yaml
View file @
599453f
...
...
@@ -24,10 +24,11 @@ parsing:
xlsx_mode
:
"
row_text"
min_chars
:
80
mineru
:
mode
:
"
cli
"
mode
:
"
http
"
cli_bin
:
"
mineru"
output_dir
:
"
data/parsed_docs/mineru_raw"
http_base_url
:
"
http://172.23.184.9:8002"
http_parse_path
:
"
/file_parse"
api_key
:
"
mineru"
timeout_seconds
:
600
fallback_to_local
:
false
...
...
src/weknora_eval/parsers/mineru.py
View file @
599453f
...
...
@@ -128,12 +128,13 @@ def parse_pdf_with_http(
if
mineru_config
.
get
(
"api_key"
):
headers
[
"Authorization"
]
=
f
"Bearer {mineru_config['api_key']}"
# The checklist does not define a universal MinerU HTTP contract. This
# implementation expects a replaceable service exposing POST /parse and
# returning {"markdown": "..."} or {"documents": [{"content": "..."}]}.
endpoint
=
str
(
mineru_config
.
get
(
"http_parse_path"
)
or
"/file_parse"
)
if
not
endpoint
.
startswith
(
"/"
):
endpoint
=
"/"
+
endpoint
with
target
.
open
(
"rb"
)
as
file
:
response
=
requests
.
post
(
f
"{base_url}
/parse
"
,
f
"{base_url}
{endpoint}
"
,
files
=
{
"file"
:
(
target
.
name
,
file
,
"application/pdf"
)},
headers
=
headers
,
timeout
=
int
(
mineru_config
.
get
(
"timeout_seconds"
,
600
)),
...
...
@@ -142,13 +143,13 @@ def parse_pdf_with_http(
raise
MinerUParseError
(
f
"MinerU HTTP failed with {response.status_code}: {response.text[:500]}"
)
payload
=
response
.
json
()
contents
:
list
[
str
]
=
[]
if
isinstance
(
payload
.
get
(
"documents"
),
list
)
:
contents
=
[
compact_text
(
item
.
get
(
"content"
))
for
item
in
payload
[
"documents"
]]
elif
payload
.
get
(
"markdown"
):
contents
=
[
compact_text
(
payload
[
"markdown"
])]
else
:
raise
MinerUParseError
(
"MinerU HTTP response must include `markdown` or `documents`"
)
contents
=
extract_mineru_contents
(
payload
)
if
not
contents
:
raw_path
=
_write_unrecognized_mineru_payload
(
target
,
payload
,
mineru_config
)
raise
MinerUParseError
(
"MinerU HTTP response did not include recognizable text content. "
f
"Saved raw response to {raw_path}"
)
docs
:
list
[
ParsedDocument
]
=
[]
for
index
,
content
in
enumerate
(
contents
,
start
=
1
):
...
...
@@ -160,7 +161,57 @@ def parse_pdf_with_http(
source_file
=
target
.
name
,
file_type
=
"pdf"
,
content
=
content
,
metadata
=
{
"parser"
:
"mineru:http"
},
metadata
=
{
"parser"
:
"mineru:http"
,
"mineru_endpoint"
:
endpoint
},
)
)
return
docs
def
extract_mineru_contents
(
payload
:
Any
)
->
list
[
str
]:
contents
:
list
[
str
]
=
[]
_collect_text_values
(
payload
,
contents
)
return
[
content
for
content
in
dict
.
fromkeys
(
contents
)
if
content
]
def
_collect_text_values
(
value
:
Any
,
contents
:
list
[
str
])
->
None
:
if
isinstance
(
value
,
str
):
text
=
compact_text
(
value
)
if
len
(
text
)
>=
20
:
contents
.
append
(
text
)
return
if
isinstance
(
value
,
list
):
for
item
in
value
:
_collect_text_values
(
item
,
contents
)
return
if
not
isinstance
(
value
,
dict
):
return
for
key
in
(
"markdown"
,
"md"
,
"content"
,
"text"
,
"plain_text"
,
"page_content"
,
"document"
,
):
if
key
in
value
:
_collect_text_values
(
value
[
key
],
contents
)
for
key
in
(
"documents"
,
"pages"
,
"chunks"
,
"data"
,
"result"
,
"results"
):
if
key
in
value
:
_collect_text_values
(
value
[
key
],
contents
)
def
_write_unrecognized_mineru_payload
(
pdf_path
:
Path
,
payload
:
dict
[
str
,
Any
],
mineru_config
:
dict
[
str
,
Any
],
)
->
Path
:
output_root
=
Path
(
mineru_config
.
get
(
"output_dir"
,
"data/parsed_docs/mineru_raw"
))
output_root
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
raw_path
=
output_root
/
f
"{pdf_path.stem}.response.json"
write_json
(
raw_path
,
payload
)
return
raw_path
...
...
Please
register
or
sign in
to post a comment