Commit b9e14ffa b9e14ffa1378c453b090e158c910410e26d2eafa by 沈秋雨

Pass MinerU HTTP device field

1 parent 127df0d6
......@@ -3,6 +3,10 @@ WEKNORA_API_KEY=
WEKNORA_KB_ID=
WEKNORA_KB_NAME=ragas-eval-pilot
# MinerU HTTP parser. Use cpu, cuda, cuda:0, etc. according to the deployed
# MinerU backend.
MINERU_DEVICE=cpu
# Ragas generation and judge models. These are evaluation-side models, not the
# model configuration used by the WeKnora backend.
RAGAS_LLM_API_KEY=replace-me
......
......@@ -29,6 +29,8 @@ parsing:
output_dir: "data/parsed_docs/mineru_raw"
http_base_url: "http://172.23.184.9:8002"
http_parse_path: "/file_parse"
http_form_fields:
device: "${MINERU_DEVICE:-cpu}"
api_key: "mineru"
timeout_seconds: 600
fallback_to_local: false
......
......@@ -131,16 +131,23 @@ def parse_pdf_with_http(
endpoint = str(mineru_config.get("http_parse_path") or "/file_parse")
if not endpoint.startswith("/"):
endpoint = "/" + endpoint
form_fields = {
str(key): str(value)
for key, value in (mineru_config.get("http_form_fields") or {}).items()
if value not in {None, ""}
}
with target.open("rb") as file:
response = requests.post(
f"{base_url}{endpoint}",
files=[("files", (target.name, file, "application/pdf"))],
data=form_fields,
headers=headers,
timeout=int(mineru_config.get("timeout_seconds", 600)),
)
if response.status_code >= 400:
raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
error_detail = _mineru_error_detail(response)
raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {error_detail}")
payload = response.json()
contents = extract_mineru_contents(payload)
......@@ -161,12 +168,29 @@ def parse_pdf_with_http(
source_file=target.name,
file_type="pdf",
content=content,
metadata={"parser": "mineru:http", "mineru_endpoint": endpoint},
metadata={
"parser": "mineru:http",
"mineru_endpoint": endpoint,
"mineru_form_fields": form_fields,
},
)
)
return docs
def _mineru_error_detail(response: requests.Response) -> str:
try:
payload = response.json()
except ValueError:
return response.text[:1000]
error = payload.get("error")
if error:
task_id = payload.get("task_id")
status = payload.get("status")
return f"task_id={task_id} status={status} error={error}"
return response.text[:1000]
def extract_mineru_contents(payload: Any) -> list[str]:
contents: list[str] = []
_collect_text_values(payload, contents)
......