Commit 599453fa 599453faf39d1936a4b7439c4d296d35f200c9e5 by 沈秋雨

Support MinerU HTTP file parse endpoint

1 parent 1e0c82cf
......@@ -24,10 +24,11 @@ parsing:
xlsx_mode: "row_text"
min_chars: 80
mineru:
mode: "cli"
mode: "http"
cli_bin: "mineru"
output_dir: "data/parsed_docs/mineru_raw"
http_base_url: "http://172.23.184.9:8002"
http_parse_path: "/file_parse"
api_key: "mineru"
timeout_seconds: 600
fallback_to_local: false
......
......@@ -128,12 +128,13 @@ def parse_pdf_with_http(
if mineru_config.get("api_key"):
headers["Authorization"] = f"Bearer {mineru_config['api_key']}"
# The checklist does not define a universal MinerU HTTP contract. This
# implementation expects a replaceable service exposing POST /parse and
# returning {"markdown": "..."} or {"documents": [{"content": "..."}]}.
endpoint = str(mineru_config.get("http_parse_path") or "/file_parse")
if not endpoint.startswith("/"):
endpoint = "/" + endpoint
with target.open("rb") as file:
response = requests.post(
f"{base_url}/parse",
f"{base_url}{endpoint}",
files={"file": (target.name, file, "application/pdf")},
headers=headers,
timeout=int(mineru_config.get("timeout_seconds", 600)),
......@@ -142,13 +143,13 @@ def parse_pdf_with_http(
raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
payload = response.json()
contents: list[str] = []
if isinstance(payload.get("documents"), list):
contents = [compact_text(item.get("content")) for item in payload["documents"]]
elif payload.get("markdown"):
contents = [compact_text(payload["markdown"])]
else:
raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`")
contents = extract_mineru_contents(payload)
if not contents:
raw_path = _write_unrecognized_mineru_payload(target, payload, mineru_config)
raise MinerUParseError(
"MinerU HTTP response did not include recognizable text content. "
f"Saved raw response to {raw_path}"
)
docs: list[ParsedDocument] = []
for index, content in enumerate(contents, start=1):
......@@ -160,7 +161,57 @@ def parse_pdf_with_http(
source_file=target.name,
file_type="pdf",
content=content,
metadata={"parser": "mineru:http"},
metadata={"parser": "mineru:http", "mineru_endpoint": endpoint},
)
)
return docs
def extract_mineru_contents(payload: Any) -> list[str]:
contents: list[str] = []
_collect_text_values(payload, contents)
return [content for content in dict.fromkeys(contents) if content]
def _collect_text_values(value: Any, contents: list[str]) -> None:
if isinstance(value, str):
text = compact_text(value)
if len(text) >= 20:
contents.append(text)
return
if isinstance(value, list):
for item in value:
_collect_text_values(item, contents)
return
if not isinstance(value, dict):
return
for key in (
"markdown",
"md",
"content",
"text",
"plain_text",
"page_content",
"document",
):
if key in value:
_collect_text_values(value[key], contents)
for key in ("documents", "pages", "chunks", "data", "result", "results"):
if key in value:
_collect_text_values(value[key], contents)
def _write_unrecognized_mineru_payload(
pdf_path: Path,
payload: dict[str, Any],
mineru_config: dict[str, Any],
) -> Path:
output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw"))
output_root.mkdir(parents=True, exist_ok=True)
raw_path = output_root / f"{pdf_path.stem}.response.json"
write_json(raw_path, payload)
return raw_path
......