Support MinerU HTTP file parse endpoint
Showing
2 changed files
with
65 additions
and
13 deletions
| ... | @@ -24,10 +24,11 @@ parsing: | ... | @@ -24,10 +24,11 @@ parsing: |
| 24 | xlsx_mode: "row_text" | 24 | xlsx_mode: "row_text" |
| 25 | min_chars: 80 | 25 | min_chars: 80 |
| 26 | mineru: | 26 | mineru: |
| 27 | mode: "cli" | 27 | mode: "http" |
| 28 | cli_bin: "mineru" | 28 | cli_bin: "mineru" |
| 29 | output_dir: "data/parsed_docs/mineru_raw" | 29 | output_dir: "data/parsed_docs/mineru_raw" |
| 30 | http_base_url: "http://172.23.184.9:8002" | 30 | http_base_url: "http://172.23.184.9:8002" |
| 31 | http_parse_path: "/file_parse" | ||
| 31 | api_key: "mineru" | 32 | api_key: "mineru" |
| 32 | timeout_seconds: 600 | 33 | timeout_seconds: 600 |
| 33 | fallback_to_local: false | 34 | fallback_to_local: false | ... | ... |
| ... | @@ -128,12 +128,13 @@ def parse_pdf_with_http( | ... | @@ -128,12 +128,13 @@ def parse_pdf_with_http( |
| 128 | if mineru_config.get("api_key"): | 128 | if mineru_config.get("api_key"): |
| 129 | headers["Authorization"] = f"Bearer {mineru_config['api_key']}" | 129 | headers["Authorization"] = f"Bearer {mineru_config['api_key']}" |
| 130 | 130 | ||
| 131 | # The checklist does not define a universal MinerU HTTP contract. This | 131 | endpoint = str(mineru_config.get("http_parse_path") or "/file_parse") |
| 132 | # implementation expects a replaceable service exposing POST /parse and | 132 | if not endpoint.startswith("/"): |
| 133 | # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}. | 133 | endpoint = "/" + endpoint |
| 134 | |||
| 134 | with target.open("rb") as file: | 135 | with target.open("rb") as file: |
| 135 | response = requests.post( | 136 | response = requests.post( |
| 136 | f"{base_url}/parse", | 137 | f"{base_url}{endpoint}", |
| 137 | files={"file": (target.name, file, "application/pdf")}, | 138 | files={"file": (target.name, file, "application/pdf")}, |
| 138 | headers=headers, | 139 | headers=headers, |
| 139 | timeout=int(mineru_config.get("timeout_seconds", 600)), | 140 | timeout=int(mineru_config.get("timeout_seconds", 600)), |
| ... | @@ -142,13 +143,13 @@ def parse_pdf_with_http( | ... | @@ -142,13 +143,13 @@ def parse_pdf_with_http( |
| 142 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") | 143 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") |
| 143 | 144 | ||
| 144 | payload = response.json() | 145 | payload = response.json() |
| 145 | contents: list[str] = [] | 146 | contents = extract_mineru_contents(payload) |
| 146 | if isinstance(payload.get("documents"), list): | 147 | if not contents: |
| 147 | contents = [compact_text(item.get("content")) for item in payload["documents"]] | 148 | raw_path = _write_unrecognized_mineru_payload(target, payload, mineru_config) |
| 148 | elif payload.get("markdown"): | 149 | raise MinerUParseError( |
| 149 | contents = [compact_text(payload["markdown"])] | 150 | "MinerU HTTP response did not include recognizable text content. " |
| 150 | else: | 151 | f"Saved raw response to {raw_path}" |
| 151 | raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`") | 152 | ) |
| 152 | 153 | ||
| 153 | docs: list[ParsedDocument] = [] | 154 | docs: list[ParsedDocument] = [] |
| 154 | for index, content in enumerate(contents, start=1): | 155 | for index, content in enumerate(contents, start=1): |
| ... | @@ -160,7 +161,57 @@ def parse_pdf_with_http( | ... | @@ -160,7 +161,57 @@ def parse_pdf_with_http( |
| 160 | source_file=target.name, | 161 | source_file=target.name, |
| 161 | file_type="pdf", | 162 | file_type="pdf", |
| 162 | content=content, | 163 | content=content, |
| 163 | metadata={"parser": "mineru:http"}, | 164 | metadata={"parser": "mineru:http", "mineru_endpoint": endpoint}, |
| 164 | ) | 165 | ) |
| 165 | ) | 166 | ) |
| 166 | return docs | 167 | return docs |
| 168 | |||
| 169 | |||
| 170 | def extract_mineru_contents(payload: Any) -> list[str]: | ||
| 171 | contents: list[str] = [] | ||
| 172 | _collect_text_values(payload, contents) | ||
| 173 | return [content for content in dict.fromkeys(contents) if content] | ||
| 174 | |||
| 175 | |||
| 176 | def _collect_text_values(value: Any, contents: list[str]) -> None: | ||
| 177 | if isinstance(value, str): | ||
| 178 | text = compact_text(value) | ||
| 179 | if len(text) >= 20: | ||
| 180 | contents.append(text) | ||
| 181 | return | ||
| 182 | |||
| 183 | if isinstance(value, list): | ||
| 184 | for item in value: | ||
| 185 | _collect_text_values(item, contents) | ||
| 186 | return | ||
| 187 | |||
| 188 | if not isinstance(value, dict): | ||
| 189 | return | ||
| 190 | |||
| 191 | for key in ( | ||
| 192 | "markdown", | ||
| 193 | "md", | ||
| 194 | "content", | ||
| 195 | "text", | ||
| 196 | "plain_text", | ||
| 197 | "page_content", | ||
| 198 | "document", | ||
| 199 | ): | ||
| 200 | if key in value: | ||
| 201 | _collect_text_values(value[key], contents) | ||
| 202 | |||
| 203 | for key in ("documents", "pages", "chunks", "data", "result", "results"): | ||
| 204 | if key in value: | ||
| 205 | _collect_text_values(value[key], contents) | ||
| 206 | |||
| 207 | |||
| 208 | def _write_unrecognized_mineru_payload( | ||
| 209 | pdf_path: Path, | ||
| 210 | payload: dict[str, Any], | ||
| 211 | mineru_config: dict[str, Any], | ||
| 212 | ) -> Path: | ||
| 213 | output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw")) | ||
| 214 | output_root.mkdir(parents=True, exist_ok=True) | ||
| 215 | raw_path = output_root / f"{pdf_path.stem}.response.json" | ||
| 216 | write_json(raw_path, payload) | ||
| 217 | return raw_path | ... | ... |
-
Please register or sign in to post a comment