Commit 599453fa 599453faf39d1936a4b7439c4d296d35f200c9e5 by 沈秋雨

Support MinerU HTTP file parse endpoint

1 parent 1e0c82cf
...@@ -24,10 +24,11 @@ parsing: ...@@ -24,10 +24,11 @@ parsing:
24 xlsx_mode: "row_text" 24 xlsx_mode: "row_text"
25 min_chars: 80 25 min_chars: 80
26 mineru: 26 mineru:
27 mode: "cli" 27 mode: "http"
28 cli_bin: "mineru" 28 cli_bin: "mineru"
29 output_dir: "data/parsed_docs/mineru_raw" 29 output_dir: "data/parsed_docs/mineru_raw"
30 http_base_url: "http://172.23.184.9:8002" 30 http_base_url: "http://172.23.184.9:8002"
31 http_parse_path: "/file_parse"
31 api_key: "mineru" 32 api_key: "mineru"
32 timeout_seconds: 600 33 timeout_seconds: 600
33 fallback_to_local: false 34 fallback_to_local: false
......
...@@ -128,12 +128,13 @@ def parse_pdf_with_http( ...@@ -128,12 +128,13 @@ def parse_pdf_with_http(
128 if mineru_config.get("api_key"): 128 if mineru_config.get("api_key"):
129 headers["Authorization"] = f"Bearer {mineru_config['api_key']}" 129 headers["Authorization"] = f"Bearer {mineru_config['api_key']}"
130 130
131 # The checklist does not define a universal MinerU HTTP contract. This 131 endpoint = str(mineru_config.get("http_parse_path") or "/file_parse")
132 # implementation expects a replaceable service exposing POST /parse and 132 if not endpoint.startswith("/"):
133 # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}. 133 endpoint = "/" + endpoint
134
134 with target.open("rb") as file: 135 with target.open("rb") as file:
135 response = requests.post( 136 response = requests.post(
136 f"{base_url}/parse", 137 f"{base_url}{endpoint}",
137 files={"file": (target.name, file, "application/pdf")}, 138 files={"file": (target.name, file, "application/pdf")},
138 headers=headers, 139 headers=headers,
139 timeout=int(mineru_config.get("timeout_seconds", 600)), 140 timeout=int(mineru_config.get("timeout_seconds", 600)),
...@@ -142,13 +143,13 @@ def parse_pdf_with_http( ...@@ -142,13 +143,13 @@ def parse_pdf_with_http(
142 raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") 143 raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
143 144
144 payload = response.json() 145 payload = response.json()
145 contents: list[str] = [] 146 contents = extract_mineru_contents(payload)
146 if isinstance(payload.get("documents"), list): 147 if not contents:
147 contents = [compact_text(item.get("content")) for item in payload["documents"]] 148 raw_path = _write_unrecognized_mineru_payload(target, payload, mineru_config)
148 elif payload.get("markdown"): 149 raise MinerUParseError(
149 contents = [compact_text(payload["markdown"])] 150 "MinerU HTTP response did not include recognizable text content. "
150 else: 151 f"Saved raw response to {raw_path}"
151 raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`") 152 )
152 153
153 docs: list[ParsedDocument] = [] 154 docs: list[ParsedDocument] = []
154 for index, content in enumerate(contents, start=1): 155 for index, content in enumerate(contents, start=1):
...@@ -160,7 +161,57 @@ def parse_pdf_with_http( ...@@ -160,7 +161,57 @@ def parse_pdf_with_http(
160 source_file=target.name, 161 source_file=target.name,
161 file_type="pdf", 162 file_type="pdf",
162 content=content, 163 content=content,
163 metadata={"parser": "mineru:http"}, 164 metadata={"parser": "mineru:http", "mineru_endpoint": endpoint},
164 ) 165 )
165 ) 166 )
166 return docs 167 return docs
168
169
170 def extract_mineru_contents(payload: Any) -> list[str]:
171 contents: list[str] = []
172 _collect_text_values(payload, contents)
173 return [content for content in dict.fromkeys(contents) if content]
174
175
176 def _collect_text_values(value: Any, contents: list[str]) -> None:
177 if isinstance(value, str):
178 text = compact_text(value)
179 if len(text) >= 20:
180 contents.append(text)
181 return
182
183 if isinstance(value, list):
184 for item in value:
185 _collect_text_values(item, contents)
186 return
187
188 if not isinstance(value, dict):
189 return
190
191 for key in (
192 "markdown",
193 "md",
194 "content",
195 "text",
196 "plain_text",
197 "page_content",
198 "document",
199 ):
200 if key in value:
201 _collect_text_values(value[key], contents)
202
203 for key in ("documents", "pages", "chunks", "data", "result", "results"):
204 if key in value:
205 _collect_text_values(value[key], contents)
206
207
208 def _write_unrecognized_mineru_payload(
209 pdf_path: Path,
210 payload: dict[str, Any],
211 mineru_config: dict[str, Any],
212 ) -> Path:
213 output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw"))
214 output_root.mkdir(parents=True, exist_ok=True)
215 raw_path = output_root / f"{pdf_path.stem}.response.json"
216 write_json(raw_path, payload)
217 return raw_path
......