Commit b9e14ffa b9e14ffa1378c453b090e158c910410e26d2eafa by 沈秋雨

Pass MinerU HTTP device field

1 parent 127df0d6
...@@ -3,6 +3,10 @@ WEKNORA_API_KEY= ...@@ -3,6 +3,10 @@ WEKNORA_API_KEY=
3 WEKNORA_KB_ID= 3 WEKNORA_KB_ID=
4 WEKNORA_KB_NAME=ragas-eval-pilot 4 WEKNORA_KB_NAME=ragas-eval-pilot
5 5
6 # MinerU HTTP parser. Use cpu, cuda, cuda:0, etc. according to the deployed
7 # MinerU backend.
8 MINERU_DEVICE=cpu
9
6 # Ragas generation and judge models. These are evaluation-side models, not the 10 # Ragas generation and judge models. These are evaluation-side models, not the
7 # model configuration used by the WeKnora backend. 11 # model configuration used by the WeKnora backend.
8 RAGAS_LLM_API_KEY=replace-me 12 RAGAS_LLM_API_KEY=replace-me
......
...@@ -29,6 +29,8 @@ parsing: ...@@ -29,6 +29,8 @@ parsing:
29 output_dir: "data/parsed_docs/mineru_raw" 29 output_dir: "data/parsed_docs/mineru_raw"
30 http_base_url: "http://172.23.184.9:8002" 30 http_base_url: "http://172.23.184.9:8002"
31 http_parse_path: "/file_parse" 31 http_parse_path: "/file_parse"
32 http_form_fields:
33 device: "${MINERU_DEVICE:-cpu}"
32 api_key: "mineru" 34 api_key: "mineru"
33 timeout_seconds: 600 35 timeout_seconds: 600
34 fallback_to_local: false 36 fallback_to_local: false
......
...@@ -131,16 +131,23 @@ def parse_pdf_with_http( ...@@ -131,16 +131,23 @@ def parse_pdf_with_http(
131 endpoint = str(mineru_config.get("http_parse_path") or "/file_parse") 131 endpoint = str(mineru_config.get("http_parse_path") or "/file_parse")
132 if not endpoint.startswith("/"): 132 if not endpoint.startswith("/"):
133 endpoint = "/" + endpoint 133 endpoint = "/" + endpoint
134 form_fields = {
135 str(key): str(value)
136 for key, value in (mineru_config.get("http_form_fields") or {}).items()
137 if value not in {None, ""}
138 }
134 139
135 with target.open("rb") as file: 140 with target.open("rb") as file:
136 response = requests.post( 141 response = requests.post(
137 f"{base_url}{endpoint}", 142 f"{base_url}{endpoint}",
138 files=[("files", (target.name, file, "application/pdf"))], 143 files=[("files", (target.name, file, "application/pdf"))],
144 data=form_fields,
139 headers=headers, 145 headers=headers,
140 timeout=int(mineru_config.get("timeout_seconds", 600)), 146 timeout=int(mineru_config.get("timeout_seconds", 600)),
141 ) 147 )
142 if response.status_code >= 400: 148 if response.status_code >= 400:
143 raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") 149 error_detail = _mineru_error_detail(response)
150 raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {error_detail}")
144 151
145 payload = response.json() 152 payload = response.json()
146 contents = extract_mineru_contents(payload) 153 contents = extract_mineru_contents(payload)
...@@ -161,12 +168,29 @@ def parse_pdf_with_http( ...@@ -161,12 +168,29 @@ def parse_pdf_with_http(
161 source_file=target.name, 168 source_file=target.name,
162 file_type="pdf", 169 file_type="pdf",
163 content=content, 170 content=content,
164 metadata={"parser": "mineru:http", "mineru_endpoint": endpoint}, 171 metadata={
172 "parser": "mineru:http",
173 "mineru_endpoint": endpoint,
174 "mineru_form_fields": form_fields,
175 },
165 ) 176 )
166 ) 177 )
167 return docs 178 return docs
168 179
169 180
181 def _mineru_error_detail(response: requests.Response) -> str:
182 try:
183 payload = response.json()
184 except ValueError:
185 return response.text[:1000]
186 error = payload.get("error")
187 if error:
188 task_id = payload.get("task_id")
189 status = payload.get("status")
190 return f"task_id={task_id} status={status} error={error}"
191 return response.text[:1000]
192
193
170 def extract_mineru_contents(payload: Any) -> list[str]: 194 def extract_mineru_contents(payload: Any) -> list[str]:
171 contents: list[str] = [] 195 contents: list[str] = []
172 _collect_text_values(payload, contents) 196 _collect_text_values(payload, contents)
......