Pass MinerU HTTP device field
Showing
3 changed files
with
32 additions
and
2 deletions
| ... | @@ -3,6 +3,10 @@ WEKNORA_API_KEY= | ... | @@ -3,6 +3,10 @@ WEKNORA_API_KEY= |
| 3 | WEKNORA_KB_ID= | 3 | WEKNORA_KB_ID= |
| 4 | WEKNORA_KB_NAME=ragas-eval-pilot | 4 | WEKNORA_KB_NAME=ragas-eval-pilot |
| 5 | 5 | ||
| 6 | # MinerU HTTP parser. Use cpu, cuda, cuda:0, etc. according to the deployed | ||
| 7 | # MinerU backend. | ||
| 8 | MINERU_DEVICE=cpu | ||
| 9 | |||
| 6 | # Ragas generation and judge models. These are evaluation-side models, not the | 10 | # Ragas generation and judge models. These are evaluation-side models, not the |
| 7 | # model configuration used by the WeKnora backend. | 11 | # model configuration used by the WeKnora backend. |
| 8 | RAGAS_LLM_API_KEY=replace-me | 12 | RAGAS_LLM_API_KEY=replace-me | ... | ... |
| ... | @@ -29,6 +29,8 @@ parsing: | ... | @@ -29,6 +29,8 @@ parsing: |
| 29 | output_dir: "data/parsed_docs/mineru_raw" | 29 | output_dir: "data/parsed_docs/mineru_raw" |
| 30 | http_base_url: "http://172.23.184.9:8002" | 30 | http_base_url: "http://172.23.184.9:8002" |
| 31 | http_parse_path: "/file_parse" | 31 | http_parse_path: "/file_parse" |
| 32 | http_form_fields: | ||
| 33 | device: "${MINERU_DEVICE:-cpu}" | ||
| 32 | api_key: "mineru" | 34 | api_key: "mineru" |
| 33 | timeout_seconds: 600 | 35 | timeout_seconds: 600 |
| 34 | fallback_to_local: false | 36 | fallback_to_local: false | ... | ... |
| ... | @@ -131,16 +131,23 @@ def parse_pdf_with_http( | ... | @@ -131,16 +131,23 @@ def parse_pdf_with_http( |
| 131 | endpoint = str(mineru_config.get("http_parse_path") or "/file_parse") | 131 | endpoint = str(mineru_config.get("http_parse_path") or "/file_parse") |
| 132 | if not endpoint.startswith("/"): | 132 | if not endpoint.startswith("/"): |
| 133 | endpoint = "/" + endpoint | 133 | endpoint = "/" + endpoint |
| 134 | form_fields = { | ||
| 135 | str(key): str(value) | ||
| 136 | for key, value in (mineru_config.get("http_form_fields") or {}).items() | ||
| 137 | if value not in {None, ""} | ||
| 138 | } | ||
| 134 | 139 | ||
| 135 | with target.open("rb") as file: | 140 | with target.open("rb") as file: |
| 136 | response = requests.post( | 141 | response = requests.post( |
| 137 | f"{base_url}{endpoint}", | 142 | f"{base_url}{endpoint}", |
| 138 | files=[("files", (target.name, file, "application/pdf"))], | 143 | files=[("files", (target.name, file, "application/pdf"))], |
| 144 | data=form_fields, | ||
| 139 | headers=headers, | 145 | headers=headers, |
| 140 | timeout=int(mineru_config.get("timeout_seconds", 600)), | 146 | timeout=int(mineru_config.get("timeout_seconds", 600)), |
| 141 | ) | 147 | ) |
| 142 | if response.status_code >= 400: | 148 | if response.status_code >= 400: |
| 143 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") | 149 | error_detail = _mineru_error_detail(response) |
| 150 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {error_detail}") | ||
| 144 | 151 | ||
| 145 | payload = response.json() | 152 | payload = response.json() |
| 146 | contents = extract_mineru_contents(payload) | 153 | contents = extract_mineru_contents(payload) |
| ... | @@ -161,12 +168,29 @@ def parse_pdf_with_http( | ... | @@ -161,12 +168,29 @@ def parse_pdf_with_http( |
| 161 | source_file=target.name, | 168 | source_file=target.name, |
| 162 | file_type="pdf", | 169 | file_type="pdf", |
| 163 | content=content, | 170 | content=content, |
| 164 | metadata={"parser": "mineru:http", "mineru_endpoint": endpoint}, | 171 | metadata={ |
| 172 | "parser": "mineru:http", | ||
| 173 | "mineru_endpoint": endpoint, | ||
| 174 | "mineru_form_fields": form_fields, | ||
| 175 | }, | ||
| 165 | ) | 176 | ) |
| 166 | ) | 177 | ) |
| 167 | return docs | 178 | return docs |
| 168 | 179 | ||
| 169 | 180 | ||
| 181 | def _mineru_error_detail(response: requests.Response) -> str: | ||
| 182 | try: | ||
| 183 | payload = response.json() | ||
| 184 | except ValueError: | ||
| 185 | return response.text[:1000] | ||
| 186 | error = payload.get("error") | ||
| 187 | if error: | ||
| 188 | task_id = payload.get("task_id") | ||
| 189 | status = payload.get("status") | ||
| 190 | return f"task_id={task_id} status={status} error={error}" | ||
| 191 | return response.text[:1000] | ||
| 192 | |||
| 193 | |||
| 170 | def extract_mineru_contents(payload: Any) -> list[str]: | 194 | def extract_mineru_contents(payload: Any) -> list[str]: |
| 171 | contents: list[str] = [] | 195 | contents: list[str] = [] |
| 172 | _collect_text_values(payload, contents) | 196 | _collect_text_values(payload, contents) | ... | ... |
-
Please register or sign in to post a comment