Commit 66bac6d5 66bac6d5362a372f5bcc3627eb1a33c147795bbc by 沈秋雨

Improve ingestion status diagnostics

1 parent 56b1b00a
from __future__ import annotations
from collections import Counter
import sys
import _bootstrap # noqa: F401
......@@ -18,12 +19,37 @@ def main() -> int:
result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids)
knowledge = client.list_knowledge()
write_jsonl("data/exported/knowledge.jsonl", knowledge)
target_knowledge = [
row for row in knowledge if not knowledge_ids or row.get("id") in knowledge_ids
]
print(
"Ingestion status: "
f"completed={len(result['completed'])} failed={len(result['failed'])} "
f"pending={len(result['pending'])}"
)
print(
"Status distribution: "
f"parse_status={dict(Counter(str(row.get('parse_status')) for row in target_knowledge))} "
f"enable_status={dict(Counter(str(row.get('enable_status')) for row in target_knowledge))}"
)
if result["pending"]:
print("Pending samples:")
for row in result["pending"][:5]:
print(
"- "
f"id={row.get('id')} title={row.get('title') or row.get('file_name')} "
f"parse_status={row.get('parse_status')} enable_status={row.get('enable_status')} "
f"error={row.get('error_message') or ''}"
)
if result["failed"]:
print("Failed samples:")
for row in result["failed"][:10]:
print(
"- "
f"id={row.get('id')} title={row.get('title') or row.get('file_name')} "
f"error={row.get('error_message') or ''}"
)
return 1 if result["failed"] or result["pending"] else 0
......
......@@ -2,6 +2,7 @@ from __future__ import annotations
import logging
import time
from collections import Counter
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
......@@ -90,7 +91,7 @@ class WeKnoraClient:
completed = [
row
for row in rows
if row.get("parse_status") == "completed" and row.get("enable_status") == "enabled"
if self._is_ingestion_completed(row)
]
failed = [row for row in rows if row.get("parse_status") == "failed"]
......@@ -100,7 +101,13 @@ class WeKnoraClient:
return {"completed": completed, "failed": [], "pending": []}
pending = [row for row in rows if row not in completed]
logger.info("Waiting for ingestion: completed=%s pending=%s", len(completed), len(pending))
logger.info(
"Waiting for ingestion: completed=%s pending=%s parse_status=%s enable_status=%s",
len(completed),
len(pending),
dict(Counter(str(row.get("parse_status")) for row in rows)),
dict(Counter(str(row.get("enable_status")) for row in rows)),
)
time.sleep(poll_interval_seconds)
rows = self.list_knowledge()
......@@ -109,7 +116,7 @@ class WeKnoraClient:
completed = [
row
for row in rows
if row.get("parse_status") == "completed" and row.get("enable_status") == "enabled"
if self._is_ingestion_completed(row)
]
failed = [row for row in rows if row.get("parse_status") == "failed"]
pending = [row for row in rows if row not in completed and row not in failed]
......@@ -118,6 +125,13 @@ class WeKnoraClient:
def list_chunks(self, knowledge_id: str, *, page_size: int = 100) -> list[dict[str, Any]]:
return self._paginate(f"chunks/{knowledge_id}", page_size=page_size)
def _is_ingestion_completed(self, row: dict[str, Any]) -> bool:
parse_status = row.get("parse_status")
enable_status = row.get("enable_status")
parsed = parse_status in {"completed", "success", "done"} or parse_status in {2, "2"}
enabled = enable_status in {"enabled", "success", "done"} or enable_status in {1, 2, "1", "2"}
return parsed and enabled
def knowledge_chat_sse(
self,
*,
......