Commit 147c79e0 147c79e07bbaa525c4f6c6ac143e9c5c109362f8 by 沈秋雨

Initial WeKnora Ragas evaluation project

0 parents
WEKNORA_BASE_URL=http://localhost:8080/api/v1
WEKNORA_API_KEY=
WEKNORA_KB_ID=
WEKNORA_KB_NAME=ragas-eval-pilot
# Ragas generation and judge models. These are evaluation-side models, not the
# model configuration used by the WeKnora backend.
OPENAI_API_KEY=replace-me
OPENAI_BASE_URL=https://api.openai.com/v1
# Optional split deployment. Use these when LLM and embedding are served by
# different OpenAI-compatible services, such as vLLM + Infinity.
RAGAS_LLM_API_KEY=replace-me
RAGAS_LLM_BASE_URL=http://localhost:8000/v1
RAGAS_EMBEDDING_API_KEY=replace-me
RAGAS_EMBEDDING_BASE_URL=http://localhost:7997/v1
RAGAS_RERANKER_API_KEY=replace-me
RAGAS_RERANKER_BASE_URL=http://localhost:7998/v1
RAGAS_RERANKER_MODEL=replace-me
RAGAS_GENERATOR_MODEL=gpt-4o-mini
RAGAS_JUDGE_MODEL=gpt-4o-mini
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
TESTSET_SIZE=50
REQUEST_INTERVAL_SECONDS=0.2
.env
.venv/
__pycache__/
*.py[cod]
*.egg-info/
.pytest_cache/
.ruff_cache/
data/raw_docs/pdf/*
data/raw_docs/xlsx/*
data/parsed_docs/*.json
data/parsed_docs/*.jsonl
data/parsed_docs/mineru_raw/*
data/exported/*.json
data/exported/*.jsonl
data/testsets/*.jsonl
data/runs/*.jsonl
data/reports/*.csv
data/reports/*.md
!data/raw_docs/pdf/.gitkeep
!data/raw_docs/xlsx/.gitkeep
!data/parsed_docs/mineru_raw/.gitkeep
# WeKnora Ragas Eval
独立的 WeKnora Ragas 评估项目。它只调用 WeKnora 公开 API,不依赖 WeKnora 内置的 `/evaluation` 接口。
## 安装
```bash
python -m venv .venv
source .venv/bin/activate
pip install -e .
```
如果需要更好的 PDF 解析能力:
```bash
pip install -e ".[pdf]"
```
开发和测试工具:
```bash
pip install -e ".[dev,pdf]"
```
## 配置
```bash
cp .env.example .env
```
编辑 `.env` 后确认:
- `WEKNORA_BASE_URL` 指向 WeKnora API v1,例如 `http://localhost:9090/api/v1`
- `WEKNORA_API_KEY` 是 WeKnora API Key
- `WEKNORA_KB_ID` 是目标知识库 ID;如果还没有,先运行 `python scripts/00_create_kb.py`
- `WEKNORA_KB_NAME` 是创建知识库时使用的名称
- `OPENAI_API_KEY``OPENAI_BASE_URL``RAGAS_*_MODEL` 是评估侧模型配置
- 如果 LLM 和 embedding 分开部署,使用 `RAGAS_LLM_BASE_URL` 指向 vLLM 的 `/v1`,使用 `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity 的 `/v1`
## 首轮 Pilot
把原始文件放到:
- `data/raw_docs/pdf/`
- `data/raw_docs/xlsx/`
按顺序执行:
```bash
python scripts/00_create_kb.py
python scripts/01_upload_docs.py
python scripts/02_wait_ingestion.py
python scripts/03_export_chunks.py
python scripts/04_parse_docs.py
python scripts/05_generate_testset.py
python scripts/06_review_testset.py
python scripts/07_run_weknora_qa.py
python scripts/08_build_ragas_input.py
python scripts/09_run_ragas_eval.py
python scripts/10_report.py
```
首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。
## 主要产物
- `data/exported/knowledge.jsonl`
- `data/exported/chunks.jsonl`
- `data/parsed_docs/documents.jsonl`
- `data/parsed_docs/parse_summary.json`
- `data/testsets/testset.raw.jsonl`
- `data/testsets/testset.reviewed.jsonl`
- `data/runs/weknora_answers.jsonl`
- `data/runs/ragas_input.jsonl`
- `data/reports/ragas_scores.csv`
- `data/reports/summary.md`
weknora:
base_url: "${WEKNORA_BASE_URL}"
api_key: "${WEKNORA_API_KEY}"
knowledge_base_id: "${WEKNORA_KB_ID}"
knowledge_base_name: "${WEKNORA_KB_NAME:-ragas-eval-pilot}"
knowledge_base_description: "Knowledge base for independent Ragas evaluation."
timeout_seconds: 300
request_interval_seconds: "${REQUEST_INTERVAL_SECONDS:-0.2}"
testset:
size: "${TESTSET_SIZE:-50}"
include_pdf: true
include_xlsx: true
min_context_chars: 80
require_manual_review: true
parsing:
provider: "local"
output_path: "data/parsed_docs/documents.jsonl"
failed_path: "data/parsed_docs/failed_parse.jsonl"
summary_path: "data/parsed_docs/parse_summary.json"
local:
pdf_backend: "pymupdf"
xlsx_mode: "row_text"
min_chars: 80
mineru:
mode: "cli"
cli_bin: "mineru"
output_dir: "data/parsed_docs/mineru_raw"
http_base_url: "http://172.23.184.9:8002"
api_key: "mineru"
timeout_seconds: 600
fallback_to_local: false
qa:
one_session_per_question: true
disable_title: true
enable_memory: false
channel: "api"
verify_with_messages: false
ragas:
provider: "openai-compatible"
# Backward-compatible defaults. If the split LLM/embedding values below are
# empty, these values are used for both clients.
api_key: "${OPENAI_API_KEY}"
base_url: "${OPENAI_BASE_URL}"
# vLLM OpenAI-compatible endpoint, for example http://localhost:8000/v1.
llm_api_key: "${RAGAS_LLM_API_KEY}"
llm_base_url: "${RAGAS_LLM_BASE_URL}"
# Infinity OpenAI-compatible embedding endpoint, for example
# http://localhost:7997/v1.
embedding_api_key: "${RAGAS_EMBEDDING_API_KEY}"
embedding_base_url: "${RAGAS_EMBEDDING_BASE_URL}"
# Reserved for future retrieval/rerank metrics. The current Ragas pipeline
# does not call reranker APIs.
reranker_api_key: "${RAGAS_RERANKER_API_KEY}"
reranker_base_url: "${RAGAS_RERANKER_BASE_URL}"
reranker_model: "${RAGAS_RERANKER_MODEL}"
generator_model: "${RAGAS_GENERATOR_MODEL}"
judge_model: "${RAGAS_JUDGE_MODEL}"
embedding_model: "${RAGAS_EMBEDDING_MODEL}"
temperature: 0
max_tokens: 4096
timeout_seconds: 600
max_workers: 1
metrics:
- faithfulness
- response_relevancy
- context_precision
- context_recall
- factual_correctness
[project]
name = "weknora-ragas-eval"
version = "0.1.0"
description = "Independent Ragas evaluation pipeline for WeKnora public APIs."
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"ragas>=0.3,<0.5",
"datasets>=2.19.0",
"pandas>=2.2.0",
"openpyxl>=3.1.0",
"requests>=2.32.0",
"sseclient-py>=1.8.0",
"python-dotenv>=1.0.0",
"pyyaml>=6.0.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
"pypdf>=4.2.0"
]
[project.optional-dependencies]
pdf = [
"pymupdf>=1.24.0",
"pdfplumber>=0.11.0"
]
dev = [
"ruff>=0.6.0",
"pytest>=8.0.0"
]
[build-system]
requires = ["setuptools>=68"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]
[tool.ruff]
line-length = 100
target-version = "py310"
[tool.ruff.lint]
select = ["E", "F", "I", "UP", "B"]
from __future__ import annotations
import sys
from typing import Any
import _bootstrap # noqa: F401
from weknora_eval.api import bootstrap_client_from_config
from weknora_eval.config import load_config, require_config
from weknora_eval.envfile import set_env_value
from weknora_eval.loaders import setup_logging, write_json
def main() -> int:
setup_logging()
config = load_config()
client = bootstrap_client_from_config(config)
weknora = config["weknora"]
existing_id = str(weknora.get("knowledge_base_id") or "")
name = str(require_config(config, "weknora.knowledge_base_name"))
if existing_id and existing_id != "replace-me":
record = {"id": existing_id, "name": name, "source": "env"}
write_json("data/exported/knowledge_base.json", record)
print(f"WEKNORA_KB_ID already set: {existing_id}")
return 0
created = client.create_knowledge_base(name=name)
knowledge_base_id = _extract_knowledge_base_id(created)
if not knowledge_base_id:
print(f"Created knowledge base but could not extract id from response: {created}")
return 1
set_env_value(".env", "WEKNORA_KB_ID", knowledge_base_id)
write_json("data/exported/knowledge_base.json", {**created, "source": "create"})
print(f"WEKNORA_KB_ID={knowledge_base_id}")
print("Wrote ID to .env and data/exported/knowledge_base.json")
return 0
def _extract_knowledge_base_id(payload: dict[str, Any]) -> str | None:
candidates = [payload]
for key in ("data", "knowledge_base"):
nested = payload.get(key)
if isinstance(nested, dict):
candidates.append(nested)
for row in candidates:
for key in ("id", "knowledge_base_id", "kb_id", "uuid"):
value = row.get(key)
if value:
return str(value)
return None
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
from pathlib import Path
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
Path("data/raw_docs/xlsx").glob("*.xlsx")
)
rows = []
for path in files:
data = client.upload_file(path)
rows.append(
{
"knowledge_id": data.get("id"),
"file_name": data.get("file_name") or data.get("title") or path.name,
"file_type": data.get("file_type") or path.suffix.lstrip("."),
"parse_status": data.get("parse_status"),
"enable_status": data.get("enable_status"),
"raw": data,
}
)
write_jsonl("data/exported/knowledge_uploads.jsonl", rows)
print(f"Uploaded {len(rows)} files")
return 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import read_jsonl, setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
uploads = read_jsonl("data/exported/knowledge_uploads.jsonl", missing_ok=True)
knowledge_ids = {row["knowledge_id"] for row in uploads if row.get("knowledge_id")} or None
result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids)
knowledge = client.list_knowledge()
write_jsonl("data/exported/knowledge.jsonl", knowledge)
print(
"Ingestion status: "
f"completed={len(result['completed'])} failed={len(result['failed'])} "
f"pending={len(result['pending'])}"
)
return 1 if result["failed"] or result["pending"] else 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
knowledge_rows = client.list_knowledge()
write_jsonl("data/exported/knowledge.jsonl", knowledge_rows)
knowledge_by_id = {row.get("id"): row for row in knowledge_rows}
chunk_rows = []
for knowledge in knowledge_rows:
knowledge_id = knowledge.get("id")
if not knowledge_id:
continue
if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled":
continue
for chunk in client.list_chunks(str(knowledge_id)):
content = (chunk.get("content") or "").strip()
if not content:
continue
if chunk.get("is_enabled") is False:
continue
source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge
chunk_rows.append(
{
"chunk_id": chunk.get("id"),
"knowledge_id": chunk.get("knowledge_id") or knowledge_id,
"knowledge_base_id": chunk.get("knowledge_base_id")
or config["weknora"]["knowledge_base_id"],
"chunk_index": chunk.get("chunk_index"),
"content": content,
"source_file": source.get("file_name") or source.get("title"),
"chunk_type": chunk.get("chunk_type"),
"raw": chunk,
}
)
write_jsonl("data/exported/chunks.jsonl", chunk_rows)
print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records")
return 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging
from weknora_eval.parsers.local import parse_raw_docs
from weknora_eval.parsers.mineru import parse_with_mineru
def main() -> int:
setup_logging()
config = load_config()
provider = config.get("parsing", {}).get("provider", "local")
if provider == "local":
rows, summary = parse_raw_docs(config)
elif provider == "mineru":
rows, summary = parse_with_mineru(config)
else:
raise ValueError(f"Unsupported parsing provider: {provider}")
print(f"Parsed {len(rows)} documents: {summary}")
return 0 if rows else 1
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging
from weknora_eval.testset import generate_rule_based_testset
def main() -> int:
setup_logging()
config = load_config()
testset = config.get("testset", {})
rows = generate_rule_based_testset(
size=int(testset.get("size", 50)),
min_context_chars=int(testset.get("min_context_chars", 80)),
)
print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl")
return 0 if rows else 1
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.loaders import setup_logging
from weknora_eval.testset import approve_pending_testset, validate_reviewed_testset
def main() -> int:
setup_logging()
rows = approve_pending_testset()
errors = validate_reviewed_testset()
if errors:
for error in errors:
print(error)
return 1
print(f"Wrote {len(rows)} approved QA records to data/testsets/testset.reviewed.jsonl")
return 0 if rows else 1
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.api import client_from_config
from weknora_eval.config import load_config
from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
def main() -> int:
setup_logging()
config = load_config()
client = client_from_config(config)
qa_config = config.get("qa", {})
rows = [row for row in read_jsonl("data/testsets/testset.reviewed.jsonl") if row.get("review_status") == "approved"]
answers = []
for index, row in enumerate(rows, start=1):
sample_id = row["sample_id"]
try:
session = client.create_session(title=f"ragas-eval-{sample_id}")
session_id = session.get("id")
if not session_id:
raise RuntimeError(f"create_session returned no id for {sample_id}")
result = client.knowledge_chat_sse(
session_id=session_id,
query=row["user_input"],
disable_title=bool(qa_config.get("disable_title", True)),
enable_memory=bool(qa_config.get("enable_memory", False)),
channel=str(qa_config.get("channel", "api")),
)
answer = {
"sample_id": sample_id,
"user_input": row["user_input"],
"session_id": session_id,
"request_id": result.get("request_id"),
"response": result.get("response") or "",
"retrieved_contexts": result.get("retrieved_contexts") or [],
"weknora_references": result.get("weknora_references") or [],
"error": None,
}
if not answer["response"]:
answer["error"] = "empty_response"
append_jsonl("data/runs/failed_requests.jsonl", answer)
elif not answer["retrieved_contexts"]:
append_jsonl("data/runs/failed_requests.jsonl", {**answer, "error": "empty_retrieval"})
answers.append(answer)
print(f"[{index}/{len(rows)}] {sample_id} response_chars={len(answer['response'])}")
except Exception as exc: # noqa: BLE001
failed = {
"sample_id": sample_id,
"user_input": row.get("user_input"),
"response": "",
"retrieved_contexts": [],
"weknora_references": [],
"session_id": None,
"request_id": None,
"error": str(exc),
}
answers.append(failed)
append_jsonl("data/runs/failed_requests.jsonl", failed)
print(f"[{index}/{len(rows)}] {sample_id} failed: {exc}")
write_jsonl("data/runs/weknora_answers.jsonl", answers)
failures = [row for row in answers if row.get("error") and row.get("error") != "empty_retrieval"]
return 1 if failures else 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
def main() -> int:
setup_logging()
testset = {
row["sample_id"]: row
for row in read_jsonl("data/testsets/testset.reviewed.jsonl")
if row.get("review_status") == "approved"
}
answers = {row["sample_id"]: row for row in read_jsonl("data/runs/weknora_answers.jsonl")}
ragas_rows = []
for sample_id, qa in testset.items():
answer = answers.get(sample_id)
if not answer:
append_jsonl("data/runs/failed_requests.jsonl", {"sample_id": sample_id, "error": "missing_answer"})
continue
row = {
"sample_id": sample_id,
"user_input": qa["user_input"],
"response": answer.get("response") or "",
"retrieved_contexts": answer.get("retrieved_contexts") or [],
"reference": qa["reference"],
"reference_contexts": qa.get("reference_contexts") or [],
"session_id": answer.get("session_id"),
"request_id": answer.get("request_id"),
"weknora_references": answer.get("weknora_references") or [],
"source_file": qa.get("source_file"),
"gold_chunk_ids": qa.get("gold_chunk_ids") or [],
}
missing = [
key
for key in ("user_input", "response", "retrieved_contexts", "reference", "reference_contexts")
if not row.get(key)
]
if missing:
append_jsonl(
"data/runs/failed_requests.jsonl",
{"sample_id": sample_id, "error": f"missing_ragas_fields:{','.join(missing)}"},
)
continue
ragas_rows.append(row)
write_jsonl("data/runs/ragas_input.jsonl", ragas_rows)
print(f"Built {len(ragas_rows)} Ragas input rows")
return 0 if ragas_rows else 1
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging
from weknora_eval.ragas_runner import run_ragas_eval
def main() -> int:
setup_logging()
config = load_config()
scores = run_ragas_eval(config)
print(f"Wrote {len(scores)} Ragas score rows to data/reports/ragas_scores.csv")
return 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
import _bootstrap # noqa: F401
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging
from weknora_eval.report import generate_summary_report
def main() -> int:
setup_logging()
config = load_config()
generate_summary_report(config)
print("Wrote report to data/reports/summary.md")
return 0
if __name__ == "__main__":
sys.exit(main())
from __future__ import annotations
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
SRC = PROJECT_ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
"""Independent Ragas evaluation pipeline for WeKnora."""
__all__ = [
"__version__",
]
__version__ = "0.1.0"
from __future__ import annotations
import os
import re
from pathlib import Path
from typing import Any
import yaml
from dotenv import load_dotenv
_ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}")
def _expand_env(value: Any) -> Any:
if isinstance(value, dict):
return {key: _expand_env(item) for key, item in value.items()}
if isinstance(value, list):
return [_expand_env(item) for item in value]
if not isinstance(value, str):
return value
def replace(match: re.Match[str]) -> str:
default = match.group(2) if match.group(2) is not None else ""
return os.getenv(match.group(1), default)
expanded = _ENV_PATTERN.sub(replace, value)
return _coerce_scalar(expanded)
def _coerce_scalar(value: str) -> Any:
lowered = value.lower()
if lowered in {"true", "false"}:
return lowered == "true"
if lowered in {"none", "null"}:
return None
try:
if "." not in value:
return int(value)
return float(value)
except ValueError:
return value
def load_config(path: str | Path = "configs/eval.yaml") -> dict[str, Any]:
load_dotenv()
config_path = Path(path)
with config_path.open("r", encoding="utf-8") as file:
raw = yaml.safe_load(file) or {}
return _expand_env(raw)
def require_config(config: dict[str, Any], dotted_key: str) -> Any:
current: Any = config
for part in dotted_key.split("."):
if not isinstance(current, dict) or part not in current:
raise ValueError(f"Missing required config value: {dotted_key}")
value = current[part]
if value is None or value == "":
raise ValueError(f"Missing required config value: {dotted_key}")
current = value
return current
def project_path(*parts: str) -> Path:
return Path.cwd().joinpath(*parts)
from __future__ import annotations
from pathlib import Path
def set_env_value(path: str | Path, key: str, value: str) -> None:
target = Path(path)
lines = target.read_text(encoding="utf-8").splitlines() if target.exists() else []
prefix = f"{key}="
replacement = f"{key}={value}"
updated = False
output: list[str] = []
for line in lines:
if line.startswith(prefix):
output.append(replacement)
updated = True
else:
output.append(line)
if not updated:
output.append(replacement)
target.write_text("\n".join(output) + "\n", encoding="utf-8")
from __future__ import annotations
import json
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Any
def setup_logging(level: int = logging.INFO) -> None:
logging.basicConfig(
level=level,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
def ensure_parent(path: str | Path) -> Path:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
return target
def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]:
target = Path(path)
if not target.exists():
if missing_ok:
return []
raise FileNotFoundError(target)
rows: list[dict[str, Any]] = []
with target.open("r", encoding="utf-8") as file:
for line_no, line in enumerate(file, start=1):
stripped = line.strip()
if not stripped:
continue
try:
rows.append(json.loads(stripped))
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
return rows
def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]:
target = Path(path)
if not target.exists():
if missing_ok:
return
raise FileNotFoundError(target)
with target.open("r", encoding="utf-8") as file:
for line_no, line in enumerate(file, start=1):
stripped = line.strip()
if not stripped:
continue
try:
yield json.loads(stripped)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int:
target = ensure_parent(path)
count = 0
with target.open("w", encoding="utf-8") as file:
for row in rows:
file.write(json.dumps(row, ensure_ascii=False) + "\n")
count += 1
return count
def append_jsonl(path: str | Path, row: dict[str, Any]) -> None:
target = ensure_parent(path)
with target.open("a", encoding="utf-8") as file:
file.write(json.dumps(row, ensure_ascii=False) + "\n")
def write_json(path: str | Path, payload: dict[str, Any]) -> None:
target = ensure_parent(path)
with target.open("w", encoding="utf-8") as file:
json.dump(payload, file, ensure_ascii=False, indent=2)
file.write("\n")
def compact_text(value: Any) -> str:
text = "" if value is None else str(value)
return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip()
"""Document parser adapters."""
from __future__ import annotations
import statistics
from pathlib import Path
from typing import Any
from openpyxl import load_workbook
from weknora_eval.loaders import compact_text, write_json, write_jsonl
from weknora_eval.schemas import ParsedDocument
def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
parsing = config["parsing"]
local_config = parsing.get("local", {})
min_chars = int(local_config.get("min_chars", 80))
pdf_backend = local_config.get("pdf_backend", "pypdf")
xlsx_mode = local_config.get("xlsx_mode", "row_text")
docs: list[ParsedDocument] = []
failures: list[dict[str, Any]] = []
for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
try:
docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
except Exception as exc: # noqa: BLE001 - parser failures must be persisted.
failures.append(
{
"source_file": pdf_path.name,
"parser": f"local:{pdf_backend}",
"status": "failed",
"error": str(exc),
"fallback_used": None,
}
)
for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")):
try:
docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
except Exception as exc: # noqa: BLE001
failures.append(
{
"source_file": xlsx_path.name,
"parser": "local:openpyxl",
"status": "failed",
"error": str(exc),
"fallback_used": None,
}
)
rows = [doc.to_dict() for doc in docs]
write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
if failures:
write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
summary = build_parse_summary(rows, failures, parser=f"local:{pdf_backend}")
write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
return rows, summary
def parse_pdf(path: str | Path, *, backend: str = "pypdf", min_chars: int = 80) -> list[ParsedDocument]:
target = Path(path)
backend = backend.lower()
if backend == "pymupdf":
return _parse_pdf_pymupdf(target, min_chars=min_chars)
if backend == "pdfplumber":
return _parse_pdf_pdfplumber(target, min_chars=min_chars)
if backend == "pypdf":
return _parse_pdf_pypdf(target, min_chars=min_chars)
raise ValueError(f"Unsupported PDF backend: {backend}")
def _parse_pdf_pypdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
from pypdf import PdfReader
reader = PdfReader(str(path))
docs: list[ParsedDocument] = []
for index, page in enumerate(reader.pages, start=1):
content = compact_text(page.extract_text() or "")
if len(content) < min_chars:
continue
docs.append(_pdf_doc(path, index, content, "local:pypdf"))
return docs
def _parse_pdf_pymupdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
try:
import fitz
except ImportError as exc:
raise ImportError("pymupdf backend requires `pip install -e '.[pdf]'`") from exc
docs: list[ParsedDocument] = []
with fitz.open(path) as document:
for index, page in enumerate(document, start=1):
content = compact_text(page.get_text("text"))
if len(content) < min_chars:
continue
docs.append(_pdf_doc(path, index, content, "local:pymupdf"))
return docs
def _parse_pdf_pdfplumber(path: Path, *, min_chars: int) -> list[ParsedDocument]:
try:
import pdfplumber
except ImportError as exc:
raise ImportError("pdfplumber backend requires `pip install -e '.[pdf]'`") from exc
docs: list[ParsedDocument] = []
with pdfplumber.open(path) as pdf:
for index, page in enumerate(pdf.pages, start=1):
content = compact_text(page.extract_text() or "")
if len(content) < min_chars:
continue
docs.append(_pdf_doc(path, index, content, "local:pdfplumber"))
return docs
def _pdf_doc(path: Path, page: int, content: str, parser: str) -> ParsedDocument:
return ParsedDocument(
doc_id=f"{path.name}::page-{page}",
source_file=path.name,
file_type="pdf",
page=page,
content=content,
metadata={"parser": parser},
)
def parse_xlsx(path: str | Path, *, mode: str = "row_text", min_chars: int = 80) -> list[ParsedDocument]:
target = Path(path)
mode = mode.lower()
workbook = load_workbook(target, data_only=True, read_only=True)
if mode == "row_text":
return _parse_xlsx_row_text(target, workbook, min_chars=min_chars)
if mode == "markdown_table":
return _parse_xlsx_markdown_table(target, workbook, min_chars=min_chars)
raise ValueError(f"Unsupported XLSX mode: {mode}")
def _parse_xlsx_row_text(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
docs: list[ParsedDocument] = []
for sheet in workbook.worksheets:
rows = list(sheet.iter_rows(values_only=True))
if not rows:
continue
headers = [_cell_to_text(value) or f"col_{index}" for index, value in enumerate(rows[0], start=1)]
for row_index, row in enumerate(rows[1:], start=2):
pairs = []
for header, value in zip(headers, row, strict=False):
cell = _cell_to_text(value)
if cell:
pairs.append(f"{header}: {cell}")
content = "\n".join(pairs).strip()
if len(content) < min_chars:
continue
docs.append(
ParsedDocument(
doc_id=f"{path.name}::{sheet.title}::row-{row_index}",
source_file=path.name,
file_type="xlsx",
sheet=sheet.title,
row_index=row_index,
content=content,
metadata={"parser": "local:openpyxl", "columns": headers},
)
)
return docs
def _parse_xlsx_markdown_table(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
docs: list[ParsedDocument] = []
for sheet in workbook.worksheets:
rows = [
[_cell_to_text(value) for value in row]
for row in sheet.iter_rows(values_only=True)
if any(value is not None for value in row)
]
if not rows:
continue
width = max(len(row) for row in rows)
normalized = [row + [""] * (width - len(row)) for row in rows]
header = normalized[0]
separator = ["---"] * width
body = normalized[1:]
lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(separator) + " |",
]
lines.extend("| " + " | ".join(row) + " |" for row in body)
content = "\n".join(lines)
if len(content) < min_chars:
continue
docs.append(
ParsedDocument(
doc_id=f"{path.name}::{sheet.title}",
source_file=path.name,
file_type="xlsx",
sheet=sheet.title,
content=content,
metadata={"parser": "local:openpyxl", "mode": "markdown_table"},
)
)
return docs
def _cell_to_text(value: Any) -> str:
if value is None:
return ""
text = str(value).strip()
return text.replace("\n", " ")
def build_parse_summary(
rows: list[dict[str, Any]],
failures: list[dict[str, Any]],
*,
parser: str,
) -> dict[str, Any]:
source_files = {row.get("source_file") for row in rows if row.get("source_file")}
failed_files = {row.get("source_file") for row in failures if row.get("source_file")}
lengths = [len(row.get("content") or "") for row in rows]
return {
"total_files": len(source_files | failed_files),
"parsed_files": len(source_files),
"failed_files": len(failed_files),
"total_documents": len(rows),
"empty_documents": sum(1 for length in lengths if length == 0),
"avg_chars": round(statistics.mean(lengths), 2) if lengths else 0,
"parser": parser,
}
from __future__ import annotations
import subprocess
from pathlib import Path
from typing import Any
import requests
from weknora_eval.loaders import compact_text, write_json, write_jsonl
from weknora_eval.parsers.local import build_parse_summary, parse_pdf
from weknora_eval.schemas import ParsedDocument
class MinerUParseError(RuntimeError):
pass
def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
parsing = config["parsing"]
mineru = parsing.get("mineru", {})
mode = mineru.get("mode", "cli")
fallback = bool(mineru.get("fallback_to_local", True))
local_config = parsing.get("local", {})
min_chars = int(local_config.get("min_chars", 80))
docs: list[ParsedDocument] = []
failures: list[dict[str, Any]] = []
for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
parser_name = f"mineru:{mode}"
try:
if mode == "cli":
docs.extend(parse_pdf_with_cli(pdf_path, mineru, min_chars=min_chars))
elif mode == "http":
docs.extend(parse_pdf_with_http(pdf_path, mineru, min_chars=min_chars))
else:
raise MinerUParseError(f"Unsupported MinerU mode: {mode}")
except Exception as exc: # noqa: BLE001
failure = {
"source_file": pdf_path.name,
"parser": parser_name,
"status": "failed",
"error": str(exc),
"fallback_used": None,
}
if fallback:
try:
backend = local_config.get("pdf_backend", "pypdf")
local_docs = parse_pdf(pdf_path, backend=backend, min_chars=min_chars)
docs.extend(local_docs)
failure["fallback_used"] = f"local:{backend}"
except Exception as fallback_exc: # noqa: BLE001
failure["fallback_error"] = str(fallback_exc)
failures.append(failure)
rows = [doc.to_dict() for doc in docs]
write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
if failures:
write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
summary = build_parse_summary(rows, failures, parser=f"mineru:{mode}")
write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
return rows, summary
def parse_pdf_with_cli(
pdf_path: str | Path,
mineru_config: dict[str, Any],
*,
min_chars: int,
) -> list[ParsedDocument]:
target = Path(pdf_path)
output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw"))
output_dir = output_root / target.stem
output_dir.mkdir(parents=True, exist_ok=True)
cli_bin = mineru_config.get("cli_bin", "mineru")
timeout = int(mineru_config.get("timeout_seconds", 600))
# MinerU CLI arguments vary by release. This common invocation is isolated
# here so deployments can replace it without touching pipeline scripts.
result = subprocess.run(
[cli_bin, "-p", str(target), "-o", str(output_dir)],
check=False,
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
raise MinerUParseError(result.stderr.strip() or result.stdout.strip() or "MinerU CLI failed")
markdown_files = sorted(output_dir.rglob("*.md"))
if not markdown_files:
raise MinerUParseError(f"No Markdown output found in {output_dir}")
docs: list[ParsedDocument] = []
for index, markdown_path in enumerate(markdown_files, start=1):
content = compact_text(markdown_path.read_text(encoding="utf-8"))
if len(content) < min_chars:
continue
docs.append(
ParsedDocument(
doc_id=f"{target.name}::mineru-{index}",
source_file=target.name,
file_type="pdf",
content=content,
metadata={
"parser": "mineru:cli",
"mineru_output": str(markdown_path),
},
)
)
return docs
def parse_pdf_with_http(
pdf_path: str | Path,
mineru_config: dict[str, Any],
*,
min_chars: int,
) -> list[ParsedDocument]:
target = Path(pdf_path)
base_url = str(mineru_config.get("http_base_url") or "").rstrip("/")
if not base_url:
raise MinerUParseError("MinerU HTTP mode requires parsing.mineru.http_base_url")
headers = {}
if mineru_config.get("api_key"):
headers["Authorization"] = f"Bearer {mineru_config['api_key']}"
# The checklist does not define a universal MinerU HTTP contract. This
# implementation expects a replaceable service exposing POST /parse and
# returning {"markdown": "..."} or {"documents": [{"content": "..."}]}.
with target.open("rb") as file:
response = requests.post(
f"{base_url}/parse",
files={"file": (target.name, file, "application/pdf")},
headers=headers,
timeout=int(mineru_config.get("timeout_seconds", 600)),
)
if response.status_code >= 400:
raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
payload = response.json()
contents: list[str] = []
if isinstance(payload.get("documents"), list):
contents = [compact_text(item.get("content")) for item in payload["documents"]]
elif payload.get("markdown"):
contents = [compact_text(payload["markdown"])]
else:
raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`")
docs: list[ParsedDocument] = []
for index, content in enumerate(contents, start=1):
if len(content) < min_chars:
continue
docs.append(
ParsedDocument(
doc_id=f"{target.name}::mineru-http-{index}",
source_file=target.name,
file_type="pdf",
content=content,
metadata={"parser": "mineru:http"},
)
)
return docs
from __future__ import annotations
import os
from pathlib import Path
from typing import Any
import pandas as pd
from weknora_eval.config import require_config
from weknora_eval.loaders import read_jsonl
def run_ragas_eval(
config: dict[str, Any],
*,
input_path: str = "data/runs/ragas_input.jsonl",
output_csv_path: str = "data/reports/ragas_scores.csv",
) -> pd.DataFrame:
from datasets import Dataset
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas import evaluate
from ragas.run_config import RunConfig
ragas_config = config["ragas"]
llm_api_key = _first_non_empty(ragas_config, "llm_api_key", "api_key")
llm_base_url = _first_non_empty(ragas_config, "llm_base_url", "base_url")
embedding_api_key = _first_non_empty(ragas_config, "embedding_api_key", "api_key")
embedding_base_url = _first_non_empty(ragas_config, "embedding_base_url", "base_url")
judge_model = str(require_config(config, "ragas.judge_model"))
embedding_model = str(require_config(config, "ragas.embedding_model"))
temperature = float(ragas_config.get("temperature", 0))
max_tokens = int(ragas_config.get("max_tokens", 4096))
timeout_seconds = int(ragas_config.get("timeout_seconds", 600))
max_workers = int(ragas_config.get("max_workers", 1))
os.environ["OPENAI_API_KEY"] = llm_api_key
if llm_base_url:
os.environ["OPENAI_BASE_URL"] = llm_base_url
rows = read_jsonl(input_path)
dataset = Dataset.from_list(
[
{
"user_input": row["user_input"],
"response": row["response"],
"retrieved_contexts": row["retrieved_contexts"],
"reference": row["reference"],
"reference_contexts": row.get("reference_contexts") or [],
}
for row in rows
]
)
metric_map = _metric_map()
selected_metrics = [
metric_map[name]
for name in ragas_config.get("metrics", metric_map.keys())
if name in metric_map
]
llm = ChatOpenAI(
model=judge_model,
api_key=llm_api_key,
base_url=llm_base_url or None,
temperature=temperature,
max_tokens=max_tokens,
)
embeddings = OpenAIEmbeddings(
model=embedding_model,
api_key=embedding_api_key,
base_url=embedding_base_url or None,
tiktoken_enabled=False,
check_embedding_ctx_length=False,
)
ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
run_config = RunConfig(timeout=timeout_seconds, max_workers=max_workers)
result = evaluate(
dataset,
metrics=selected_metrics,
llm=ragas_llm,
embeddings=ragas_embeddings,
run_config=run_config,
)
scores = result.to_pandas()
for index, row in enumerate(rows):
scores.loc[index, "sample_id"] = row.get("sample_id")
target = Path(output_csv_path)
target.parent.mkdir(parents=True, exist_ok=True)
scores.to_csv(target, index=False)
return scores
def _metric_map() -> dict[str, Any]:
try:
from ragas.metrics import (
context_precision,
context_recall,
faithfulness,
factual_correctness,
response_relevancy,
)
return {
"faithfulness": faithfulness,
"response_relevancy": response_relevancy,
"context_precision": context_precision,
"context_recall": context_recall,
"factual_correctness": factual_correctness,
}
except ImportError:
from ragas.metrics import (
Faithfulness,
FactualCorrectness,
LLMContextPrecisionWithReference,
LLMContextRecall,
ResponseRelevancy,
)
return {
"faithfulness": Faithfulness(),
"response_relevancy": ResponseRelevancy(),
"context_precision": LLMContextPrecisionWithReference(),
"context_recall": LLMContextRecall(),
"factual_correctness": FactualCorrectness(),
}
def _first_non_empty(config: dict[str, Any], *keys: str) -> str:
for key in keys:
value = config.get(key)
if value not in {None, ""}:
return str(value)
raise ValueError(f"Missing required Ragas config value. Checked: {', '.join(keys)}")
def _wrap_langchain_models(llm: Any, embeddings: Any) -> tuple[Any, Any]:
try:
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
except ImportError:
return llm, embeddings
return LangchainLLMWrapper(llm), LangchainEmbeddingsWrapper(embeddings)
from __future__ import annotations
import math
from pathlib import Path
from typing import Any
import pandas as pd
from weknora_eval.loaders import read_jsonl
def retrieval_metrics(
ragas_rows: list[dict[str, Any]],
*,
ks: tuple[int, ...] = (1, 3, 5),
) -> dict[str, float]:
samples = [row for row in ragas_rows if row.get("gold_chunk_ids")]
if not samples:
return {}
totals: dict[str, float] = {f"hit@{k}": 0.0 for k in ks}
totals.update({f"recall@{k}": 0.0 for k in ks})
totals["mrr"] = 0.0
totals["ndcg@5"] = 0.0
for row in samples:
gold = set(row.get("gold_chunk_ids") or [])
refs = row.get("weknora_references") or []
predicted = [str(ref.get("id")) for ref in refs if ref.get("id")]
for k in ks:
top_k = predicted[:k]
hits = len(gold.intersection(top_k))
totals[f"hit@{k}"] += 1.0 if hits else 0.0
totals[f"recall@{k}"] += hits / len(gold)
first_rank = next((idx for idx, chunk_id in enumerate(predicted, start=1) if chunk_id in gold), None)
if first_rank:
totals["mrr"] += 1 / first_rank
dcg = 0.0
for idx, chunk_id in enumerate(predicted[:5], start=1):
if chunk_id in gold:
dcg += 1 / math.log2(idx + 1)
ideal_hits = min(len(gold), 5)
idcg = sum(1 / math.log2(idx + 1) for idx in range(1, ideal_hits + 1))
totals["ndcg@5"] += dcg / idcg if idcg else 0.0
return {key: round(value / len(samples), 4) for key, value in totals.items()}
def generate_summary_report(
config: dict[str, Any],
*,
scores_csv_path: str = "data/reports/ragas_scores.csv",
ragas_input_path: str = "data/runs/ragas_input.jsonl",
answers_path: str = "data/runs/weknora_answers.jsonl",
output_path: str = "data/reports/summary.md",
) -> str:
ragas_rows = read_jsonl(ragas_input_path, missing_ok=True)
answer_rows = read_jsonl(answers_path, missing_ok=True)
scores = pd.read_csv(scores_csv_path) if Path(scores_csv_path).exists() else pd.DataFrame()
lines = [
"# Ragas 评估报告",
"",
"## 运行信息",
f"- WeKnora Base URL: {config.get('weknora', {}).get('base_url', '')}",
f"- 知识库 ID: {config.get('weknora', {}).get('knowledge_base_id', '')}",
f"- 测试集规模: {len(ragas_rows)}",
f"- 审核通过样本数: {len(ragas_rows)}",
f"- 失败样本数: {sum(1 for row in answer_rows if row.get('error'))}",
f"- Judge 模型: {config.get('ragas', {}).get('judge_model', '')}",
"",
"## 聚合指标",
"| 指标 | 平均值 | P50 | 失败阈值 |",
"| --- | --- | --- | --- |",
]
metric_columns = [
column
for column in scores.columns
if column not in {"sample_id", "user_input", "response", "reference"}
and pd.api.types.is_numeric_dtype(scores[column])
]
for column in metric_columns:
lines.append(
f"| {column} | {scores[column].mean():.4f} | {scores[column].median():.4f} | 0.50 |"
)
chunk_metrics = retrieval_metrics(ragas_rows)
if chunk_metrics:
lines.extend(["", "## Chunk ID 检索指标", "| 指标 | 平均值 |", "| --- | --- |"])
for key, value in chunk_metrics.items():
lines.append(f"| {key} | {value:.4f} |")
lines.extend(["", "## 检索失败样本", "| sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 |", "| --- | --- | --- | --- | --- | --- |"])
for row in _worst_rows(scores, "context_recall"):
sample = _sample_by_id(ragas_rows, row.get("sample_id"))
actual_files = sorted(
{
ref.get("knowledge_filename") or ""
for ref in sample.get("weknora_references", [])
if ref.get("knowledge_filename")
}
)
lines.append(
f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
f"{_cell(sample.get('source_file'))} | {_cell(', '.join(actual_files))} | "
f"{_score(row.get('context_recall'))} | |"
)
lines.extend(["", "## 生成失败样本", "| sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness |", "| --- | --- | --- | --- | --- | --- |"])
for row in _worst_rows(scores, "faithfulness"):
sample = _sample_by_id(ragas_rows, row.get("sample_id"))
lines.append(
f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
f"{_cell(sample.get('response'))} | {_cell(sample.get('reference'))} | "
f"{_score(row.get('faithfulness'))} | {_score(row.get('factual_correctness'))} |"
)
empty_retrievals = sum(1 for row in ragas_rows if not row.get("retrieved_contexts"))
fallback_answers = sum(1 for row in answer_rows if row.get("is_fallback"))
source_counts: dict[str, int] = {}
for row in ragas_rows:
source = row.get("source_file") or "unknown"
source_counts[source] = source_counts.get(source, 0) + 1
lines.extend(
[
"",
"## 数据质量",
f"- 空检索数量: {empty_retrievals}",
f"- fallback 答案数量: {fallback_answers}",
f"- 来源文件分布: {source_counts}",
"",
"## 改进建议",
"- 优先检查 context_recall 低且 retrieved_contexts 为空的样本。",
"- 对低 faithfulness 且 context_recall 正常的样本,重点检查生成模型和提示词。",
"- 对 Chunk ID 指标低但 Ragas context 指标正常的样本,检查 chunk 切分或 gold_chunk_ids 标注。",
"",
]
)
content = "\n".join(lines)
target = Path(output_path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(content, encoding="utf-8")
return content
def _worst_rows(scores: pd.DataFrame, column: str, *, limit: int = 10) -> list[dict[str, Any]]:
if scores.empty or column not in scores.columns:
return []
return scores.sort_values(column, ascending=True).head(limit).to_dict(orient="records")
def _sample_by_id(rows: list[dict[str, Any]], sample_id: Any) -> dict[str, Any]:
return next((row for row in rows if row.get("sample_id") == sample_id), {})
def _cell(value: Any, *, max_len: int = 120) -> str:
text = "" if value is None else " ".join(str(value).split())
text = text.replace("|", "\\|")
if len(text) <= max_len:
return text
return text[:max_len].rstrip() + "..."
def _score(value: Any) -> str:
try:
if pd.isna(value):
return ""
return f"{float(value):.4f}"
except (TypeError, ValueError):
return ""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from typing import Any
@dataclass
class ParsedDocument:
doc_id: str
source_file: str
file_type: str
content: str
page: int | None = None
sheet: str | None = None
row_index: int | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass
class TestsetRecord:
sample_id: str
user_input: str
reference: str
reference_contexts: list[str]
source_file: str | None = None
gold_chunk_ids: list[str] = field(default_factory=list)
question_type: str = "single_hop"
review_status: str = "pending"
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass
class WeKnoraAnswer:
sample_id: str
user_input: str
response: str
retrieved_contexts: list[str]
weknora_references: list[dict[str, Any]]
session_id: str | None = None
request_id: str | None = None
error: str | None = None
def to_dict(self) -> dict[str, Any]:
return asdict(self)
from __future__ import annotations
import json
from collections.abc import Iterable, Iterator
from typing import Any
def parse_sse_events(lines: Iterable[str | bytes]) -> Iterator[dict[str, Any]]:
event_name = "message"
data_lines: list[str] = []
for raw_line in lines:
line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
line = line.rstrip("\r\n")
if not line:
if data_lines:
yield _build_event(event_name, data_lines)
event_name = "message"
data_lines = []
continue
if line.startswith(":"):
continue
if line.startswith("event:"):
event_name = line.removeprefix("event:").strip()
continue
if line.startswith("data:"):
data_lines.append(line.removeprefix("data:").strip())
if data_lines:
yield _build_event(event_name, data_lines)
def _build_event(event_name: str, data_lines: list[str]) -> dict[str, Any]:
raw_data = "\n".join(data_lines)
parsed_data: Any = raw_data
if raw_data and raw_data != "[DONE]":
try:
parsed_data = json.loads(raw_data)
except json.JSONDecodeError:
parsed_data = raw_data
return {"event": event_name, "data": parsed_data}
def normalize_reference(reference: dict[str, Any]) -> dict[str, Any]:
return {
"id": reference.get("id"),
"content": reference.get("content") or "",
"knowledge_id": reference.get("knowledge_id"),
"chunk_index": reference.get("chunk_index"),
"score": reference.get("score"),
"knowledge_filename": reference.get("knowledge_filename")
or reference.get("knowledge_title"),
"match_type": reference.get("match_type"),
"chunk_type": reference.get("chunk_type"),
}
from __future__ import annotations
from typing import Any
from weknora_eval.loaders import read_jsonl, write_jsonl
from weknora_eval.schemas import TestsetRecord
def generate_rule_based_testset(
*,
documents_path: str = "data/parsed_docs/documents.jsonl",
output_path: str = "data/testsets/testset.raw.jsonl",
size: int = 50,
min_context_chars: int = 80,
) -> list[dict[str, Any]]:
documents = [
row
for row in read_jsonl(documents_path)
if len(row.get("content") or "") >= min_context_chars
]
rows: list[dict[str, Any]] = []
for index, document in enumerate(documents[:size], start=1):
context = document["content"]
source_file = document.get("source_file")
question = _default_question(document)
reference = _reference_from_context(context)
rows.append(
TestsetRecord(
sample_id=f"qa-{index:04d}",
user_input=question,
reference=reference,
reference_contexts=[context],
source_file=source_file,
question_type="single_hop",
review_status="pending",
).to_dict()
)
write_jsonl(output_path, rows)
return rows
def approve_pending_testset(
*,
input_path: str = "data/testsets/testset.raw.jsonl",
output_path: str = "data/testsets/testset.reviewed.jsonl",
) -> list[dict[str, Any]]:
rows = read_jsonl(input_path)
reviewed: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
if row.get("review_status") == "rejected":
continue
row["review_status"] = "approved"
reviewed.append(row)
write_jsonl(output_path, reviewed)
return reviewed
def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]:
errors: list[str] = []
for index, row in enumerate(read_jsonl(path), start=1):
prefix = f"{path}:{index}"
if row.get("review_status") != "approved":
errors.append(f"{prefix} review_status must be approved")
for key in ("sample_id", "user_input", "reference"):
if not row.get(key):
errors.append(f"{prefix} missing {key}")
if not row.get("reference_contexts"):
errors.append(f"{prefix} reference_contexts must be non-empty")
return errors
def _default_question(document: dict[str, Any]) -> str:
source = document.get("source_file") or "该文档"
if document.get("file_type") == "xlsx" and document.get("sheet"):
return f"请根据 {source} 的 {document['sheet']} 中对应记录回答:这条记录的主要内容是什么?"
if document.get("page"):
return f"请根据 {source} 第 {document['page']} 页回答:该片段的主要内容是什么?"
return f"请根据 {source} 回答:该片段的主要内容是什么?"
def _reference_from_context(context: str, *, max_chars: int = 500) -> str:
text = " ".join(context.split())
if len(text) <= max_chars:
return text
return text[:max_chars].rstrip() + "..."