Initial WeKnora Ragas evaluation project
0 parents
Showing
34 changed files
with
1772 additions
and
0 deletions
.env.example
0 → 100644
| 1 | WEKNORA_BASE_URL=http://localhost:8080/api/v1 | ||
| 2 | WEKNORA_API_KEY= | ||
| 3 | WEKNORA_KB_ID= | ||
| 4 | WEKNORA_KB_NAME=ragas-eval-pilot | ||
| 5 | |||
| 6 | # Ragas generation and judge models. These are evaluation-side models, not the | ||
| 7 | # model configuration used by the WeKnora backend. | ||
| 8 | OPENAI_API_KEY=replace-me | ||
| 9 | OPENAI_BASE_URL=https://api.openai.com/v1 | ||
| 10 | |||
| 11 | # Optional split deployment. Use these when LLM and embedding are served by | ||
| 12 | # different OpenAI-compatible services, such as vLLM + Infinity. | ||
| 13 | RAGAS_LLM_API_KEY=replace-me | ||
| 14 | RAGAS_LLM_BASE_URL=http://localhost:8000/v1 | ||
| 15 | RAGAS_EMBEDDING_API_KEY=replace-me | ||
| 16 | RAGAS_EMBEDDING_BASE_URL=http://localhost:7997/v1 | ||
| 17 | RAGAS_RERANKER_API_KEY=replace-me | ||
| 18 | RAGAS_RERANKER_BASE_URL=http://localhost:7998/v1 | ||
| 19 | RAGAS_RERANKER_MODEL=replace-me | ||
| 20 | |||
| 21 | RAGAS_GENERATOR_MODEL=gpt-4o-mini | ||
| 22 | RAGAS_JUDGE_MODEL=gpt-4o-mini | ||
| 23 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small | ||
| 24 | |||
| 25 | TESTSET_SIZE=50 | ||
| 26 | REQUEST_INTERVAL_SECONDS=0.2 |
.gitignore
0 → 100644
| 1 | .env | ||
| 2 | .venv/ | ||
| 3 | __pycache__/ | ||
| 4 | *.py[cod] | ||
| 5 | *.egg-info/ | ||
| 6 | .pytest_cache/ | ||
| 7 | .ruff_cache/ | ||
| 8 | |||
| 9 | data/raw_docs/pdf/* | ||
| 10 | data/raw_docs/xlsx/* | ||
| 11 | data/parsed_docs/*.json | ||
| 12 | data/parsed_docs/*.jsonl | ||
| 13 | data/parsed_docs/mineru_raw/* | ||
| 14 | data/exported/*.json | ||
| 15 | data/exported/*.jsonl | ||
| 16 | data/testsets/*.jsonl | ||
| 17 | data/runs/*.jsonl | ||
| 18 | data/reports/*.csv | ||
| 19 | data/reports/*.md | ||
| 20 | |||
| 21 | !data/raw_docs/pdf/.gitkeep | ||
| 22 | !data/raw_docs/xlsx/.gitkeep | ||
| 23 | !data/parsed_docs/mineru_raw/.gitkeep |
RAGAS_EVALUATION_IMPLEMENTATION_CHECKLIST.md
0 → 100644
This diff is collapsed.
Click to expand it.
README.md
0 → 100644
| 1 | # WeKnora Ragas Eval | ||
| 2 | |||
| 3 | 独立的 WeKnora Ragas 评估项目。它只调用 WeKnora 公开 API,不依赖 WeKnora 内置的 `/evaluation` 接口。 | ||
| 4 | |||
| 5 | ## 安装 | ||
| 6 | |||
| 7 | ```bash | ||
| 8 | python -m venv .venv | ||
| 9 | source .venv/bin/activate | ||
| 10 | pip install -e . | ||
| 11 | ``` | ||
| 12 | |||
| 13 | 如果需要更好的 PDF 解析能力: | ||
| 14 | |||
| 15 | ```bash | ||
| 16 | pip install -e ".[pdf]" | ||
| 17 | ``` | ||
| 18 | |||
| 19 | 开发和测试工具: | ||
| 20 | |||
| 21 | ```bash | ||
| 22 | pip install -e ".[dev,pdf]" | ||
| 23 | ``` | ||
| 24 | |||
| 25 | ## 配置 | ||
| 26 | |||
| 27 | ```bash | ||
| 28 | cp .env.example .env | ||
| 29 | ``` | ||
| 30 | |||
| 31 | 编辑 `.env` 后确认: | ||
| 32 | |||
| 33 | - `WEKNORA_BASE_URL` 指向 WeKnora API v1,例如 `http://localhost:9090/api/v1` | ||
| 34 | - `WEKNORA_API_KEY` 是 WeKnora API Key | ||
| 35 | - `WEKNORA_KB_ID` 是目标知识库 ID;如果还没有,先运行 `python scripts/00_create_kb.py` | ||
| 36 | - `WEKNORA_KB_NAME` 是创建知识库时使用的名称 | ||
| 37 | - `OPENAI_API_KEY`、`OPENAI_BASE_URL`、`RAGAS_*_MODEL` 是评估侧模型配置 | ||
| 38 | - 如果 LLM 和 embedding 分开部署,使用 `RAGAS_LLM_BASE_URL` 指向 vLLM 的 `/v1`,使用 `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity 的 `/v1` | ||
| 39 | |||
| 40 | ## 首轮 Pilot | ||
| 41 | |||
| 42 | 把原始文件放到: | ||
| 43 | |||
| 44 | - `data/raw_docs/pdf/` | ||
| 45 | - `data/raw_docs/xlsx/` | ||
| 46 | |||
| 47 | 按顺序执行: | ||
| 48 | |||
| 49 | ```bash | ||
| 50 | python scripts/00_create_kb.py | ||
| 51 | python scripts/01_upload_docs.py | ||
| 52 | python scripts/02_wait_ingestion.py | ||
| 53 | python scripts/03_export_chunks.py | ||
| 54 | python scripts/04_parse_docs.py | ||
| 55 | python scripts/05_generate_testset.py | ||
| 56 | python scripts/06_review_testset.py | ||
| 57 | python scripts/07_run_weknora_qa.py | ||
| 58 | python scripts/08_build_ragas_input.py | ||
| 59 | python scripts/09_run_ragas_eval.py | ||
| 60 | python scripts/10_report.py | ||
| 61 | ``` | ||
| 62 | |||
| 63 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | ||
| 64 | |||
| 65 | ## 主要产物 | ||
| 66 | |||
| 67 | - `data/exported/knowledge.jsonl` | ||
| 68 | - `data/exported/chunks.jsonl` | ||
| 69 | - `data/parsed_docs/documents.jsonl` | ||
| 70 | - `data/parsed_docs/parse_summary.json` | ||
| 71 | - `data/testsets/testset.raw.jsonl` | ||
| 72 | - `data/testsets/testset.reviewed.jsonl` | ||
| 73 | - `data/runs/weknora_answers.jsonl` | ||
| 74 | - `data/runs/ragas_input.jsonl` | ||
| 75 | - `data/reports/ragas_scores.csv` | ||
| 76 | - `data/reports/summary.md` |
configs/eval.yaml
0 → 100644
| 1 | weknora: | ||
| 2 | base_url: "${WEKNORA_BASE_URL}" | ||
| 3 | api_key: "${WEKNORA_API_KEY}" | ||
| 4 | knowledge_base_id: "${WEKNORA_KB_ID}" | ||
| 5 | knowledge_base_name: "${WEKNORA_KB_NAME:-ragas-eval-pilot}" | ||
| 6 | knowledge_base_description: "Knowledge base for independent Ragas evaluation." | ||
| 7 | timeout_seconds: 300 | ||
| 8 | request_interval_seconds: "${REQUEST_INTERVAL_SECONDS:-0.2}" | ||
| 9 | |||
| 10 | testset: | ||
| 11 | size: "${TESTSET_SIZE:-50}" | ||
| 12 | include_pdf: true | ||
| 13 | include_xlsx: true | ||
| 14 | min_context_chars: 80 | ||
| 15 | require_manual_review: true | ||
| 16 | |||
| 17 | parsing: | ||
| 18 | provider: "local" | ||
| 19 | output_path: "data/parsed_docs/documents.jsonl" | ||
| 20 | failed_path: "data/parsed_docs/failed_parse.jsonl" | ||
| 21 | summary_path: "data/parsed_docs/parse_summary.json" | ||
| 22 | local: | ||
| 23 | pdf_backend: "pymupdf" | ||
| 24 | xlsx_mode: "row_text" | ||
| 25 | min_chars: 80 | ||
| 26 | mineru: | ||
| 27 | mode: "cli" | ||
| 28 | cli_bin: "mineru" | ||
| 29 | output_dir: "data/parsed_docs/mineru_raw" | ||
| 30 | http_base_url: "http://172.23.184.9:8002" | ||
| 31 | api_key: "mineru" | ||
| 32 | timeout_seconds: 600 | ||
| 33 | fallback_to_local: false | ||
| 34 | |||
| 35 | qa: | ||
| 36 | one_session_per_question: true | ||
| 37 | disable_title: true | ||
| 38 | enable_memory: false | ||
| 39 | channel: "api" | ||
| 40 | verify_with_messages: false | ||
| 41 | |||
| 42 | ragas: | ||
| 43 | provider: "openai-compatible" | ||
| 44 | # Backward-compatible defaults. If the split LLM/embedding values below are | ||
| 45 | # empty, these values are used for both clients. | ||
| 46 | api_key: "${OPENAI_API_KEY}" | ||
| 47 | base_url: "${OPENAI_BASE_URL}" | ||
| 48 | # vLLM OpenAI-compatible endpoint, for example http://localhost:8000/v1. | ||
| 49 | llm_api_key: "${RAGAS_LLM_API_KEY}" | ||
| 50 | llm_base_url: "${RAGAS_LLM_BASE_URL}" | ||
| 51 | # Infinity OpenAI-compatible embedding endpoint, for example | ||
| 52 | # http://localhost:7997/v1. | ||
| 53 | embedding_api_key: "${RAGAS_EMBEDDING_API_KEY}" | ||
| 54 | embedding_base_url: "${RAGAS_EMBEDDING_BASE_URL}" | ||
| 55 | # Reserved for future retrieval/rerank metrics. The current Ragas pipeline | ||
| 56 | # does not call reranker APIs. | ||
| 57 | reranker_api_key: "${RAGAS_RERANKER_API_KEY}" | ||
| 58 | reranker_base_url: "${RAGAS_RERANKER_BASE_URL}" | ||
| 59 | reranker_model: "${RAGAS_RERANKER_MODEL}" | ||
| 60 | generator_model: "${RAGAS_GENERATOR_MODEL}" | ||
| 61 | judge_model: "${RAGAS_JUDGE_MODEL}" | ||
| 62 | embedding_model: "${RAGAS_EMBEDDING_MODEL}" | ||
| 63 | temperature: 0 | ||
| 64 | max_tokens: 4096 | ||
| 65 | timeout_seconds: 600 | ||
| 66 | max_workers: 1 | ||
| 67 | metrics: | ||
| 68 | - faithfulness | ||
| 69 | - response_relevancy | ||
| 70 | - context_precision | ||
| 71 | - context_recall | ||
| 72 | - factual_correctness |
data/parsed_docs/mineru_raw/.gitkeep
0 → 100644
data/raw_docs/pdf/.gitkeep
0 → 100644
data/raw_docs/xlsx/.gitkeep
0 → 100644
pyproject.toml
0 → 100644
| 1 | [project] | ||
| 2 | name = "weknora-ragas-eval" | ||
| 3 | version = "0.1.0" | ||
| 4 | description = "Independent Ragas evaluation pipeline for WeKnora public APIs." | ||
| 5 | readme = "README.md" | ||
| 6 | requires-python = ">=3.10" | ||
| 7 | dependencies = [ | ||
| 8 | "ragas>=0.3,<0.5", | ||
| 9 | "datasets>=2.19.0", | ||
| 10 | "pandas>=2.2.0", | ||
| 11 | "openpyxl>=3.1.0", | ||
| 12 | "requests>=2.32.0", | ||
| 13 | "sseclient-py>=1.8.0", | ||
| 14 | "python-dotenv>=1.0.0", | ||
| 15 | "pyyaml>=6.0.0", | ||
| 16 | "langchain>=0.2.0", | ||
| 17 | "langchain-community>=0.2.0", | ||
| 18 | "langchain-openai>=0.1.0", | ||
| 19 | "pypdf>=4.2.0" | ||
| 20 | ] | ||
| 21 | |||
| 22 | [project.optional-dependencies] | ||
| 23 | pdf = [ | ||
| 24 | "pymupdf>=1.24.0", | ||
| 25 | "pdfplumber>=0.11.0" | ||
| 26 | ] | ||
| 27 | dev = [ | ||
| 28 | "ruff>=0.6.0", | ||
| 29 | "pytest>=8.0.0" | ||
| 30 | ] | ||
| 31 | |||
| 32 | [build-system] | ||
| 33 | requires = ["setuptools>=68"] | ||
| 34 | build-backend = "setuptools.build_meta" | ||
| 35 | |||
| 36 | [tool.setuptools.packages.find] | ||
| 37 | where = ["src"] | ||
| 38 | |||
| 39 | [tool.ruff] | ||
| 40 | line-length = 100 | ||
| 41 | target-version = "py310" | ||
| 42 | |||
| 43 | [tool.ruff.lint] | ||
| 44 | select = ["E", "F", "I", "UP", "B"] |
scripts/00_create_kb.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | from typing import Any | ||
| 5 | |||
| 6 | import _bootstrap # noqa: F401 | ||
| 7 | |||
| 8 | from weknora_eval.api import bootstrap_client_from_config | ||
| 9 | from weknora_eval.config import load_config, require_config | ||
| 10 | from weknora_eval.envfile import set_env_value | ||
| 11 | from weknora_eval.loaders import setup_logging, write_json | ||
| 12 | |||
| 13 | |||
| 14 | def main() -> int: | ||
| 15 | setup_logging() | ||
| 16 | config = load_config() | ||
| 17 | client = bootstrap_client_from_config(config) | ||
| 18 | weknora = config["weknora"] | ||
| 19 | |||
| 20 | existing_id = str(weknora.get("knowledge_base_id") or "") | ||
| 21 | name = str(require_config(config, "weknora.knowledge_base_name")) | ||
| 22 | if existing_id and existing_id != "replace-me": | ||
| 23 | record = {"id": existing_id, "name": name, "source": "env"} | ||
| 24 | write_json("data/exported/knowledge_base.json", record) | ||
| 25 | print(f"WEKNORA_KB_ID already set: {existing_id}") | ||
| 26 | return 0 | ||
| 27 | |||
| 28 | created = client.create_knowledge_base(name=name) | ||
| 29 | knowledge_base_id = _extract_knowledge_base_id(created) | ||
| 30 | if not knowledge_base_id: | ||
| 31 | print(f"Created knowledge base but could not extract id from response: {created}") | ||
| 32 | return 1 | ||
| 33 | |||
| 34 | set_env_value(".env", "WEKNORA_KB_ID", knowledge_base_id) | ||
| 35 | write_json("data/exported/knowledge_base.json", {**created, "source": "create"}) | ||
| 36 | print(f"WEKNORA_KB_ID={knowledge_base_id}") | ||
| 37 | print("Wrote ID to .env and data/exported/knowledge_base.json") | ||
| 38 | return 0 | ||
| 39 | |||
| 40 | |||
| 41 | def _extract_knowledge_base_id(payload: dict[str, Any]) -> str | None: | ||
| 42 | candidates = [payload] | ||
| 43 | for key in ("data", "knowledge_base"): | ||
| 44 | nested = payload.get(key) | ||
| 45 | if isinstance(nested, dict): | ||
| 46 | candidates.append(nested) | ||
| 47 | |||
| 48 | for row in candidates: | ||
| 49 | for key in ("id", "knowledge_base_id", "kb_id", "uuid"): | ||
| 50 | value = row.get(key) | ||
| 51 | if value: | ||
| 52 | return str(value) | ||
| 53 | return None | ||
| 54 | |||
| 55 | |||
| 56 | if __name__ == "__main__": | ||
| 57 | sys.exit(main()) |
scripts/01_upload_docs.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | from pathlib import Path | ||
| 5 | |||
| 6 | import _bootstrap # noqa: F401 | ||
| 7 | |||
| 8 | from weknora_eval.api import client_from_config | ||
| 9 | from weknora_eval.config import load_config | ||
| 10 | from weknora_eval.loaders import setup_logging, write_jsonl | ||
| 11 | |||
| 12 | |||
| 13 | def main() -> int: | ||
| 14 | setup_logging() | ||
| 15 | config = load_config() | ||
| 16 | client = client_from_config(config) | ||
| 17 | files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted( | ||
| 18 | Path("data/raw_docs/xlsx").glob("*.xlsx") | ||
| 19 | ) | ||
| 20 | rows = [] | ||
| 21 | for path in files: | ||
| 22 | data = client.upload_file(path) | ||
| 23 | rows.append( | ||
| 24 | { | ||
| 25 | "knowledge_id": data.get("id"), | ||
| 26 | "file_name": data.get("file_name") or data.get("title") or path.name, | ||
| 27 | "file_type": data.get("file_type") or path.suffix.lstrip("."), | ||
| 28 | "parse_status": data.get("parse_status"), | ||
| 29 | "enable_status": data.get("enable_status"), | ||
| 30 | "raw": data, | ||
| 31 | } | ||
| 32 | ) | ||
| 33 | write_jsonl("data/exported/knowledge_uploads.jsonl", rows) | ||
| 34 | print(f"Uploaded {len(rows)} files") | ||
| 35 | return 0 | ||
| 36 | |||
| 37 | |||
| 38 | if __name__ == "__main__": | ||
| 39 | sys.exit(main()) |
scripts/02_wait_ingestion.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import read_jsonl, setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | uploads = read_jsonl("data/exported/knowledge_uploads.jsonl", missing_ok=True) | ||
| 17 | knowledge_ids = {row["knowledge_id"] for row in uploads if row.get("knowledge_id")} or None | ||
| 18 | result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids) | ||
| 19 | knowledge = client.list_knowledge() | ||
| 20 | write_jsonl("data/exported/knowledge.jsonl", knowledge) | ||
| 21 | |||
| 22 | print( | ||
| 23 | "Ingestion status: " | ||
| 24 | f"completed={len(result['completed'])} failed={len(result['failed'])} " | ||
| 25 | f"pending={len(result['pending'])}" | ||
| 26 | ) | ||
| 27 | return 1 if result["failed"] or result["pending"] else 0 | ||
| 28 | |||
| 29 | |||
| 30 | if __name__ == "__main__": | ||
| 31 | sys.exit(main()) |
scripts/03_export_chunks.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | knowledge_rows = client.list_knowledge() | ||
| 17 | write_jsonl("data/exported/knowledge.jsonl", knowledge_rows) | ||
| 18 | knowledge_by_id = {row.get("id"): row for row in knowledge_rows} | ||
| 19 | |||
| 20 | chunk_rows = [] | ||
| 21 | for knowledge in knowledge_rows: | ||
| 22 | knowledge_id = knowledge.get("id") | ||
| 23 | if not knowledge_id: | ||
| 24 | continue | ||
| 25 | if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled": | ||
| 26 | continue | ||
| 27 | for chunk in client.list_chunks(str(knowledge_id)): | ||
| 28 | content = (chunk.get("content") or "").strip() | ||
| 29 | if not content: | ||
| 30 | continue | ||
| 31 | if chunk.get("is_enabled") is False: | ||
| 32 | continue | ||
| 33 | source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge | ||
| 34 | chunk_rows.append( | ||
| 35 | { | ||
| 36 | "chunk_id": chunk.get("id"), | ||
| 37 | "knowledge_id": chunk.get("knowledge_id") or knowledge_id, | ||
| 38 | "knowledge_base_id": chunk.get("knowledge_base_id") | ||
| 39 | or config["weknora"]["knowledge_base_id"], | ||
| 40 | "chunk_index": chunk.get("chunk_index"), | ||
| 41 | "content": content, | ||
| 42 | "source_file": source.get("file_name") or source.get("title"), | ||
| 43 | "chunk_type": chunk.get("chunk_type"), | ||
| 44 | "raw": chunk, | ||
| 45 | } | ||
| 46 | ) | ||
| 47 | write_jsonl("data/exported/chunks.jsonl", chunk_rows) | ||
| 48 | print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records") | ||
| 49 | return 0 | ||
| 50 | |||
| 51 | |||
| 52 | if __name__ == "__main__": | ||
| 53 | sys.exit(main()) |
scripts/04_parse_docs.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.parsers.local import parse_raw_docs | ||
| 10 | from weknora_eval.parsers.mineru import parse_with_mineru | ||
| 11 | |||
| 12 | |||
| 13 | def main() -> int: | ||
| 14 | setup_logging() | ||
| 15 | config = load_config() | ||
| 16 | provider = config.get("parsing", {}).get("provider", "local") | ||
| 17 | if provider == "local": | ||
| 18 | rows, summary = parse_raw_docs(config) | ||
| 19 | elif provider == "mineru": | ||
| 20 | rows, summary = parse_with_mineru(config) | ||
| 21 | else: | ||
| 22 | raise ValueError(f"Unsupported parsing provider: {provider}") | ||
| 23 | print(f"Parsed {len(rows)} documents: {summary}") | ||
| 24 | return 0 if rows else 1 | ||
| 25 | |||
| 26 | |||
| 27 | if __name__ == "__main__": | ||
| 28 | sys.exit(main()) |
scripts/05_generate_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.testset import generate_rule_based_testset | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | testset = config.get("testset", {}) | ||
| 16 | rows = generate_rule_based_testset( | ||
| 17 | size=int(testset.get("size", 50)), | ||
| 18 | min_context_chars=int(testset.get("min_context_chars", 80)), | ||
| 19 | ) | ||
| 20 | print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl") | ||
| 21 | return 0 if rows else 1 | ||
| 22 | |||
| 23 | |||
| 24 | if __name__ == "__main__": | ||
| 25 | sys.exit(main()) |
scripts/06_review_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.loaders import setup_logging | ||
| 8 | from weknora_eval.testset import approve_pending_testset, validate_reviewed_testset | ||
| 9 | |||
| 10 | |||
| 11 | def main() -> int: | ||
| 12 | setup_logging() | ||
| 13 | rows = approve_pending_testset() | ||
| 14 | errors = validate_reviewed_testset() | ||
| 15 | if errors: | ||
| 16 | for error in errors: | ||
| 17 | print(error) | ||
| 18 | return 1 | ||
| 19 | print(f"Wrote {len(rows)} approved QA records to data/testsets/testset.reviewed.jsonl") | ||
| 20 | return 0 if rows else 1 | ||
| 21 | |||
| 22 | |||
| 23 | if __name__ == "__main__": | ||
| 24 | sys.exit(main()) |
scripts/07_run_weknora_qa.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | qa_config = config.get("qa", {}) | ||
| 17 | rows = [row for row in read_jsonl("data/testsets/testset.reviewed.jsonl") if row.get("review_status") == "approved"] | ||
| 18 | answers = [] | ||
| 19 | |||
| 20 | for index, row in enumerate(rows, start=1): | ||
| 21 | sample_id = row["sample_id"] | ||
| 22 | try: | ||
| 23 | session = client.create_session(title=f"ragas-eval-{sample_id}") | ||
| 24 | session_id = session.get("id") | ||
| 25 | if not session_id: | ||
| 26 | raise RuntimeError(f"create_session returned no id for {sample_id}") | ||
| 27 | result = client.knowledge_chat_sse( | ||
| 28 | session_id=session_id, | ||
| 29 | query=row["user_input"], | ||
| 30 | disable_title=bool(qa_config.get("disable_title", True)), | ||
| 31 | enable_memory=bool(qa_config.get("enable_memory", False)), | ||
| 32 | channel=str(qa_config.get("channel", "api")), | ||
| 33 | ) | ||
| 34 | answer = { | ||
| 35 | "sample_id": sample_id, | ||
| 36 | "user_input": row["user_input"], | ||
| 37 | "session_id": session_id, | ||
| 38 | "request_id": result.get("request_id"), | ||
| 39 | "response": result.get("response") or "", | ||
| 40 | "retrieved_contexts": result.get("retrieved_contexts") or [], | ||
| 41 | "weknora_references": result.get("weknora_references") or [], | ||
| 42 | "error": None, | ||
| 43 | } | ||
| 44 | if not answer["response"]: | ||
| 45 | answer["error"] = "empty_response" | ||
| 46 | append_jsonl("data/runs/failed_requests.jsonl", answer) | ||
| 47 | elif not answer["retrieved_contexts"]: | ||
| 48 | append_jsonl("data/runs/failed_requests.jsonl", {**answer, "error": "empty_retrieval"}) | ||
| 49 | answers.append(answer) | ||
| 50 | print(f"[{index}/{len(rows)}] {sample_id} response_chars={len(answer['response'])}") | ||
| 51 | except Exception as exc: # noqa: BLE001 | ||
| 52 | failed = { | ||
| 53 | "sample_id": sample_id, | ||
| 54 | "user_input": row.get("user_input"), | ||
| 55 | "response": "", | ||
| 56 | "retrieved_contexts": [], | ||
| 57 | "weknora_references": [], | ||
| 58 | "session_id": None, | ||
| 59 | "request_id": None, | ||
| 60 | "error": str(exc), | ||
| 61 | } | ||
| 62 | answers.append(failed) | ||
| 63 | append_jsonl("data/runs/failed_requests.jsonl", failed) | ||
| 64 | print(f"[{index}/{len(rows)}] {sample_id} failed: {exc}") | ||
| 65 | |||
| 66 | write_jsonl("data/runs/weknora_answers.jsonl", answers) | ||
| 67 | failures = [row for row in answers if row.get("error") and row.get("error") != "empty_retrieval"] | ||
| 68 | return 1 if failures else 0 | ||
| 69 | |||
| 70 | |||
| 71 | if __name__ == "__main__": | ||
| 72 | sys.exit(main()) |
scripts/08_build_ragas_input.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl | ||
| 8 | |||
| 9 | |||
| 10 | def main() -> int: | ||
| 11 | setup_logging() | ||
| 12 | testset = { | ||
| 13 | row["sample_id"]: row | ||
| 14 | for row in read_jsonl("data/testsets/testset.reviewed.jsonl") | ||
| 15 | if row.get("review_status") == "approved" | ||
| 16 | } | ||
| 17 | answers = {row["sample_id"]: row for row in read_jsonl("data/runs/weknora_answers.jsonl")} | ||
| 18 | ragas_rows = [] | ||
| 19 | for sample_id, qa in testset.items(): | ||
| 20 | answer = answers.get(sample_id) | ||
| 21 | if not answer: | ||
| 22 | append_jsonl("data/runs/failed_requests.jsonl", {"sample_id": sample_id, "error": "missing_answer"}) | ||
| 23 | continue | ||
| 24 | row = { | ||
| 25 | "sample_id": sample_id, | ||
| 26 | "user_input": qa["user_input"], | ||
| 27 | "response": answer.get("response") or "", | ||
| 28 | "retrieved_contexts": answer.get("retrieved_contexts") or [], | ||
| 29 | "reference": qa["reference"], | ||
| 30 | "reference_contexts": qa.get("reference_contexts") or [], | ||
| 31 | "session_id": answer.get("session_id"), | ||
| 32 | "request_id": answer.get("request_id"), | ||
| 33 | "weknora_references": answer.get("weknora_references") or [], | ||
| 34 | "source_file": qa.get("source_file"), | ||
| 35 | "gold_chunk_ids": qa.get("gold_chunk_ids") or [], | ||
| 36 | } | ||
| 37 | missing = [ | ||
| 38 | key | ||
| 39 | for key in ("user_input", "response", "retrieved_contexts", "reference", "reference_contexts") | ||
| 40 | if not row.get(key) | ||
| 41 | ] | ||
| 42 | if missing: | ||
| 43 | append_jsonl( | ||
| 44 | "data/runs/failed_requests.jsonl", | ||
| 45 | {"sample_id": sample_id, "error": f"missing_ragas_fields:{','.join(missing)}"}, | ||
| 46 | ) | ||
| 47 | continue | ||
| 48 | ragas_rows.append(row) | ||
| 49 | |||
| 50 | write_jsonl("data/runs/ragas_input.jsonl", ragas_rows) | ||
| 51 | print(f"Built {len(ragas_rows)} Ragas input rows") | ||
| 52 | return 0 if ragas_rows else 1 | ||
| 53 | |||
| 54 | |||
| 55 | if __name__ == "__main__": | ||
| 56 | sys.exit(main()) |
scripts/09_run_ragas_eval.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.ragas_runner import run_ragas_eval | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | scores = run_ragas_eval(config) | ||
| 16 | print(f"Wrote {len(scores)} Ragas score rows to data/reports/ragas_scores.csv") | ||
| 17 | return 0 | ||
| 18 | |||
| 19 | |||
| 20 | if __name__ == "__main__": | ||
| 21 | sys.exit(main()) |
scripts/10_report.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.report import generate_summary_report | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | generate_summary_report(config) | ||
| 16 | print("Wrote report to data/reports/summary.md") | ||
| 17 | return 0 | ||
| 18 | |||
| 19 | |||
| 20 | if __name__ == "__main__": | ||
| 21 | sys.exit(main()) |
scripts/_bootstrap.py
0 → 100644
src/weknora_eval/__init__.py
0 → 100644
src/weknora_eval/api.py
0 → 100644
This diff is collapsed.
Click to expand it.
src/weknora_eval/config.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import os | ||
| 4 | import re | ||
| 5 | from pathlib import Path | ||
| 6 | from typing import Any | ||
| 7 | |||
| 8 | import yaml | ||
| 9 | from dotenv import load_dotenv | ||
| 10 | |||
| 11 | |||
| 12 | _ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}") | ||
| 13 | |||
| 14 | |||
| 15 | def _expand_env(value: Any) -> Any: | ||
| 16 | if isinstance(value, dict): | ||
| 17 | return {key: _expand_env(item) for key, item in value.items()} | ||
| 18 | if isinstance(value, list): | ||
| 19 | return [_expand_env(item) for item in value] | ||
| 20 | if not isinstance(value, str): | ||
| 21 | return value | ||
| 22 | |||
| 23 | def replace(match: re.Match[str]) -> str: | ||
| 24 | default = match.group(2) if match.group(2) is not None else "" | ||
| 25 | return os.getenv(match.group(1), default) | ||
| 26 | |||
| 27 | expanded = _ENV_PATTERN.sub(replace, value) | ||
| 28 | return _coerce_scalar(expanded) | ||
| 29 | |||
| 30 | |||
| 31 | def _coerce_scalar(value: str) -> Any: | ||
| 32 | lowered = value.lower() | ||
| 33 | if lowered in {"true", "false"}: | ||
| 34 | return lowered == "true" | ||
| 35 | if lowered in {"none", "null"}: | ||
| 36 | return None | ||
| 37 | try: | ||
| 38 | if "." not in value: | ||
| 39 | return int(value) | ||
| 40 | return float(value) | ||
| 41 | except ValueError: | ||
| 42 | return value | ||
| 43 | |||
| 44 | |||
| 45 | def load_config(path: str | Path = "configs/eval.yaml") -> dict[str, Any]: | ||
| 46 | load_dotenv() | ||
| 47 | config_path = Path(path) | ||
| 48 | with config_path.open("r", encoding="utf-8") as file: | ||
| 49 | raw = yaml.safe_load(file) or {} | ||
| 50 | return _expand_env(raw) | ||
| 51 | |||
| 52 | |||
| 53 | def require_config(config: dict[str, Any], dotted_key: str) -> Any: | ||
| 54 | current: Any = config | ||
| 55 | for part in dotted_key.split("."): | ||
| 56 | if not isinstance(current, dict) or part not in current: | ||
| 57 | raise ValueError(f"Missing required config value: {dotted_key}") | ||
| 58 | value = current[part] | ||
| 59 | if value is None or value == "": | ||
| 60 | raise ValueError(f"Missing required config value: {dotted_key}") | ||
| 61 | current = value | ||
| 62 | return current | ||
| 63 | |||
| 64 | |||
| 65 | def project_path(*parts: str) -> Path: | ||
| 66 | return Path.cwd().joinpath(*parts) |
src/weknora_eval/envfile.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from pathlib import Path | ||
| 4 | |||
| 5 | |||
| 6 | def set_env_value(path: str | Path, key: str, value: str) -> None: | ||
| 7 | target = Path(path) | ||
| 8 | lines = target.read_text(encoding="utf-8").splitlines() if target.exists() else [] | ||
| 9 | prefix = f"{key}=" | ||
| 10 | replacement = f"{key}={value}" | ||
| 11 | updated = False | ||
| 12 | output: list[str] = [] | ||
| 13 | |||
| 14 | for line in lines: | ||
| 15 | if line.startswith(prefix): | ||
| 16 | output.append(replacement) | ||
| 17 | updated = True | ||
| 18 | else: | ||
| 19 | output.append(line) | ||
| 20 | |||
| 21 | if not updated: | ||
| 22 | output.append(replacement) | ||
| 23 | |||
| 24 | target.write_text("\n".join(output) + "\n", encoding="utf-8") |
src/weknora_eval/loaders.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import json | ||
| 4 | import logging | ||
| 5 | from collections.abc import Iterable | ||
| 6 | from pathlib import Path | ||
| 7 | from typing import Any | ||
| 8 | |||
| 9 | |||
| 10 | def setup_logging(level: int = logging.INFO) -> None: | ||
| 11 | logging.basicConfig( | ||
| 12 | level=level, | ||
| 13 | format="%(asctime)s %(levelname)s %(name)s: %(message)s", | ||
| 14 | ) | ||
| 15 | |||
| 16 | |||
| 17 | def ensure_parent(path: str | Path) -> Path: | ||
| 18 | target = Path(path) | ||
| 19 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 20 | return target | ||
| 21 | |||
| 22 | |||
| 23 | def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]: | ||
| 24 | target = Path(path) | ||
| 25 | if not target.exists(): | ||
| 26 | if missing_ok: | ||
| 27 | return [] | ||
| 28 | raise FileNotFoundError(target) | ||
| 29 | |||
| 30 | rows: list[dict[str, Any]] = [] | ||
| 31 | with target.open("r", encoding="utf-8") as file: | ||
| 32 | for line_no, line in enumerate(file, start=1): | ||
| 33 | stripped = line.strip() | ||
| 34 | if not stripped: | ||
| 35 | continue | ||
| 36 | try: | ||
| 37 | rows.append(json.loads(stripped)) | ||
| 38 | except json.JSONDecodeError as exc: | ||
| 39 | raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc | ||
| 40 | return rows | ||
| 41 | |||
| 42 | |||
| 43 | def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]: | ||
| 44 | target = Path(path) | ||
| 45 | if not target.exists(): | ||
| 46 | if missing_ok: | ||
| 47 | return | ||
| 48 | raise FileNotFoundError(target) | ||
| 49 | |||
| 50 | with target.open("r", encoding="utf-8") as file: | ||
| 51 | for line_no, line in enumerate(file, start=1): | ||
| 52 | stripped = line.strip() | ||
| 53 | if not stripped: | ||
| 54 | continue | ||
| 55 | try: | ||
| 56 | yield json.loads(stripped) | ||
| 57 | except json.JSONDecodeError as exc: | ||
| 58 | raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc | ||
| 59 | |||
| 60 | |||
| 61 | def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int: | ||
| 62 | target = ensure_parent(path) | ||
| 63 | count = 0 | ||
| 64 | with target.open("w", encoding="utf-8") as file: | ||
| 65 | for row in rows: | ||
| 66 | file.write(json.dumps(row, ensure_ascii=False) + "\n") | ||
| 67 | count += 1 | ||
| 68 | return count | ||
| 69 | |||
| 70 | |||
| 71 | def append_jsonl(path: str | Path, row: dict[str, Any]) -> None: | ||
| 72 | target = ensure_parent(path) | ||
| 73 | with target.open("a", encoding="utf-8") as file: | ||
| 74 | file.write(json.dumps(row, ensure_ascii=False) + "\n") | ||
| 75 | |||
| 76 | |||
| 77 | def write_json(path: str | Path, payload: dict[str, Any]) -> None: | ||
| 78 | target = ensure_parent(path) | ||
| 79 | with target.open("w", encoding="utf-8") as file: | ||
| 80 | json.dump(payload, file, ensure_ascii=False, indent=2) | ||
| 81 | file.write("\n") | ||
| 82 | |||
| 83 | |||
| 84 | def compact_text(value: Any) -> str: | ||
| 85 | text = "" if value is None else str(value) | ||
| 86 | return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip() |
src/weknora_eval/parsers/__init__.py
0 → 100644
| 1 | """Document parser adapters.""" |
src/weknora_eval/parsers/local.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import statistics | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | from openpyxl import load_workbook | ||
| 8 | |||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | ||
| 10 | from weknora_eval.schemas import ParsedDocument | ||
| 11 | |||
| 12 | |||
| 13 | def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| 14 | parsing = config["parsing"] | ||
| 15 | local_config = parsing.get("local", {}) | ||
| 16 | min_chars = int(local_config.get("min_chars", 80)) | ||
| 17 | pdf_backend = local_config.get("pdf_backend", "pypdf") | ||
| 18 | xlsx_mode = local_config.get("xlsx_mode", "row_text") | ||
| 19 | |||
| 20 | docs: list[ParsedDocument] = [] | ||
| 21 | failures: list[dict[str, Any]] = [] | ||
| 22 | |||
| 23 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | ||
| 24 | try: | ||
| 25 | docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars)) | ||
| 26 | except Exception as exc: # noqa: BLE001 - parser failures must be persisted. | ||
| 27 | failures.append( | ||
| 28 | { | ||
| 29 | "source_file": pdf_path.name, | ||
| 30 | "parser": f"local:{pdf_backend}", | ||
| 31 | "status": "failed", | ||
| 32 | "error": str(exc), | ||
| 33 | "fallback_used": None, | ||
| 34 | } | ||
| 35 | ) | ||
| 36 | |||
| 37 | for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")): | ||
| 38 | try: | ||
| 39 | docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars)) | ||
| 40 | except Exception as exc: # noqa: BLE001 | ||
| 41 | failures.append( | ||
| 42 | { | ||
| 43 | "source_file": xlsx_path.name, | ||
| 44 | "parser": "local:openpyxl", | ||
| 45 | "status": "failed", | ||
| 46 | "error": str(exc), | ||
| 47 | "fallback_used": None, | ||
| 48 | } | ||
| 49 | ) | ||
| 50 | |||
| 51 | rows = [doc.to_dict() for doc in docs] | ||
| 52 | write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows) | ||
| 53 | if failures: | ||
| 54 | write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures) | ||
| 55 | |||
| 56 | summary = build_parse_summary(rows, failures, parser=f"local:{pdf_backend}") | ||
| 57 | write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary) | ||
| 58 | return rows, summary | ||
| 59 | |||
| 60 | |||
| 61 | def parse_pdf(path: str | Path, *, backend: str = "pypdf", min_chars: int = 80) -> list[ParsedDocument]: | ||
| 62 | target = Path(path) | ||
| 63 | backend = backend.lower() | ||
| 64 | if backend == "pymupdf": | ||
| 65 | return _parse_pdf_pymupdf(target, min_chars=min_chars) | ||
| 66 | if backend == "pdfplumber": | ||
| 67 | return _parse_pdf_pdfplumber(target, min_chars=min_chars) | ||
| 68 | if backend == "pypdf": | ||
| 69 | return _parse_pdf_pypdf(target, min_chars=min_chars) | ||
| 70 | raise ValueError(f"Unsupported PDF backend: {backend}") | ||
| 71 | |||
| 72 | |||
| 73 | def _parse_pdf_pypdf(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 74 | from pypdf import PdfReader | ||
| 75 | |||
| 76 | reader = PdfReader(str(path)) | ||
| 77 | docs: list[ParsedDocument] = [] | ||
| 78 | for index, page in enumerate(reader.pages, start=1): | ||
| 79 | content = compact_text(page.extract_text() or "") | ||
| 80 | if len(content) < min_chars: | ||
| 81 | continue | ||
| 82 | docs.append(_pdf_doc(path, index, content, "local:pypdf")) | ||
| 83 | return docs | ||
| 84 | |||
| 85 | |||
| 86 | def _parse_pdf_pymupdf(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 87 | try: | ||
| 88 | import fitz | ||
| 89 | except ImportError as exc: | ||
| 90 | raise ImportError("pymupdf backend requires `pip install -e '.[pdf]'`") from exc | ||
| 91 | |||
| 92 | docs: list[ParsedDocument] = [] | ||
| 93 | with fitz.open(path) as document: | ||
| 94 | for index, page in enumerate(document, start=1): | ||
| 95 | content = compact_text(page.get_text("text")) | ||
| 96 | if len(content) < min_chars: | ||
| 97 | continue | ||
| 98 | docs.append(_pdf_doc(path, index, content, "local:pymupdf")) | ||
| 99 | return docs | ||
| 100 | |||
| 101 | |||
| 102 | def _parse_pdf_pdfplumber(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 103 | try: | ||
| 104 | import pdfplumber | ||
| 105 | except ImportError as exc: | ||
| 106 | raise ImportError("pdfplumber backend requires `pip install -e '.[pdf]'`") from exc | ||
| 107 | |||
| 108 | docs: list[ParsedDocument] = [] | ||
| 109 | with pdfplumber.open(path) as pdf: | ||
| 110 | for index, page in enumerate(pdf.pages, start=1): | ||
| 111 | content = compact_text(page.extract_text() or "") | ||
| 112 | if len(content) < min_chars: | ||
| 113 | continue | ||
| 114 | docs.append(_pdf_doc(path, index, content, "local:pdfplumber")) | ||
| 115 | return docs | ||
| 116 | |||
| 117 | |||
| 118 | def _pdf_doc(path: Path, page: int, content: str, parser: str) -> ParsedDocument: | ||
| 119 | return ParsedDocument( | ||
| 120 | doc_id=f"{path.name}::page-{page}", | ||
| 121 | source_file=path.name, | ||
| 122 | file_type="pdf", | ||
| 123 | page=page, | ||
| 124 | content=content, | ||
| 125 | metadata={"parser": parser}, | ||
| 126 | ) | ||
| 127 | |||
| 128 | |||
| 129 | def parse_xlsx(path: str | Path, *, mode: str = "row_text", min_chars: int = 80) -> list[ParsedDocument]: | ||
| 130 | target = Path(path) | ||
| 131 | mode = mode.lower() | ||
| 132 | workbook = load_workbook(target, data_only=True, read_only=True) | ||
| 133 | if mode == "row_text": | ||
| 134 | return _parse_xlsx_row_text(target, workbook, min_chars=min_chars) | ||
| 135 | if mode == "markdown_table": | ||
| 136 | return _parse_xlsx_markdown_table(target, workbook, min_chars=min_chars) | ||
| 137 | raise ValueError(f"Unsupported XLSX mode: {mode}") | ||
| 138 | |||
| 139 | |||
| 140 | def _parse_xlsx_row_text(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]: | ||
| 141 | docs: list[ParsedDocument] = [] | ||
| 142 | for sheet in workbook.worksheets: | ||
| 143 | rows = list(sheet.iter_rows(values_only=True)) | ||
| 144 | if not rows: | ||
| 145 | continue | ||
| 146 | headers = [_cell_to_text(value) or f"col_{index}" for index, value in enumerate(rows[0], start=1)] | ||
| 147 | for row_index, row in enumerate(rows[1:], start=2): | ||
| 148 | pairs = [] | ||
| 149 | for header, value in zip(headers, row, strict=False): | ||
| 150 | cell = _cell_to_text(value) | ||
| 151 | if cell: | ||
| 152 | pairs.append(f"{header}: {cell}") | ||
| 153 | content = "\n".join(pairs).strip() | ||
| 154 | if len(content) < min_chars: | ||
| 155 | continue | ||
| 156 | docs.append( | ||
| 157 | ParsedDocument( | ||
| 158 | doc_id=f"{path.name}::{sheet.title}::row-{row_index}", | ||
| 159 | source_file=path.name, | ||
| 160 | file_type="xlsx", | ||
| 161 | sheet=sheet.title, | ||
| 162 | row_index=row_index, | ||
| 163 | content=content, | ||
| 164 | metadata={"parser": "local:openpyxl", "columns": headers}, | ||
| 165 | ) | ||
| 166 | ) | ||
| 167 | return docs | ||
| 168 | |||
| 169 | |||
| 170 | def _parse_xlsx_markdown_table(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]: | ||
| 171 | docs: list[ParsedDocument] = [] | ||
| 172 | for sheet in workbook.worksheets: | ||
| 173 | rows = [ | ||
| 174 | [_cell_to_text(value) for value in row] | ||
| 175 | for row in sheet.iter_rows(values_only=True) | ||
| 176 | if any(value is not None for value in row) | ||
| 177 | ] | ||
| 178 | if not rows: | ||
| 179 | continue | ||
| 180 | width = max(len(row) for row in rows) | ||
| 181 | normalized = [row + [""] * (width - len(row)) for row in rows] | ||
| 182 | header = normalized[0] | ||
| 183 | separator = ["---"] * width | ||
| 184 | body = normalized[1:] | ||
| 185 | lines = [ | ||
| 186 | "| " + " | ".join(header) + " |", | ||
| 187 | "| " + " | ".join(separator) + " |", | ||
| 188 | ] | ||
| 189 | lines.extend("| " + " | ".join(row) + " |" for row in body) | ||
| 190 | content = "\n".join(lines) | ||
| 191 | if len(content) < min_chars: | ||
| 192 | continue | ||
| 193 | docs.append( | ||
| 194 | ParsedDocument( | ||
| 195 | doc_id=f"{path.name}::{sheet.title}", | ||
| 196 | source_file=path.name, | ||
| 197 | file_type="xlsx", | ||
| 198 | sheet=sheet.title, | ||
| 199 | content=content, | ||
| 200 | metadata={"parser": "local:openpyxl", "mode": "markdown_table"}, | ||
| 201 | ) | ||
| 202 | ) | ||
| 203 | return docs | ||
| 204 | |||
| 205 | |||
| 206 | def _cell_to_text(value: Any) -> str: | ||
| 207 | if value is None: | ||
| 208 | return "" | ||
| 209 | text = str(value).strip() | ||
| 210 | return text.replace("\n", " ") | ||
| 211 | |||
| 212 | |||
| 213 | def build_parse_summary( | ||
| 214 | rows: list[dict[str, Any]], | ||
| 215 | failures: list[dict[str, Any]], | ||
| 216 | *, | ||
| 217 | parser: str, | ||
| 218 | ) -> dict[str, Any]: | ||
| 219 | source_files = {row.get("source_file") for row in rows if row.get("source_file")} | ||
| 220 | failed_files = {row.get("source_file") for row in failures if row.get("source_file")} | ||
| 221 | lengths = [len(row.get("content") or "") for row in rows] | ||
| 222 | return { | ||
| 223 | "total_files": len(source_files | failed_files), | ||
| 224 | "parsed_files": len(source_files), | ||
| 225 | "failed_files": len(failed_files), | ||
| 226 | "total_documents": len(rows), | ||
| 227 | "empty_documents": sum(1 for length in lengths if length == 0), | ||
| 228 | "avg_chars": round(statistics.mean(lengths), 2) if lengths else 0, | ||
| 229 | "parser": parser, | ||
| 230 | } |
src/weknora_eval/parsers/mineru.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import subprocess | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import requests | ||
| 8 | |||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | ||
| 10 | from weknora_eval.parsers.local import build_parse_summary, parse_pdf | ||
| 11 | from weknora_eval.schemas import ParsedDocument | ||
| 12 | |||
| 13 | |||
| 14 | class MinerUParseError(RuntimeError): | ||
| 15 | pass | ||
| 16 | |||
| 17 | |||
| 18 | def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| 19 | parsing = config["parsing"] | ||
| 20 | mineru = parsing.get("mineru", {}) | ||
| 21 | mode = mineru.get("mode", "cli") | ||
| 22 | fallback = bool(mineru.get("fallback_to_local", True)) | ||
| 23 | local_config = parsing.get("local", {}) | ||
| 24 | min_chars = int(local_config.get("min_chars", 80)) | ||
| 25 | |||
| 26 | docs: list[ParsedDocument] = [] | ||
| 27 | failures: list[dict[str, Any]] = [] | ||
| 28 | |||
| 29 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | ||
| 30 | parser_name = f"mineru:{mode}" | ||
| 31 | try: | ||
| 32 | if mode == "cli": | ||
| 33 | docs.extend(parse_pdf_with_cli(pdf_path, mineru, min_chars=min_chars)) | ||
| 34 | elif mode == "http": | ||
| 35 | docs.extend(parse_pdf_with_http(pdf_path, mineru, min_chars=min_chars)) | ||
| 36 | else: | ||
| 37 | raise MinerUParseError(f"Unsupported MinerU mode: {mode}") | ||
| 38 | except Exception as exc: # noqa: BLE001 | ||
| 39 | failure = { | ||
| 40 | "source_file": pdf_path.name, | ||
| 41 | "parser": parser_name, | ||
| 42 | "status": "failed", | ||
| 43 | "error": str(exc), | ||
| 44 | "fallback_used": None, | ||
| 45 | } | ||
| 46 | if fallback: | ||
| 47 | try: | ||
| 48 | backend = local_config.get("pdf_backend", "pypdf") | ||
| 49 | local_docs = parse_pdf(pdf_path, backend=backend, min_chars=min_chars) | ||
| 50 | docs.extend(local_docs) | ||
| 51 | failure["fallback_used"] = f"local:{backend}" | ||
| 52 | except Exception as fallback_exc: # noqa: BLE001 | ||
| 53 | failure["fallback_error"] = str(fallback_exc) | ||
| 54 | failures.append(failure) | ||
| 55 | |||
| 56 | rows = [doc.to_dict() for doc in docs] | ||
| 57 | write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows) | ||
| 58 | if failures: | ||
| 59 | write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures) | ||
| 60 | |||
| 61 | summary = build_parse_summary(rows, failures, parser=f"mineru:{mode}") | ||
| 62 | write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary) | ||
| 63 | return rows, summary | ||
| 64 | |||
| 65 | |||
| 66 | def parse_pdf_with_cli( | ||
| 67 | pdf_path: str | Path, | ||
| 68 | mineru_config: dict[str, Any], | ||
| 69 | *, | ||
| 70 | min_chars: int, | ||
| 71 | ) -> list[ParsedDocument]: | ||
| 72 | target = Path(pdf_path) | ||
| 73 | output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw")) | ||
| 74 | output_dir = output_root / target.stem | ||
| 75 | output_dir.mkdir(parents=True, exist_ok=True) | ||
| 76 | cli_bin = mineru_config.get("cli_bin", "mineru") | ||
| 77 | timeout = int(mineru_config.get("timeout_seconds", 600)) | ||
| 78 | |||
| 79 | # MinerU CLI arguments vary by release. This common invocation is isolated | ||
| 80 | # here so deployments can replace it without touching pipeline scripts. | ||
| 81 | result = subprocess.run( | ||
| 82 | [cli_bin, "-p", str(target), "-o", str(output_dir)], | ||
| 83 | check=False, | ||
| 84 | capture_output=True, | ||
| 85 | text=True, | ||
| 86 | timeout=timeout, | ||
| 87 | ) | ||
| 88 | if result.returncode != 0: | ||
| 89 | raise MinerUParseError(result.stderr.strip() or result.stdout.strip() or "MinerU CLI failed") | ||
| 90 | |||
| 91 | markdown_files = sorted(output_dir.rglob("*.md")) | ||
| 92 | if not markdown_files: | ||
| 93 | raise MinerUParseError(f"No Markdown output found in {output_dir}") | ||
| 94 | |||
| 95 | docs: list[ParsedDocument] = [] | ||
| 96 | for index, markdown_path in enumerate(markdown_files, start=1): | ||
| 97 | content = compact_text(markdown_path.read_text(encoding="utf-8")) | ||
| 98 | if len(content) < min_chars: | ||
| 99 | continue | ||
| 100 | docs.append( | ||
| 101 | ParsedDocument( | ||
| 102 | doc_id=f"{target.name}::mineru-{index}", | ||
| 103 | source_file=target.name, | ||
| 104 | file_type="pdf", | ||
| 105 | content=content, | ||
| 106 | metadata={ | ||
| 107 | "parser": "mineru:cli", | ||
| 108 | "mineru_output": str(markdown_path), | ||
| 109 | }, | ||
| 110 | ) | ||
| 111 | ) | ||
| 112 | return docs | ||
| 113 | |||
| 114 | |||
| 115 | def parse_pdf_with_http( | ||
| 116 | pdf_path: str | Path, | ||
| 117 | mineru_config: dict[str, Any], | ||
| 118 | *, | ||
| 119 | min_chars: int, | ||
| 120 | ) -> list[ParsedDocument]: | ||
| 121 | target = Path(pdf_path) | ||
| 122 | base_url = str(mineru_config.get("http_base_url") or "").rstrip("/") | ||
| 123 | if not base_url: | ||
| 124 | raise MinerUParseError("MinerU HTTP mode requires parsing.mineru.http_base_url") | ||
| 125 | |||
| 126 | headers = {} | ||
| 127 | if mineru_config.get("api_key"): | ||
| 128 | headers["Authorization"] = f"Bearer {mineru_config['api_key']}" | ||
| 129 | |||
| 130 | # The checklist does not define a universal MinerU HTTP contract. This | ||
| 131 | # implementation expects a replaceable service exposing POST /parse and | ||
| 132 | # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}. | ||
| 133 | with target.open("rb") as file: | ||
| 134 | response = requests.post( | ||
| 135 | f"{base_url}/parse", | ||
| 136 | files={"file": (target.name, file, "application/pdf")}, | ||
| 137 | headers=headers, | ||
| 138 | timeout=int(mineru_config.get("timeout_seconds", 600)), | ||
| 139 | ) | ||
| 140 | if response.status_code >= 400: | ||
| 141 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") | ||
| 142 | |||
| 143 | payload = response.json() | ||
| 144 | contents: list[str] = [] | ||
| 145 | if isinstance(payload.get("documents"), list): | ||
| 146 | contents = [compact_text(item.get("content")) for item in payload["documents"]] | ||
| 147 | elif payload.get("markdown"): | ||
| 148 | contents = [compact_text(payload["markdown"])] | ||
| 149 | else: | ||
| 150 | raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`") | ||
| 151 | |||
| 152 | docs: list[ParsedDocument] = [] | ||
| 153 | for index, content in enumerate(contents, start=1): | ||
| 154 | if len(content) < min_chars: | ||
| 155 | continue | ||
| 156 | docs.append( | ||
| 157 | ParsedDocument( | ||
| 158 | doc_id=f"{target.name}::mineru-http-{index}", | ||
| 159 | source_file=target.name, | ||
| 160 | file_type="pdf", | ||
| 161 | content=content, | ||
| 162 | metadata={"parser": "mineru:http"}, | ||
| 163 | ) | ||
| 164 | ) | ||
| 165 | return docs |
src/weknora_eval/ragas_runner.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import os | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import pandas as pd | ||
| 8 | |||
| 9 | from weknora_eval.config import require_config | ||
| 10 | from weknora_eval.loaders import read_jsonl | ||
| 11 | |||
| 12 | |||
| 13 | def run_ragas_eval( | ||
| 14 | config: dict[str, Any], | ||
| 15 | *, | ||
| 16 | input_path: str = "data/runs/ragas_input.jsonl", | ||
| 17 | output_csv_path: str = "data/reports/ragas_scores.csv", | ||
| 18 | ) -> pd.DataFrame: | ||
| 19 | from datasets import Dataset | ||
| 20 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings | ||
| 21 | from ragas import evaluate | ||
| 22 | from ragas.run_config import RunConfig | ||
| 23 | |||
| 24 | ragas_config = config["ragas"] | ||
| 25 | llm_api_key = _first_non_empty(ragas_config, "llm_api_key", "api_key") | ||
| 26 | llm_base_url = _first_non_empty(ragas_config, "llm_base_url", "base_url") | ||
| 27 | embedding_api_key = _first_non_empty(ragas_config, "embedding_api_key", "api_key") | ||
| 28 | embedding_base_url = _first_non_empty(ragas_config, "embedding_base_url", "base_url") | ||
| 29 | judge_model = str(require_config(config, "ragas.judge_model")) | ||
| 30 | embedding_model = str(require_config(config, "ragas.embedding_model")) | ||
| 31 | temperature = float(ragas_config.get("temperature", 0)) | ||
| 32 | max_tokens = int(ragas_config.get("max_tokens", 4096)) | ||
| 33 | timeout_seconds = int(ragas_config.get("timeout_seconds", 600)) | ||
| 34 | max_workers = int(ragas_config.get("max_workers", 1)) | ||
| 35 | |||
| 36 | os.environ["OPENAI_API_KEY"] = llm_api_key | ||
| 37 | if llm_base_url: | ||
| 38 | os.environ["OPENAI_BASE_URL"] = llm_base_url | ||
| 39 | |||
| 40 | rows = read_jsonl(input_path) | ||
| 41 | dataset = Dataset.from_list( | ||
| 42 | [ | ||
| 43 | { | ||
| 44 | "user_input": row["user_input"], | ||
| 45 | "response": row["response"], | ||
| 46 | "retrieved_contexts": row["retrieved_contexts"], | ||
| 47 | "reference": row["reference"], | ||
| 48 | "reference_contexts": row.get("reference_contexts") or [], | ||
| 49 | } | ||
| 50 | for row in rows | ||
| 51 | ] | ||
| 52 | ) | ||
| 53 | |||
| 54 | metric_map = _metric_map() | ||
| 55 | selected_metrics = [ | ||
| 56 | metric_map[name] | ||
| 57 | for name in ragas_config.get("metrics", metric_map.keys()) | ||
| 58 | if name in metric_map | ||
| 59 | ] | ||
| 60 | |||
| 61 | llm = ChatOpenAI( | ||
| 62 | model=judge_model, | ||
| 63 | api_key=llm_api_key, | ||
| 64 | base_url=llm_base_url or None, | ||
| 65 | temperature=temperature, | ||
| 66 | max_tokens=max_tokens, | ||
| 67 | ) | ||
| 68 | embeddings = OpenAIEmbeddings( | ||
| 69 | model=embedding_model, | ||
| 70 | api_key=embedding_api_key, | ||
| 71 | base_url=embedding_base_url or None, | ||
| 72 | tiktoken_enabled=False, | ||
| 73 | check_embedding_ctx_length=False, | ||
| 74 | ) | ||
| 75 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) | ||
| 76 | |||
| 77 | run_config = RunConfig(timeout=timeout_seconds, max_workers=max_workers) | ||
| 78 | result = evaluate( | ||
| 79 | dataset, | ||
| 80 | metrics=selected_metrics, | ||
| 81 | llm=ragas_llm, | ||
| 82 | embeddings=ragas_embeddings, | ||
| 83 | run_config=run_config, | ||
| 84 | ) | ||
| 85 | scores = result.to_pandas() | ||
| 86 | for index, row in enumerate(rows): | ||
| 87 | scores.loc[index, "sample_id"] = row.get("sample_id") | ||
| 88 | |||
| 89 | target = Path(output_csv_path) | ||
| 90 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 91 | scores.to_csv(target, index=False) | ||
| 92 | return scores | ||
| 93 | |||
| 94 | |||
| 95 | def _metric_map() -> dict[str, Any]: | ||
| 96 | try: | ||
| 97 | from ragas.metrics import ( | ||
| 98 | context_precision, | ||
| 99 | context_recall, | ||
| 100 | faithfulness, | ||
| 101 | factual_correctness, | ||
| 102 | response_relevancy, | ||
| 103 | ) | ||
| 104 | |||
| 105 | return { | ||
| 106 | "faithfulness": faithfulness, | ||
| 107 | "response_relevancy": response_relevancy, | ||
| 108 | "context_precision": context_precision, | ||
| 109 | "context_recall": context_recall, | ||
| 110 | "factual_correctness": factual_correctness, | ||
| 111 | } | ||
| 112 | except ImportError: | ||
| 113 | from ragas.metrics import ( | ||
| 114 | Faithfulness, | ||
| 115 | FactualCorrectness, | ||
| 116 | LLMContextPrecisionWithReference, | ||
| 117 | LLMContextRecall, | ||
| 118 | ResponseRelevancy, | ||
| 119 | ) | ||
| 120 | |||
| 121 | return { | ||
| 122 | "faithfulness": Faithfulness(), | ||
| 123 | "response_relevancy": ResponseRelevancy(), | ||
| 124 | "context_precision": LLMContextPrecisionWithReference(), | ||
| 125 | "context_recall": LLMContextRecall(), | ||
| 126 | "factual_correctness": FactualCorrectness(), | ||
| 127 | } | ||
| 128 | |||
| 129 | |||
| 130 | def _first_non_empty(config: dict[str, Any], *keys: str) -> str: | ||
| 131 | for key in keys: | ||
| 132 | value = config.get(key) | ||
| 133 | if value not in {None, ""}: | ||
| 134 | return str(value) | ||
| 135 | raise ValueError(f"Missing required Ragas config value. Checked: {', '.join(keys)}") | ||
| 136 | |||
| 137 | |||
| 138 | def _wrap_langchain_models(llm: Any, embeddings: Any) -> tuple[Any, Any]: | ||
| 139 | try: | ||
| 140 | from ragas.embeddings import LangchainEmbeddingsWrapper | ||
| 141 | from ragas.llms import LangchainLLMWrapper | ||
| 142 | except ImportError: | ||
| 143 | return llm, embeddings | ||
| 144 | |||
| 145 | return LangchainLLMWrapper(llm), LangchainEmbeddingsWrapper(embeddings) |
src/weknora_eval/report.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import math | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import pandas as pd | ||
| 8 | |||
| 9 | from weknora_eval.loaders import read_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def retrieval_metrics( | ||
| 13 | ragas_rows: list[dict[str, Any]], | ||
| 14 | *, | ||
| 15 | ks: tuple[int, ...] = (1, 3, 5), | ||
| 16 | ) -> dict[str, float]: | ||
| 17 | samples = [row for row in ragas_rows if row.get("gold_chunk_ids")] | ||
| 18 | if not samples: | ||
| 19 | return {} | ||
| 20 | |||
| 21 | totals: dict[str, float] = {f"hit@{k}": 0.0 for k in ks} | ||
| 22 | totals.update({f"recall@{k}": 0.0 for k in ks}) | ||
| 23 | totals["mrr"] = 0.0 | ||
| 24 | totals["ndcg@5"] = 0.0 | ||
| 25 | |||
| 26 | for row in samples: | ||
| 27 | gold = set(row.get("gold_chunk_ids") or []) | ||
| 28 | refs = row.get("weknora_references") or [] | ||
| 29 | predicted = [str(ref.get("id")) for ref in refs if ref.get("id")] | ||
| 30 | for k in ks: | ||
| 31 | top_k = predicted[:k] | ||
| 32 | hits = len(gold.intersection(top_k)) | ||
| 33 | totals[f"hit@{k}"] += 1.0 if hits else 0.0 | ||
| 34 | totals[f"recall@{k}"] += hits / len(gold) | ||
| 35 | |||
| 36 | first_rank = next((idx for idx, chunk_id in enumerate(predicted, start=1) if chunk_id in gold), None) | ||
| 37 | if first_rank: | ||
| 38 | totals["mrr"] += 1 / first_rank | ||
| 39 | |||
| 40 | dcg = 0.0 | ||
| 41 | for idx, chunk_id in enumerate(predicted[:5], start=1): | ||
| 42 | if chunk_id in gold: | ||
| 43 | dcg += 1 / math.log2(idx + 1) | ||
| 44 | ideal_hits = min(len(gold), 5) | ||
| 45 | idcg = sum(1 / math.log2(idx + 1) for idx in range(1, ideal_hits + 1)) | ||
| 46 | totals["ndcg@5"] += dcg / idcg if idcg else 0.0 | ||
| 47 | |||
| 48 | return {key: round(value / len(samples), 4) for key, value in totals.items()} | ||
| 49 | |||
| 50 | |||
| 51 | def generate_summary_report( | ||
| 52 | config: dict[str, Any], | ||
| 53 | *, | ||
| 54 | scores_csv_path: str = "data/reports/ragas_scores.csv", | ||
| 55 | ragas_input_path: str = "data/runs/ragas_input.jsonl", | ||
| 56 | answers_path: str = "data/runs/weknora_answers.jsonl", | ||
| 57 | output_path: str = "data/reports/summary.md", | ||
| 58 | ) -> str: | ||
| 59 | ragas_rows = read_jsonl(ragas_input_path, missing_ok=True) | ||
| 60 | answer_rows = read_jsonl(answers_path, missing_ok=True) | ||
| 61 | scores = pd.read_csv(scores_csv_path) if Path(scores_csv_path).exists() else pd.DataFrame() | ||
| 62 | |||
| 63 | lines = [ | ||
| 64 | "# Ragas 评估报告", | ||
| 65 | "", | ||
| 66 | "## 运行信息", | ||
| 67 | f"- WeKnora Base URL: {config.get('weknora', {}).get('base_url', '')}", | ||
| 68 | f"- 知识库 ID: {config.get('weknora', {}).get('knowledge_base_id', '')}", | ||
| 69 | f"- 测试集规模: {len(ragas_rows)}", | ||
| 70 | f"- 审核通过样本数: {len(ragas_rows)}", | ||
| 71 | f"- 失败样本数: {sum(1 for row in answer_rows if row.get('error'))}", | ||
| 72 | f"- Judge 模型: {config.get('ragas', {}).get('judge_model', '')}", | ||
| 73 | "", | ||
| 74 | "## 聚合指标", | ||
| 75 | "| 指标 | 平均值 | P50 | 失败阈值 |", | ||
| 76 | "| --- | --- | --- | --- |", | ||
| 77 | ] | ||
| 78 | |||
| 79 | metric_columns = [ | ||
| 80 | column | ||
| 81 | for column in scores.columns | ||
| 82 | if column not in {"sample_id", "user_input", "response", "reference"} | ||
| 83 | and pd.api.types.is_numeric_dtype(scores[column]) | ||
| 84 | ] | ||
| 85 | for column in metric_columns: | ||
| 86 | lines.append( | ||
| 87 | f"| {column} | {scores[column].mean():.4f} | {scores[column].median():.4f} | 0.50 |" | ||
| 88 | ) | ||
| 89 | |||
| 90 | chunk_metrics = retrieval_metrics(ragas_rows) | ||
| 91 | if chunk_metrics: | ||
| 92 | lines.extend(["", "## Chunk ID 检索指标", "| 指标 | 平均值 |", "| --- | --- |"]) | ||
| 93 | for key, value in chunk_metrics.items(): | ||
| 94 | lines.append(f"| {key} | {value:.4f} |") | ||
| 95 | |||
| 96 | lines.extend(["", "## 检索失败样本", "| sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 |", "| --- | --- | --- | --- | --- | --- |"]) | ||
| 97 | for row in _worst_rows(scores, "context_recall"): | ||
| 98 | sample = _sample_by_id(ragas_rows, row.get("sample_id")) | ||
| 99 | actual_files = sorted( | ||
| 100 | { | ||
| 101 | ref.get("knowledge_filename") or "" | ||
| 102 | for ref in sample.get("weknora_references", []) | ||
| 103 | if ref.get("knowledge_filename") | ||
| 104 | } | ||
| 105 | ) | ||
| 106 | lines.append( | ||
| 107 | f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | " | ||
| 108 | f"{_cell(sample.get('source_file'))} | {_cell(', '.join(actual_files))} | " | ||
| 109 | f"{_score(row.get('context_recall'))} | |" | ||
| 110 | ) | ||
| 111 | |||
| 112 | lines.extend(["", "## 生成失败样本", "| sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness |", "| --- | --- | --- | --- | --- | --- |"]) | ||
| 113 | for row in _worst_rows(scores, "faithfulness"): | ||
| 114 | sample = _sample_by_id(ragas_rows, row.get("sample_id")) | ||
| 115 | lines.append( | ||
| 116 | f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | " | ||
| 117 | f"{_cell(sample.get('response'))} | {_cell(sample.get('reference'))} | " | ||
| 118 | f"{_score(row.get('faithfulness'))} | {_score(row.get('factual_correctness'))} |" | ||
| 119 | ) | ||
| 120 | |||
| 121 | empty_retrievals = sum(1 for row in ragas_rows if not row.get("retrieved_contexts")) | ||
| 122 | fallback_answers = sum(1 for row in answer_rows if row.get("is_fallback")) | ||
| 123 | source_counts: dict[str, int] = {} | ||
| 124 | for row in ragas_rows: | ||
| 125 | source = row.get("source_file") or "unknown" | ||
| 126 | source_counts[source] = source_counts.get(source, 0) + 1 | ||
| 127 | |||
| 128 | lines.extend( | ||
| 129 | [ | ||
| 130 | "", | ||
| 131 | "## 数据质量", | ||
| 132 | f"- 空检索数量: {empty_retrievals}", | ||
| 133 | f"- fallback 答案数量: {fallback_answers}", | ||
| 134 | f"- 来源文件分布: {source_counts}", | ||
| 135 | "", | ||
| 136 | "## 改进建议", | ||
| 137 | "- 优先检查 context_recall 低且 retrieved_contexts 为空的样本。", | ||
| 138 | "- 对低 faithfulness 且 context_recall 正常的样本,重点检查生成模型和提示词。", | ||
| 139 | "- 对 Chunk ID 指标低但 Ragas context 指标正常的样本,检查 chunk 切分或 gold_chunk_ids 标注。", | ||
| 140 | "", | ||
| 141 | ] | ||
| 142 | ) | ||
| 143 | |||
| 144 | content = "\n".join(lines) | ||
| 145 | target = Path(output_path) | ||
| 146 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 147 | target.write_text(content, encoding="utf-8") | ||
| 148 | return content | ||
| 149 | |||
| 150 | |||
| 151 | def _worst_rows(scores: pd.DataFrame, column: str, *, limit: int = 10) -> list[dict[str, Any]]: | ||
| 152 | if scores.empty or column not in scores.columns: | ||
| 153 | return [] | ||
| 154 | return scores.sort_values(column, ascending=True).head(limit).to_dict(orient="records") | ||
| 155 | |||
| 156 | |||
| 157 | def _sample_by_id(rows: list[dict[str, Any]], sample_id: Any) -> dict[str, Any]: | ||
| 158 | return next((row for row in rows if row.get("sample_id") == sample_id), {}) | ||
| 159 | |||
| 160 | |||
| 161 | def _cell(value: Any, *, max_len: int = 120) -> str: | ||
| 162 | text = "" if value is None else " ".join(str(value).split()) | ||
| 163 | text = text.replace("|", "\\|") | ||
| 164 | if len(text) <= max_len: | ||
| 165 | return text | ||
| 166 | return text[:max_len].rstrip() + "..." | ||
| 167 | |||
| 168 | |||
| 169 | def _score(value: Any) -> str: | ||
| 170 | try: | ||
| 171 | if pd.isna(value): | ||
| 172 | return "" | ||
| 173 | return f"{float(value):.4f}" | ||
| 174 | except (TypeError, ValueError): | ||
| 175 | return "" |
src/weknora_eval/schemas.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from dataclasses import asdict, dataclass, field | ||
| 4 | from typing import Any | ||
| 5 | |||
| 6 | |||
| 7 | @dataclass | ||
| 8 | class ParsedDocument: | ||
| 9 | doc_id: str | ||
| 10 | source_file: str | ||
| 11 | file_type: str | ||
| 12 | content: str | ||
| 13 | page: int | None = None | ||
| 14 | sheet: str | None = None | ||
| 15 | row_index: int | None = None | ||
| 16 | metadata: dict[str, Any] = field(default_factory=dict) | ||
| 17 | |||
| 18 | def to_dict(self) -> dict[str, Any]: | ||
| 19 | return asdict(self) | ||
| 20 | |||
| 21 | |||
| 22 | @dataclass | ||
| 23 | class TestsetRecord: | ||
| 24 | sample_id: str | ||
| 25 | user_input: str | ||
| 26 | reference: str | ||
| 27 | reference_contexts: list[str] | ||
| 28 | source_file: str | None = None | ||
| 29 | gold_chunk_ids: list[str] = field(default_factory=list) | ||
| 30 | question_type: str = "single_hop" | ||
| 31 | review_status: str = "pending" | ||
| 32 | |||
| 33 | def to_dict(self) -> dict[str, Any]: | ||
| 34 | return asdict(self) | ||
| 35 | |||
| 36 | |||
| 37 | @dataclass | ||
| 38 | class WeKnoraAnswer: | ||
| 39 | sample_id: str | ||
| 40 | user_input: str | ||
| 41 | response: str | ||
| 42 | retrieved_contexts: list[str] | ||
| 43 | weknora_references: list[dict[str, Any]] | ||
| 44 | session_id: str | None = None | ||
| 45 | request_id: str | None = None | ||
| 46 | error: str | None = None | ||
| 47 | |||
| 48 | def to_dict(self) -> dict[str, Any]: | ||
| 49 | return asdict(self) |
src/weknora_eval/sse.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import json | ||
| 4 | from collections.abc import Iterable, Iterator | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | |||
| 8 | def parse_sse_events(lines: Iterable[str | bytes]) -> Iterator[dict[str, Any]]: | ||
| 9 | event_name = "message" | ||
| 10 | data_lines: list[str] = [] | ||
| 11 | |||
| 12 | for raw_line in lines: | ||
| 13 | line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line | ||
| 14 | line = line.rstrip("\r\n") | ||
| 15 | |||
| 16 | if not line: | ||
| 17 | if data_lines: | ||
| 18 | yield _build_event(event_name, data_lines) | ||
| 19 | event_name = "message" | ||
| 20 | data_lines = [] | ||
| 21 | continue | ||
| 22 | |||
| 23 | if line.startswith(":"): | ||
| 24 | continue | ||
| 25 | if line.startswith("event:"): | ||
| 26 | event_name = line.removeprefix("event:").strip() | ||
| 27 | continue | ||
| 28 | if line.startswith("data:"): | ||
| 29 | data_lines.append(line.removeprefix("data:").strip()) | ||
| 30 | |||
| 31 | if data_lines: | ||
| 32 | yield _build_event(event_name, data_lines) | ||
| 33 | |||
| 34 | |||
| 35 | def _build_event(event_name: str, data_lines: list[str]) -> dict[str, Any]: | ||
| 36 | raw_data = "\n".join(data_lines) | ||
| 37 | parsed_data: Any = raw_data | ||
| 38 | if raw_data and raw_data != "[DONE]": | ||
| 39 | try: | ||
| 40 | parsed_data = json.loads(raw_data) | ||
| 41 | except json.JSONDecodeError: | ||
| 42 | parsed_data = raw_data | ||
| 43 | return {"event": event_name, "data": parsed_data} | ||
| 44 | |||
| 45 | |||
| 46 | def normalize_reference(reference: dict[str, Any]) -> dict[str, Any]: | ||
| 47 | return { | ||
| 48 | "id": reference.get("id"), | ||
| 49 | "content": reference.get("content") or "", | ||
| 50 | "knowledge_id": reference.get("knowledge_id"), | ||
| 51 | "chunk_index": reference.get("chunk_index"), | ||
| 52 | "score": reference.get("score"), | ||
| 53 | "knowledge_filename": reference.get("knowledge_filename") | ||
| 54 | or reference.get("knowledge_title"), | ||
| 55 | "match_type": reference.get("match_type"), | ||
| 56 | "chunk_type": reference.get("chunk_type"), | ||
| 57 | } |
src/weknora_eval/testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from typing import Any | ||
| 4 | |||
| 5 | from weknora_eval.loaders import read_jsonl, write_jsonl | ||
| 6 | from weknora_eval.schemas import TestsetRecord | ||
| 7 | |||
| 8 | |||
| 9 | def generate_rule_based_testset( | ||
| 10 | *, | ||
| 11 | documents_path: str = "data/parsed_docs/documents.jsonl", | ||
| 12 | output_path: str = "data/testsets/testset.raw.jsonl", | ||
| 13 | size: int = 50, | ||
| 14 | min_context_chars: int = 80, | ||
| 15 | ) -> list[dict[str, Any]]: | ||
| 16 | documents = [ | ||
| 17 | row | ||
| 18 | for row in read_jsonl(documents_path) | ||
| 19 | if len(row.get("content") or "") >= min_context_chars | ||
| 20 | ] | ||
| 21 | rows: list[dict[str, Any]] = [] | ||
| 22 | for index, document in enumerate(documents[:size], start=1): | ||
| 23 | context = document["content"] | ||
| 24 | source_file = document.get("source_file") | ||
| 25 | question = _default_question(document) | ||
| 26 | reference = _reference_from_context(context) | ||
| 27 | rows.append( | ||
| 28 | TestsetRecord( | ||
| 29 | sample_id=f"qa-{index:04d}", | ||
| 30 | user_input=question, | ||
| 31 | reference=reference, | ||
| 32 | reference_contexts=[context], | ||
| 33 | source_file=source_file, | ||
| 34 | question_type="single_hop", | ||
| 35 | review_status="pending", | ||
| 36 | ).to_dict() | ||
| 37 | ) | ||
| 38 | write_jsonl(output_path, rows) | ||
| 39 | return rows | ||
| 40 | |||
| 41 | |||
| 42 | def approve_pending_testset( | ||
| 43 | *, | ||
| 44 | input_path: str = "data/testsets/testset.raw.jsonl", | ||
| 45 | output_path: str = "data/testsets/testset.reviewed.jsonl", | ||
| 46 | ) -> list[dict[str, Any]]: | ||
| 47 | rows = read_jsonl(input_path) | ||
| 48 | reviewed: list[dict[str, Any]] = [] | ||
| 49 | for row in rows: | ||
| 50 | row = dict(row) | ||
| 51 | if row.get("review_status") == "rejected": | ||
| 52 | continue | ||
| 53 | row["review_status"] = "approved" | ||
| 54 | reviewed.append(row) | ||
| 55 | write_jsonl(output_path, reviewed) | ||
| 56 | return reviewed | ||
| 57 | |||
| 58 | |||
| 59 | def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]: | ||
| 60 | errors: list[str] = [] | ||
| 61 | for index, row in enumerate(read_jsonl(path), start=1): | ||
| 62 | prefix = f"{path}:{index}" | ||
| 63 | if row.get("review_status") != "approved": | ||
| 64 | errors.append(f"{prefix} review_status must be approved") | ||
| 65 | for key in ("sample_id", "user_input", "reference"): | ||
| 66 | if not row.get(key): | ||
| 67 | errors.append(f"{prefix} missing {key}") | ||
| 68 | if not row.get("reference_contexts"): | ||
| 69 | errors.append(f"{prefix} reference_contexts must be non-empty") | ||
| 70 | return errors | ||
| 71 | |||
| 72 | |||
| 73 | def _default_question(document: dict[str, Any]) -> str: | ||
| 74 | source = document.get("source_file") or "该文档" | ||
| 75 | if document.get("file_type") == "xlsx" and document.get("sheet"): | ||
| 76 | return f"请根据 {source} 的 {document['sheet']} 中对应记录回答:这条记录的主要内容是什么?" | ||
| 77 | if document.get("page"): | ||
| 78 | return f"请根据 {source} 第 {document['page']} 页回答:该片段的主要内容是什么?" | ||
| 79 | return f"请根据 {source} 回答:该片段的主要内容是什么?" | ||
| 80 | |||
| 81 | |||
| 82 | def _reference_from_context(context: str, *, max_chars: int = 500) -> str: | ||
| 83 | text = " ".join(context.split()) | ||
| 84 | if len(text) <= max_chars: | ||
| 85 | return text | ||
| 86 | return text[:max_chars].rstrip() + "..." |
-
Please register or sign in to post a comment