Initial WeKnora Ragas evaluation project

沈秋雨
Commit 147c79e0 ... 147c79e07bbaa525c4f6c6ac143e9c5c109362f8 authored 2026-04-21 15:23:44 +0800 by 沈秋雨
Showing 34 changed files with 1772 additions and 0 deletions
.env.example
.gitignore
RAGAS_EVALUATION_IMPLEMENTATION_CHECKLIST.md
README.md
configs/eval.yaml
data/parsed_docs/mineru_raw/.gitkeep
data/raw_docs/pdf/.gitkeep
data/raw_docs/xlsx/.gitkeep
pyproject.toml
scripts/00_create_kb.py
scripts/01_upload_docs.py
scripts/02_wait_ingestion.py
scripts/03_export_chunks.py
scripts/04_parse_docs.py
scripts/05_generate_testset.py
scripts/06_review_testset.py
scripts/07_run_weknora_qa.py
scripts/08_build_ragas_input.py
scripts/09_run_ragas_eval.py
scripts/10_report.py
--- a/.env.example 0 → 100644
View file @147c79e
+++ b/.env.example 0 → 100644
View file @147c79e
+WEKNORA_BASE_URL=http://localhost:8080/api/v1
+WEKNORA_API_KEY=
+WEKNORA_KB_ID=
+WEKNORA_KB_NAME=ragas-eval-pilot
+
+# Ragas generation and judge models. These are evaluation-side models, not the
+# model configuration used by the WeKnora backend.
+OPENAI_API_KEY=replace-me
+OPENAI_BASE_URL=https://api.openai.com/v1
+
+# Optional split deployment. Use these when LLM and embedding are served by
+# different OpenAI-compatible services, such as vLLM + Infinity.
+RAGAS_LLM_API_KEY=replace-me
+RAGAS_LLM_BASE_URL=http://localhost:8000/v1
+RAGAS_EMBEDDING_API_KEY=replace-me
+RAGAS_EMBEDDING_BASE_URL=http://localhost:7997/v1
+RAGAS_RERANKER_API_KEY=replace-me
+RAGAS_RERANKER_BASE_URL=http://localhost:7998/v1
+RAGAS_RERANKER_MODEL=replace-me
+
+RAGAS_GENERATOR_MODEL=gpt-4o-mini
+RAGAS_JUDGE_MODEL=gpt-4o-mini
+RAGAS_EMBEDDING_MODEL=text-embedding-3-small
+
+TESTSET_SIZE=50
+REQUEST_INTERVAL_SECONDS=0.2
--- a/.gitignore 0 → 100644
View file @147c79e
+++ b/.gitignore 0 → 100644
View file @147c79e
+.env
+.venv/
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+.ruff_cache/
+
+data/raw_docs/pdf/*
+data/raw_docs/xlsx/*
+data/parsed_docs/*.json
+data/parsed_docs/*.jsonl
+data/parsed_docs/mineru_raw/*
+data/exported/*.json
+data/exported/*.jsonl
+data/testsets/*.jsonl
+data/runs/*.jsonl
+data/reports/*.csv
+data/reports/*.md
+
+!data/raw_docs/pdf/.gitkeep
+!data/raw_docs/xlsx/.gitkeep
+!data/parsed_docs/mineru_raw/.gitkeep
--- a/RAGAS_EVALUATION_IMPLEMENTATION_CHECKLIST.md 0 → 100644
View file @147c79e
+++ b/RAGAS_EVALUATION_IMPLEMENTATION_CHECKLIST.md 0 → 100644
View file @147c79e
--- a/README.md 0 → 100644
View file @147c79e
+++ b/README.md 0 → 100644
View file @147c79e
+# WeKnora Ragas Eval
+
+独立的 WeKnora Ragas 评估项目。它只调用 WeKnora 公开 API，不依赖 WeKnora 内置的 `/evaluation` 接口。
+
+## 安装
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -e .
+```
+
+如果需要更好的 PDF 解析能力：
+
+```bash
+pip install -e ".[pdf]"
+```
+
+开发和测试工具：
+
+```bash
+pip install -e ".[dev,pdf]"
+```
+
+## 配置
+
+```bash
+cp .env.example .env
+```
+
+编辑 `.env` 后确认：
+
+- `WEKNORA_BASE_URL` 指向 WeKnora API v1，例如 `http://localhost:9090/api/v1`
+- `WEKNORA_API_KEY` 是 WeKnora API Key
+- `WEKNORA_KB_ID` 是目标知识库 ID；如果还没有，先运行 `python scripts/00_create_kb.py`
+- `WEKNORA_KB_NAME` 是创建知识库时使用的名称
+- `OPENAI_API_KEY`、`OPENAI_BASE_URL`、`RAGAS_*_MODEL` 是评估侧模型配置
+- 如果 LLM 和 embedding 分开部署，使用 `RAGAS_LLM_BASE_URL` 指向 vLLM 的 `/v1`，使用 `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity 的 `/v1`
+
+## 首轮 Pilot
+
+把原始文件放到：
+
+- `data/raw_docs/pdf/`
+- `data/raw_docs/xlsx/`
+
+按顺序执行：
+
+```bash
+python scripts/00_create_kb.py
+python scripts/01_upload_docs.py
+python scripts/02_wait_ingestion.py
+python scripts/03_export_chunks.py
+python scripts/04_parse_docs.py
+python scripts/05_generate_testset.py
+python scripts/06_review_testset.py
+python scripts/07_run_weknora_qa.py
+python scripts/08_build_ragas_input.py
+python scripts/09_run_ragas_eval.py
+python scripts/10_report.py
+```
+
+首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA，确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。
+
+## 主要产物
+
+- `data/exported/knowledge.jsonl`
+- `data/exported/chunks.jsonl`
+- `data/parsed_docs/documents.jsonl`
+- `data/parsed_docs/parse_summary.json`
+- `data/testsets/testset.raw.jsonl`
+- `data/testsets/testset.reviewed.jsonl`
+- `data/runs/weknora_answers.jsonl`
+- `data/runs/ragas_input.jsonl`
+- `data/reports/ragas_scores.csv`
+- `data/reports/summary.md`
--- a/configs/eval.yaml 0 → 100644
View file @147c79e
+++ b/configs/eval.yaml 0 → 100644
View file @147c79e
+weknora:
+  base_url: "${WEKNORA_BASE_URL}"
+  api_key: "${WEKNORA_API_KEY}"
+  knowledge_base_id: "${WEKNORA_KB_ID}"
+  knowledge_base_name: "${WEKNORA_KB_NAME:-ragas-eval-pilot}"
+  knowledge_base_description: "Knowledge base for independent Ragas evaluation."
+  timeout_seconds: 300
+  request_interval_seconds: "${REQUEST_INTERVAL_SECONDS:-0.2}"
+
+testset:
+  size: "${TESTSET_SIZE:-50}"
+  include_pdf: true
+  include_xlsx: true
+  min_context_chars: 80
+  require_manual_review: true
+
+parsing:
+  provider: "local"
+  output_path: "data/parsed_docs/documents.jsonl"
+  failed_path: "data/parsed_docs/failed_parse.jsonl"
+  summary_path: "data/parsed_docs/parse_summary.json"
+  local:
+    pdf_backend: "pymupdf"
+    xlsx_mode: "row_text"
+    min_chars: 80
+  mineru:
+    mode: "cli"
+    cli_bin: "mineru"
+    output_dir: "data/parsed_docs/mineru_raw"
+    http_base_url: "http://172.23.184.9:8002"
+    api_key: "mineru"
+    timeout_seconds: 600
+    fallback_to_local: false
+
+qa:
+  one_session_per_question: true
+  disable_title: true
+  enable_memory: false
+  channel: "api"
+  verify_with_messages: false
+
+ragas:
+  provider: "openai-compatible"
+  # Backward-compatible defaults. If the split LLM/embedding values below are
+  # empty, these values are used for both clients.
+  api_key: "${OPENAI_API_KEY}"
+  base_url: "${OPENAI_BASE_URL}"
+  # vLLM OpenAI-compatible endpoint, for example http://localhost:8000/v1.
+  llm_api_key: "${RAGAS_LLM_API_KEY}"
+  llm_base_url: "${RAGAS_LLM_BASE_URL}"
+  # Infinity OpenAI-compatible embedding endpoint, for example
+  # http://localhost:7997/v1.
+  embedding_api_key: "${RAGAS_EMBEDDING_API_KEY}"
+  embedding_base_url: "${RAGAS_EMBEDDING_BASE_URL}"
+  # Reserved for future retrieval/rerank metrics. The current Ragas pipeline
+  # does not call reranker APIs.
+  reranker_api_key: "${RAGAS_RERANKER_API_KEY}"
+  reranker_base_url: "${RAGAS_RERANKER_BASE_URL}"
+  reranker_model: "${RAGAS_RERANKER_MODEL}"
+  generator_model: "${RAGAS_GENERATOR_MODEL}"
+  judge_model: "${RAGAS_JUDGE_MODEL}"
+  embedding_model: "${RAGAS_EMBEDDING_MODEL}"
+  temperature: 0
+  max_tokens: 4096
+  timeout_seconds: 600
+  max_workers: 1
+  metrics:
+    - faithfulness
+    - response_relevancy
+    - context_precision
+    - context_recall
+    - factual_correctness
--- a/data/parsed_docs/mineru_raw/.gitkeep 0 → 100644
View file @147c79e
+++ b/data/parsed_docs/mineru_raw/.gitkeep 0 → 100644
View file @147c79e
+
--- a/data/raw_docs/pdf/.gitkeep 0 → 100644
View file @147c79e
+++ b/data/raw_docs/pdf/.gitkeep 0 → 100644
View file @147c79e
+
--- a/data/raw_docs/xlsx/.gitkeep 0 → 100644
View file @147c79e
+++ b/data/raw_docs/xlsx/.gitkeep 0 → 100644
View file @147c79e
+
--- a/pyproject.toml 0 → 100644
View file @147c79e
+++ b/pyproject.toml 0 → 100644
View file @147c79e
+[project]
+name = "weknora-ragas-eval"
+version = "0.1.0"
+description = "Independent Ragas evaluation pipeline for WeKnora public APIs."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "ragas>=0.3,<0.5",
+  "datasets>=2.19.0",
+  "pandas>=2.2.0",
+  "openpyxl>=3.1.0",
+  "requests>=2.32.0",
+  "sseclient-py>=1.8.0",
+  "python-dotenv>=1.0.0",
+  "pyyaml>=6.0.0",
+  "langchain>=0.2.0",
+  "langchain-community>=0.2.0",
+  "langchain-openai>=0.1.0",
+  "pypdf>=4.2.0"
+]
+
+[project.optional-dependencies]
+pdf = [
+  "pymupdf>=1.24.0",
+  "pdfplumber>=0.11.0"
+]
+dev = [
+  "ruff>=0.6.0",
+  "pytest>=8.0.0"
+]
+
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP", "B"]
--- a/scripts/00_create_kb.py 0 → 100644
View file @147c79e
+++ b/scripts/00_create_kb.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+from typing import Any
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.api import bootstrap_client_from_config
+from weknora_eval.config import load_config, require_config
+from weknora_eval.envfile import set_env_value
+from weknora_eval.loaders import setup_logging, write_json
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    client = bootstrap_client_from_config(config)
+    weknora = config["weknora"]
+
+    existing_id = str(weknora.get("knowledge_base_id") or "")
+    name = str(require_config(config, "weknora.knowledge_base_name"))
+    if existing_id and existing_id != "replace-me":
+        record = {"id": existing_id, "name": name, "source": "env"}
+        write_json("data/exported/knowledge_base.json", record)
+        print(f"WEKNORA_KB_ID already set: {existing_id}")
+        return 0
+
+    created = client.create_knowledge_base(name=name)
+    knowledge_base_id = _extract_knowledge_base_id(created)
+    if not knowledge_base_id:
+        print(f"Created knowledge base but could not extract id from response: {created}")
+        return 1
+
+    set_env_value(".env", "WEKNORA_KB_ID", knowledge_base_id)
+    write_json("data/exported/knowledge_base.json", {**created, "source": "create"})
+    print(f"WEKNORA_KB_ID={knowledge_base_id}")
+    print("Wrote ID to .env and data/exported/knowledge_base.json")
+    return 0
+
+
+def _extract_knowledge_base_id(payload: dict[str, Any]) -> str | None:
+    candidates = [payload]
+    for key in ("data", "knowledge_base"):
+        nested = payload.get(key)
+        if isinstance(nested, dict):
+            candidates.append(nested)
+
+    for row in candidates:
+        for key in ("id", "knowledge_base_id", "kb_id", "uuid"):
+            value = row.get(key)
+            if value:
+                return str(value)
+    return None
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/01_upload_docs.py 0 → 100644
View file @147c79e
+++ b/scripts/01_upload_docs.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.api import client_from_config
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging, write_jsonl
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    client = client_from_config(config)
+    files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
+        Path("data/raw_docs/xlsx").glob("*.xlsx")
+    )
+    rows = []
+    for path in files:
+        data = client.upload_file(path)
+        rows.append(
+            {
+                "knowledge_id": data.get("id"),
+                "file_name": data.get("file_name") or data.get("title") or path.name,
+                "file_type": data.get("file_type") or path.suffix.lstrip("."),
+                "parse_status": data.get("parse_status"),
+                "enable_status": data.get("enable_status"),
+                "raw": data,
+            }
+        )
+    write_jsonl("data/exported/knowledge_uploads.jsonl", rows)
+    print(f"Uploaded {len(rows)} files")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/02_wait_ingestion.py 0 → 100644
View file @147c79e
+++ b/scripts/02_wait_ingestion.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.api import client_from_config
+from weknora_eval.config import load_config
+from weknora_eval.loaders import read_jsonl, setup_logging, write_jsonl
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    client = client_from_config(config)
+    uploads = read_jsonl("data/exported/knowledge_uploads.jsonl", missing_ok=True)
+    knowledge_ids = {row["knowledge_id"] for row in uploads if row.get("knowledge_id")} or None
+    result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids)
+    knowledge = client.list_knowledge()
+    write_jsonl("data/exported/knowledge.jsonl", knowledge)
+
+    print(
+        "Ingestion status: "
+        f"completed={len(result['completed'])} failed={len(result['failed'])} "
+        f"pending={len(result['pending'])}"
+    )
+    return 1 if result["failed"] or result["pending"] else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/03_export_chunks.py 0 → 100644
View file @147c79e
+++ b/scripts/03_export_chunks.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.api import client_from_config
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging, write_jsonl
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    client = client_from_config(config)
+    knowledge_rows = client.list_knowledge()
+    write_jsonl("data/exported/knowledge.jsonl", knowledge_rows)
+    knowledge_by_id = {row.get("id"): row for row in knowledge_rows}
+
+    chunk_rows = []
+    for knowledge in knowledge_rows:
+        knowledge_id = knowledge.get("id")
+        if not knowledge_id:
+            continue
+        if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled":
+            continue
+        for chunk in client.list_chunks(str(knowledge_id)):
+            content = (chunk.get("content") or "").strip()
+            if not content:
+                continue
+            if chunk.get("is_enabled") is False:
+                continue
+            source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge
+            chunk_rows.append(
+                {
+                    "chunk_id": chunk.get("id"),
+                    "knowledge_id": chunk.get("knowledge_id") or knowledge_id,
+                    "knowledge_base_id": chunk.get("knowledge_base_id")
+                    or config["weknora"]["knowledge_base_id"],
+                    "chunk_index": chunk.get("chunk_index"),
+                    "content": content,
+                    "source_file": source.get("file_name") or source.get("title"),
+                    "chunk_type": chunk.get("chunk_type"),
+                    "raw": chunk,
+                }
+            )
+    write_jsonl("data/exported/chunks.jsonl", chunk_rows)
+    print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/04_parse_docs.py 0 → 100644
View file @147c79e
+++ b/scripts/04_parse_docs.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging
+from weknora_eval.parsers.local import parse_raw_docs
+from weknora_eval.parsers.mineru import parse_with_mineru
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    provider = config.get("parsing", {}).get("provider", "local")
+    if provider == "local":
+        rows, summary = parse_raw_docs(config)
+    elif provider == "mineru":
+        rows, summary = parse_with_mineru(config)
+    else:
+        raise ValueError(f"Unsupported parsing provider: {provider}")
+    print(f"Parsed {len(rows)} documents: {summary}")
+    return 0 if rows else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/05_generate_testset.py 0 → 100644
View file @147c79e
+++ b/scripts/05_generate_testset.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging
+from weknora_eval.testset import generate_rule_based_testset
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    testset = config.get("testset", {})
+    rows = generate_rule_based_testset(
+        size=int(testset.get("size", 50)),
+        min_context_chars=int(testset.get("min_context_chars", 80)),
+    )
+    print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl")
+    return 0 if rows else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/06_review_testset.py 0 → 100644
View file @147c79e
+++ b/scripts/06_review_testset.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.loaders import setup_logging
+from weknora_eval.testset import approve_pending_testset, validate_reviewed_testset
+
+
+def main() -> int:
+    setup_logging()
+    rows = approve_pending_testset()
+    errors = validate_reviewed_testset()
+    if errors:
+        for error in errors:
+            print(error)
+        return 1
+    print(f"Wrote {len(rows)} approved QA records to data/testsets/testset.reviewed.jsonl")
+    return 0 if rows else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/07_run_weknora_qa.py 0 → 100644
View file @147c79e
+++ b/scripts/07_run_weknora_qa.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.api import client_from_config
+from weknora_eval.config import load_config
+from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    client = client_from_config(config)
+    qa_config = config.get("qa", {})
+    rows = [row for row in read_jsonl("data/testsets/testset.reviewed.jsonl") if row.get("review_status") == "approved"]
+    answers = []
+
+    for index, row in enumerate(rows, start=1):
+        sample_id = row["sample_id"]
+        try:
+            session = client.create_session(title=f"ragas-eval-{sample_id}")
+            session_id = session.get("id")
+            if not session_id:
+                raise RuntimeError(f"create_session returned no id for {sample_id}")
+            result = client.knowledge_chat_sse(
+                session_id=session_id,
+                query=row["user_input"],
+                disable_title=bool(qa_config.get("disable_title", True)),
+                enable_memory=bool(qa_config.get("enable_memory", False)),
+                channel=str(qa_config.get("channel", "api")),
+            )
+            answer = {
+                "sample_id": sample_id,
+                "user_input": row["user_input"],
+                "session_id": session_id,
+                "request_id": result.get("request_id"),
+                "response": result.get("response") or "",
+                "retrieved_contexts": result.get("retrieved_contexts") or [],
+                "weknora_references": result.get("weknora_references") or [],
+                "error": None,
+            }
+            if not answer["response"]:
+                answer["error"] = "empty_response"
+                append_jsonl("data/runs/failed_requests.jsonl", answer)
+            elif not answer["retrieved_contexts"]:
+                append_jsonl("data/runs/failed_requests.jsonl", {**answer, "error": "empty_retrieval"})
+            answers.append(answer)
+            print(f"[{index}/{len(rows)}] {sample_id} response_chars={len(answer['response'])}")
+        except Exception as exc:  # noqa: BLE001
+            failed = {
+                "sample_id": sample_id,
+                "user_input": row.get("user_input"),
+                "response": "",
+                "retrieved_contexts": [],
+                "weknora_references": [],
+                "session_id": None,
+                "request_id": None,
+                "error": str(exc),
+            }
+            answers.append(failed)
+            append_jsonl("data/runs/failed_requests.jsonl", failed)
+            print(f"[{index}/{len(rows)}] {sample_id} failed: {exc}")
+
+    write_jsonl("data/runs/weknora_answers.jsonl", answers)
+    failures = [row for row in answers if row.get("error") and row.get("error") != "empty_retrieval"]
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/08_build_ragas_input.py 0 → 100644
View file @147c79e
+++ b/scripts/08_build_ragas_input.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
+
+
+def main() -> int:
+    setup_logging()
+    testset = {
+        row["sample_id"]: row
+        for row in read_jsonl("data/testsets/testset.reviewed.jsonl")
+        if row.get("review_status") == "approved"
+    }
+    answers = {row["sample_id"]: row for row in read_jsonl("data/runs/weknora_answers.jsonl")}
+    ragas_rows = []
+    for sample_id, qa in testset.items():
+        answer = answers.get(sample_id)
+        if not answer:
+            append_jsonl("data/runs/failed_requests.jsonl", {"sample_id": sample_id, "error": "missing_answer"})
+            continue
+        row = {
+            "sample_id": sample_id,
+            "user_input": qa["user_input"],
+            "response": answer.get("response") or "",
+            "retrieved_contexts": answer.get("retrieved_contexts") or [],
+            "reference": qa["reference"],
+            "reference_contexts": qa.get("reference_contexts") or [],
+            "session_id": answer.get("session_id"),
+            "request_id": answer.get("request_id"),
+            "weknora_references": answer.get("weknora_references") or [],
+            "source_file": qa.get("source_file"),
+            "gold_chunk_ids": qa.get("gold_chunk_ids") or [],
+        }
+        missing = [
+            key
+            for key in ("user_input", "response", "retrieved_contexts", "reference", "reference_contexts")
+            if not row.get(key)
+        ]
+        if missing:
+            append_jsonl(
+                "data/runs/failed_requests.jsonl",
+                {"sample_id": sample_id, "error": f"missing_ragas_fields:{','.join(missing)}"},
+            )
+            continue
+        ragas_rows.append(row)
+
+    write_jsonl("data/runs/ragas_input.jsonl", ragas_rows)
+    print(f"Built {len(ragas_rows)} Ragas input rows")
+    return 0 if ragas_rows else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/09_run_ragas_eval.py 0 → 100644
View file @147c79e
+++ b/scripts/09_run_ragas_eval.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging
+from weknora_eval.ragas_runner import run_ragas_eval
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    scores = run_ragas_eval(config)
+    print(f"Wrote {len(scores)} Ragas score rows to data/reports/ragas_scores.csv")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/10_report.py 0 → 100644
View file @147c79e
+++ b/scripts/10_report.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+
+import _bootstrap  # noqa: F401
+
+from weknora_eval.config import load_config
+from weknora_eval.loaders import setup_logging
+from weknora_eval.report import generate_summary_report
+
+
+def main() -> int:
+    setup_logging()
+    config = load_config()
+    generate_summary_report(config)
+    print("Wrote report to data/reports/summary.md")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/_bootstrap.py 0 → 100644
View file @147c79e
+++ b/scripts/_bootstrap.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+SRC = PROJECT_ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
--- a/src/weknora_eval/__init__.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/__init__.py 0 → 100644
View file @147c79e
+"""Independent Ragas evaluation pipeline for WeKnora."""
+
+__all__ = [
+    "__version__",
+]
+
+__version__ = "0.1.0"
--- a/src/weknora_eval/api.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/api.py 0 → 100644
View file @147c79e
--- a/src/weknora_eval/config.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/config.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import os
+import re
+from pathlib import Path
+from typing import Any
+
+import yaml
+from dotenv import load_dotenv
+
+
+_ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}")
+
+
+def _expand_env(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {key: _expand_env(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_expand_env(item) for item in value]
+    if not isinstance(value, str):
+        return value
+
+    def replace(match: re.Match[str]) -> str:
+        default = match.group(2) if match.group(2) is not None else ""
+        return os.getenv(match.group(1), default)
+
+    expanded = _ENV_PATTERN.sub(replace, value)
+    return _coerce_scalar(expanded)
+
+
+def _coerce_scalar(value: str) -> Any:
+    lowered = value.lower()
+    if lowered in {"true", "false"}:
+        return lowered == "true"
+    if lowered in {"none", "null"}:
+        return None
+    try:
+        if "." not in value:
+            return int(value)
+        return float(value)
+    except ValueError:
+        return value
+
+
+def load_config(path: str | Path = "configs/eval.yaml") -> dict[str, Any]:
+    load_dotenv()
+    config_path = Path(path)
+    with config_path.open("r", encoding="utf-8") as file:
+        raw = yaml.safe_load(file) or {}
+    return _expand_env(raw)
+
+
+def require_config(config: dict[str, Any], dotted_key: str) -> Any:
+    current: Any = config
+    for part in dotted_key.split("."):
+        if not isinstance(current, dict) or part not in current:
+            raise ValueError(f"Missing required config value: {dotted_key}")
+        value = current[part]
+        if value is None or value == "":
+            raise ValueError(f"Missing required config value: {dotted_key}")
+        current = value
+    return current
+
+
+def project_path(*parts: str) -> Path:
+    return Path.cwd().joinpath(*parts)
--- a/src/weknora_eval/envfile.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/envfile.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def set_env_value(path: str | Path, key: str, value: str) -> None:
+    target = Path(path)
+    lines = target.read_text(encoding="utf-8").splitlines() if target.exists() else []
+    prefix = f"{key}="
+    replacement = f"{key}={value}"
+    updated = False
+    output: list[str] = []
+
+    for line in lines:
+        if line.startswith(prefix):
+            output.append(replacement)
+            updated = True
+        else:
+            output.append(line)
+
+    if not updated:
+        output.append(replacement)
+
+    target.write_text("\n".join(output) + "\n", encoding="utf-8")
--- a/src/weknora_eval/loaders.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/loaders.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any
+
+
+def setup_logging(level: int = logging.INFO) -> None:
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+
+def ensure_parent(path: str | Path) -> Path:
+    target = Path(path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    return target
+
+
+def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]:
+    target = Path(path)
+    if not target.exists():
+        if missing_ok:
+            return []
+        raise FileNotFoundError(target)
+
+    rows: list[dict[str, Any]] = []
+    with target.open("r", encoding="utf-8") as file:
+        for line_no, line in enumerate(file, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                rows.append(json.loads(stripped))
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
+    return rows
+
+
+def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]:
+    target = Path(path)
+    if not target.exists():
+        if missing_ok:
+            return
+        raise FileNotFoundError(target)
+
+    with target.open("r", encoding="utf-8") as file:
+        for line_no, line in enumerate(file, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                yield json.loads(stripped)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
+
+
+def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int:
+    target = ensure_parent(path)
+    count = 0
+    with target.open("w", encoding="utf-8") as file:
+        for row in rows:
+            file.write(json.dumps(row, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+
+
+def append_jsonl(path: str | Path, row: dict[str, Any]) -> None:
+    target = ensure_parent(path)
+    with target.open("a", encoding="utf-8") as file:
+        file.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def write_json(path: str | Path, payload: dict[str, Any]) -> None:
+    target = ensure_parent(path)
+    with target.open("w", encoding="utf-8") as file:
+        json.dump(payload, file, ensure_ascii=False, indent=2)
+        file.write("\n")
+
+
+def compact_text(value: Any) -> str:
+    text = "" if value is None else str(value)
+    return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip()
--- a/src/weknora_eval/parsers/__init__.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/parsers/__init__.py 0 → 100644
View file @147c79e
+"""Document parser adapters."""
--- a/src/weknora_eval/parsers/local.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/parsers/local.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import statistics
+from pathlib import Path
+from typing import Any
+
+from openpyxl import load_workbook
+
+from weknora_eval.loaders import compact_text, write_json, write_jsonl
+from weknora_eval.schemas import ParsedDocument
+
+
+def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    parsing = config["parsing"]
+    local_config = parsing.get("local", {})
+    min_chars = int(local_config.get("min_chars", 80))
+    pdf_backend = local_config.get("pdf_backend", "pypdf")
+    xlsx_mode = local_config.get("xlsx_mode", "row_text")
+
+    docs: list[ParsedDocument] = []
+    failures: list[dict[str, Any]] = []
+
+    for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
+        try:
+            docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
+        except Exception as exc:  # noqa: BLE001 - parser failures must be persisted.
+            failures.append(
+                {
+                    "source_file": pdf_path.name,
+                    "parser": f"local:{pdf_backend}",
+                    "status": "failed",
+                    "error": str(exc),
+                    "fallback_used": None,
+                }
+            )
+
+    for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")):
+        try:
+            docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
+        except Exception as exc:  # noqa: BLE001
+            failures.append(
+                {
+                    "source_file": xlsx_path.name,
+                    "parser": "local:openpyxl",
+                    "status": "failed",
+                    "error": str(exc),
+                    "fallback_used": None,
+                }
+            )
+
+    rows = [doc.to_dict() for doc in docs]
+    write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
+    if failures:
+        write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
+
+    summary = build_parse_summary(rows, failures, parser=f"local:{pdf_backend}")
+    write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
+    return rows, summary
+
+
+def parse_pdf(path: str | Path, *, backend: str = "pypdf", min_chars: int = 80) -> list[ParsedDocument]:
+    target = Path(path)
+    backend = backend.lower()
+    if backend == "pymupdf":
+        return _parse_pdf_pymupdf(target, min_chars=min_chars)
+    if backend == "pdfplumber":
+        return _parse_pdf_pdfplumber(target, min_chars=min_chars)
+    if backend == "pypdf":
+        return _parse_pdf_pypdf(target, min_chars=min_chars)
+    raise ValueError(f"Unsupported PDF backend: {backend}")
+
+
+def _parse_pdf_pypdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
+    from pypdf import PdfReader
+
+    reader = PdfReader(str(path))
+    docs: list[ParsedDocument] = []
+    for index, page in enumerate(reader.pages, start=1):
+        content = compact_text(page.extract_text() or "")
+        if len(content) < min_chars:
+            continue
+        docs.append(_pdf_doc(path, index, content, "local:pypdf"))
+    return docs
+
+
+def _parse_pdf_pymupdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
+    try:
+        import fitz
+    except ImportError as exc:
+        raise ImportError("pymupdf backend requires `pip install -e '.[pdf]'`") from exc
+
+    docs: list[ParsedDocument] = []
+    with fitz.open(path) as document:
+        for index, page in enumerate(document, start=1):
+            content = compact_text(page.get_text("text"))
+            if len(content) < min_chars:
+                continue
+            docs.append(_pdf_doc(path, index, content, "local:pymupdf"))
+    return docs
+
+
+def _parse_pdf_pdfplumber(path: Path, *, min_chars: int) -> list[ParsedDocument]:
+    try:
+        import pdfplumber
+    except ImportError as exc:
+        raise ImportError("pdfplumber backend requires `pip install -e '.[pdf]'`") from exc
+
+    docs: list[ParsedDocument] = []
+    with pdfplumber.open(path) as pdf:
+        for index, page in enumerate(pdf.pages, start=1):
+            content = compact_text(page.extract_text() or "")
+            if len(content) < min_chars:
+                continue
+            docs.append(_pdf_doc(path, index, content, "local:pdfplumber"))
+    return docs
+
+
+def _pdf_doc(path: Path, page: int, content: str, parser: str) -> ParsedDocument:
+    return ParsedDocument(
+        doc_id=f"{path.name}::page-{page}",
+        source_file=path.name,
+        file_type="pdf",
+        page=page,
+        content=content,
+        metadata={"parser": parser},
+    )
+
+
+def parse_xlsx(path: str | Path, *, mode: str = "row_text", min_chars: int = 80) -> list[ParsedDocument]:
+    target = Path(path)
+    mode = mode.lower()
+    workbook = load_workbook(target, data_only=True, read_only=True)
+    if mode == "row_text":
+        return _parse_xlsx_row_text(target, workbook, min_chars=min_chars)
+    if mode == "markdown_table":
+        return _parse_xlsx_markdown_table(target, workbook, min_chars=min_chars)
+    raise ValueError(f"Unsupported XLSX mode: {mode}")
+
+
+def _parse_xlsx_row_text(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
+    docs: list[ParsedDocument] = []
+    for sheet in workbook.worksheets:
+        rows = list(sheet.iter_rows(values_only=True))
+        if not rows:
+            continue
+        headers = [_cell_to_text(value) or f"col_{index}" for index, value in enumerate(rows[0], start=1)]
+        for row_index, row in enumerate(rows[1:], start=2):
+            pairs = []
+            for header, value in zip(headers, row, strict=False):
+                cell = _cell_to_text(value)
+                if cell:
+                    pairs.append(f"{header}: {cell}")
+            content = "\n".join(pairs).strip()
+            if len(content) < min_chars:
+                continue
+            docs.append(
+                ParsedDocument(
+                    doc_id=f"{path.name}::{sheet.title}::row-{row_index}",
+                    source_file=path.name,
+                    file_type="xlsx",
+                    sheet=sheet.title,
+                    row_index=row_index,
+                    content=content,
+                    metadata={"parser": "local:openpyxl", "columns": headers},
+                )
+            )
+    return docs
+
+
+def _parse_xlsx_markdown_table(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
+    docs: list[ParsedDocument] = []
+    for sheet in workbook.worksheets:
+        rows = [
+            [_cell_to_text(value) for value in row]
+            for row in sheet.iter_rows(values_only=True)
+            if any(value is not None for value in row)
+        ]
+        if not rows:
+            continue
+        width = max(len(row) for row in rows)
+        normalized = [row + [""] * (width - len(row)) for row in rows]
+        header = normalized[0]
+        separator = ["---"] * width
+        body = normalized[1:]
+        lines = [
+            "| " + " | ".join(header) + " |",
+            "| " + " | ".join(separator) + " |",
+        ]
+        lines.extend("| " + " | ".join(row) + " |" for row in body)
+        content = "\n".join(lines)
+        if len(content) < min_chars:
+            continue
+        docs.append(
+            ParsedDocument(
+                doc_id=f"{path.name}::{sheet.title}",
+                source_file=path.name,
+                file_type="xlsx",
+                sheet=sheet.title,
+                content=content,
+                metadata={"parser": "local:openpyxl", "mode": "markdown_table"},
+            )
+        )
+    return docs
+
+
+def _cell_to_text(value: Any) -> str:
+    if value is None:
+        return ""
+    text = str(value).strip()
+    return text.replace("\n", " ")
+
+
+def build_parse_summary(
+    rows: list[dict[str, Any]],
+    failures: list[dict[str, Any]],
+    *,
+    parser: str,
+) -> dict[str, Any]:
+    source_files = {row.get("source_file") for row in rows if row.get("source_file")}
+    failed_files = {row.get("source_file") for row in failures if row.get("source_file")}
+    lengths = [len(row.get("content") or "") for row in rows]
+    return {
+        "total_files": len(source_files | failed_files),
+        "parsed_files": len(source_files),
+        "failed_files": len(failed_files),
+        "total_documents": len(rows),
+        "empty_documents": sum(1 for length in lengths if length == 0),
+        "avg_chars": round(statistics.mean(lengths), 2) if lengths else 0,
+        "parser": parser,
+    }
--- a/src/weknora_eval/parsers/mineru.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/parsers/mineru.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from typing import Any
+
+import requests
+
+from weknora_eval.loaders import compact_text, write_json, write_jsonl
+from weknora_eval.parsers.local import build_parse_summary, parse_pdf
+from weknora_eval.schemas import ParsedDocument
+
+
+class MinerUParseError(RuntimeError):
+    pass
+
+
+def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    parsing = config["parsing"]
+    mineru = parsing.get("mineru", {})
+    mode = mineru.get("mode", "cli")
+    fallback = bool(mineru.get("fallback_to_local", True))
+    local_config = parsing.get("local", {})
+    min_chars = int(local_config.get("min_chars", 80))
+
+    docs: list[ParsedDocument] = []
+    failures: list[dict[str, Any]] = []
+
+    for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
+        parser_name = f"mineru:{mode}"
+        try:
+            if mode == "cli":
+                docs.extend(parse_pdf_with_cli(pdf_path, mineru, min_chars=min_chars))
+            elif mode == "http":
+                docs.extend(parse_pdf_with_http(pdf_path, mineru, min_chars=min_chars))
+            else:
+                raise MinerUParseError(f"Unsupported MinerU mode: {mode}")
+        except Exception as exc:  # noqa: BLE001
+            failure = {
+                "source_file": pdf_path.name,
+                "parser": parser_name,
+                "status": "failed",
+                "error": str(exc),
+                "fallback_used": None,
+            }
+            if fallback:
+                try:
+                    backend = local_config.get("pdf_backend", "pypdf")
+                    local_docs = parse_pdf(pdf_path, backend=backend, min_chars=min_chars)
+                    docs.extend(local_docs)
+                    failure["fallback_used"] = f"local:{backend}"
+                except Exception as fallback_exc:  # noqa: BLE001
+                    failure["fallback_error"] = str(fallback_exc)
+            failures.append(failure)
+
+    rows = [doc.to_dict() for doc in docs]
+    write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
+    if failures:
+        write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
+
+    summary = build_parse_summary(rows, failures, parser=f"mineru:{mode}")
+    write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
+    return rows, summary
+
+
+def parse_pdf_with_cli(
+    pdf_path: str | Path,
+    mineru_config: dict[str, Any],
+    *,
+    min_chars: int,
+) -> list[ParsedDocument]:
+    target = Path(pdf_path)
+    output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw"))
+    output_dir = output_root / target.stem
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cli_bin = mineru_config.get("cli_bin", "mineru")
+    timeout = int(mineru_config.get("timeout_seconds", 600))
+
+    # MinerU CLI arguments vary by release. This common invocation is isolated
+    # here so deployments can replace it without touching pipeline scripts.
+    result = subprocess.run(
+        [cli_bin, "-p", str(target), "-o", str(output_dir)],
+        check=False,
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+    )
+    if result.returncode != 0:
+        raise MinerUParseError(result.stderr.strip() or result.stdout.strip() or "MinerU CLI failed")
+
+    markdown_files = sorted(output_dir.rglob("*.md"))
+    if not markdown_files:
+        raise MinerUParseError(f"No Markdown output found in {output_dir}")
+
+    docs: list[ParsedDocument] = []
+    for index, markdown_path in enumerate(markdown_files, start=1):
+        content = compact_text(markdown_path.read_text(encoding="utf-8"))
+        if len(content) < min_chars:
+            continue
+        docs.append(
+            ParsedDocument(
+                doc_id=f"{target.name}::mineru-{index}",
+                source_file=target.name,
+                file_type="pdf",
+                content=content,
+                metadata={
+                    "parser": "mineru:cli",
+                    "mineru_output": str(markdown_path),
+                },
+            )
+        )
+    return docs
+
+
+def parse_pdf_with_http(
+    pdf_path: str | Path,
+    mineru_config: dict[str, Any],
+    *,
+    min_chars: int,
+) -> list[ParsedDocument]:
+    target = Path(pdf_path)
+    base_url = str(mineru_config.get("http_base_url") or "").rstrip("/")
+    if not base_url:
+        raise MinerUParseError("MinerU HTTP mode requires parsing.mineru.http_base_url")
+
+    headers = {}
+    if mineru_config.get("api_key"):
+        headers["Authorization"] = f"Bearer {mineru_config['api_key']}"
+
+    # The checklist does not define a universal MinerU HTTP contract. This
+    # implementation expects a replaceable service exposing POST /parse and
+    # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}.
+    with target.open("rb") as file:
+        response = requests.post(
+            f"{base_url}/parse",
+            files={"file": (target.name, file, "application/pdf")},
+            headers=headers,
+            timeout=int(mineru_config.get("timeout_seconds", 600)),
+        )
+    if response.status_code >= 400:
+        raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
+
+    payload = response.json()
+    contents: list[str] = []
+    if isinstance(payload.get("documents"), list):
+        contents = [compact_text(item.get("content")) for item in payload["documents"]]
+    elif payload.get("markdown"):
+        contents = [compact_text(payload["markdown"])]
+    else:
+        raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`")
+
+    docs: list[ParsedDocument] = []
+    for index, content in enumerate(contents, start=1):
+        if len(content) < min_chars:
+            continue
+        docs.append(
+            ParsedDocument(
+                doc_id=f"{target.name}::mineru-http-{index}",
+                source_file=target.name,
+                file_type="pdf",
+                content=content,
+                metadata={"parser": "mineru:http"},
+            )
+        )
+    return docs
--- a/src/weknora_eval/ragas_runner.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/ragas_runner.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from weknora_eval.config import require_config
+from weknora_eval.loaders import read_jsonl
+
+
+def run_ragas_eval(
+    config: dict[str, Any],
+    *,
+    input_path: str = "data/runs/ragas_input.jsonl",
+    output_csv_path: str = "data/reports/ragas_scores.csv",
+) -> pd.DataFrame:
+    from datasets import Dataset
+    from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+    from ragas import evaluate
+    from ragas.run_config import RunConfig
+
+    ragas_config = config["ragas"]
+    llm_api_key = _first_non_empty(ragas_config, "llm_api_key", "api_key")
+    llm_base_url = _first_non_empty(ragas_config, "llm_base_url", "base_url")
+    embedding_api_key = _first_non_empty(ragas_config, "embedding_api_key", "api_key")
+    embedding_base_url = _first_non_empty(ragas_config, "embedding_base_url", "base_url")
+    judge_model = str(require_config(config, "ragas.judge_model"))
+    embedding_model = str(require_config(config, "ragas.embedding_model"))
+    temperature = float(ragas_config.get("temperature", 0))
+    max_tokens = int(ragas_config.get("max_tokens", 4096))
+    timeout_seconds = int(ragas_config.get("timeout_seconds", 600))
+    max_workers = int(ragas_config.get("max_workers", 1))
+
+    os.environ["OPENAI_API_KEY"] = llm_api_key
+    if llm_base_url:
+        os.environ["OPENAI_BASE_URL"] = llm_base_url
+
+    rows = read_jsonl(input_path)
+    dataset = Dataset.from_list(
+        [
+            {
+                "user_input": row["user_input"],
+                "response": row["response"],
+                "retrieved_contexts": row["retrieved_contexts"],
+                "reference": row["reference"],
+                "reference_contexts": row.get("reference_contexts") or [],
+            }
+            for row in rows
+        ]
+    )
+
+    metric_map = _metric_map()
+    selected_metrics = [
+        metric_map[name]
+        for name in ragas_config.get("metrics", metric_map.keys())
+        if name in metric_map
+    ]
+
+    llm = ChatOpenAI(
+        model=judge_model,
+        api_key=llm_api_key,
+        base_url=llm_base_url or None,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    embeddings = OpenAIEmbeddings(
+        model=embedding_model,
+        api_key=embedding_api_key,
+        base_url=embedding_base_url or None,
+        tiktoken_enabled=False,
+        check_embedding_ctx_length=False,
+    )
+    ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
+
+    run_config = RunConfig(timeout=timeout_seconds, max_workers=max_workers)
+    result = evaluate(
+        dataset,
+        metrics=selected_metrics,
+        llm=ragas_llm,
+        embeddings=ragas_embeddings,
+        run_config=run_config,
+    )
+    scores = result.to_pandas()
+    for index, row in enumerate(rows):
+        scores.loc[index, "sample_id"] = row.get("sample_id")
+
+    target = Path(output_csv_path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    scores.to_csv(target, index=False)
+    return scores
+
+
+def _metric_map() -> dict[str, Any]:
+    try:
+        from ragas.metrics import (
+            context_precision,
+            context_recall,
+            faithfulness,
+            factual_correctness,
+            response_relevancy,
+        )
+
+        return {
+            "faithfulness": faithfulness,
+            "response_relevancy": response_relevancy,
+            "context_precision": context_precision,
+            "context_recall": context_recall,
+            "factual_correctness": factual_correctness,
+        }
+    except ImportError:
+        from ragas.metrics import (
+            Faithfulness,
+            FactualCorrectness,
+            LLMContextPrecisionWithReference,
+            LLMContextRecall,
+            ResponseRelevancy,
+        )
+
+        return {
+            "faithfulness": Faithfulness(),
+            "response_relevancy": ResponseRelevancy(),
+            "context_precision": LLMContextPrecisionWithReference(),
+            "context_recall": LLMContextRecall(),
+            "factual_correctness": FactualCorrectness(),
+        }
+
+
+def _first_non_empty(config: dict[str, Any], *keys: str) -> str:
+    for key in keys:
+        value = config.get(key)
+        if value not in {None, ""}:
+            return str(value)
+    raise ValueError(f"Missing required Ragas config value. Checked: {', '.join(keys)}")
+
+
+def _wrap_langchain_models(llm: Any, embeddings: Any) -> tuple[Any, Any]:
+    try:
+        from ragas.embeddings import LangchainEmbeddingsWrapper
+        from ragas.llms import LangchainLLMWrapper
+    except ImportError:
+        return llm, embeddings
+
+    return LangchainLLMWrapper(llm), LangchainEmbeddingsWrapper(embeddings)
--- a/src/weknora_eval/report.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/report.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import math
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from weknora_eval.loaders import read_jsonl
+
+
+def retrieval_metrics(
+    ragas_rows: list[dict[str, Any]],
+    *,
+    ks: tuple[int, ...] = (1, 3, 5),
+) -> dict[str, float]:
+    samples = [row for row in ragas_rows if row.get("gold_chunk_ids")]
+    if not samples:
+        return {}
+
+    totals: dict[str, float] = {f"hit@{k}": 0.0 for k in ks}
+    totals.update({f"recall@{k}": 0.0 for k in ks})
+    totals["mrr"] = 0.0
+    totals["ndcg@5"] = 0.0
+
+    for row in samples:
+        gold = set(row.get("gold_chunk_ids") or [])
+        refs = row.get("weknora_references") or []
+        predicted = [str(ref.get("id")) for ref in refs if ref.get("id")]
+        for k in ks:
+            top_k = predicted[:k]
+            hits = len(gold.intersection(top_k))
+            totals[f"hit@{k}"] += 1.0 if hits else 0.0
+            totals[f"recall@{k}"] += hits / len(gold)
+
+        first_rank = next((idx for idx, chunk_id in enumerate(predicted, start=1) if chunk_id in gold), None)
+        if first_rank:
+            totals["mrr"] += 1 / first_rank
+
+        dcg = 0.0
+        for idx, chunk_id in enumerate(predicted[:5], start=1):
+            if chunk_id in gold:
+                dcg += 1 / math.log2(idx + 1)
+        ideal_hits = min(len(gold), 5)
+        idcg = sum(1 / math.log2(idx + 1) for idx in range(1, ideal_hits + 1))
+        totals["ndcg@5"] += dcg / idcg if idcg else 0.0
+
+    return {key: round(value / len(samples), 4) for key, value in totals.items()}
+
+
+def generate_summary_report(
+    config: dict[str, Any],
+    *,
+    scores_csv_path: str = "data/reports/ragas_scores.csv",
+    ragas_input_path: str = "data/runs/ragas_input.jsonl",
+    answers_path: str = "data/runs/weknora_answers.jsonl",
+    output_path: str = "data/reports/summary.md",
+) -> str:
+    ragas_rows = read_jsonl(ragas_input_path, missing_ok=True)
+    answer_rows = read_jsonl(answers_path, missing_ok=True)
+    scores = pd.read_csv(scores_csv_path) if Path(scores_csv_path).exists() else pd.DataFrame()
+
+    lines = [
+        "# Ragas 评估报告",
+        "",
+        "## 运行信息",
+        f"- WeKnora Base URL: {config.get('weknora', {}).get('base_url', '')}",
+        f"- 知识库 ID: {config.get('weknora', {}).get('knowledge_base_id', '')}",
+        f"- 测试集规模: {len(ragas_rows)}",
+        f"- 审核通过样本数: {len(ragas_rows)}",
+        f"- 失败样本数: {sum(1 for row in answer_rows if row.get('error'))}",
+        f"- Judge 模型: {config.get('ragas', {}).get('judge_model', '')}",
+        "",
+        "## 聚合指标",
+        "| 指标 | 平均值 | P50 | 失败阈值 |",
+        "| --- | --- | --- | --- |",
+    ]
+
+    metric_columns = [
+        column
+        for column in scores.columns
+        if column not in {"sample_id", "user_input", "response", "reference"}
+        and pd.api.types.is_numeric_dtype(scores[column])
+    ]
+    for column in metric_columns:
+        lines.append(
+            f"| {column} | {scores[column].mean():.4f} | {scores[column].median():.4f} | 0.50 |"
+        )
+
+    chunk_metrics = retrieval_metrics(ragas_rows)
+    if chunk_metrics:
+        lines.extend(["", "## Chunk ID 检索指标", "| 指标 | 平均值 |", "| --- | --- |"])
+        for key, value in chunk_metrics.items():
+            lines.append(f"| {key} | {value:.4f} |")
+
+    lines.extend(["", "## 检索失败样本", "| sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 |", "| --- | --- | --- | --- | --- | --- |"])
+    for row in _worst_rows(scores, "context_recall"):
+        sample = _sample_by_id(ragas_rows, row.get("sample_id"))
+        actual_files = sorted(
+            {
+                ref.get("knowledge_filename") or ""
+                for ref in sample.get("weknora_references", [])
+                if ref.get("knowledge_filename")
+            }
+        )
+        lines.append(
+            f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
+            f"{_cell(sample.get('source_file'))} | {_cell(', '.join(actual_files))} | "
+            f"{_score(row.get('context_recall'))} | |"
+        )
+
+    lines.extend(["", "## 生成失败样本", "| sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness |", "| --- | --- | --- | --- | --- | --- |"])
+    for row in _worst_rows(scores, "faithfulness"):
+        sample = _sample_by_id(ragas_rows, row.get("sample_id"))
+        lines.append(
+            f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
+            f"{_cell(sample.get('response'))} | {_cell(sample.get('reference'))} | "
+            f"{_score(row.get('faithfulness'))} | {_score(row.get('factual_correctness'))} |"
+        )
+
+    empty_retrievals = sum(1 for row in ragas_rows if not row.get("retrieved_contexts"))
+    fallback_answers = sum(1 for row in answer_rows if row.get("is_fallback"))
+    source_counts: dict[str, int] = {}
+    for row in ragas_rows:
+        source = row.get("source_file") or "unknown"
+        source_counts[source] = source_counts.get(source, 0) + 1
+
+    lines.extend(
+        [
+            "",
+            "## 数据质量",
+            f"- 空检索数量: {empty_retrievals}",
+            f"- fallback 答案数量: {fallback_answers}",
+            f"- 来源文件分布: {source_counts}",
+            "",
+            "## 改进建议",
+            "- 优先检查 context_recall 低且 retrieved_contexts 为空的样本。",
+            "- 对低 faithfulness 且 context_recall 正常的样本，重点检查生成模型和提示词。",
+            "- 对 Chunk ID 指标低但 Ragas context 指标正常的样本，检查 chunk 切分或 gold_chunk_ids 标注。",
+            "",
+        ]
+    )
+
+    content = "\n".join(lines)
+    target = Path(output_path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(content, encoding="utf-8")
+    return content
+
+
+def _worst_rows(scores: pd.DataFrame, column: str, *, limit: int = 10) -> list[dict[str, Any]]:
+    if scores.empty or column not in scores.columns:
+        return []
+    return scores.sort_values(column, ascending=True).head(limit).to_dict(orient="records")
+
+
+def _sample_by_id(rows: list[dict[str, Any]], sample_id: Any) -> dict[str, Any]:
+    return next((row for row in rows if row.get("sample_id") == sample_id), {})
+
+
+def _cell(value: Any, *, max_len: int = 120) -> str:
+    text = "" if value is None else " ".join(str(value).split())
+    text = text.replace("|", "\\|")
+    if len(text) <= max_len:
+        return text
+    return text[:max_len].rstrip() + "..."
+
+
+def _score(value: Any) -> str:
+    try:
+        if pd.isna(value):
+            return ""
+        return f"{float(value):.4f}"
+    except (TypeError, ValueError):
+        return ""
--- a/src/weknora_eval/schemas.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/schemas.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass
+class ParsedDocument:
+    doc_id: str
+    source_file: str
+    file_type: str
+    content: str
+    page: int | None = None
+    sheet: str | None = None
+    row_index: int | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class TestsetRecord:
+    sample_id: str
+    user_input: str
+    reference: str
+    reference_contexts: list[str]
+    source_file: str | None = None
+    gold_chunk_ids: list[str] = field(default_factory=list)
+    question_type: str = "single_hop"
+    review_status: str = "pending"
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class WeKnoraAnswer:
+    sample_id: str
+    user_input: str
+    response: str
+    retrieved_contexts: list[str]
+    weknora_references: list[dict[str, Any]]
+    session_id: str | None = None
+    request_id: str | None = None
+    error: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
--- a/src/weknora_eval/sse.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/sse.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable, Iterator
+from typing import Any
+
+
+def parse_sse_events(lines: Iterable[str | bytes]) -> Iterator[dict[str, Any]]:
+    event_name = "message"
+    data_lines: list[str] = []
+
+    for raw_line in lines:
+        line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
+        line = line.rstrip("\r\n")
+
+        if not line:
+            if data_lines:
+                yield _build_event(event_name, data_lines)
+                event_name = "message"
+                data_lines = []
+            continue
+
+        if line.startswith(":"):
+            continue
+        if line.startswith("event:"):
+            event_name = line.removeprefix("event:").strip()
+            continue
+        if line.startswith("data:"):
+            data_lines.append(line.removeprefix("data:").strip())
+
+    if data_lines:
+        yield _build_event(event_name, data_lines)
+
+
+def _build_event(event_name: str, data_lines: list[str]) -> dict[str, Any]:
+    raw_data = "\n".join(data_lines)
+    parsed_data: Any = raw_data
+    if raw_data and raw_data != "[DONE]":
+        try:
+            parsed_data = json.loads(raw_data)
+        except json.JSONDecodeError:
+            parsed_data = raw_data
+    return {"event": event_name, "data": parsed_data}
+
+
+def normalize_reference(reference: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "id": reference.get("id"),
+        "content": reference.get("content") or "",
+        "knowledge_id": reference.get("knowledge_id"),
+        "chunk_index": reference.get("chunk_index"),
+        "score": reference.get("score"),
+        "knowledge_filename": reference.get("knowledge_filename")
+        or reference.get("knowledge_title"),
+        "match_type": reference.get("match_type"),
+        "chunk_type": reference.get("chunk_type"),
+    }
--- a/src/weknora_eval/testset.py 0 → 100644
View file @147c79e
+++ b/src/weknora_eval/testset.py 0 → 100644
View file @147c79e
+from __future__ import annotations
+
+from typing import Any
+
+from weknora_eval.loaders import read_jsonl, write_jsonl
+from weknora_eval.schemas import TestsetRecord
+
+
+def generate_rule_based_testset(
+    *,
+    documents_path: str = "data/parsed_docs/documents.jsonl",
+    output_path: str = "data/testsets/testset.raw.jsonl",
+    size: int = 50,
+    min_context_chars: int = 80,
+) -> list[dict[str, Any]]:
+    documents = [
+        row
+        for row in read_jsonl(documents_path)
+        if len(row.get("content") or "") >= min_context_chars
+    ]
+    rows: list[dict[str, Any]] = []
+    for index, document in enumerate(documents[:size], start=1):
+        context = document["content"]
+        source_file = document.get("source_file")
+        question = _default_question(document)
+        reference = _reference_from_context(context)
+        rows.append(
+            TestsetRecord(
+                sample_id=f"qa-{index:04d}",
+                user_input=question,
+                reference=reference,
+                reference_contexts=[context],
+                source_file=source_file,
+                question_type="single_hop",
+                review_status="pending",
+            ).to_dict()
+        )
+    write_jsonl(output_path, rows)
+    return rows
+
+
+def approve_pending_testset(
+    *,
+    input_path: str = "data/testsets/testset.raw.jsonl",
+    output_path: str = "data/testsets/testset.reviewed.jsonl",
+) -> list[dict[str, Any]]:
+    rows = read_jsonl(input_path)
+    reviewed: list[dict[str, Any]] = []
+    for row in rows:
+        row = dict(row)
+        if row.get("review_status") == "rejected":
+            continue
+        row["review_status"] = "approved"
+        reviewed.append(row)
+    write_jsonl(output_path, reviewed)
+    return reviewed
+
+
+def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]:
+    errors: list[str] = []
+    for index, row in enumerate(read_jsonl(path), start=1):
+        prefix = f"{path}:{index}"
+        if row.get("review_status") != "approved":
+            errors.append(f"{prefix} review_status must be approved")
+        for key in ("sample_id", "user_input", "reference"):
+            if not row.get(key):
+                errors.append(f"{prefix} missing {key}")
+        if not row.get("reference_contexts"):
+            errors.append(f"{prefix} reference_contexts must be non-empty")
+    return errors
+
+
+def _default_question(document: dict[str, Any]) -> str:
+    source = document.get("source_file") or "该文档"
+    if document.get("file_type") == "xlsx" and document.get("sheet"):
+        return f"请根据 {source} 的 {document['sheet']} 中对应记录回答：这条记录的主要内容是什么？"
+    if document.get("page"):
+        return f"请根据 {source} 第 {document['page']} 页回答：该片段的主要内容是什么？"
+    return f"请根据 {source} 回答：该片段的主要内容是什么？"
+
+
+def _reference_from_context(context: str, *, max_chars: int = 500) -> str:
+    text = " ".join(context.split())
+    if len(text) <= max_chars:
+        return text
+    return text[:max_chars].rstrip() + "..."