Commit 147c79e0 147c79e07bbaa525c4f6c6ac143e9c5c109362f8 by 沈秋雨

Initial WeKnora Ragas evaluation project

0 parents
1 WEKNORA_BASE_URL=http://localhost:8080/api/v1
2 WEKNORA_API_KEY=
3 WEKNORA_KB_ID=
4 WEKNORA_KB_NAME=ragas-eval-pilot
5
6 # Ragas generation and judge models. These are evaluation-side models, not the
7 # model configuration used by the WeKnora backend.
8 OPENAI_API_KEY=replace-me
9 OPENAI_BASE_URL=https://api.openai.com/v1
10
11 # Optional split deployment. Use these when LLM and embedding are served by
12 # different OpenAI-compatible services, such as vLLM + Infinity.
13 RAGAS_LLM_API_KEY=replace-me
14 RAGAS_LLM_BASE_URL=http://localhost:8000/v1
15 RAGAS_EMBEDDING_API_KEY=replace-me
16 RAGAS_EMBEDDING_BASE_URL=http://localhost:7997/v1
17 RAGAS_RERANKER_API_KEY=replace-me
18 RAGAS_RERANKER_BASE_URL=http://localhost:7998/v1
19 RAGAS_RERANKER_MODEL=replace-me
20
21 RAGAS_GENERATOR_MODEL=gpt-4o-mini
22 RAGAS_JUDGE_MODEL=gpt-4o-mini
23 RAGAS_EMBEDDING_MODEL=text-embedding-3-small
24
25 TESTSET_SIZE=50
26 REQUEST_INTERVAL_SECONDS=0.2
1 .env
2 .venv/
3 __pycache__/
4 *.py[cod]
5 *.egg-info/
6 .pytest_cache/
7 .ruff_cache/
8
9 data/raw_docs/pdf/*
10 data/raw_docs/xlsx/*
11 data/parsed_docs/*.json
12 data/parsed_docs/*.jsonl
13 data/parsed_docs/mineru_raw/*
14 data/exported/*.json
15 data/exported/*.jsonl
16 data/testsets/*.jsonl
17 data/runs/*.jsonl
18 data/reports/*.csv
19 data/reports/*.md
20
21 !data/raw_docs/pdf/.gitkeep
22 !data/raw_docs/xlsx/.gitkeep
23 !data/parsed_docs/mineru_raw/.gitkeep
1 # WeKnora Ragas Eval
2
3 独立的 WeKnora Ragas 评估项目。它只调用 WeKnora 公开 API,不依赖 WeKnora 内置的 `/evaluation` 接口。
4
5 ## 安装
6
7 ```bash
8 python -m venv .venv
9 source .venv/bin/activate
10 pip install -e .
11 ```
12
13 如果需要更好的 PDF 解析能力:
14
15 ```bash
16 pip install -e ".[pdf]"
17 ```
18
19 开发和测试工具:
20
21 ```bash
22 pip install -e ".[dev,pdf]"
23 ```
24
25 ## 配置
26
27 ```bash
28 cp .env.example .env
29 ```
30
31 编辑 `.env` 后确认:
32
33 - `WEKNORA_BASE_URL` 指向 WeKnora API v1,例如 `http://localhost:9090/api/v1`
34 - `WEKNORA_API_KEY` 是 WeKnora API Key
35 - `WEKNORA_KB_ID` 是目标知识库 ID;如果还没有,先运行 `python scripts/00_create_kb.py`
36 - `WEKNORA_KB_NAME` 是创建知识库时使用的名称
37 - `OPENAI_API_KEY``OPENAI_BASE_URL``RAGAS_*_MODEL` 是评估侧模型配置
38 - 如果 LLM 和 embedding 分开部署,使用 `RAGAS_LLM_BASE_URL` 指向 vLLM 的 `/v1`,使用 `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity 的 `/v1`
39
40 ## 首轮 Pilot
41
42 把原始文件放到:
43
44 - `data/raw_docs/pdf/`
45 - `data/raw_docs/xlsx/`
46
47 按顺序执行:
48
49 ```bash
50 python scripts/00_create_kb.py
51 python scripts/01_upload_docs.py
52 python scripts/02_wait_ingestion.py
53 python scripts/03_export_chunks.py
54 python scripts/04_parse_docs.py
55 python scripts/05_generate_testset.py
56 python scripts/06_review_testset.py
57 python scripts/07_run_weknora_qa.py
58 python scripts/08_build_ragas_input.py
59 python scripts/09_run_ragas_eval.py
60 python scripts/10_report.py
61 ```
62
63 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。
64
65 ## 主要产物
66
67 - `data/exported/knowledge.jsonl`
68 - `data/exported/chunks.jsonl`
69 - `data/parsed_docs/documents.jsonl`
70 - `data/parsed_docs/parse_summary.json`
71 - `data/testsets/testset.raw.jsonl`
72 - `data/testsets/testset.reviewed.jsonl`
73 - `data/runs/weknora_answers.jsonl`
74 - `data/runs/ragas_input.jsonl`
75 - `data/reports/ragas_scores.csv`
76 - `data/reports/summary.md`
1 weknora:
2 base_url: "${WEKNORA_BASE_URL}"
3 api_key: "${WEKNORA_API_KEY}"
4 knowledge_base_id: "${WEKNORA_KB_ID}"
5 knowledge_base_name: "${WEKNORA_KB_NAME:-ragas-eval-pilot}"
6 knowledge_base_description: "Knowledge base for independent Ragas evaluation."
7 timeout_seconds: 300
8 request_interval_seconds: "${REQUEST_INTERVAL_SECONDS:-0.2}"
9
10 testset:
11 size: "${TESTSET_SIZE:-50}"
12 include_pdf: true
13 include_xlsx: true
14 min_context_chars: 80
15 require_manual_review: true
16
17 parsing:
18 provider: "local"
19 output_path: "data/parsed_docs/documents.jsonl"
20 failed_path: "data/parsed_docs/failed_parse.jsonl"
21 summary_path: "data/parsed_docs/parse_summary.json"
22 local:
23 pdf_backend: "pymupdf"
24 xlsx_mode: "row_text"
25 min_chars: 80
26 mineru:
27 mode: "cli"
28 cli_bin: "mineru"
29 output_dir: "data/parsed_docs/mineru_raw"
30 http_base_url: "http://172.23.184.9:8002"
31 api_key: "mineru"
32 timeout_seconds: 600
33 fallback_to_local: false
34
35 qa:
36 one_session_per_question: true
37 disable_title: true
38 enable_memory: false
39 channel: "api"
40 verify_with_messages: false
41
42 ragas:
43 provider: "openai-compatible"
44 # Backward-compatible defaults. If the split LLM/embedding values below are
45 # empty, these values are used for both clients.
46 api_key: "${OPENAI_API_KEY}"
47 base_url: "${OPENAI_BASE_URL}"
48 # vLLM OpenAI-compatible endpoint, for example http://localhost:8000/v1.
49 llm_api_key: "${RAGAS_LLM_API_KEY}"
50 llm_base_url: "${RAGAS_LLM_BASE_URL}"
51 # Infinity OpenAI-compatible embedding endpoint, for example
52 # http://localhost:7997/v1.
53 embedding_api_key: "${RAGAS_EMBEDDING_API_KEY}"
54 embedding_base_url: "${RAGAS_EMBEDDING_BASE_URL}"
55 # Reserved for future retrieval/rerank metrics. The current Ragas pipeline
56 # does not call reranker APIs.
57 reranker_api_key: "${RAGAS_RERANKER_API_KEY}"
58 reranker_base_url: "${RAGAS_RERANKER_BASE_URL}"
59 reranker_model: "${RAGAS_RERANKER_MODEL}"
60 generator_model: "${RAGAS_GENERATOR_MODEL}"
61 judge_model: "${RAGAS_JUDGE_MODEL}"
62 embedding_model: "${RAGAS_EMBEDDING_MODEL}"
63 temperature: 0
64 max_tokens: 4096
65 timeout_seconds: 600
66 max_workers: 1
67 metrics:
68 - faithfulness
69 - response_relevancy
70 - context_precision
71 - context_recall
72 - factual_correctness
1 [project]
2 name = "weknora-ragas-eval"
3 version = "0.1.0"
4 description = "Independent Ragas evaluation pipeline for WeKnora public APIs."
5 readme = "README.md"
6 requires-python = ">=3.10"
7 dependencies = [
8 "ragas>=0.3,<0.5",
9 "datasets>=2.19.0",
10 "pandas>=2.2.0",
11 "openpyxl>=3.1.0",
12 "requests>=2.32.0",
13 "sseclient-py>=1.8.0",
14 "python-dotenv>=1.0.0",
15 "pyyaml>=6.0.0",
16 "langchain>=0.2.0",
17 "langchain-community>=0.2.0",
18 "langchain-openai>=0.1.0",
19 "pypdf>=4.2.0"
20 ]
21
22 [project.optional-dependencies]
23 pdf = [
24 "pymupdf>=1.24.0",
25 "pdfplumber>=0.11.0"
26 ]
27 dev = [
28 "ruff>=0.6.0",
29 "pytest>=8.0.0"
30 ]
31
32 [build-system]
33 requires = ["setuptools>=68"]
34 build-backend = "setuptools.build_meta"
35
36 [tool.setuptools.packages.find]
37 where = ["src"]
38
39 [tool.ruff]
40 line-length = 100
41 target-version = "py310"
42
43 [tool.ruff.lint]
44 select = ["E", "F", "I", "UP", "B"]
1 from __future__ import annotations
2
3 import sys
4 from typing import Any
5
6 import _bootstrap # noqa: F401
7
8 from weknora_eval.api import bootstrap_client_from_config
9 from weknora_eval.config import load_config, require_config
10 from weknora_eval.envfile import set_env_value
11 from weknora_eval.loaders import setup_logging, write_json
12
13
14 def main() -> int:
15 setup_logging()
16 config = load_config()
17 client = bootstrap_client_from_config(config)
18 weknora = config["weknora"]
19
20 existing_id = str(weknora.get("knowledge_base_id") or "")
21 name = str(require_config(config, "weknora.knowledge_base_name"))
22 if existing_id and existing_id != "replace-me":
23 record = {"id": existing_id, "name": name, "source": "env"}
24 write_json("data/exported/knowledge_base.json", record)
25 print(f"WEKNORA_KB_ID already set: {existing_id}")
26 return 0
27
28 created = client.create_knowledge_base(name=name)
29 knowledge_base_id = _extract_knowledge_base_id(created)
30 if not knowledge_base_id:
31 print(f"Created knowledge base but could not extract id from response: {created}")
32 return 1
33
34 set_env_value(".env", "WEKNORA_KB_ID", knowledge_base_id)
35 write_json("data/exported/knowledge_base.json", {**created, "source": "create"})
36 print(f"WEKNORA_KB_ID={knowledge_base_id}")
37 print("Wrote ID to .env and data/exported/knowledge_base.json")
38 return 0
39
40
41 def _extract_knowledge_base_id(payload: dict[str, Any]) -> str | None:
42 candidates = [payload]
43 for key in ("data", "knowledge_base"):
44 nested = payload.get(key)
45 if isinstance(nested, dict):
46 candidates.append(nested)
47
48 for row in candidates:
49 for key in ("id", "knowledge_base_id", "kb_id", "uuid"):
50 value = row.get(key)
51 if value:
52 return str(value)
53 return None
54
55
56 if __name__ == "__main__":
57 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4 from pathlib import Path
5
6 import _bootstrap # noqa: F401
7
8 from weknora_eval.api import client_from_config
9 from weknora_eval.config import load_config
10 from weknora_eval.loaders import setup_logging, write_jsonl
11
12
13 def main() -> int:
14 setup_logging()
15 config = load_config()
16 client = client_from_config(config)
17 files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted(
18 Path("data/raw_docs/xlsx").glob("*.xlsx")
19 )
20 rows = []
21 for path in files:
22 data = client.upload_file(path)
23 rows.append(
24 {
25 "knowledge_id": data.get("id"),
26 "file_name": data.get("file_name") or data.get("title") or path.name,
27 "file_type": data.get("file_type") or path.suffix.lstrip("."),
28 "parse_status": data.get("parse_status"),
29 "enable_status": data.get("enable_status"),
30 "raw": data,
31 }
32 )
33 write_jsonl("data/exported/knowledge_uploads.jsonl", rows)
34 print(f"Uploaded {len(rows)} files")
35 return 0
36
37
38 if __name__ == "__main__":
39 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.api import client_from_config
8 from weknora_eval.config import load_config
9 from weknora_eval.loaders import read_jsonl, setup_logging, write_jsonl
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 client = client_from_config(config)
16 uploads = read_jsonl("data/exported/knowledge_uploads.jsonl", missing_ok=True)
17 knowledge_ids = {row["knowledge_id"] for row in uploads if row.get("knowledge_id")} or None
18 result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids)
19 knowledge = client.list_knowledge()
20 write_jsonl("data/exported/knowledge.jsonl", knowledge)
21
22 print(
23 "Ingestion status: "
24 f"completed={len(result['completed'])} failed={len(result['failed'])} "
25 f"pending={len(result['pending'])}"
26 )
27 return 1 if result["failed"] or result["pending"] else 0
28
29
30 if __name__ == "__main__":
31 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.api import client_from_config
8 from weknora_eval.config import load_config
9 from weknora_eval.loaders import setup_logging, write_jsonl
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 client = client_from_config(config)
16 knowledge_rows = client.list_knowledge()
17 write_jsonl("data/exported/knowledge.jsonl", knowledge_rows)
18 knowledge_by_id = {row.get("id"): row for row in knowledge_rows}
19
20 chunk_rows = []
21 for knowledge in knowledge_rows:
22 knowledge_id = knowledge.get("id")
23 if not knowledge_id:
24 continue
25 if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled":
26 continue
27 for chunk in client.list_chunks(str(knowledge_id)):
28 content = (chunk.get("content") or "").strip()
29 if not content:
30 continue
31 if chunk.get("is_enabled") is False:
32 continue
33 source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge
34 chunk_rows.append(
35 {
36 "chunk_id": chunk.get("id"),
37 "knowledge_id": chunk.get("knowledge_id") or knowledge_id,
38 "knowledge_base_id": chunk.get("knowledge_base_id")
39 or config["weknora"]["knowledge_base_id"],
40 "chunk_index": chunk.get("chunk_index"),
41 "content": content,
42 "source_file": source.get("file_name") or source.get("title"),
43 "chunk_type": chunk.get("chunk_type"),
44 "raw": chunk,
45 }
46 )
47 write_jsonl("data/exported/chunks.jsonl", chunk_rows)
48 print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records")
49 return 0
50
51
52 if __name__ == "__main__":
53 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.parsers.local import parse_raw_docs
10 from weknora_eval.parsers.mineru import parse_with_mineru
11
12
13 def main() -> int:
14 setup_logging()
15 config = load_config()
16 provider = config.get("parsing", {}).get("provider", "local")
17 if provider == "local":
18 rows, summary = parse_raw_docs(config)
19 elif provider == "mineru":
20 rows, summary = parse_with_mineru(config)
21 else:
22 raise ValueError(f"Unsupported parsing provider: {provider}")
23 print(f"Parsed {len(rows)} documents: {summary}")
24 return 0 if rows else 1
25
26
27 if __name__ == "__main__":
28 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.testset import generate_rule_based_testset
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 testset = config.get("testset", {})
16 rows = generate_rule_based_testset(
17 size=int(testset.get("size", 50)),
18 min_context_chars=int(testset.get("min_context_chars", 80)),
19 )
20 print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl")
21 return 0 if rows else 1
22
23
24 if __name__ == "__main__":
25 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.loaders import setup_logging
8 from weknora_eval.testset import approve_pending_testset, validate_reviewed_testset
9
10
11 def main() -> int:
12 setup_logging()
13 rows = approve_pending_testset()
14 errors = validate_reviewed_testset()
15 if errors:
16 for error in errors:
17 print(error)
18 return 1
19 print(f"Wrote {len(rows)} approved QA records to data/testsets/testset.reviewed.jsonl")
20 return 0 if rows else 1
21
22
23 if __name__ == "__main__":
24 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.api import client_from_config
8 from weknora_eval.config import load_config
9 from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 client = client_from_config(config)
16 qa_config = config.get("qa", {})
17 rows = [row for row in read_jsonl("data/testsets/testset.reviewed.jsonl") if row.get("review_status") == "approved"]
18 answers = []
19
20 for index, row in enumerate(rows, start=1):
21 sample_id = row["sample_id"]
22 try:
23 session = client.create_session(title=f"ragas-eval-{sample_id}")
24 session_id = session.get("id")
25 if not session_id:
26 raise RuntimeError(f"create_session returned no id for {sample_id}")
27 result = client.knowledge_chat_sse(
28 session_id=session_id,
29 query=row["user_input"],
30 disable_title=bool(qa_config.get("disable_title", True)),
31 enable_memory=bool(qa_config.get("enable_memory", False)),
32 channel=str(qa_config.get("channel", "api")),
33 )
34 answer = {
35 "sample_id": sample_id,
36 "user_input": row["user_input"],
37 "session_id": session_id,
38 "request_id": result.get("request_id"),
39 "response": result.get("response") or "",
40 "retrieved_contexts": result.get("retrieved_contexts") or [],
41 "weknora_references": result.get("weknora_references") or [],
42 "error": None,
43 }
44 if not answer["response"]:
45 answer["error"] = "empty_response"
46 append_jsonl("data/runs/failed_requests.jsonl", answer)
47 elif not answer["retrieved_contexts"]:
48 append_jsonl("data/runs/failed_requests.jsonl", {**answer, "error": "empty_retrieval"})
49 answers.append(answer)
50 print(f"[{index}/{len(rows)}] {sample_id} response_chars={len(answer['response'])}")
51 except Exception as exc: # noqa: BLE001
52 failed = {
53 "sample_id": sample_id,
54 "user_input": row.get("user_input"),
55 "response": "",
56 "retrieved_contexts": [],
57 "weknora_references": [],
58 "session_id": None,
59 "request_id": None,
60 "error": str(exc),
61 }
62 answers.append(failed)
63 append_jsonl("data/runs/failed_requests.jsonl", failed)
64 print(f"[{index}/{len(rows)}] {sample_id} failed: {exc}")
65
66 write_jsonl("data/runs/weknora_answers.jsonl", answers)
67 failures = [row for row in answers if row.get("error") and row.get("error") != "empty_retrieval"]
68 return 1 if failures else 0
69
70
71 if __name__ == "__main__":
72 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl
8
9
10 def main() -> int:
11 setup_logging()
12 testset = {
13 row["sample_id"]: row
14 for row in read_jsonl("data/testsets/testset.reviewed.jsonl")
15 if row.get("review_status") == "approved"
16 }
17 answers = {row["sample_id"]: row for row in read_jsonl("data/runs/weknora_answers.jsonl")}
18 ragas_rows = []
19 for sample_id, qa in testset.items():
20 answer = answers.get(sample_id)
21 if not answer:
22 append_jsonl("data/runs/failed_requests.jsonl", {"sample_id": sample_id, "error": "missing_answer"})
23 continue
24 row = {
25 "sample_id": sample_id,
26 "user_input": qa["user_input"],
27 "response": answer.get("response") or "",
28 "retrieved_contexts": answer.get("retrieved_contexts") or [],
29 "reference": qa["reference"],
30 "reference_contexts": qa.get("reference_contexts") or [],
31 "session_id": answer.get("session_id"),
32 "request_id": answer.get("request_id"),
33 "weknora_references": answer.get("weknora_references") or [],
34 "source_file": qa.get("source_file"),
35 "gold_chunk_ids": qa.get("gold_chunk_ids") or [],
36 }
37 missing = [
38 key
39 for key in ("user_input", "response", "retrieved_contexts", "reference", "reference_contexts")
40 if not row.get(key)
41 ]
42 if missing:
43 append_jsonl(
44 "data/runs/failed_requests.jsonl",
45 {"sample_id": sample_id, "error": f"missing_ragas_fields:{','.join(missing)}"},
46 )
47 continue
48 ragas_rows.append(row)
49
50 write_jsonl("data/runs/ragas_input.jsonl", ragas_rows)
51 print(f"Built {len(ragas_rows)} Ragas input rows")
52 return 0 if ragas_rows else 1
53
54
55 if __name__ == "__main__":
56 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.ragas_runner import run_ragas_eval
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 scores = run_ragas_eval(config)
16 print(f"Wrote {len(scores)} Ragas score rows to data/reports/ragas_scores.csv")
17 return 0
18
19
20 if __name__ == "__main__":
21 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4
5 import _bootstrap # noqa: F401
6
7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.report import generate_summary_report
10
11
12 def main() -> int:
13 setup_logging()
14 config = load_config()
15 generate_summary_report(config)
16 print("Wrote report to data/reports/summary.md")
17 return 0
18
19
20 if __name__ == "__main__":
21 sys.exit(main())
1 from __future__ import annotations
2
3 import sys
4 from pathlib import Path
5
6
7 PROJECT_ROOT = Path(__file__).resolve().parents[1]
8 SRC = PROJECT_ROOT / "src"
9 if str(SRC) not in sys.path:
10 sys.path.insert(0, str(SRC))
1 """Independent Ragas evaluation pipeline for WeKnora."""
2
3 __all__ = [
4 "__version__",
5 ]
6
7 __version__ = "0.1.0"
1 from __future__ import annotations
2
3 import os
4 import re
5 from pathlib import Path
6 from typing import Any
7
8 import yaml
9 from dotenv import load_dotenv
10
11
12 _ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}")
13
14
15 def _expand_env(value: Any) -> Any:
16 if isinstance(value, dict):
17 return {key: _expand_env(item) for key, item in value.items()}
18 if isinstance(value, list):
19 return [_expand_env(item) for item in value]
20 if not isinstance(value, str):
21 return value
22
23 def replace(match: re.Match[str]) -> str:
24 default = match.group(2) if match.group(2) is not None else ""
25 return os.getenv(match.group(1), default)
26
27 expanded = _ENV_PATTERN.sub(replace, value)
28 return _coerce_scalar(expanded)
29
30
31 def _coerce_scalar(value: str) -> Any:
32 lowered = value.lower()
33 if lowered in {"true", "false"}:
34 return lowered == "true"
35 if lowered in {"none", "null"}:
36 return None
37 try:
38 if "." not in value:
39 return int(value)
40 return float(value)
41 except ValueError:
42 return value
43
44
45 def load_config(path: str | Path = "configs/eval.yaml") -> dict[str, Any]:
46 load_dotenv()
47 config_path = Path(path)
48 with config_path.open("r", encoding="utf-8") as file:
49 raw = yaml.safe_load(file) or {}
50 return _expand_env(raw)
51
52
53 def require_config(config: dict[str, Any], dotted_key: str) -> Any:
54 current: Any = config
55 for part in dotted_key.split("."):
56 if not isinstance(current, dict) or part not in current:
57 raise ValueError(f"Missing required config value: {dotted_key}")
58 value = current[part]
59 if value is None or value == "":
60 raise ValueError(f"Missing required config value: {dotted_key}")
61 current = value
62 return current
63
64
65 def project_path(*parts: str) -> Path:
66 return Path.cwd().joinpath(*parts)
1 from __future__ import annotations
2
3 from pathlib import Path
4
5
6 def set_env_value(path: str | Path, key: str, value: str) -> None:
7 target = Path(path)
8 lines = target.read_text(encoding="utf-8").splitlines() if target.exists() else []
9 prefix = f"{key}="
10 replacement = f"{key}={value}"
11 updated = False
12 output: list[str] = []
13
14 for line in lines:
15 if line.startswith(prefix):
16 output.append(replacement)
17 updated = True
18 else:
19 output.append(line)
20
21 if not updated:
22 output.append(replacement)
23
24 target.write_text("\n".join(output) + "\n", encoding="utf-8")
1 from __future__ import annotations
2
3 import json
4 import logging
5 from collections.abc import Iterable
6 from pathlib import Path
7 from typing import Any
8
9
10 def setup_logging(level: int = logging.INFO) -> None:
11 logging.basicConfig(
12 level=level,
13 format="%(asctime)s %(levelname)s %(name)s: %(message)s",
14 )
15
16
17 def ensure_parent(path: str | Path) -> Path:
18 target = Path(path)
19 target.parent.mkdir(parents=True, exist_ok=True)
20 return target
21
22
23 def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]:
24 target = Path(path)
25 if not target.exists():
26 if missing_ok:
27 return []
28 raise FileNotFoundError(target)
29
30 rows: list[dict[str, Any]] = []
31 with target.open("r", encoding="utf-8") as file:
32 for line_no, line in enumerate(file, start=1):
33 stripped = line.strip()
34 if not stripped:
35 continue
36 try:
37 rows.append(json.loads(stripped))
38 except json.JSONDecodeError as exc:
39 raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
40 return rows
41
42
43 def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]:
44 target = Path(path)
45 if not target.exists():
46 if missing_ok:
47 return
48 raise FileNotFoundError(target)
49
50 with target.open("r", encoding="utf-8") as file:
51 for line_no, line in enumerate(file, start=1):
52 stripped = line.strip()
53 if not stripped:
54 continue
55 try:
56 yield json.loads(stripped)
57 except json.JSONDecodeError as exc:
58 raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc
59
60
61 def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int:
62 target = ensure_parent(path)
63 count = 0
64 with target.open("w", encoding="utf-8") as file:
65 for row in rows:
66 file.write(json.dumps(row, ensure_ascii=False) + "\n")
67 count += 1
68 return count
69
70
71 def append_jsonl(path: str | Path, row: dict[str, Any]) -> None:
72 target = ensure_parent(path)
73 with target.open("a", encoding="utf-8") as file:
74 file.write(json.dumps(row, ensure_ascii=False) + "\n")
75
76
77 def write_json(path: str | Path, payload: dict[str, Any]) -> None:
78 target = ensure_parent(path)
79 with target.open("w", encoding="utf-8") as file:
80 json.dump(payload, file, ensure_ascii=False, indent=2)
81 file.write("\n")
82
83
84 def compact_text(value: Any) -> str:
85 text = "" if value is None else str(value)
86 return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip()
1 """Document parser adapters."""
1 from __future__ import annotations
2
3 import statistics
4 from pathlib import Path
5 from typing import Any
6
7 from openpyxl import load_workbook
8
9 from weknora_eval.loaders import compact_text, write_json, write_jsonl
10 from weknora_eval.schemas import ParsedDocument
11
12
13 def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
14 parsing = config["parsing"]
15 local_config = parsing.get("local", {})
16 min_chars = int(local_config.get("min_chars", 80))
17 pdf_backend = local_config.get("pdf_backend", "pypdf")
18 xlsx_mode = local_config.get("xlsx_mode", "row_text")
19
20 docs: list[ParsedDocument] = []
21 failures: list[dict[str, Any]] = []
22
23 for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
24 try:
25 docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars))
26 except Exception as exc: # noqa: BLE001 - parser failures must be persisted.
27 failures.append(
28 {
29 "source_file": pdf_path.name,
30 "parser": f"local:{pdf_backend}",
31 "status": "failed",
32 "error": str(exc),
33 "fallback_used": None,
34 }
35 )
36
37 for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")):
38 try:
39 docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars))
40 except Exception as exc: # noqa: BLE001
41 failures.append(
42 {
43 "source_file": xlsx_path.name,
44 "parser": "local:openpyxl",
45 "status": "failed",
46 "error": str(exc),
47 "fallback_used": None,
48 }
49 )
50
51 rows = [doc.to_dict() for doc in docs]
52 write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
53 if failures:
54 write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
55
56 summary = build_parse_summary(rows, failures, parser=f"local:{pdf_backend}")
57 write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
58 return rows, summary
59
60
61 def parse_pdf(path: str | Path, *, backend: str = "pypdf", min_chars: int = 80) -> list[ParsedDocument]:
62 target = Path(path)
63 backend = backend.lower()
64 if backend == "pymupdf":
65 return _parse_pdf_pymupdf(target, min_chars=min_chars)
66 if backend == "pdfplumber":
67 return _parse_pdf_pdfplumber(target, min_chars=min_chars)
68 if backend == "pypdf":
69 return _parse_pdf_pypdf(target, min_chars=min_chars)
70 raise ValueError(f"Unsupported PDF backend: {backend}")
71
72
73 def _parse_pdf_pypdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
74 from pypdf import PdfReader
75
76 reader = PdfReader(str(path))
77 docs: list[ParsedDocument] = []
78 for index, page in enumerate(reader.pages, start=1):
79 content = compact_text(page.extract_text() or "")
80 if len(content) < min_chars:
81 continue
82 docs.append(_pdf_doc(path, index, content, "local:pypdf"))
83 return docs
84
85
86 def _parse_pdf_pymupdf(path: Path, *, min_chars: int) -> list[ParsedDocument]:
87 try:
88 import fitz
89 except ImportError as exc:
90 raise ImportError("pymupdf backend requires `pip install -e '.[pdf]'`") from exc
91
92 docs: list[ParsedDocument] = []
93 with fitz.open(path) as document:
94 for index, page in enumerate(document, start=1):
95 content = compact_text(page.get_text("text"))
96 if len(content) < min_chars:
97 continue
98 docs.append(_pdf_doc(path, index, content, "local:pymupdf"))
99 return docs
100
101
102 def _parse_pdf_pdfplumber(path: Path, *, min_chars: int) -> list[ParsedDocument]:
103 try:
104 import pdfplumber
105 except ImportError as exc:
106 raise ImportError("pdfplumber backend requires `pip install -e '.[pdf]'`") from exc
107
108 docs: list[ParsedDocument] = []
109 with pdfplumber.open(path) as pdf:
110 for index, page in enumerate(pdf.pages, start=1):
111 content = compact_text(page.extract_text() or "")
112 if len(content) < min_chars:
113 continue
114 docs.append(_pdf_doc(path, index, content, "local:pdfplumber"))
115 return docs
116
117
118 def _pdf_doc(path: Path, page: int, content: str, parser: str) -> ParsedDocument:
119 return ParsedDocument(
120 doc_id=f"{path.name}::page-{page}",
121 source_file=path.name,
122 file_type="pdf",
123 page=page,
124 content=content,
125 metadata={"parser": parser},
126 )
127
128
129 def parse_xlsx(path: str | Path, *, mode: str = "row_text", min_chars: int = 80) -> list[ParsedDocument]:
130 target = Path(path)
131 mode = mode.lower()
132 workbook = load_workbook(target, data_only=True, read_only=True)
133 if mode == "row_text":
134 return _parse_xlsx_row_text(target, workbook, min_chars=min_chars)
135 if mode == "markdown_table":
136 return _parse_xlsx_markdown_table(target, workbook, min_chars=min_chars)
137 raise ValueError(f"Unsupported XLSX mode: {mode}")
138
139
140 def _parse_xlsx_row_text(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
141 docs: list[ParsedDocument] = []
142 for sheet in workbook.worksheets:
143 rows = list(sheet.iter_rows(values_only=True))
144 if not rows:
145 continue
146 headers = [_cell_to_text(value) or f"col_{index}" for index, value in enumerate(rows[0], start=1)]
147 for row_index, row in enumerate(rows[1:], start=2):
148 pairs = []
149 for header, value in zip(headers, row, strict=False):
150 cell = _cell_to_text(value)
151 if cell:
152 pairs.append(f"{header}: {cell}")
153 content = "\n".join(pairs).strip()
154 if len(content) < min_chars:
155 continue
156 docs.append(
157 ParsedDocument(
158 doc_id=f"{path.name}::{sheet.title}::row-{row_index}",
159 source_file=path.name,
160 file_type="xlsx",
161 sheet=sheet.title,
162 row_index=row_index,
163 content=content,
164 metadata={"parser": "local:openpyxl", "columns": headers},
165 )
166 )
167 return docs
168
169
170 def _parse_xlsx_markdown_table(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]:
171 docs: list[ParsedDocument] = []
172 for sheet in workbook.worksheets:
173 rows = [
174 [_cell_to_text(value) for value in row]
175 for row in sheet.iter_rows(values_only=True)
176 if any(value is not None for value in row)
177 ]
178 if not rows:
179 continue
180 width = max(len(row) for row in rows)
181 normalized = [row + [""] * (width - len(row)) for row in rows]
182 header = normalized[0]
183 separator = ["---"] * width
184 body = normalized[1:]
185 lines = [
186 "| " + " | ".join(header) + " |",
187 "| " + " | ".join(separator) + " |",
188 ]
189 lines.extend("| " + " | ".join(row) + " |" for row in body)
190 content = "\n".join(lines)
191 if len(content) < min_chars:
192 continue
193 docs.append(
194 ParsedDocument(
195 doc_id=f"{path.name}::{sheet.title}",
196 source_file=path.name,
197 file_type="xlsx",
198 sheet=sheet.title,
199 content=content,
200 metadata={"parser": "local:openpyxl", "mode": "markdown_table"},
201 )
202 )
203 return docs
204
205
206 def _cell_to_text(value: Any) -> str:
207 if value is None:
208 return ""
209 text = str(value).strip()
210 return text.replace("\n", " ")
211
212
213 def build_parse_summary(
214 rows: list[dict[str, Any]],
215 failures: list[dict[str, Any]],
216 *,
217 parser: str,
218 ) -> dict[str, Any]:
219 source_files = {row.get("source_file") for row in rows if row.get("source_file")}
220 failed_files = {row.get("source_file") for row in failures if row.get("source_file")}
221 lengths = [len(row.get("content") or "") for row in rows]
222 return {
223 "total_files": len(source_files | failed_files),
224 "parsed_files": len(source_files),
225 "failed_files": len(failed_files),
226 "total_documents": len(rows),
227 "empty_documents": sum(1 for length in lengths if length == 0),
228 "avg_chars": round(statistics.mean(lengths), 2) if lengths else 0,
229 "parser": parser,
230 }
1 from __future__ import annotations
2
3 import subprocess
4 from pathlib import Path
5 from typing import Any
6
7 import requests
8
9 from weknora_eval.loaders import compact_text, write_json, write_jsonl
10 from weknora_eval.parsers.local import build_parse_summary, parse_pdf
11 from weknora_eval.schemas import ParsedDocument
12
13
14 class MinerUParseError(RuntimeError):
15 pass
16
17
18 def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
19 parsing = config["parsing"]
20 mineru = parsing.get("mineru", {})
21 mode = mineru.get("mode", "cli")
22 fallback = bool(mineru.get("fallback_to_local", True))
23 local_config = parsing.get("local", {})
24 min_chars = int(local_config.get("min_chars", 80))
25
26 docs: list[ParsedDocument] = []
27 failures: list[dict[str, Any]] = []
28
29 for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")):
30 parser_name = f"mineru:{mode}"
31 try:
32 if mode == "cli":
33 docs.extend(parse_pdf_with_cli(pdf_path, mineru, min_chars=min_chars))
34 elif mode == "http":
35 docs.extend(parse_pdf_with_http(pdf_path, mineru, min_chars=min_chars))
36 else:
37 raise MinerUParseError(f"Unsupported MinerU mode: {mode}")
38 except Exception as exc: # noqa: BLE001
39 failure = {
40 "source_file": pdf_path.name,
41 "parser": parser_name,
42 "status": "failed",
43 "error": str(exc),
44 "fallback_used": None,
45 }
46 if fallback:
47 try:
48 backend = local_config.get("pdf_backend", "pypdf")
49 local_docs = parse_pdf(pdf_path, backend=backend, min_chars=min_chars)
50 docs.extend(local_docs)
51 failure["fallback_used"] = f"local:{backend}"
52 except Exception as fallback_exc: # noqa: BLE001
53 failure["fallback_error"] = str(fallback_exc)
54 failures.append(failure)
55
56 rows = [doc.to_dict() for doc in docs]
57 write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
58 if failures:
59 write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
60
61 summary = build_parse_summary(rows, failures, parser=f"mineru:{mode}")
62 write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
63 return rows, summary
64
65
66 def parse_pdf_with_cli(
67 pdf_path: str | Path,
68 mineru_config: dict[str, Any],
69 *,
70 min_chars: int,
71 ) -> list[ParsedDocument]:
72 target = Path(pdf_path)
73 output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw"))
74 output_dir = output_root / target.stem
75 output_dir.mkdir(parents=True, exist_ok=True)
76 cli_bin = mineru_config.get("cli_bin", "mineru")
77 timeout = int(mineru_config.get("timeout_seconds", 600))
78
79 # MinerU CLI arguments vary by release. This common invocation is isolated
80 # here so deployments can replace it without touching pipeline scripts.
81 result = subprocess.run(
82 [cli_bin, "-p", str(target), "-o", str(output_dir)],
83 check=False,
84 capture_output=True,
85 text=True,
86 timeout=timeout,
87 )
88 if result.returncode != 0:
89 raise MinerUParseError(result.stderr.strip() or result.stdout.strip() or "MinerU CLI failed")
90
91 markdown_files = sorted(output_dir.rglob("*.md"))
92 if not markdown_files:
93 raise MinerUParseError(f"No Markdown output found in {output_dir}")
94
95 docs: list[ParsedDocument] = []
96 for index, markdown_path in enumerate(markdown_files, start=1):
97 content = compact_text(markdown_path.read_text(encoding="utf-8"))
98 if len(content) < min_chars:
99 continue
100 docs.append(
101 ParsedDocument(
102 doc_id=f"{target.name}::mineru-{index}",
103 source_file=target.name,
104 file_type="pdf",
105 content=content,
106 metadata={
107 "parser": "mineru:cli",
108 "mineru_output": str(markdown_path),
109 },
110 )
111 )
112 return docs
113
114
115 def parse_pdf_with_http(
116 pdf_path: str | Path,
117 mineru_config: dict[str, Any],
118 *,
119 min_chars: int,
120 ) -> list[ParsedDocument]:
121 target = Path(pdf_path)
122 base_url = str(mineru_config.get("http_base_url") or "").rstrip("/")
123 if not base_url:
124 raise MinerUParseError("MinerU HTTP mode requires parsing.mineru.http_base_url")
125
126 headers = {}
127 if mineru_config.get("api_key"):
128 headers["Authorization"] = f"Bearer {mineru_config['api_key']}"
129
130 # The checklist does not define a universal MinerU HTTP contract. This
131 # implementation expects a replaceable service exposing POST /parse and
132 # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}.
133 with target.open("rb") as file:
134 response = requests.post(
135 f"{base_url}/parse",
136 files={"file": (target.name, file, "application/pdf")},
137 headers=headers,
138 timeout=int(mineru_config.get("timeout_seconds", 600)),
139 )
140 if response.status_code >= 400:
141 raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}")
142
143 payload = response.json()
144 contents: list[str] = []
145 if isinstance(payload.get("documents"), list):
146 contents = [compact_text(item.get("content")) for item in payload["documents"]]
147 elif payload.get("markdown"):
148 contents = [compact_text(payload["markdown"])]
149 else:
150 raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`")
151
152 docs: list[ParsedDocument] = []
153 for index, content in enumerate(contents, start=1):
154 if len(content) < min_chars:
155 continue
156 docs.append(
157 ParsedDocument(
158 doc_id=f"{target.name}::mineru-http-{index}",
159 source_file=target.name,
160 file_type="pdf",
161 content=content,
162 metadata={"parser": "mineru:http"},
163 )
164 )
165 return docs
1 from __future__ import annotations
2
3 import os
4 from pathlib import Path
5 from typing import Any
6
7 import pandas as pd
8
9 from weknora_eval.config import require_config
10 from weknora_eval.loaders import read_jsonl
11
12
13 def run_ragas_eval(
14 config: dict[str, Any],
15 *,
16 input_path: str = "data/runs/ragas_input.jsonl",
17 output_csv_path: str = "data/reports/ragas_scores.csv",
18 ) -> pd.DataFrame:
19 from datasets import Dataset
20 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
21 from ragas import evaluate
22 from ragas.run_config import RunConfig
23
24 ragas_config = config["ragas"]
25 llm_api_key = _first_non_empty(ragas_config, "llm_api_key", "api_key")
26 llm_base_url = _first_non_empty(ragas_config, "llm_base_url", "base_url")
27 embedding_api_key = _first_non_empty(ragas_config, "embedding_api_key", "api_key")
28 embedding_base_url = _first_non_empty(ragas_config, "embedding_base_url", "base_url")
29 judge_model = str(require_config(config, "ragas.judge_model"))
30 embedding_model = str(require_config(config, "ragas.embedding_model"))
31 temperature = float(ragas_config.get("temperature", 0))
32 max_tokens = int(ragas_config.get("max_tokens", 4096))
33 timeout_seconds = int(ragas_config.get("timeout_seconds", 600))
34 max_workers = int(ragas_config.get("max_workers", 1))
35
36 os.environ["OPENAI_API_KEY"] = llm_api_key
37 if llm_base_url:
38 os.environ["OPENAI_BASE_URL"] = llm_base_url
39
40 rows = read_jsonl(input_path)
41 dataset = Dataset.from_list(
42 [
43 {
44 "user_input": row["user_input"],
45 "response": row["response"],
46 "retrieved_contexts": row["retrieved_contexts"],
47 "reference": row["reference"],
48 "reference_contexts": row.get("reference_contexts") or [],
49 }
50 for row in rows
51 ]
52 )
53
54 metric_map = _metric_map()
55 selected_metrics = [
56 metric_map[name]
57 for name in ragas_config.get("metrics", metric_map.keys())
58 if name in metric_map
59 ]
60
61 llm = ChatOpenAI(
62 model=judge_model,
63 api_key=llm_api_key,
64 base_url=llm_base_url or None,
65 temperature=temperature,
66 max_tokens=max_tokens,
67 )
68 embeddings = OpenAIEmbeddings(
69 model=embedding_model,
70 api_key=embedding_api_key,
71 base_url=embedding_base_url or None,
72 tiktoken_enabled=False,
73 check_embedding_ctx_length=False,
74 )
75 ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
76
77 run_config = RunConfig(timeout=timeout_seconds, max_workers=max_workers)
78 result = evaluate(
79 dataset,
80 metrics=selected_metrics,
81 llm=ragas_llm,
82 embeddings=ragas_embeddings,
83 run_config=run_config,
84 )
85 scores = result.to_pandas()
86 for index, row in enumerate(rows):
87 scores.loc[index, "sample_id"] = row.get("sample_id")
88
89 target = Path(output_csv_path)
90 target.parent.mkdir(parents=True, exist_ok=True)
91 scores.to_csv(target, index=False)
92 return scores
93
94
95 def _metric_map() -> dict[str, Any]:
96 try:
97 from ragas.metrics import (
98 context_precision,
99 context_recall,
100 faithfulness,
101 factual_correctness,
102 response_relevancy,
103 )
104
105 return {
106 "faithfulness": faithfulness,
107 "response_relevancy": response_relevancy,
108 "context_precision": context_precision,
109 "context_recall": context_recall,
110 "factual_correctness": factual_correctness,
111 }
112 except ImportError:
113 from ragas.metrics import (
114 Faithfulness,
115 FactualCorrectness,
116 LLMContextPrecisionWithReference,
117 LLMContextRecall,
118 ResponseRelevancy,
119 )
120
121 return {
122 "faithfulness": Faithfulness(),
123 "response_relevancy": ResponseRelevancy(),
124 "context_precision": LLMContextPrecisionWithReference(),
125 "context_recall": LLMContextRecall(),
126 "factual_correctness": FactualCorrectness(),
127 }
128
129
130 def _first_non_empty(config: dict[str, Any], *keys: str) -> str:
131 for key in keys:
132 value = config.get(key)
133 if value not in {None, ""}:
134 return str(value)
135 raise ValueError(f"Missing required Ragas config value. Checked: {', '.join(keys)}")
136
137
138 def _wrap_langchain_models(llm: Any, embeddings: Any) -> tuple[Any, Any]:
139 try:
140 from ragas.embeddings import LangchainEmbeddingsWrapper
141 from ragas.llms import LangchainLLMWrapper
142 except ImportError:
143 return llm, embeddings
144
145 return LangchainLLMWrapper(llm), LangchainEmbeddingsWrapper(embeddings)
1 from __future__ import annotations
2
3 import math
4 from pathlib import Path
5 from typing import Any
6
7 import pandas as pd
8
9 from weknora_eval.loaders import read_jsonl
10
11
12 def retrieval_metrics(
13 ragas_rows: list[dict[str, Any]],
14 *,
15 ks: tuple[int, ...] = (1, 3, 5),
16 ) -> dict[str, float]:
17 samples = [row for row in ragas_rows if row.get("gold_chunk_ids")]
18 if not samples:
19 return {}
20
21 totals: dict[str, float] = {f"hit@{k}": 0.0 for k in ks}
22 totals.update({f"recall@{k}": 0.0 for k in ks})
23 totals["mrr"] = 0.0
24 totals["ndcg@5"] = 0.0
25
26 for row in samples:
27 gold = set(row.get("gold_chunk_ids") or [])
28 refs = row.get("weknora_references") or []
29 predicted = [str(ref.get("id")) for ref in refs if ref.get("id")]
30 for k in ks:
31 top_k = predicted[:k]
32 hits = len(gold.intersection(top_k))
33 totals[f"hit@{k}"] += 1.0 if hits else 0.0
34 totals[f"recall@{k}"] += hits / len(gold)
35
36 first_rank = next((idx for idx, chunk_id in enumerate(predicted, start=1) if chunk_id in gold), None)
37 if first_rank:
38 totals["mrr"] += 1 / first_rank
39
40 dcg = 0.0
41 for idx, chunk_id in enumerate(predicted[:5], start=1):
42 if chunk_id in gold:
43 dcg += 1 / math.log2(idx + 1)
44 ideal_hits = min(len(gold), 5)
45 idcg = sum(1 / math.log2(idx + 1) for idx in range(1, ideal_hits + 1))
46 totals["ndcg@5"] += dcg / idcg if idcg else 0.0
47
48 return {key: round(value / len(samples), 4) for key, value in totals.items()}
49
50
51 def generate_summary_report(
52 config: dict[str, Any],
53 *,
54 scores_csv_path: str = "data/reports/ragas_scores.csv",
55 ragas_input_path: str = "data/runs/ragas_input.jsonl",
56 answers_path: str = "data/runs/weknora_answers.jsonl",
57 output_path: str = "data/reports/summary.md",
58 ) -> str:
59 ragas_rows = read_jsonl(ragas_input_path, missing_ok=True)
60 answer_rows = read_jsonl(answers_path, missing_ok=True)
61 scores = pd.read_csv(scores_csv_path) if Path(scores_csv_path).exists() else pd.DataFrame()
62
63 lines = [
64 "# Ragas 评估报告",
65 "",
66 "## 运行信息",
67 f"- WeKnora Base URL: {config.get('weknora', {}).get('base_url', '')}",
68 f"- 知识库 ID: {config.get('weknora', {}).get('knowledge_base_id', '')}",
69 f"- 测试集规模: {len(ragas_rows)}",
70 f"- 审核通过样本数: {len(ragas_rows)}",
71 f"- 失败样本数: {sum(1 for row in answer_rows if row.get('error'))}",
72 f"- Judge 模型: {config.get('ragas', {}).get('judge_model', '')}",
73 "",
74 "## 聚合指标",
75 "| 指标 | 平均值 | P50 | 失败阈值 |",
76 "| --- | --- | --- | --- |",
77 ]
78
79 metric_columns = [
80 column
81 for column in scores.columns
82 if column not in {"sample_id", "user_input", "response", "reference"}
83 and pd.api.types.is_numeric_dtype(scores[column])
84 ]
85 for column in metric_columns:
86 lines.append(
87 f"| {column} | {scores[column].mean():.4f} | {scores[column].median():.4f} | 0.50 |"
88 )
89
90 chunk_metrics = retrieval_metrics(ragas_rows)
91 if chunk_metrics:
92 lines.extend(["", "## Chunk ID 检索指标", "| 指标 | 平均值 |", "| --- | --- |"])
93 for key, value in chunk_metrics.items():
94 lines.append(f"| {key} | {value:.4f} |")
95
96 lines.extend(["", "## 检索失败样本", "| sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 |", "| --- | --- | --- | --- | --- | --- |"])
97 for row in _worst_rows(scores, "context_recall"):
98 sample = _sample_by_id(ragas_rows, row.get("sample_id"))
99 actual_files = sorted(
100 {
101 ref.get("knowledge_filename") or ""
102 for ref in sample.get("weknora_references", [])
103 if ref.get("knowledge_filename")
104 }
105 )
106 lines.append(
107 f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
108 f"{_cell(sample.get('source_file'))} | {_cell(', '.join(actual_files))} | "
109 f"{_score(row.get('context_recall'))} | |"
110 )
111
112 lines.extend(["", "## 生成失败样本", "| sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness |", "| --- | --- | --- | --- | --- | --- |"])
113 for row in _worst_rows(scores, "faithfulness"):
114 sample = _sample_by_id(ragas_rows, row.get("sample_id"))
115 lines.append(
116 f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | "
117 f"{_cell(sample.get('response'))} | {_cell(sample.get('reference'))} | "
118 f"{_score(row.get('faithfulness'))} | {_score(row.get('factual_correctness'))} |"
119 )
120
121 empty_retrievals = sum(1 for row in ragas_rows if not row.get("retrieved_contexts"))
122 fallback_answers = sum(1 for row in answer_rows if row.get("is_fallback"))
123 source_counts: dict[str, int] = {}
124 for row in ragas_rows:
125 source = row.get("source_file") or "unknown"
126 source_counts[source] = source_counts.get(source, 0) + 1
127
128 lines.extend(
129 [
130 "",
131 "## 数据质量",
132 f"- 空检索数量: {empty_retrievals}",
133 f"- fallback 答案数量: {fallback_answers}",
134 f"- 来源文件分布: {source_counts}",
135 "",
136 "## 改进建议",
137 "- 优先检查 context_recall 低且 retrieved_contexts 为空的样本。",
138 "- 对低 faithfulness 且 context_recall 正常的样本,重点检查生成模型和提示词。",
139 "- 对 Chunk ID 指标低但 Ragas context 指标正常的样本,检查 chunk 切分或 gold_chunk_ids 标注。",
140 "",
141 ]
142 )
143
144 content = "\n".join(lines)
145 target = Path(output_path)
146 target.parent.mkdir(parents=True, exist_ok=True)
147 target.write_text(content, encoding="utf-8")
148 return content
149
150
151 def _worst_rows(scores: pd.DataFrame, column: str, *, limit: int = 10) -> list[dict[str, Any]]:
152 if scores.empty or column not in scores.columns:
153 return []
154 return scores.sort_values(column, ascending=True).head(limit).to_dict(orient="records")
155
156
157 def _sample_by_id(rows: list[dict[str, Any]], sample_id: Any) -> dict[str, Any]:
158 return next((row for row in rows if row.get("sample_id") == sample_id), {})
159
160
161 def _cell(value: Any, *, max_len: int = 120) -> str:
162 text = "" if value is None else " ".join(str(value).split())
163 text = text.replace("|", "\\|")
164 if len(text) <= max_len:
165 return text
166 return text[:max_len].rstrip() + "..."
167
168
169 def _score(value: Any) -> str:
170 try:
171 if pd.isna(value):
172 return ""
173 return f"{float(value):.4f}"
174 except (TypeError, ValueError):
175 return ""
1 from __future__ import annotations
2
3 from dataclasses import asdict, dataclass, field
4 from typing import Any
5
6
7 @dataclass
8 class ParsedDocument:
9 doc_id: str
10 source_file: str
11 file_type: str
12 content: str
13 page: int | None = None
14 sheet: str | None = None
15 row_index: int | None = None
16 metadata: dict[str, Any] = field(default_factory=dict)
17
18 def to_dict(self) -> dict[str, Any]:
19 return asdict(self)
20
21
22 @dataclass
23 class TestsetRecord:
24 sample_id: str
25 user_input: str
26 reference: str
27 reference_contexts: list[str]
28 source_file: str | None = None
29 gold_chunk_ids: list[str] = field(default_factory=list)
30 question_type: str = "single_hop"
31 review_status: str = "pending"
32
33 def to_dict(self) -> dict[str, Any]:
34 return asdict(self)
35
36
37 @dataclass
38 class WeKnoraAnswer:
39 sample_id: str
40 user_input: str
41 response: str
42 retrieved_contexts: list[str]
43 weknora_references: list[dict[str, Any]]
44 session_id: str | None = None
45 request_id: str | None = None
46 error: str | None = None
47
48 def to_dict(self) -> dict[str, Any]:
49 return asdict(self)
1 from __future__ import annotations
2
3 import json
4 from collections.abc import Iterable, Iterator
5 from typing import Any
6
7
8 def parse_sse_events(lines: Iterable[str | bytes]) -> Iterator[dict[str, Any]]:
9 event_name = "message"
10 data_lines: list[str] = []
11
12 for raw_line in lines:
13 line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
14 line = line.rstrip("\r\n")
15
16 if not line:
17 if data_lines:
18 yield _build_event(event_name, data_lines)
19 event_name = "message"
20 data_lines = []
21 continue
22
23 if line.startswith(":"):
24 continue
25 if line.startswith("event:"):
26 event_name = line.removeprefix("event:").strip()
27 continue
28 if line.startswith("data:"):
29 data_lines.append(line.removeprefix("data:").strip())
30
31 if data_lines:
32 yield _build_event(event_name, data_lines)
33
34
35 def _build_event(event_name: str, data_lines: list[str]) -> dict[str, Any]:
36 raw_data = "\n".join(data_lines)
37 parsed_data: Any = raw_data
38 if raw_data and raw_data != "[DONE]":
39 try:
40 parsed_data = json.loads(raw_data)
41 except json.JSONDecodeError:
42 parsed_data = raw_data
43 return {"event": event_name, "data": parsed_data}
44
45
46 def normalize_reference(reference: dict[str, Any]) -> dict[str, Any]:
47 return {
48 "id": reference.get("id"),
49 "content": reference.get("content") or "",
50 "knowledge_id": reference.get("knowledge_id"),
51 "chunk_index": reference.get("chunk_index"),
52 "score": reference.get("score"),
53 "knowledge_filename": reference.get("knowledge_filename")
54 or reference.get("knowledge_title"),
55 "match_type": reference.get("match_type"),
56 "chunk_type": reference.get("chunk_type"),
57 }
1 from __future__ import annotations
2
3 from typing import Any
4
5 from weknora_eval.loaders import read_jsonl, write_jsonl
6 from weknora_eval.schemas import TestsetRecord
7
8
9 def generate_rule_based_testset(
10 *,
11 documents_path: str = "data/parsed_docs/documents.jsonl",
12 output_path: str = "data/testsets/testset.raw.jsonl",
13 size: int = 50,
14 min_context_chars: int = 80,
15 ) -> list[dict[str, Any]]:
16 documents = [
17 row
18 for row in read_jsonl(documents_path)
19 if len(row.get("content") or "") >= min_context_chars
20 ]
21 rows: list[dict[str, Any]] = []
22 for index, document in enumerate(documents[:size], start=1):
23 context = document["content"]
24 source_file = document.get("source_file")
25 question = _default_question(document)
26 reference = _reference_from_context(context)
27 rows.append(
28 TestsetRecord(
29 sample_id=f"qa-{index:04d}",
30 user_input=question,
31 reference=reference,
32 reference_contexts=[context],
33 source_file=source_file,
34 question_type="single_hop",
35 review_status="pending",
36 ).to_dict()
37 )
38 write_jsonl(output_path, rows)
39 return rows
40
41
42 def approve_pending_testset(
43 *,
44 input_path: str = "data/testsets/testset.raw.jsonl",
45 output_path: str = "data/testsets/testset.reviewed.jsonl",
46 ) -> list[dict[str, Any]]:
47 rows = read_jsonl(input_path)
48 reviewed: list[dict[str, Any]] = []
49 for row in rows:
50 row = dict(row)
51 if row.get("review_status") == "rejected":
52 continue
53 row["review_status"] = "approved"
54 reviewed.append(row)
55 write_jsonl(output_path, reviewed)
56 return reviewed
57
58
59 def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]:
60 errors: list[str] = []
61 for index, row in enumerate(read_jsonl(path), start=1):
62 prefix = f"{path}:{index}"
63 if row.get("review_status") != "approved":
64 errors.append(f"{prefix} review_status must be approved")
65 for key in ("sample_id", "user_input", "reference"):
66 if not row.get(key):
67 errors.append(f"{prefix} missing {key}")
68 if not row.get("reference_contexts"):
69 errors.append(f"{prefix} reference_contexts must be non-empty")
70 return errors
71
72
73 def _default_question(document: dict[str, Any]) -> str:
74 source = document.get("source_file") or "该文档"
75 if document.get("file_type") == "xlsx" and document.get("sheet"):
76 return f"请根据 {source} 的 {document['sheet']} 中对应记录回答:这条记录的主要内容是什么?"
77 if document.get("page"):
78 return f"请根据 {source} 第 {document['page']} 页回答:该片段的主要内容是什么?"
79 return f"请根据 {source} 回答:该片段的主要内容是什么?"
80
81
82 def _reference_from_context(context: str, *, max_chars: int = 500) -> str:
83 text = " ".join(context.split())
84 if len(text) <= max_chars:
85 return text
86 return text[:max_chars].rstrip() + "..."