Initial WeKnora Ragas evaluation project
0 parents
Showing
34 changed files
with
3220 additions
and
0 deletions
.env.example
0 → 100644
| 1 | WEKNORA_BASE_URL=http://localhost:8080/api/v1 | ||
| 2 | WEKNORA_API_KEY= | ||
| 3 | WEKNORA_KB_ID= | ||
| 4 | WEKNORA_KB_NAME=ragas-eval-pilot | ||
| 5 | |||
| 6 | # Ragas generation and judge models. These are evaluation-side models, not the | ||
| 7 | # model configuration used by the WeKnora backend. | ||
| 8 | OPENAI_API_KEY=replace-me | ||
| 9 | OPENAI_BASE_URL=https://api.openai.com/v1 | ||
| 10 | |||
| 11 | # Optional split deployment. Use these when LLM and embedding are served by | ||
| 12 | # different OpenAI-compatible services, such as vLLM + Infinity. | ||
| 13 | RAGAS_LLM_API_KEY=replace-me | ||
| 14 | RAGAS_LLM_BASE_URL=http://localhost:8000/v1 | ||
| 15 | RAGAS_EMBEDDING_API_KEY=replace-me | ||
| 16 | RAGAS_EMBEDDING_BASE_URL=http://localhost:7997/v1 | ||
| 17 | RAGAS_RERANKER_API_KEY=replace-me | ||
| 18 | RAGAS_RERANKER_BASE_URL=http://localhost:7998/v1 | ||
| 19 | RAGAS_RERANKER_MODEL=replace-me | ||
| 20 | |||
| 21 | RAGAS_GENERATOR_MODEL=gpt-4o-mini | ||
| 22 | RAGAS_JUDGE_MODEL=gpt-4o-mini | ||
| 23 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small | ||
| 24 | |||
| 25 | TESTSET_SIZE=50 | ||
| 26 | REQUEST_INTERVAL_SECONDS=0.2 |
.gitignore
0 → 100644
| 1 | .env | ||
| 2 | .venv/ | ||
| 3 | __pycache__/ | ||
| 4 | *.py[cod] | ||
| 5 | *.egg-info/ | ||
| 6 | .pytest_cache/ | ||
| 7 | .ruff_cache/ | ||
| 8 | |||
| 9 | data/raw_docs/pdf/* | ||
| 10 | data/raw_docs/xlsx/* | ||
| 11 | data/parsed_docs/*.json | ||
| 12 | data/parsed_docs/*.jsonl | ||
| 13 | data/parsed_docs/mineru_raw/* | ||
| 14 | data/exported/*.json | ||
| 15 | data/exported/*.jsonl | ||
| 16 | data/testsets/*.jsonl | ||
| 17 | data/runs/*.jsonl | ||
| 18 | data/reports/*.csv | ||
| 19 | data/reports/*.md | ||
| 20 | |||
| 21 | !data/raw_docs/pdf/.gitkeep | ||
| 22 | !data/raw_docs/xlsx/.gitkeep | ||
| 23 | !data/parsed_docs/mineru_raw/.gitkeep |
RAGAS_EVALUATION_IMPLEMENTATION_CHECKLIST.md
0 → 100644
| 1 | # Ragas 独立评估项目实施清单 | ||
| 2 | |||
| 3 | ## 1. 目标 | ||
| 4 | |||
| 5 | 基于 WeKnora 的公开 API 构建一个独立评估项目,不依赖 WeKnora 内置的 `/evaluation` 接口。 | ||
| 6 | |||
| 7 | 这个项目用于评估: | ||
| 8 | |||
| 9 | - 检索质量:WeKnora 是否召回了正确的 chunks。 | ||
| 10 | - 生成质量:WeKnora 是否基于检索上下文正确、忠实地回答问题。 | ||
| 11 | - 端到端 RAG 效果:问题 -> 检索 -> 回答 -> Ragas 指标。 | ||
| 12 | |||
| 13 | 最终输入 Ragas 的单条记录格式: | ||
| 14 | |||
| 15 | ```json | ||
| 16 | { | ||
| 17 | "user_input": "问题", | ||
| 18 | "response": "WeKnora 生成的答案", | ||
| 19 | "retrieved_contexts": ["检索到的 chunk 文本 1", "检索到的 chunk 文本 2"], | ||
| 20 | "reference": "标准答案", | ||
| 21 | "reference_contexts": ["标准答案依据的原文片段"] | ||
| 22 | } | ||
| 23 | ``` | ||
| 24 | |||
| 25 | ## 2. 推荐独立项目结构 | ||
| 26 | |||
| 27 | 按照以下结构创建独立的项目: | ||
| 28 | |||
| 29 | ```text | ||
| 30 | README.md | ||
| 31 | pyproject.toml | ||
| 32 | .env.example | ||
| 33 | configs/ | ||
| 34 | eval.yaml | ||
| 35 | data/ | ||
| 36 | raw_docs/ | ||
| 37 | pdf/ | ||
| 38 | xlsx/ | ||
| 39 | parsed_docs/ | ||
| 40 | documents.jsonl | ||
| 41 | parse_summary.json | ||
| 42 | mineru_raw/ | ||
| 43 | exported/ | ||
| 44 | knowledge.jsonl | ||
| 45 | chunks.jsonl | ||
| 46 | testsets/ | ||
| 47 | testset.raw.jsonl | ||
| 48 | testset.reviewed.jsonl | ||
| 49 | runs/ | ||
| 50 | weknora_answers.jsonl | ||
| 51 | ragas_input.jsonl | ||
| 52 | reports/ | ||
| 53 | ragas_scores.csv | ||
| 54 | summary.md | ||
| 55 | scripts/ | ||
| 56 | 01_upload_docs.py | ||
| 57 | 02_wait_ingestion.py | ||
| 58 | 03_export_chunks.py | ||
| 59 | 04_parse_docs.py | ||
| 60 | 05_generate_testset.py | ||
| 61 | 06_review_testset.py | ||
| 62 | 07_run_weknora_qa.py | ||
| 63 | 08_build_ragas_input.py | ||
| 64 | 09_run_ragas_eval.py | ||
| 65 | 10_report.py | ||
| 66 | src/ | ||
| 67 | weknora_eval/ | ||
| 68 | api.py | ||
| 69 | schemas.py | ||
| 70 | loaders.py | ||
| 71 | parsers/ | ||
| 72 | local.py | ||
| 73 | mineru.py | ||
| 74 | testset.py | ||
| 75 | sse.py | ||
| 76 | ragas_runner.py | ||
| 77 | report.py | ||
| 78 | ``` | ||
| 79 | |||
| 80 | `pyproject.toml` 示例: | ||
| 81 | |||
| 82 | ```toml | ||
| 83 | [project] | ||
| 84 | name = "weknora-ragas-eval" | ||
| 85 | version = "0.1.0" | ||
| 86 | requires-python = ">=3.10" | ||
| 87 | dependencies = [ | ||
| 88 | "ragas>=0.3,<0.5", | ||
| 89 | "datasets>=2.19.0", | ||
| 90 | "pandas>=2.2.0", | ||
| 91 | "openpyxl>=3.1.0", | ||
| 92 | "requests>=2.32.0", | ||
| 93 | "sseclient-py>=1.8.0", | ||
| 94 | "python-dotenv>=1.0.0", | ||
| 95 | "pyyaml>=6.0.0", | ||
| 96 | "langchain>=0.2.0", | ||
| 97 | "langchain-community>=0.2.0", | ||
| 98 | "langchain-openai>=0.1.0", | ||
| 99 | "pypdf>=4.2.0" | ||
| 100 | ] | ||
| 101 | |||
| 102 | [project.optional-dependencies] | ||
| 103 | pdf = [ | ||
| 104 | "pymupdf>=1.24.0", | ||
| 105 | "pdfplumber>=0.11.0" | ||
| 106 | ] | ||
| 107 | dev = [ | ||
| 108 | "ruff>=0.6.0", | ||
| 109 | "pytest>=8.0.0" | ||
| 110 | ] | ||
| 111 | ``` | ||
| 112 | |||
| 113 | 安装命令: | ||
| 114 | |||
| 115 | ```bash | ||
| 116 | python -m venv .venv | ||
| 117 | source .venv/bin/activate | ||
| 118 | pip install -e . | ||
| 119 | ``` | ||
| 120 | |||
| 121 | 如果 PDF 解析效果不好,安装 PDF 增强依赖: | ||
| 122 | |||
| 123 | ```bash | ||
| 124 | pip install -e ".[pdf]" | ||
| 125 | ``` | ||
| 126 | |||
| 127 | 如果需要开发和测试工具: | ||
| 128 | |||
| 129 | ```bash | ||
| 130 | pip install -e ".[dev,pdf]" | ||
| 131 | ``` | ||
| 132 | |||
| 133 | ## 3. 环境配置 | ||
| 134 | |||
| 135 | `.env.example`: | ||
| 136 | |||
| 137 | ```bash | ||
| 138 | WEKNORA_BASE_URL=http://localhost:8080/api/v1 | ||
| 139 | WEKNORA_API_KEY=replace-me | ||
| 140 | WEKNORA_KB_ID=replace-me | ||
| 141 | |||
| 142 | # Ragas 生成测试集和评测打分使用的模型服务。 | ||
| 143 | # 这里不是 WeKnora 后端的模型配置,而是评估项目自己调用的 LLM/Embedding。 | ||
| 144 | # 如果使用 OpenAI 官方接口: | ||
| 145 | OPENAI_API_KEY=replace-me | ||
| 146 | OPENAI_BASE_URL=https://api.openai.com/v1 | ||
| 147 | |||
| 148 | # 如果使用 OpenAI 兼容服务,例如自建网关、OneAPI、LiteLLM、硅基流动、OpenRouter 等: | ||
| 149 | # OPENAI_API_KEY=replace-me | ||
| 150 | # OPENAI_BASE_URL=https://your-openai-compatible-endpoint/v1 | ||
| 151 | |||
| 152 | # 用于 Ragas 自动生成 QA 测试集的 LLM。 | ||
| 153 | RAGAS_GENERATOR_MODEL=gpt-4o-mini | ||
| 154 | |||
| 155 | # 用于 Ragas 评测打分的 LLM,也就是 judge/evaluator。 | ||
| 156 | RAGAS_JUDGE_MODEL=gpt-4o-mini | ||
| 157 | |||
| 158 | # 用于 Ragas 中部分语义相似度或问题生成流程的 embedding 模型。 | ||
| 159 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small | ||
| 160 | |||
| 161 | TESTSET_SIZE=50 | ||
| 162 | REQUEST_INTERVAL_SECONDS=0.2 | ||
| 163 | ``` | ||
| 164 | |||
| 165 | 这几个模型变量的来源: | ||
| 166 | |||
| 167 | | 变量 | 用途 | 从哪里来 | | ||
| 168 | | --- | --- | --- | | ||
| 169 | | `RAGAS_GENERATOR_MODEL` | 生成 QA 测试集 | 你选择的评估侧 LLM 服务中的模型名称 | | ||
| 170 | | `RAGAS_JUDGE_MODEL` | Ragas 指标打分,例如 faithfulness、context recall | 你选择的评估侧 LLM 服务中的模型名称 | | ||
| 171 | | `RAGAS_EMBEDDING_MODEL` | Ragas 生成/评估中需要 embedding 的步骤 | 你选择的评估侧 embedding 服务中的模型名称 | | ||
| 172 | | `OPENAI_API_KEY` | 调用评估侧模型服务的 API Key | OpenAI 或 OpenAI 兼容服务提供 | | ||
| 173 | | `OPENAI_BASE_URL` | 调用评估侧模型服务的 Base URL | OpenAI 官方或兼容服务地址 | | ||
| 174 | |||
| 175 | 注意: | ||
| 176 | |||
| 177 | - WeKnora 自己回答问题时使用的是 WeKnora 后端已经配置好的模型。 | ||
| 178 | - Ragas 评估项目调用的 `RAGAS_GENERATOR_MODEL`、`RAGAS_JUDGE_MODEL`、`RAGAS_EMBEDDING_MODEL` 是独立的“评估侧模型”。 | ||
| 179 | - 两边可以使用同一个模型服务,也可以分开。为了避免被测系统和评测裁判互相影响,建议评测侧 judge 模型能力不低于 WeKnora 回答模型。 | ||
| 180 | - 如果你不用 OpenAI 官方服务,只要目标服务兼容 OpenAI Chat Completions 和 Embeddings API,一般可以通过 `OPENAI_BASE_URL` 接入。 | ||
| 181 | |||
| 182 | `configs/eval.yaml`: | ||
| 183 | |||
| 184 | ```yaml | ||
| 185 | weknora: | ||
| 186 | base_url: "${WEKNORA_BASE_URL}" | ||
| 187 | api_key: "${WEKNORA_API_KEY}" | ||
| 188 | knowledge_base_id: "${WEKNORA_KB_ID}" | ||
| 189 | timeout_seconds: 300 | ||
| 190 | request_interval_seconds: 0.2 | ||
| 191 | |||
| 192 | testset: | ||
| 193 | size: 50 | ||
| 194 | include_pdf: true | ||
| 195 | include_xlsx: true | ||
| 196 | min_context_chars: 80 | ||
| 197 | require_manual_review: true | ||
| 198 | |||
| 199 | parsing: | ||
| 200 | # 可选:local 或 mineru | ||
| 201 | provider: "local" | ||
| 202 | output_path: "data/parsed_docs/documents.jsonl" | ||
| 203 | local: | ||
| 204 | pdf_backend: "pymupdf" # 可选:pypdf, pymupdf, pdfplumber | ||
| 205 | xlsx_mode: "row_text" # 可选:row_text, markdown_table | ||
| 206 | min_chars: 80 | ||
| 207 | mineru: | ||
| 208 | mode: "cli" # 可选:cli, http | ||
| 209 | cli_bin: "mineru" | ||
| 210 | output_dir: "data/parsed_docs/mineru_raw" | ||
| 211 | http_base_url: "" | ||
| 212 | api_key: "" | ||
| 213 | timeout_seconds: 600 | ||
| 214 | fallback_to_local: true | ||
| 215 | |||
| 216 | qa: | ||
| 217 | one_session_per_question: true | ||
| 218 | disable_title: true | ||
| 219 | enable_memory: false | ||
| 220 | channel: "api" | ||
| 221 | |||
| 222 | ragas: | ||
| 223 | provider: "openai-compatible" | ||
| 224 | api_key: "${OPENAI_API_KEY}" | ||
| 225 | base_url: "${OPENAI_BASE_URL}" | ||
| 226 | generator_model: "${RAGAS_GENERATOR_MODEL}" | ||
| 227 | judge_model: "${RAGAS_JUDGE_MODEL}" | ||
| 228 | embedding_model: "${RAGAS_EMBEDDING_MODEL}" | ||
| 229 | metrics: | ||
| 230 | - faithfulness | ||
| 231 | - response_relevancy | ||
| 232 | - context_precision | ||
| 233 | - context_recall | ||
| 234 | - factual_correctness | ||
| 235 | ``` | ||
| 236 | |||
| 237 | ## 4. Ragas 侧文档解析方案 | ||
| 238 | |||
| 239 | Ragas 生成 QA 测试集前,需要先把原始 PDF/XLSX 转成统一的文本 Document。这里不要直接把文件路径交给 Ragas,而是先执行独立解析步骤,产出标准化的 `documents.jsonl`。 | ||
| 240 | |||
| 241 | 支持两种解析方式: | ||
| 242 | |||
| 243 | - 本地解析:适合快速验证、纯文本 PDF、结构简单的 XLSX。 | ||
| 244 | - MinerU 解析:适合复杂 PDF、扫描件、表格/公式/多栏排版较多的文档。 | ||
| 245 | |||
| 246 | ### 4.1 统一解析产物 | ||
| 247 | |||
| 248 | 无论使用本地解析还是 MinerU,最终都要产出 `data/parsed_docs/documents.jsonl`,一行一个 Document: | ||
| 249 | |||
| 250 | ```json | ||
| 251 | { | ||
| 252 | "doc_id": "contract.pdf::page-1", | ||
| 253 | "source_file": "contract.pdf", | ||
| 254 | "file_type": "pdf", | ||
| 255 | "page": 1, | ||
| 256 | "sheet": null, | ||
| 257 | "row_index": null, | ||
| 258 | "content": "第1页解析后的正文文本...", | ||
| 259 | "metadata": { | ||
| 260 | "parser": "local:pymupdf" | ||
| 261 | } | ||
| 262 | } | ||
| 263 | ``` | ||
| 264 | |||
| 265 | XLSX 行级记录示例: | ||
| 266 | |||
| 267 | ```json | ||
| 268 | { | ||
| 269 | "doc_id": "sales.xlsx::Sheet1::row-12", | ||
| 270 | "source_file": "sales.xlsx", | ||
| 271 | "file_type": "xlsx", | ||
| 272 | "page": null, | ||
| 273 | "sheet": "Sheet1", | ||
| 274 | "row_index": 12, | ||
| 275 | "content": "产品: A产品\n年份: 2024\n销售额: 120万元", | ||
| 276 | "metadata": { | ||
| 277 | "parser": "local:openpyxl", | ||
| 278 | "columns": ["产品", "年份", "销售额"] | ||
| 279 | } | ||
| 280 | } | ||
| 281 | ``` | ||
| 282 | |||
| 283 | 后续 Ragas 测试集生成只读取 `documents.jsonl`,不直接读取原始 PDF/XLSX。 | ||
| 284 | |||
| 285 | ### 4.2 本地解析 | ||
| 286 | |||
| 287 | 本地解析用于最低依赖、最快跑通。 | ||
| 288 | |||
| 289 | PDF 可选 backend: | ||
| 290 | |||
| 291 | - `pypdf`:依赖轻,适合文本型 PDF。 | ||
| 292 | - `pymupdf`:解析速度快,通常比 pypdf 稳。 | ||
| 293 | - `pdfplumber`:适合需要保留部分表格/版面信息的 PDF。 | ||
| 294 | |||
| 295 | XLSX 解析模式: | ||
| 296 | |||
| 297 | - `row_text`:每行转成 `列名: 值` 的文本,适合问答和检索。 | ||
| 298 | - `markdown_table`:每个 sheet 转成 Markdown 表格,适合保留表格整体结构,但长表容易过长。 | ||
| 299 | |||
| 300 | 本地解析配置: | ||
| 301 | |||
| 302 | ```yaml | ||
| 303 | parsing: | ||
| 304 | provider: "local" | ||
| 305 | output_path: "data/parsed_docs/documents.jsonl" | ||
| 306 | local: | ||
| 307 | pdf_backend: "pymupdf" | ||
| 308 | xlsx_mode: "row_text" | ||
| 309 | min_chars: 80 | ||
| 310 | ``` | ||
| 311 | |||
| 312 | `scripts/04_parse_docs.py` 在本地解析模式下的职责: | ||
| 313 | |||
| 314 | - 遍历 `data/raw_docs/pdf` 和 `data/raw_docs/xlsx`。 | ||
| 315 | - PDF 按页或按段落输出 Document。 | ||
| 316 | - XLSX 按行或按 sheet 输出 Document。 | ||
| 317 | - 过滤过短文本。 | ||
| 318 | - 写入 `data/parsed_docs/documents.jsonl`。 | ||
| 319 | - 保留 `source_file`、`page`、`sheet`、`row_index` 等元数据。 | ||
| 320 | |||
| 321 | ### 4.3 MinerU 解析 | ||
| 322 | |||
| 323 | MinerU 作为可选增强解析能力。适用于: | ||
| 324 | |||
| 325 | - PDF 版面复杂。 | ||
| 326 | - PDF 中有表格、公式、多栏排版。 | ||
| 327 | - 扫描件或图片型 PDF。 | ||
| 328 | - 需要 Markdown 格式作为 QA 生成上下文。 | ||
| 329 | |||
| 330 | MinerU 支持两种接入模式。 | ||
| 331 | |||
| 332 | #### 4.3.1 MinerU CLI 模式 | ||
| 333 | |||
| 334 | 配置: | ||
| 335 | |||
| 336 | ```yaml | ||
| 337 | parsing: | ||
| 338 | provider: "mineru" | ||
| 339 | mineru: | ||
| 340 | mode: "cli" | ||
| 341 | cli_bin: "mineru" | ||
| 342 | output_dir: "data/parsed_docs/mineru_raw" | ||
| 343 | timeout_seconds: 600 | ||
| 344 | fallback_to_local: true | ||
| 345 | ``` | ||
| 346 | |||
| 347 | 预期行为: | ||
| 348 | |||
| 349 | - `scripts/04_parse_docs.py` 调用 MinerU CLI。 | ||
| 350 | - 每个 PDF 解析到 `data/parsed_docs/mineru_raw/{file_stem}/`。 | ||
| 351 | - 从 MinerU 输出中读取 Markdown 或 JSON。 | ||
| 352 | - 转换成统一 `documents.jsonl`。 | ||
| 353 | |||
| 354 | CLI 命令需要按实际安装的 MinerU 版本适配。独立项目中应把 MinerU CLI 调用封装在 `src/weknora_eval/parsers/mineru.py`,不要把具体命令散落在业务脚本里。 | ||
| 355 | |||
| 356 | #### 4.3.2 MinerU HTTP 服务模式 | ||
| 357 | |||
| 358 | 如果已有 MinerU 服务,可以通过 HTTP 调用。 | ||
| 359 | |||
| 360 | 配置: | ||
| 361 | |||
| 362 | ```yaml | ||
| 363 | parsing: | ||
| 364 | provider: "mineru" | ||
| 365 | mineru: | ||
| 366 | mode: "http" | ||
| 367 | http_base_url: "http://mineru.example.com" | ||
| 368 | api_key: "replace-me" | ||
| 369 | output_dir: "data/parsed_docs/mineru_raw" | ||
| 370 | timeout_seconds: 600 | ||
| 371 | fallback_to_local: true | ||
| 372 | ``` | ||
| 373 | |||
| 374 | 预期行为: | ||
| 375 | |||
| 376 | - 上传 PDF 到 MinerU HTTP 服务。 | ||
| 377 | - 等待解析任务完成。 | ||
| 378 | - 下载 Markdown/JSON 结果。 | ||
| 379 | - 转换成统一 `documents.jsonl`。 | ||
| 380 | |||
| 381 | HTTP 接口路径需要按实际 MinerU 服务部署约定实现,因此 MinerU HTTP Client 必须做成可替换模块。 | ||
| 382 | |||
| 383 | ### 4.4 解析回退策略 | ||
| 384 | |||
| 385 | 建议实现以下策略: | ||
| 386 | |||
| 387 | 1. 默认使用配置指定的 provider。 | ||
| 388 | 2. 如果 `provider=mineru` 且某个文件解析失败: | ||
| 389 | - 记录到 `data/parsed_docs/failed_parse.jsonl`。 | ||
| 390 | - 如果 `fallback_to_local=true`,回退到本地解析。 | ||
| 391 | 3. 如果本地解析结果为空或过短: | ||
| 392 | - 标记该文件为低质量解析。 | ||
| 393 | - 不进入自动 QA 生成,等待人工处理。 | ||
| 394 | |||
| 395 | 失败记录格式: | ||
| 396 | |||
| 397 | ```json | ||
| 398 | { | ||
| 399 | "source_file": "contract.pdf", | ||
| 400 | "parser": "mineru:cli", | ||
| 401 | "status": "failed", | ||
| 402 | "error": "timeout", | ||
| 403 | "fallback_used": "local:pymupdf" | ||
| 404 | } | ||
| 405 | ``` | ||
| 406 | |||
| 407 | ### 4.5 解析质量检查 | ||
| 408 | |||
| 409 | 解析完成后生成 `data/parsed_docs/parse_summary.json`: | ||
| 410 | |||
| 411 | ```json | ||
| 412 | { | ||
| 413 | "total_files": 3, | ||
| 414 | "parsed_files": 3, | ||
| 415 | "failed_files": 0, | ||
| 416 | "total_documents": 128, | ||
| 417 | "empty_documents": 0, | ||
| 418 | "avg_chars": 512.4, | ||
| 419 | "parser": "local:pymupdf" | ||
| 420 | } | ||
| 421 | ``` | ||
| 422 | |||
| 423 | 最低质量要求: | ||
| 424 | |||
| 425 | - 每个文件至少产生 1 条 Document。 | ||
| 426 | - `content` 非空。 | ||
| 427 | - 大部分 Document 长度不低于 `min_chars`。 | ||
| 428 | - metadata 中必须保留 `source_file`。 | ||
| 429 | |||
| 430 | ## 5. WeKnora API 调用契约 | ||
| 431 | |||
| 432 | ### 5.1 上传文档 | ||
| 433 | |||
| 434 | 如果独立评估项目负责把原始 PDF/XLSX 上传到 WeKnora,使用这个接口。 | ||
| 435 | |||
| 436 | 请求: | ||
| 437 | |||
| 438 | ```http | ||
| 439 | POST /api/v1/knowledge-bases/{knowledge_base_id}/knowledge/file | ||
| 440 | X-API-Key: <api-key> | ||
| 441 | Content-Type: multipart/form-data | ||
| 442 | ``` | ||
| 443 | |||
| 444 | Multipart 字段: | ||
| 445 | |||
| 446 | ```text | ||
| 447 | file=@/path/to/file.pdf | ||
| 448 | enable_multimodel=false | ||
| 449 | ``` | ||
| 450 | |||
| 451 | 响应示例: | ||
| 452 | |||
| 453 | ```json | ||
| 454 | { | ||
| 455 | "success": true, | ||
| 456 | "data": { | ||
| 457 | "id": "knowledge-0001", | ||
| 458 | "knowledge_base_id": "kb-0001", | ||
| 459 | "type": "file", | ||
| 460 | "title": "contract.pdf", | ||
| 461 | "parse_status": "processing", | ||
| 462 | "enable_status": "disabled", | ||
| 463 | "file_name": "contract.pdf", | ||
| 464 | "file_type": "pdf", | ||
| 465 | "error_message": "" | ||
| 466 | } | ||
| 467 | } | ||
| 468 | ``` | ||
| 469 | |||
| 470 | 需要持久化: | ||
| 471 | |||
| 472 | ```json | ||
| 473 | { | ||
| 474 | "knowledge_id": "knowledge-0001", | ||
| 475 | "file_name": "contract.pdf", | ||
| 476 | "file_type": "pdf", | ||
| 477 | "parse_status": "processing" | ||
| 478 | } | ||
| 479 | ``` | ||
| 480 | |||
| 481 | ### 5.2 轮询文档入库状态 | ||
| 482 | |||
| 483 | 请求: | ||
| 484 | |||
| 485 | ```http | ||
| 486 | GET /api/v1/knowledge-bases/{knowledge_base_id}/knowledge?page=1&page_size=100 | ||
| 487 | X-API-Key: <api-key> | ||
| 488 | ``` | ||
| 489 | |||
| 490 | 响应示例: | ||
| 491 | |||
| 492 | ```json | ||
| 493 | { | ||
| 494 | "success": true, | ||
| 495 | "data": [ | ||
| 496 | { | ||
| 497 | "id": "knowledge-0001", | ||
| 498 | "title": "contract.pdf", | ||
| 499 | "parse_status": "completed", | ||
| 500 | "enable_status": "enabled", | ||
| 501 | "file_name": "contract.pdf", | ||
| 502 | "file_type": "pdf", | ||
| 503 | "processed_at": "2026-04-20T10:03:00+08:00", | ||
| 504 | "error_message": "" | ||
| 505 | } | ||
| 506 | ], | ||
| 507 | "page": 1, | ||
| 508 | "page_size": 100, | ||
| 509 | "total": 1 | ||
| 510 | } | ||
| 511 | ``` | ||
| 512 | |||
| 513 | 完成条件: | ||
| 514 | |||
| 515 | ```text | ||
| 516 | parse_status == "completed" | ||
| 517 | enable_status == "enabled" | ||
| 518 | ``` | ||
| 519 | |||
| 520 | 失败条件: | ||
| 521 | |||
| 522 | ```text | ||
| 523 | parse_status == "failed" | ||
| 524 | ``` | ||
| 525 | |||
| 526 | ### 5.3 导出 chunks | ||
| 527 | |||
| 528 | 请求: | ||
| 529 | |||
| 530 | ```http | ||
| 531 | GET /api/v1/chunks/{knowledge_id}?page=1&page_size=100 | ||
| 532 | X-API-Key: <api-key> | ||
| 533 | ``` | ||
| 534 | |||
| 535 | 响应示例: | ||
| 536 | |||
| 537 | ```json | ||
| 538 | { | ||
| 539 | "success": true, | ||
| 540 | "data": [ | ||
| 541 | { | ||
| 542 | "id": "chunk-0001", | ||
| 543 | "knowledge_id": "knowledge-0001", | ||
| 544 | "knowledge_base_id": "kb-0001", | ||
| 545 | "content": "分块文本...", | ||
| 546 | "chunk_index": 0, | ||
| 547 | "is_enabled": true, | ||
| 548 | "status": 2, | ||
| 549 | "start_at": 0, | ||
| 550 | "end_at": 500, | ||
| 551 | "chunk_type": "text", | ||
| 552 | "parent_chunk_id": "", | ||
| 553 | "metadata": null, | ||
| 554 | "image_info": "" | ||
| 555 | } | ||
| 556 | ], | ||
| 557 | "page": 1, | ||
| 558 | "page_size": 100, | ||
| 559 | "total": 35 | ||
| 560 | } | ||
| 561 | ``` | ||
| 562 | |||
| 563 | 保存为 `data/exported/chunks.jsonl`: | ||
| 564 | |||
| 565 | ```json | ||
| 566 | { | ||
| 567 | "chunk_id": "chunk-0001", | ||
| 568 | "knowledge_id": "knowledge-0001", | ||
| 569 | "knowledge_base_id": "kb-0001", | ||
| 570 | "chunk_index": 0, | ||
| 571 | "content": "分块文本...", | ||
| 572 | "source_file": "contract.pdf", | ||
| 573 | "chunk_type": "text" | ||
| 574 | } | ||
| 575 | ``` | ||
| 576 | |||
| 577 | ### 5.4 创建会话 | ||
| 578 | |||
| 579 | 建议每个评测问题创建一个独立 session,避免历史上下文影响答案。 | ||
| 580 | |||
| 581 | 请求: | ||
| 582 | |||
| 583 | ```http | ||
| 584 | POST /api/v1/sessions | ||
| 585 | X-API-Key: <api-key> | ||
| 586 | Content-Type: application/json | ||
| 587 | ``` | ||
| 588 | |||
| 589 | 请求体: | ||
| 590 | |||
| 591 | ```json | ||
| 592 | { | ||
| 593 | "title": "ragas-eval-qa-0001", | ||
| 594 | "description": "Ragas evaluation session" | ||
| 595 | } | ||
| 596 | ``` | ||
| 597 | |||
| 598 | 响应示例: | ||
| 599 | |||
| 600 | ```json | ||
| 601 | { | ||
| 602 | "success": true, | ||
| 603 | "data": { | ||
| 604 | "id": "session-0001", | ||
| 605 | "title": "ragas-eval-qa-0001", | ||
| 606 | "description": "Ragas evaluation session" | ||
| 607 | } | ||
| 608 | } | ||
| 609 | ``` | ||
| 610 | |||
| 611 | ### 5.5 执行知识库问答 | ||
| 612 | |||
| 613 | 请求: | ||
| 614 | |||
| 615 | ```http | ||
| 616 | POST /api/v1/knowledge-chat/{session_id} | ||
| 617 | X-API-Key: <api-key> | ||
| 618 | Content-Type: application/json | ||
| 619 | ``` | ||
| 620 | |||
| 621 | 请求体: | ||
| 622 | |||
| 623 | ```json | ||
| 624 | { | ||
| 625 | "query": "合同中的付款期限是什么?", | ||
| 626 | "knowledge_base_ids": ["kb-0001"], | ||
| 627 | "disable_title": true, | ||
| 628 | "enable_memory": false, | ||
| 629 | "channel": "api" | ||
| 630 | } | ||
| 631 | ``` | ||
| 632 | |||
| 633 | 如果要限制在指定文件内检索,可以传: | ||
| 634 | |||
| 635 | ```json | ||
| 636 | { | ||
| 637 | "query": "合同中的付款期限是什么?", | ||
| 638 | "knowledge_ids": ["knowledge-0001"], | ||
| 639 | "disable_title": true, | ||
| 640 | "enable_memory": false, | ||
| 641 | "channel": "api" | ||
| 642 | } | ||
| 643 | ``` | ||
| 644 | |||
| 645 | 响应类型:Server-Sent Events。 | ||
| 646 | |||
| 647 | 引用事件: | ||
| 648 | |||
| 649 | ```text | ||
| 650 | event: message | ||
| 651 | data: { | ||
| 652 | "id": "request-0001", | ||
| 653 | "response_type": "references", | ||
| 654 | "content": "", | ||
| 655 | "done": false, | ||
| 656 | "knowledge_references": [ | ||
| 657 | { | ||
| 658 | "id": "chunk-0012", | ||
| 659 | "content": "买方应在收到合法有效发票后30日内完成付款。", | ||
| 660 | "knowledge_id": "knowledge-0001", | ||
| 661 | "chunk_index": 12, | ||
| 662 | "knowledge_title": "contract.pdf", | ||
| 663 | "start_at": 1200, | ||
| 664 | "end_at": 1480, | ||
| 665 | "seq": 12, | ||
| 666 | "score": 0.92, | ||
| 667 | "match_type": 3, | ||
| 668 | "metadata": {}, | ||
| 669 | "chunk_type": "text", | ||
| 670 | "parent_chunk_id": "", | ||
| 671 | "image_info": "", | ||
| 672 | "knowledge_filename": "contract.pdf", | ||
| 673 | "knowledge_source": "file" | ||
| 674 | } | ||
| 675 | ] | ||
| 676 | } | ||
| 677 | ``` | ||
| 678 | |||
| 679 | 答案事件: | ||
| 680 | |||
| 681 | ```text | ||
| 682 | event: message | ||
| 683 | data: { | ||
| 684 | "id": "request-0001", | ||
| 685 | "response_type": "answer", | ||
| 686 | "content": "合同约定,付款期限为收到合法有效发票后30日内。", | ||
| 687 | "done": false, | ||
| 688 | "knowledge_references": null | ||
| 689 | } | ||
| 690 | ``` | ||
| 691 | |||
| 692 | 结束事件: | ||
| 693 | |||
| 694 | ```text | ||
| 695 | event: message | ||
| 696 | data: { | ||
| 697 | "id": "request-0001", | ||
| 698 | "response_type": "answer", | ||
| 699 | "content": "", | ||
| 700 | "done": true, | ||
| 701 | "knowledge_references": null | ||
| 702 | } | ||
| 703 | ``` | ||
| 704 | |||
| 705 | 需要提取: | ||
| 706 | |||
| 707 | ```json | ||
| 708 | { | ||
| 709 | "request_id": "request-0001", | ||
| 710 | "response": "合同约定,付款期限为收到合法有效发票后30日内。", | ||
| 711 | "retrieved_contexts": [ | ||
| 712 | "买方应在收到合法有效发票后30日内完成付款。" | ||
| 713 | ], | ||
| 714 | "weknora_references": [ | ||
| 715 | { | ||
| 716 | "id": "chunk-0012", | ||
| 717 | "content": "买方应在收到合法有效发票后30日内完成付款。", | ||
| 718 | "knowledge_id": "knowledge-0001", | ||
| 719 | "chunk_index": 12, | ||
| 720 | "score": 0.92, | ||
| 721 | "knowledge_filename": "contract.pdf" | ||
| 722 | } | ||
| 723 | ] | ||
| 724 | } | ||
| 725 | ``` | ||
| 726 | |||
| 727 | ### 5.6 可选:读取落库后的消息 | ||
| 728 | |||
| 729 | 用于在 SSE 完成后校验或补取最终 assistant 答案。 | ||
| 730 | |||
| 731 | 请求: | ||
| 732 | |||
| 733 | ```http | ||
| 734 | GET /api/v1/messages/{session_id}/load?limit=10 | ||
| 735 | X-API-Key: <api-key> | ||
| 736 | ``` | ||
| 737 | |||
| 738 | 响应示例: | ||
| 739 | |||
| 740 | ```json | ||
| 741 | { | ||
| 742 | "success": true, | ||
| 743 | "data": [ | ||
| 744 | { | ||
| 745 | "id": "assistant-message-0001", | ||
| 746 | "session_id": "session-0001", | ||
| 747 | "request_id": "request-0001", | ||
| 748 | "content": "合同约定,付款期限为收到合法有效发票后30日内。", | ||
| 749 | "role": "assistant", | ||
| 750 | "knowledge_references": [ | ||
| 751 | { | ||
| 752 | "id": "chunk-0012", | ||
| 753 | "content": "买方应在收到合法有效发票后30日内完成付款。", | ||
| 754 | "knowledge_id": "knowledge-0001", | ||
| 755 | "chunk_index": 12, | ||
| 756 | "knowledge_title": "contract.pdf", | ||
| 757 | "score": 0.92, | ||
| 758 | "match_type": 3, | ||
| 759 | "chunk_type": "text", | ||
| 760 | "knowledge_filename": "contract.pdf" | ||
| 761 | } | ||
| 762 | ], | ||
| 763 | "is_completed": true, | ||
| 764 | "is_fallback": false | ||
| 765 | } | ||
| 766 | ] | ||
| 767 | } | ||
| 768 | ``` | ||
| 769 | |||
| 770 | ### 5.7 可选:纯检索接口 | ||
| 771 | |||
| 772 | 用于只评估检索,不评估生成。 | ||
| 773 | |||
| 774 | 请求: | ||
| 775 | |||
| 776 | ```http | ||
| 777 | POST /api/v1/knowledge-search | ||
| 778 | X-API-Key: <api-key> | ||
| 779 | Content-Type: application/json | ||
| 780 | ``` | ||
| 781 | |||
| 782 | 请求体: | ||
| 783 | |||
| 784 | ```json | ||
| 785 | { | ||
| 786 | "query": "合同中的付款期限是什么?", | ||
| 787 | "knowledge_base_ids": ["kb-0001"] | ||
| 788 | } | ||
| 789 | ``` | ||
| 790 | |||
| 791 | 响应示例: | ||
| 792 | |||
| 793 | ```json | ||
| 794 | { | ||
| 795 | "success": true, | ||
| 796 | "data": [ | ||
| 797 | { | ||
| 798 | "id": "chunk-0012", | ||
| 799 | "content": "买方应在收到合法有效发票后30日内完成付款。", | ||
| 800 | "knowledge_id": "knowledge-0001", | ||
| 801 | "chunk_index": 12, | ||
| 802 | "knowledge_title": "contract.pdf", | ||
| 803 | "start_at": 1200, | ||
| 804 | "end_at": 1480, | ||
| 805 | "seq": 12, | ||
| 806 | "score": 0.92, | ||
| 807 | "match_type": 3, | ||
| 808 | "chunk_type": "text", | ||
| 809 | "metadata": {}, | ||
| 810 | "knowledge_filename": "contract.pdf", | ||
| 811 | "knowledge_source": "file" | ||
| 812 | } | ||
| 813 | ] | ||
| 814 | } | ||
| 815 | ``` | ||
| 816 | |||
| 817 | ## 6. QA 测试集生成方案 | ||
| 818 | |||
| 819 | ### 6.1 输入数据 | ||
| 820 | |||
| 821 | 建议同时保留两类输入: | ||
| 822 | |||
| 823 | - `data/parsed_docs/documents.jsonl` 中的标准化解析结果。 | ||
| 824 | - `data/exported/chunks.jsonl` 中从 WeKnora 导出的 chunks。 | ||
| 825 | |||
| 826 | 推荐顺序: | ||
| 827 | |||
| 828 | 1. 优先基于 `documents.jsonl` 生成候选 QA。 | ||
| 829 | 2. 保存 QA 对应的来源文件和依据片段。 | ||
| 830 | 3. 可选:把 `reference_contexts` 匹配回 WeKnora 的 chunk ID,用于计算 hit@k、recall@k、mrr 等非 LLM 检索指标。 | ||
| 831 | |||
| 832 | ### 6.2 测试集记录格式 | ||
| 833 | |||
| 834 | `data/testsets/testset.raw.jsonl`: | ||
| 835 | |||
| 836 | ```json | ||
| 837 | { | ||
| 838 | "sample_id": "qa-0001", | ||
| 839 | "user_input": "合同中的付款期限是什么?", | ||
| 840 | "reference": "付款期限为收到合法有效发票后30日内。", | ||
| 841 | "reference_contexts": [ | ||
| 842 | "买方应在收到合法有效发票后30日内完成付款。" | ||
| 843 | ], | ||
| 844 | "source_file": "contract.pdf", | ||
| 845 | "gold_chunk_ids": ["chunk-0012"], | ||
| 846 | "question_type": "single_hop", | ||
| 847 | "review_status": "pending" | ||
| 848 | } | ||
| 849 | ``` | ||
| 850 | |||
| 851 | 人工审核后的 `data/testsets/testset.reviewed.jsonl`: | ||
| 852 | |||
| 853 | ```json | ||
| 854 | { | ||
| 855 | "sample_id": "qa-0001", | ||
| 856 | "user_input": "合同中的付款期限是什么?", | ||
| 857 | "reference": "付款期限为收到合法有效发票后30日内。", | ||
| 858 | "reference_contexts": [ | ||
| 859 | "买方应在收到合法有效发票后30日内完成付款。" | ||
| 860 | ], | ||
| 861 | "source_file": "contract.pdf", | ||
| 862 | "gold_chunk_ids": ["chunk-0012"], | ||
| 863 | "question_type": "single_hop", | ||
| 864 | "review_status": "approved" | ||
| 865 | } | ||
| 866 | ``` | ||
| 867 | |||
| 868 | ### 6.3 问题类型建议 | ||
| 869 | |||
| 870 | 建议包含: | ||
| 871 | |||
| 872 | - PDF 单跳事实问答。 | ||
| 873 | - PDF 多跳问答,例如跨相邻章节综合。 | ||
| 874 | - PDF 定义、条件、例外条款类问题。 | ||
| 875 | - XLSX 单行查询问题。 | ||
| 876 | - XLSX 条件筛选问题。 | ||
| 877 | |||
| 878 | 第一阶段暂时避免: | ||
| 879 | |||
| 880 | - 复杂表格聚合问题,除非期望 WeKnora 本身支持表格计算。 | ||
| 881 | - 依赖图片内容才能回答的问题。 | ||
| 882 | - 需要外部知识的问题。 | ||
| 883 | - 存在多个合理答案的模糊问题。 | ||
| 884 | |||
| 885 | ## 7. Ragas 输入构造方案 | ||
| 886 | |||
| 887 | 对每条审核通过的 QA: | ||
| 888 | |||
| 889 | 1. 创建一个干净 session。 | ||
| 890 | 2. 调用 `POST /knowledge-chat/{session_id}`。 | ||
| 891 | 3. 解析 SSE 中的 references 事件。 | ||
| 892 | 4. 解析 SSE 中的 answer 事件。 | ||
| 893 | 5. 构造一条 Ragas 输入记录。 | ||
| 894 | |||
| 895 | `data/runs/ragas_input.jsonl`: | ||
| 896 | |||
| 897 | ```json | ||
| 898 | { | ||
| 899 | "sample_id": "qa-0001", | ||
| 900 | "user_input": "合同中的付款期限是什么?", | ||
| 901 | "response": "合同约定,付款期限为收到合法有效发票后30日内。", | ||
| 902 | "retrieved_contexts": [ | ||
| 903 | "买方应在收到合法有效发票后30日内完成付款。" | ||
| 904 | ], | ||
| 905 | "reference": "付款期限为收到合法有效发票后30日内。", | ||
| 906 | "reference_contexts": [ | ||
| 907 | "买方应在收到合法有效发票后30日内完成付款。" | ||
| 908 | ], | ||
| 909 | "session_id": "session-0001", | ||
| 910 | "request_id": "request-0001", | ||
| 911 | "weknora_references": [ | ||
| 912 | { | ||
| 913 | "id": "chunk-0012", | ||
| 914 | "knowledge_id": "knowledge-0001", | ||
| 915 | "chunk_index": 12, | ||
| 916 | "score": 0.92, | ||
| 917 | "knowledge_filename": "contract.pdf" | ||
| 918 | } | ||
| 919 | ] | ||
| 920 | } | ||
| 921 | ``` | ||
| 922 | |||
| 923 | ## 8. Ragas 指标方案 | ||
| 924 | |||
| 925 | 第一阶段建议使用: | ||
| 926 | |||
| 927 | | 指标 | 必要字段 | 作用 | | ||
| 928 | | --- | --- | --- | | ||
| 929 | | faithfulness | response, retrieved_contexts | 检查答案是否被检索内容支撑。 | | ||
| 930 | | response_relevancy | user_input, response | 检查答案是否切题。 | | ||
| 931 | | context_precision | user_input, retrieved_contexts, reference | 检查靠前的检索上下文是否相关。 | | ||
| 932 | | context_recall | retrieved_contexts, reference | 检查检索上下文是否包含足够证据。 | | ||
| 933 | | factual_correctness | response, reference | 检查答案与标准答案事实是否一致。 | | ||
| 934 | |||
| 935 | 如果测试集里有 `gold_chunk_ids`,建议额外计算非 LLM 检索指标: | ||
| 936 | |||
| 937 | - hit@1 | ||
| 938 | - hit@3 | ||
| 939 | - hit@5 | ||
| 940 | - recall@k | ||
| 941 | - mrr | ||
| 942 | - ndcg@k | ||
| 943 | |||
| 944 | ## 9. 报告方案 | ||
| 945 | |||
| 946 | 生成 `data/reports/summary.md`: | ||
| 947 | |||
| 948 | ```markdown | ||
| 949 | # Ragas 评估报告 | ||
| 950 | |||
| 951 | ## 运行信息 | ||
| 952 | - WeKnora Base URL: | ||
| 953 | - 知识库 ID: | ||
| 954 | - 测试集规模: | ||
| 955 | - 审核通过样本数: | ||
| 956 | - 失败样本数: | ||
| 957 | - Judge 模型: | ||
| 958 | |||
| 959 | ## 聚合指标 | ||
| 960 | | 指标 | 平均值 | P50 | 失败阈值 | | ||
| 961 | | --- | --- | --- | --- | | ||
| 962 | |||
| 963 | ## 检索失败样本 | ||
| 964 | | sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 | | ||
| 965 | |||
| 966 | ## 生成失败样本 | ||
| 967 | | sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness | | ||
| 968 | |||
| 969 | ## 改进建议 | ||
| 970 | - ... | ||
| 971 | ``` | ||
| 972 | |||
| 973 | 同时保存: | ||
| 974 | |||
| 975 | - `ragas_scores.csv`:每条样本的指标。 | ||
| 976 | - `weknora_answers.jsonl`:WeKnora 原始输出。 | ||
| 977 | - `ragas_input.jsonl`:实际输入 Ragas 的数据。 | ||
| 978 | - `failed_requests.jsonl`:API 失败或 SSE 解析失败记录。 | ||
| 979 | |||
| 980 | ## 10. 实施清单 | ||
| 981 | |||
| 982 | ### 阶段 1:项目脚手架 | ||
| 983 | |||
| 984 | - [ ] 创建独立仓库或目录 `weknora-ragas-eval`。 | ||
| 985 | - [ ] 添加 Python 项目元数据和依赖锁定。 | ||
| 986 | - [ ] 添加 `.env.example`。 | ||
| 987 | - [ ] 添加 `configs/eval.yaml`。 | ||
| 988 | - [ ] 创建 `data/` 下的各级目录。 | ||
| 989 | - [ ] 添加结构化日志。 | ||
| 990 | - [ ] 添加重试和超时策略。 | ||
| 991 | |||
| 992 | ### 阶段 2:WeKnora API Client | ||
| 993 | |||
| 994 | - [ ] 实现 `create_session`。 | ||
| 995 | - [ ] 实现 `upload_file`。 | ||
| 996 | - [ ] 实现 `list_knowledge`。 | ||
| 997 | - [ ] 实现 `wait_ingestion_completed`。 | ||
| 998 | - [ ] 实现 `list_chunks`。 | ||
| 999 | - [ ] 实现 `knowledge_chat_sse`。 | ||
| 1000 | - [ ] 实现 `load_messages`。 | ||
| 1001 | - [ ] 实现 `knowledge_search`。 | ||
| 1002 | - [ ] API 错误时保存响应体。 | ||
| 1003 | - [ ] 实现分页工具函数。 | ||
| 1004 | |||
| 1005 | ### 阶段 3:文档与 chunk 导出 | ||
| 1006 | |||
| 1007 | - [ ] 上传 PDF 文件。 | ||
| 1008 | - [ ] 上传 XLSX 文件。 | ||
| 1009 | - [ ] 轮询直到所有文档 completed 或 failed。 | ||
| 1010 | - [ ] 导出全部 knowledge 元数据。 | ||
| 1011 | - [ ] 导出全部 chunks。 | ||
| 1012 | - [ ] 过滤 disabled chunks。 | ||
| 1013 | - [ ] 过滤空 chunks。 | ||
| 1014 | - [ ] 保留来源文件元数据。 | ||
| 1015 | |||
| 1016 | ### 阶段 4:Ragas 侧文档解析 | ||
| 1017 | |||
| 1018 | - [ ] 实现本地 PDF 解析。 | ||
| 1019 | - [ ] 实现本地 XLSX 解析。 | ||
| 1020 | - [ ] 实现 MinerU CLI 解析适配。 | ||
| 1021 | - [ ] 实现 MinerU HTTP 解析适配。 | ||
| 1022 | - [ ] 将所有解析结果转换为 `documents.jsonl`。 | ||
| 1023 | - [ ] 记录解析失败文件。 | ||
| 1024 | - [ ] 生成 `parse_summary.json`。 | ||
| 1025 | - [ ] 支持 MinerU 失败后回退本地解析。 | ||
| 1026 | |||
| 1027 | ### 阶段 5:测试集生成 | ||
| 1028 | |||
| 1029 | - [ ] 加载 `data/parsed_docs/documents.jsonl`。 | ||
| 1030 | - [ ] 使用 Ragas 生成候选 QA。 | ||
| 1031 | - [ ] 保存原始候选测试集。 | ||
| 1032 | - [ ] 增加人工审核字段。 | ||
| 1033 | - [ ] 生成审核后的测试集。 | ||
| 1034 | - [ ] 执行最低质量检查: | ||
| 1035 | - [ ] 问题可以从给定上下文回答。 | ||
| 1036 | - [ ] 标准答案有依据。 | ||
| 1037 | - [ ] `reference_contexts` 非空。 | ||
| 1038 | - [ ] 记录来源文件。 | ||
| 1039 | |||
| 1040 | ### 阶段 6:运行 WeKnora QA | ||
| 1041 | |||
| 1042 | - [ ] 每条 QA 创建一个干净 session。 | ||
| 1043 | - [ ] 调用 `knowledge-chat`。 | ||
| 1044 | - [ ] 解析 SSE references 事件。 | ||
| 1045 | - [ ] 解析 SSE answer 事件。 | ||
| 1046 | - [ ] 按 chunk ID 去重引用。 | ||
| 1047 | - [ ] 保存原始答案和引用。 | ||
| 1048 | - [ ] 记录空答案失败。 | ||
| 1049 | - [ ] 记录空检索失败。 | ||
| 1050 | - [ ] 可选:通过 message load API 校验最终答案。 | ||
| 1051 | |||
| 1052 | ### 阶段 7:构造 Ragas 输入 | ||
| 1053 | |||
| 1054 | - [ ] 合并审核后的 QA 与 WeKnora 输出。 | ||
| 1055 | - [ ] 构造 `user_input`。 | ||
| 1056 | - [ ] 构造 `response`。 | ||
| 1057 | - [ ] 构造 `retrieved_contexts`。 | ||
| 1058 | - [ ] 构造 `reference`。 | ||
| 1059 | - [ ] 构造 `reference_contexts`。 | ||
| 1060 | - [ ] 保留 `sample_id`、`session_id`、`request_id` 和 references,便于排查。 | ||
| 1061 | - [ ] 校验必要字段不缺失。 | ||
| 1062 | |||
| 1063 | ### 阶段 8:运行 Ragas 评估 | ||
| 1064 | |||
| 1065 | - [ ] 配置 judge LLM。 | ||
| 1066 | - [ ] 配置 embedding 模型。 | ||
| 1067 | - [ ] 运行 faithfulness。 | ||
| 1068 | - [ ] 运行 response relevancy。 | ||
| 1069 | - [ ] 运行 context precision。 | ||
| 1070 | - [ ] 运行 context recall。 | ||
| 1071 | - [ ] 运行 factual correctness。 | ||
| 1072 | - [ ] 保存逐样本分数。 | ||
| 1073 | - [ ] 保存聚合分数。 | ||
| 1074 | - [ ] 按样本捕获 Ragas 异常。 | ||
| 1075 | |||
| 1076 | ### 阶段 9:基于 chunk ID 的检索指标 | ||
| 1077 | |||
| 1078 | - [ ] 如果存在 `gold_chunk_ids`,计算 hit@k。 | ||
| 1079 | - [ ] 计算 recall@k。 | ||
| 1080 | - [ ] 计算 mrr。 | ||
| 1081 | - [ ] 计算 ndcg@k。 | ||
| 1082 | - [ ] 对比 chunk-ID 指标和 Ragas LLM-based context 指标。 | ||
| 1083 | |||
| 1084 | ### 阶段 10:生成报告 | ||
| 1085 | |||
| 1086 | - [ ] 生成 Markdown 报告。 | ||
| 1087 | - [ ] 写入运行信息。 | ||
| 1088 | - [ ] 写入聚合指标。 | ||
| 1089 | - [ ] 写入最差检索样本。 | ||
| 1090 | - [ ] 写入最差生成样本。 | ||
| 1091 | - [ ] 写入空检索数量。 | ||
| 1092 | - [ ] 写入 fallback 答案数量。 | ||
| 1093 | - [ ] 写入来源文件分布。 | ||
| 1094 | - [ ] 写入改进建议。 | ||
| 1095 | |||
| 1096 | ## 11. 验收标准 | ||
| 1097 | |||
| 1098 | 独立评估项目达到以下条件即认为可用: | ||
| 1099 | |||
| 1100 | - [ ] 可以上传一小批 PDF/XLSX 到 WeKnora。 | ||
| 1101 | - [ ] 可以检测文档入库完成。 | ||
| 1102 | - [ ] 可以从 WeKnora 导出 chunks。 | ||
| 1103 | - [ ] 可以通过本地解析或 MinerU 解析生成 `documents.jsonl`。 | ||
| 1104 | - [ ] 可以创建或导入至少 10 条审核通过的 QA。 | ||
| 1105 | - [ ] 可以对每条 QA 调用 WeKnora。 | ||
| 1106 | - [ ] 可以解析 `response` 和 `retrieved_contexts`。 | ||
| 1107 | - [ ] 可以构造合法的 Ragas 输入 JSONL。 | ||
| 1108 | - [ ] 可以产出逐样本 Ragas 分数。 | ||
| 1109 | - [ ] 可以产出可读的汇总报告。 | ||
| 1110 | - [ ] 所有中间产物都已保存,便于复盘和排查。 | ||
| 1111 | |||
| 1112 | ## 12. 首轮 Pilot Run | ||
| 1113 | |||
| 1114 | 先用很小的数据集跑通闭环: | ||
| 1115 | |||
| 1116 | - 2 个 PDF 文件。 | ||
| 1117 | - 1 个 XLSX 文件。 | ||
| 1118 | - 10 条人工审核通过的 QA。 | ||
| 1119 | - 每条样本一个独立 session。 | ||
| 1120 | - 指标: | ||
| 1121 | - faithfulness | ||
| 1122 | - response_relevancy | ||
| 1123 | - context_precision | ||
| 1124 | - context_recall | ||
| 1125 | - factual_correctness | ||
| 1126 | |||
| 1127 | 预期产物: | ||
| 1128 | |||
| 1129 | ```text | ||
| 1130 | data/exported/knowledge.jsonl | ||
| 1131 | data/exported/chunks.jsonl | ||
| 1132 | data/parsed_docs/documents.jsonl | ||
| 1133 | data/parsed_docs/parse_summary.json | ||
| 1134 | data/testsets/testset.reviewed.jsonl | ||
| 1135 | data/runs/weknora_answers.jsonl | ||
| 1136 | data/runs/ragas_input.jsonl | ||
| 1137 | data/reports/ragas_scores.csv | ||
| 1138 | data/reports/summary.md | ||
| 1139 | ``` | ||
| 1140 | |||
| 1141 | 只有当首轮确认以下问题都正常后,再扩展到 50-300 条样本: | ||
| 1142 | |||
| 1143 | - `retrieved_contexts` 没有系统性为空。 | ||
| 1144 | - `response` 能正确捕获。 | ||
| 1145 | - Ragas 输入字段合法。 | ||
| 1146 | - 人工审核确认 QA 集有评测意义。 |
README.md
0 → 100644
| 1 | # WeKnora Ragas Eval | ||
| 2 | |||
| 3 | 独立的 WeKnora Ragas 评估项目。它只调用 WeKnora 公开 API,不依赖 WeKnora 内置的 `/evaluation` 接口。 | ||
| 4 | |||
| 5 | ## 安装 | ||
| 6 | |||
| 7 | ```bash | ||
| 8 | python -m venv .venv | ||
| 9 | source .venv/bin/activate | ||
| 10 | pip install -e . | ||
| 11 | ``` | ||
| 12 | |||
| 13 | 如果需要更好的 PDF 解析能力: | ||
| 14 | |||
| 15 | ```bash | ||
| 16 | pip install -e ".[pdf]" | ||
| 17 | ``` | ||
| 18 | |||
| 19 | 开发和测试工具: | ||
| 20 | |||
| 21 | ```bash | ||
| 22 | pip install -e ".[dev,pdf]" | ||
| 23 | ``` | ||
| 24 | |||
| 25 | ## 配置 | ||
| 26 | |||
| 27 | ```bash | ||
| 28 | cp .env.example .env | ||
| 29 | ``` | ||
| 30 | |||
| 31 | 编辑 `.env` 后确认: | ||
| 32 | |||
| 33 | - `WEKNORA_BASE_URL` 指向 WeKnora API v1,例如 `http://localhost:9090/api/v1` | ||
| 34 | - `WEKNORA_API_KEY` 是 WeKnora API Key | ||
| 35 | - `WEKNORA_KB_ID` 是目标知识库 ID;如果还没有,先运行 `python scripts/00_create_kb.py` | ||
| 36 | - `WEKNORA_KB_NAME` 是创建知识库时使用的名称 | ||
| 37 | - `OPENAI_API_KEY`、`OPENAI_BASE_URL`、`RAGAS_*_MODEL` 是评估侧模型配置 | ||
| 38 | - 如果 LLM 和 embedding 分开部署,使用 `RAGAS_LLM_BASE_URL` 指向 vLLM 的 `/v1`,使用 `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity 的 `/v1` | ||
| 39 | |||
| 40 | ## 首轮 Pilot | ||
| 41 | |||
| 42 | 把原始文件放到: | ||
| 43 | |||
| 44 | - `data/raw_docs/pdf/` | ||
| 45 | - `data/raw_docs/xlsx/` | ||
| 46 | |||
| 47 | 按顺序执行: | ||
| 48 | |||
| 49 | ```bash | ||
| 50 | python scripts/00_create_kb.py | ||
| 51 | python scripts/01_upload_docs.py | ||
| 52 | python scripts/02_wait_ingestion.py | ||
| 53 | python scripts/03_export_chunks.py | ||
| 54 | python scripts/04_parse_docs.py | ||
| 55 | python scripts/05_generate_testset.py | ||
| 56 | python scripts/06_review_testset.py | ||
| 57 | python scripts/07_run_weknora_qa.py | ||
| 58 | python scripts/08_build_ragas_input.py | ||
| 59 | python scripts/09_run_ragas_eval.py | ||
| 60 | python scripts/10_report.py | ||
| 61 | ``` | ||
| 62 | |||
| 63 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | ||
| 64 | |||
| 65 | ## 主要产物 | ||
| 66 | |||
| 67 | - `data/exported/knowledge.jsonl` | ||
| 68 | - `data/exported/chunks.jsonl` | ||
| 69 | - `data/parsed_docs/documents.jsonl` | ||
| 70 | - `data/parsed_docs/parse_summary.json` | ||
| 71 | - `data/testsets/testset.raw.jsonl` | ||
| 72 | - `data/testsets/testset.reviewed.jsonl` | ||
| 73 | - `data/runs/weknora_answers.jsonl` | ||
| 74 | - `data/runs/ragas_input.jsonl` | ||
| 75 | - `data/reports/ragas_scores.csv` | ||
| 76 | - `data/reports/summary.md` |
configs/eval.yaml
0 → 100644
| 1 | weknora: | ||
| 2 | base_url: "${WEKNORA_BASE_URL}" | ||
| 3 | api_key: "${WEKNORA_API_KEY}" | ||
| 4 | knowledge_base_id: "${WEKNORA_KB_ID}" | ||
| 5 | knowledge_base_name: "${WEKNORA_KB_NAME:-ragas-eval-pilot}" | ||
| 6 | knowledge_base_description: "Knowledge base for independent Ragas evaluation." | ||
| 7 | timeout_seconds: 300 | ||
| 8 | request_interval_seconds: "${REQUEST_INTERVAL_SECONDS:-0.2}" | ||
| 9 | |||
| 10 | testset: | ||
| 11 | size: "${TESTSET_SIZE:-50}" | ||
| 12 | include_pdf: true | ||
| 13 | include_xlsx: true | ||
| 14 | min_context_chars: 80 | ||
| 15 | require_manual_review: true | ||
| 16 | |||
| 17 | parsing: | ||
| 18 | provider: "local" | ||
| 19 | output_path: "data/parsed_docs/documents.jsonl" | ||
| 20 | failed_path: "data/parsed_docs/failed_parse.jsonl" | ||
| 21 | summary_path: "data/parsed_docs/parse_summary.json" | ||
| 22 | local: | ||
| 23 | pdf_backend: "pymupdf" | ||
| 24 | xlsx_mode: "row_text" | ||
| 25 | min_chars: 80 | ||
| 26 | mineru: | ||
| 27 | mode: "cli" | ||
| 28 | cli_bin: "mineru" | ||
| 29 | output_dir: "data/parsed_docs/mineru_raw" | ||
| 30 | http_base_url: "http://172.23.184.9:8002" | ||
| 31 | api_key: "mineru" | ||
| 32 | timeout_seconds: 600 | ||
| 33 | fallback_to_local: false | ||
| 34 | |||
| 35 | qa: | ||
| 36 | one_session_per_question: true | ||
| 37 | disable_title: true | ||
| 38 | enable_memory: false | ||
| 39 | channel: "api" | ||
| 40 | verify_with_messages: false | ||
| 41 | |||
| 42 | ragas: | ||
| 43 | provider: "openai-compatible" | ||
| 44 | # Backward-compatible defaults. If the split LLM/embedding values below are | ||
| 45 | # empty, these values are used for both clients. | ||
| 46 | api_key: "${OPENAI_API_KEY}" | ||
| 47 | base_url: "${OPENAI_BASE_URL}" | ||
| 48 | # vLLM OpenAI-compatible endpoint, for example http://localhost:8000/v1. | ||
| 49 | llm_api_key: "${RAGAS_LLM_API_KEY}" | ||
| 50 | llm_base_url: "${RAGAS_LLM_BASE_URL}" | ||
| 51 | # Infinity OpenAI-compatible embedding endpoint, for example | ||
| 52 | # http://localhost:7997/v1. | ||
| 53 | embedding_api_key: "${RAGAS_EMBEDDING_API_KEY}" | ||
| 54 | embedding_base_url: "${RAGAS_EMBEDDING_BASE_URL}" | ||
| 55 | # Reserved for future retrieval/rerank metrics. The current Ragas pipeline | ||
| 56 | # does not call reranker APIs. | ||
| 57 | reranker_api_key: "${RAGAS_RERANKER_API_KEY}" | ||
| 58 | reranker_base_url: "${RAGAS_RERANKER_BASE_URL}" | ||
| 59 | reranker_model: "${RAGAS_RERANKER_MODEL}" | ||
| 60 | generator_model: "${RAGAS_GENERATOR_MODEL}" | ||
| 61 | judge_model: "${RAGAS_JUDGE_MODEL}" | ||
| 62 | embedding_model: "${RAGAS_EMBEDDING_MODEL}" | ||
| 63 | temperature: 0 | ||
| 64 | max_tokens: 4096 | ||
| 65 | timeout_seconds: 600 | ||
| 66 | max_workers: 1 | ||
| 67 | metrics: | ||
| 68 | - faithfulness | ||
| 69 | - response_relevancy | ||
| 70 | - context_precision | ||
| 71 | - context_recall | ||
| 72 | - factual_correctness |
data/parsed_docs/mineru_raw/.gitkeep
0 → 100644
data/raw_docs/pdf/.gitkeep
0 → 100644
data/raw_docs/xlsx/.gitkeep
0 → 100644
pyproject.toml
0 → 100644
| 1 | [project] | ||
| 2 | name = "weknora-ragas-eval" | ||
| 3 | version = "0.1.0" | ||
| 4 | description = "Independent Ragas evaluation pipeline for WeKnora public APIs." | ||
| 5 | readme = "README.md" | ||
| 6 | requires-python = ">=3.10" | ||
| 7 | dependencies = [ | ||
| 8 | "ragas>=0.3,<0.5", | ||
| 9 | "datasets>=2.19.0", | ||
| 10 | "pandas>=2.2.0", | ||
| 11 | "openpyxl>=3.1.0", | ||
| 12 | "requests>=2.32.0", | ||
| 13 | "sseclient-py>=1.8.0", | ||
| 14 | "python-dotenv>=1.0.0", | ||
| 15 | "pyyaml>=6.0.0", | ||
| 16 | "langchain>=0.2.0", | ||
| 17 | "langchain-community>=0.2.0", | ||
| 18 | "langchain-openai>=0.1.0", | ||
| 19 | "pypdf>=4.2.0" | ||
| 20 | ] | ||
| 21 | |||
| 22 | [project.optional-dependencies] | ||
| 23 | pdf = [ | ||
| 24 | "pymupdf>=1.24.0", | ||
| 25 | "pdfplumber>=0.11.0" | ||
| 26 | ] | ||
| 27 | dev = [ | ||
| 28 | "ruff>=0.6.0", | ||
| 29 | "pytest>=8.0.0" | ||
| 30 | ] | ||
| 31 | |||
| 32 | [build-system] | ||
| 33 | requires = ["setuptools>=68"] | ||
| 34 | build-backend = "setuptools.build_meta" | ||
| 35 | |||
| 36 | [tool.setuptools.packages.find] | ||
| 37 | where = ["src"] | ||
| 38 | |||
| 39 | [tool.ruff] | ||
| 40 | line-length = 100 | ||
| 41 | target-version = "py310" | ||
| 42 | |||
| 43 | [tool.ruff.lint] | ||
| 44 | select = ["E", "F", "I", "UP", "B"] |
scripts/00_create_kb.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | from typing import Any | ||
| 5 | |||
| 6 | import _bootstrap # noqa: F401 | ||
| 7 | |||
| 8 | from weknora_eval.api import bootstrap_client_from_config | ||
| 9 | from weknora_eval.config import load_config, require_config | ||
| 10 | from weknora_eval.envfile import set_env_value | ||
| 11 | from weknora_eval.loaders import setup_logging, write_json | ||
| 12 | |||
| 13 | |||
| 14 | def main() -> int: | ||
| 15 | setup_logging() | ||
| 16 | config = load_config() | ||
| 17 | client = bootstrap_client_from_config(config) | ||
| 18 | weknora = config["weknora"] | ||
| 19 | |||
| 20 | existing_id = str(weknora.get("knowledge_base_id") or "") | ||
| 21 | name = str(require_config(config, "weknora.knowledge_base_name")) | ||
| 22 | if existing_id and existing_id != "replace-me": | ||
| 23 | record = {"id": existing_id, "name": name, "source": "env"} | ||
| 24 | write_json("data/exported/knowledge_base.json", record) | ||
| 25 | print(f"WEKNORA_KB_ID already set: {existing_id}") | ||
| 26 | return 0 | ||
| 27 | |||
| 28 | created = client.create_knowledge_base(name=name) | ||
| 29 | knowledge_base_id = _extract_knowledge_base_id(created) | ||
| 30 | if not knowledge_base_id: | ||
| 31 | print(f"Created knowledge base but could not extract id from response: {created}") | ||
| 32 | return 1 | ||
| 33 | |||
| 34 | set_env_value(".env", "WEKNORA_KB_ID", knowledge_base_id) | ||
| 35 | write_json("data/exported/knowledge_base.json", {**created, "source": "create"}) | ||
| 36 | print(f"WEKNORA_KB_ID={knowledge_base_id}") | ||
| 37 | print("Wrote ID to .env and data/exported/knowledge_base.json") | ||
| 38 | return 0 | ||
| 39 | |||
| 40 | |||
| 41 | def _extract_knowledge_base_id(payload: dict[str, Any]) -> str | None: | ||
| 42 | candidates = [payload] | ||
| 43 | for key in ("data", "knowledge_base"): | ||
| 44 | nested = payload.get(key) | ||
| 45 | if isinstance(nested, dict): | ||
| 46 | candidates.append(nested) | ||
| 47 | |||
| 48 | for row in candidates: | ||
| 49 | for key in ("id", "knowledge_base_id", "kb_id", "uuid"): | ||
| 50 | value = row.get(key) | ||
| 51 | if value: | ||
| 52 | return str(value) | ||
| 53 | return None | ||
| 54 | |||
| 55 | |||
| 56 | if __name__ == "__main__": | ||
| 57 | sys.exit(main()) |
scripts/01_upload_docs.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | from pathlib import Path | ||
| 5 | |||
| 6 | import _bootstrap # noqa: F401 | ||
| 7 | |||
| 8 | from weknora_eval.api import client_from_config | ||
| 9 | from weknora_eval.config import load_config | ||
| 10 | from weknora_eval.loaders import setup_logging, write_jsonl | ||
| 11 | |||
| 12 | |||
| 13 | def main() -> int: | ||
| 14 | setup_logging() | ||
| 15 | config = load_config() | ||
| 16 | client = client_from_config(config) | ||
| 17 | files = sorted(Path("data/raw_docs/pdf").glob("*.pdf")) + sorted( | ||
| 18 | Path("data/raw_docs/xlsx").glob("*.xlsx") | ||
| 19 | ) | ||
| 20 | rows = [] | ||
| 21 | for path in files: | ||
| 22 | data = client.upload_file(path) | ||
| 23 | rows.append( | ||
| 24 | { | ||
| 25 | "knowledge_id": data.get("id"), | ||
| 26 | "file_name": data.get("file_name") or data.get("title") or path.name, | ||
| 27 | "file_type": data.get("file_type") or path.suffix.lstrip("."), | ||
| 28 | "parse_status": data.get("parse_status"), | ||
| 29 | "enable_status": data.get("enable_status"), | ||
| 30 | "raw": data, | ||
| 31 | } | ||
| 32 | ) | ||
| 33 | write_jsonl("data/exported/knowledge_uploads.jsonl", rows) | ||
| 34 | print(f"Uploaded {len(rows)} files") | ||
| 35 | return 0 | ||
| 36 | |||
| 37 | |||
| 38 | if __name__ == "__main__": | ||
| 39 | sys.exit(main()) |
scripts/02_wait_ingestion.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import read_jsonl, setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | uploads = read_jsonl("data/exported/knowledge_uploads.jsonl", missing_ok=True) | ||
| 17 | knowledge_ids = {row["knowledge_id"] for row in uploads if row.get("knowledge_id")} or None | ||
| 18 | result = client.wait_ingestion_completed(knowledge_ids=knowledge_ids) | ||
| 19 | knowledge = client.list_knowledge() | ||
| 20 | write_jsonl("data/exported/knowledge.jsonl", knowledge) | ||
| 21 | |||
| 22 | print( | ||
| 23 | "Ingestion status: " | ||
| 24 | f"completed={len(result['completed'])} failed={len(result['failed'])} " | ||
| 25 | f"pending={len(result['pending'])}" | ||
| 26 | ) | ||
| 27 | return 1 if result["failed"] or result["pending"] else 0 | ||
| 28 | |||
| 29 | |||
| 30 | if __name__ == "__main__": | ||
| 31 | sys.exit(main()) |
scripts/03_export_chunks.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | knowledge_rows = client.list_knowledge() | ||
| 17 | write_jsonl("data/exported/knowledge.jsonl", knowledge_rows) | ||
| 18 | knowledge_by_id = {row.get("id"): row for row in knowledge_rows} | ||
| 19 | |||
| 20 | chunk_rows = [] | ||
| 21 | for knowledge in knowledge_rows: | ||
| 22 | knowledge_id = knowledge.get("id") | ||
| 23 | if not knowledge_id: | ||
| 24 | continue | ||
| 25 | if knowledge.get("parse_status") != "completed" or knowledge.get("enable_status") != "enabled": | ||
| 26 | continue | ||
| 27 | for chunk in client.list_chunks(str(knowledge_id)): | ||
| 28 | content = (chunk.get("content") or "").strip() | ||
| 29 | if not content: | ||
| 30 | continue | ||
| 31 | if chunk.get("is_enabled") is False: | ||
| 32 | continue | ||
| 33 | source = knowledge_by_id.get(chunk.get("knowledge_id")) or knowledge | ||
| 34 | chunk_rows.append( | ||
| 35 | { | ||
| 36 | "chunk_id": chunk.get("id"), | ||
| 37 | "knowledge_id": chunk.get("knowledge_id") or knowledge_id, | ||
| 38 | "knowledge_base_id": chunk.get("knowledge_base_id") | ||
| 39 | or config["weknora"]["knowledge_base_id"], | ||
| 40 | "chunk_index": chunk.get("chunk_index"), | ||
| 41 | "content": content, | ||
| 42 | "source_file": source.get("file_name") or source.get("title"), | ||
| 43 | "chunk_type": chunk.get("chunk_type"), | ||
| 44 | "raw": chunk, | ||
| 45 | } | ||
| 46 | ) | ||
| 47 | write_jsonl("data/exported/chunks.jsonl", chunk_rows) | ||
| 48 | print(f"Exported {len(chunk_rows)} chunks from {len(knowledge_rows)} knowledge records") | ||
| 49 | return 0 | ||
| 50 | |||
| 51 | |||
| 52 | if __name__ == "__main__": | ||
| 53 | sys.exit(main()) |
scripts/04_parse_docs.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.parsers.local import parse_raw_docs | ||
| 10 | from weknora_eval.parsers.mineru import parse_with_mineru | ||
| 11 | |||
| 12 | |||
| 13 | def main() -> int: | ||
| 14 | setup_logging() | ||
| 15 | config = load_config() | ||
| 16 | provider = config.get("parsing", {}).get("provider", "local") | ||
| 17 | if provider == "local": | ||
| 18 | rows, summary = parse_raw_docs(config) | ||
| 19 | elif provider == "mineru": | ||
| 20 | rows, summary = parse_with_mineru(config) | ||
| 21 | else: | ||
| 22 | raise ValueError(f"Unsupported parsing provider: {provider}") | ||
| 23 | print(f"Parsed {len(rows)} documents: {summary}") | ||
| 24 | return 0 if rows else 1 | ||
| 25 | |||
| 26 | |||
| 27 | if __name__ == "__main__": | ||
| 28 | sys.exit(main()) |
scripts/05_generate_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.testset import generate_rule_based_testset | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | testset = config.get("testset", {}) | ||
| 16 | rows = generate_rule_based_testset( | ||
| 17 | size=int(testset.get("size", 50)), | ||
| 18 | min_context_chars=int(testset.get("min_context_chars", 80)), | ||
| 19 | ) | ||
| 20 | print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl") | ||
| 21 | return 0 if rows else 1 | ||
| 22 | |||
| 23 | |||
| 24 | if __name__ == "__main__": | ||
| 25 | sys.exit(main()) |
scripts/06_review_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.loaders import setup_logging | ||
| 8 | from weknora_eval.testset import approve_pending_testset, validate_reviewed_testset | ||
| 9 | |||
| 10 | |||
| 11 | def main() -> int: | ||
| 12 | setup_logging() | ||
| 13 | rows = approve_pending_testset() | ||
| 14 | errors = validate_reviewed_testset() | ||
| 15 | if errors: | ||
| 16 | for error in errors: | ||
| 17 | print(error) | ||
| 18 | return 1 | ||
| 19 | print(f"Wrote {len(rows)} approved QA records to data/testsets/testset.reviewed.jsonl") | ||
| 20 | return 0 if rows else 1 | ||
| 21 | |||
| 22 | |||
| 23 | if __name__ == "__main__": | ||
| 24 | sys.exit(main()) |
scripts/07_run_weknora_qa.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.api import client_from_config | ||
| 8 | from weknora_eval.config import load_config | ||
| 9 | from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | client = client_from_config(config) | ||
| 16 | qa_config = config.get("qa", {}) | ||
| 17 | rows = [row for row in read_jsonl("data/testsets/testset.reviewed.jsonl") if row.get("review_status") == "approved"] | ||
| 18 | answers = [] | ||
| 19 | |||
| 20 | for index, row in enumerate(rows, start=1): | ||
| 21 | sample_id = row["sample_id"] | ||
| 22 | try: | ||
| 23 | session = client.create_session(title=f"ragas-eval-{sample_id}") | ||
| 24 | session_id = session.get("id") | ||
| 25 | if not session_id: | ||
| 26 | raise RuntimeError(f"create_session returned no id for {sample_id}") | ||
| 27 | result = client.knowledge_chat_sse( | ||
| 28 | session_id=session_id, | ||
| 29 | query=row["user_input"], | ||
| 30 | disable_title=bool(qa_config.get("disable_title", True)), | ||
| 31 | enable_memory=bool(qa_config.get("enable_memory", False)), | ||
| 32 | channel=str(qa_config.get("channel", "api")), | ||
| 33 | ) | ||
| 34 | answer = { | ||
| 35 | "sample_id": sample_id, | ||
| 36 | "user_input": row["user_input"], | ||
| 37 | "session_id": session_id, | ||
| 38 | "request_id": result.get("request_id"), | ||
| 39 | "response": result.get("response") or "", | ||
| 40 | "retrieved_contexts": result.get("retrieved_contexts") or [], | ||
| 41 | "weknora_references": result.get("weknora_references") or [], | ||
| 42 | "error": None, | ||
| 43 | } | ||
| 44 | if not answer["response"]: | ||
| 45 | answer["error"] = "empty_response" | ||
| 46 | append_jsonl("data/runs/failed_requests.jsonl", answer) | ||
| 47 | elif not answer["retrieved_contexts"]: | ||
| 48 | append_jsonl("data/runs/failed_requests.jsonl", {**answer, "error": "empty_retrieval"}) | ||
| 49 | answers.append(answer) | ||
| 50 | print(f"[{index}/{len(rows)}] {sample_id} response_chars={len(answer['response'])}") | ||
| 51 | except Exception as exc: # noqa: BLE001 | ||
| 52 | failed = { | ||
| 53 | "sample_id": sample_id, | ||
| 54 | "user_input": row.get("user_input"), | ||
| 55 | "response": "", | ||
| 56 | "retrieved_contexts": [], | ||
| 57 | "weknora_references": [], | ||
| 58 | "session_id": None, | ||
| 59 | "request_id": None, | ||
| 60 | "error": str(exc), | ||
| 61 | } | ||
| 62 | answers.append(failed) | ||
| 63 | append_jsonl("data/runs/failed_requests.jsonl", failed) | ||
| 64 | print(f"[{index}/{len(rows)}] {sample_id} failed: {exc}") | ||
| 65 | |||
| 66 | write_jsonl("data/runs/weknora_answers.jsonl", answers) | ||
| 67 | failures = [row for row in answers if row.get("error") and row.get("error") != "empty_retrieval"] | ||
| 68 | return 1 if failures else 0 | ||
| 69 | |||
| 70 | |||
| 71 | if __name__ == "__main__": | ||
| 72 | sys.exit(main()) |
scripts/08_build_ragas_input.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.loaders import append_jsonl, read_jsonl, setup_logging, write_jsonl | ||
| 8 | |||
| 9 | |||
| 10 | def main() -> int: | ||
| 11 | setup_logging() | ||
| 12 | testset = { | ||
| 13 | row["sample_id"]: row | ||
| 14 | for row in read_jsonl("data/testsets/testset.reviewed.jsonl") | ||
| 15 | if row.get("review_status") == "approved" | ||
| 16 | } | ||
| 17 | answers = {row["sample_id"]: row for row in read_jsonl("data/runs/weknora_answers.jsonl")} | ||
| 18 | ragas_rows = [] | ||
| 19 | for sample_id, qa in testset.items(): | ||
| 20 | answer = answers.get(sample_id) | ||
| 21 | if not answer: | ||
| 22 | append_jsonl("data/runs/failed_requests.jsonl", {"sample_id": sample_id, "error": "missing_answer"}) | ||
| 23 | continue | ||
| 24 | row = { | ||
| 25 | "sample_id": sample_id, | ||
| 26 | "user_input": qa["user_input"], | ||
| 27 | "response": answer.get("response") or "", | ||
| 28 | "retrieved_contexts": answer.get("retrieved_contexts") or [], | ||
| 29 | "reference": qa["reference"], | ||
| 30 | "reference_contexts": qa.get("reference_contexts") or [], | ||
| 31 | "session_id": answer.get("session_id"), | ||
| 32 | "request_id": answer.get("request_id"), | ||
| 33 | "weknora_references": answer.get("weknora_references") or [], | ||
| 34 | "source_file": qa.get("source_file"), | ||
| 35 | "gold_chunk_ids": qa.get("gold_chunk_ids") or [], | ||
| 36 | } | ||
| 37 | missing = [ | ||
| 38 | key | ||
| 39 | for key in ("user_input", "response", "retrieved_contexts", "reference", "reference_contexts") | ||
| 40 | if not row.get(key) | ||
| 41 | ] | ||
| 42 | if missing: | ||
| 43 | append_jsonl( | ||
| 44 | "data/runs/failed_requests.jsonl", | ||
| 45 | {"sample_id": sample_id, "error": f"missing_ragas_fields:{','.join(missing)}"}, | ||
| 46 | ) | ||
| 47 | continue | ||
| 48 | ragas_rows.append(row) | ||
| 49 | |||
| 50 | write_jsonl("data/runs/ragas_input.jsonl", ragas_rows) | ||
| 51 | print(f"Built {len(ragas_rows)} Ragas input rows") | ||
| 52 | return 0 if ragas_rows else 1 | ||
| 53 | |||
| 54 | |||
| 55 | if __name__ == "__main__": | ||
| 56 | sys.exit(main()) |
scripts/09_run_ragas_eval.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.ragas_runner import run_ragas_eval | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | scores = run_ragas_eval(config) | ||
| 16 | print(f"Wrote {len(scores)} Ragas score rows to data/reports/ragas_scores.csv") | ||
| 17 | return 0 | ||
| 18 | |||
| 19 | |||
| 20 | if __name__ == "__main__": | ||
| 21 | sys.exit(main()) |
scripts/10_report.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | import _bootstrap # noqa: F401 | ||
| 6 | |||
| 7 | from weknora_eval.config import load_config | ||
| 8 | from weknora_eval.loaders import setup_logging | ||
| 9 | from weknora_eval.report import generate_summary_report | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | setup_logging() | ||
| 14 | config = load_config() | ||
| 15 | generate_summary_report(config) | ||
| 16 | print("Wrote report to data/reports/summary.md") | ||
| 17 | return 0 | ||
| 18 | |||
| 19 | |||
| 20 | if __name__ == "__main__": | ||
| 21 | sys.exit(main()) |
scripts/_bootstrap.py
0 → 100644
src/weknora_eval/__init__.py
0 → 100644
src/weknora_eval/api.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import logging | ||
| 4 | import time | ||
| 5 | from pathlib import Path | ||
| 6 | from typing import Any | ||
| 7 | from urllib.parse import urljoin | ||
| 8 | |||
| 9 | import requests | ||
| 10 | |||
| 11 | from weknora_eval.config import require_config | ||
| 12 | from weknora_eval.loaders import append_jsonl | ||
| 13 | from weknora_eval.sse import normalize_reference, parse_sse_events | ||
| 14 | |||
| 15 | logger = logging.getLogger(__name__) | ||
| 16 | |||
| 17 | |||
| 18 | class WeKnoraApiError(RuntimeError): | ||
| 19 | pass | ||
| 20 | |||
| 21 | |||
| 22 | class WeKnoraClient: | ||
| 23 | def __init__( | ||
| 24 | self, | ||
| 25 | *, | ||
| 26 | base_url: str, | ||
| 27 | api_key: str, | ||
| 28 | knowledge_base_id: str, | ||
| 29 | timeout_seconds: int = 300, | ||
| 30 | request_interval_seconds: float = 0.2, | ||
| 31 | error_log_path: str | Path = "data/runs/api_errors.jsonl", | ||
| 32 | max_retries: int = 3, | ||
| 33 | ) -> None: | ||
| 34 | self.base_url = base_url.rstrip("/") + "/" | ||
| 35 | self.api_key = api_key | ||
| 36 | self.knowledge_base_id = knowledge_base_id | ||
| 37 | self.timeout_seconds = timeout_seconds | ||
| 38 | self.request_interval_seconds = request_interval_seconds | ||
| 39 | self.error_log_path = Path(error_log_path) | ||
| 40 | self.max_retries = max_retries | ||
| 41 | self.session = requests.Session() | ||
| 42 | self.session.headers.update({"X-API-Key": api_key}) | ||
| 43 | |||
| 44 | def create_knowledge_base(self, *, name: str) -> dict[str, Any]: | ||
| 45 | return self._json_request("POST", "knowledge-bases", json={"name": name}) | ||
| 46 | |||
| 47 | def create_session( | ||
| 48 | self, | ||
| 49 | title: str, | ||
| 50 | description: str = "Ragas evaluation session", | ||
| 51 | ) -> dict[str, Any]: | ||
| 52 | payload = {"title": title, "description": description} | ||
| 53 | return self._json_request("POST", "sessions", json=payload) | ||
| 54 | |||
| 55 | def upload_file(self, file_path: str | Path, *, enable_multimodel: bool = False) -> dict[str, Any]: | ||
| 56 | self._ensure_knowledge_base_id() | ||
| 57 | target = Path(file_path) | ||
| 58 | with target.open("rb") as file: | ||
| 59 | files = {"file": (target.name, file)} | ||
| 60 | data = {"enable_multimodel": str(enable_multimodel).lower()} | ||
| 61 | return self._json_request( | ||
| 62 | "POST", | ||
| 63 | f"knowledge-bases/{self.knowledge_base_id}/knowledge/file", | ||
| 64 | files=files, | ||
| 65 | data=data, | ||
| 66 | ) | ||
| 67 | |||
| 68 | def list_knowledge(self, *, page_size: int = 100) -> list[dict[str, Any]]: | ||
| 69 | self._ensure_knowledge_base_id() | ||
| 70 | return self._paginate( | ||
| 71 | f"knowledge-bases/{self.knowledge_base_id}/knowledge", | ||
| 72 | page_size=page_size, | ||
| 73 | ) | ||
| 74 | |||
| 75 | def wait_ingestion_completed( | ||
| 76 | self, | ||
| 77 | *, | ||
| 78 | knowledge_ids: set[str] | None = None, | ||
| 79 | timeout_seconds: int | None = None, | ||
| 80 | poll_interval_seconds: float = 5.0, | ||
| 81 | ) -> dict[str, list[dict[str, Any]]]: | ||
| 82 | deadline = time.monotonic() + (timeout_seconds or self.timeout_seconds) | ||
| 83 | target_ids = knowledge_ids or set() | ||
| 84 | |||
| 85 | while time.monotonic() < deadline: | ||
| 86 | rows = self.list_knowledge() | ||
| 87 | if target_ids: | ||
| 88 | rows = [row for row in rows if row.get("id") in target_ids] | ||
| 89 | |||
| 90 | completed = [ | ||
| 91 | row | ||
| 92 | for row in rows | ||
| 93 | if row.get("parse_status") == "completed" and row.get("enable_status") == "enabled" | ||
| 94 | ] | ||
| 95 | failed = [row for row in rows if row.get("parse_status") == "failed"] | ||
| 96 | |||
| 97 | if failed: | ||
| 98 | return {"completed": completed, "failed": failed, "pending": []} | ||
| 99 | if rows and len(completed) == len(rows): | ||
| 100 | return {"completed": completed, "failed": [], "pending": []} | ||
| 101 | |||
| 102 | pending = [row for row in rows if row not in completed] | ||
| 103 | logger.info("Waiting for ingestion: completed=%s pending=%s", len(completed), len(pending)) | ||
| 104 | time.sleep(poll_interval_seconds) | ||
| 105 | |||
| 106 | rows = self.list_knowledge() | ||
| 107 | if target_ids: | ||
| 108 | rows = [row for row in rows if row.get("id") in target_ids] | ||
| 109 | completed = [ | ||
| 110 | row | ||
| 111 | for row in rows | ||
| 112 | if row.get("parse_status") == "completed" and row.get("enable_status") == "enabled" | ||
| 113 | ] | ||
| 114 | failed = [row for row in rows if row.get("parse_status") == "failed"] | ||
| 115 | pending = [row for row in rows if row not in completed and row not in failed] | ||
| 116 | return {"completed": completed, "failed": failed, "pending": pending} | ||
| 117 | |||
| 118 | def list_chunks(self, knowledge_id: str, *, page_size: int = 100) -> list[dict[str, Any]]: | ||
| 119 | return self._paginate(f"chunks/{knowledge_id}", page_size=page_size) | ||
| 120 | |||
| 121 | def knowledge_chat_sse( | ||
| 122 | self, | ||
| 123 | *, | ||
| 124 | session_id: str, | ||
| 125 | query: str, | ||
| 126 | knowledge_ids: list[str] | None = None, | ||
| 127 | knowledge_base_ids: list[str] | None = None, | ||
| 128 | disable_title: bool = True, | ||
| 129 | enable_memory: bool = False, | ||
| 130 | channel: str = "api", | ||
| 131 | ) -> dict[str, Any]: | ||
| 132 | payload: dict[str, Any] = { | ||
| 133 | "query": query, | ||
| 134 | "disable_title": disable_title, | ||
| 135 | "enable_memory": enable_memory, | ||
| 136 | "channel": channel, | ||
| 137 | } | ||
| 138 | if knowledge_ids: | ||
| 139 | payload["knowledge_ids"] = knowledge_ids | ||
| 140 | else: | ||
| 141 | self._ensure_knowledge_base_id() | ||
| 142 | payload["knowledge_base_ids"] = knowledge_base_ids or [self.knowledge_base_id] | ||
| 143 | |||
| 144 | url = self._url(f"knowledge-chat/{session_id}") | ||
| 145 | response = self.session.post( | ||
| 146 | url, | ||
| 147 | json=payload, | ||
| 148 | timeout=self.timeout_seconds, | ||
| 149 | stream=True, | ||
| 150 | headers={"Accept": "text/event-stream"}, | ||
| 151 | ) | ||
| 152 | if response.status_code >= 400: | ||
| 153 | self._log_error("POST", url, response) | ||
| 154 | raise WeKnoraApiError(f"POST {url} failed with HTTP {response.status_code}") | ||
| 155 | |||
| 156 | answer_parts: list[str] = [] | ||
| 157 | references: list[dict[str, Any]] = [] | ||
| 158 | raw_events: list[dict[str, Any]] = [] | ||
| 159 | request_id: str | None = None | ||
| 160 | seen_reference_ids: set[str] = set() | ||
| 161 | |||
| 162 | for event in parse_sse_events(response.iter_lines(decode_unicode=True)): | ||
| 163 | raw_events.append(event) | ||
| 164 | data = event.get("data") | ||
| 165 | if not isinstance(data, dict): | ||
| 166 | continue | ||
| 167 | request_id = request_id or data.get("id") | ||
| 168 | response_type = data.get("response_type") | ||
| 169 | if response_type == "references": | ||
| 170 | for reference in data.get("knowledge_references") or []: | ||
| 171 | normalized = normalize_reference(reference) | ||
| 172 | reference_id = str(normalized.get("id") or "") | ||
| 173 | if reference_id and reference_id in seen_reference_ids: | ||
| 174 | continue | ||
| 175 | if reference_id: | ||
| 176 | seen_reference_ids.add(reference_id) | ||
| 177 | references.append(normalized) | ||
| 178 | elif response_type == "answer" and not data.get("done"): | ||
| 179 | answer_parts.append(data.get("content") or "") | ||
| 180 | |||
| 181 | retrieved_contexts = [ref["content"] for ref in references if ref.get("content")] | ||
| 182 | return { | ||
| 183 | "request_id": request_id, | ||
| 184 | "response": "".join(answer_parts).strip(), | ||
| 185 | "retrieved_contexts": retrieved_contexts, | ||
| 186 | "weknora_references": references, | ||
| 187 | "raw_events": raw_events, | ||
| 188 | } | ||
| 189 | |||
| 190 | def load_messages(self, session_id: str, *, limit: int = 10) -> list[dict[str, Any]]: | ||
| 191 | payload = self._json_request("GET", f"messages/{session_id}/load", params={"limit": limit}) | ||
| 192 | if isinstance(payload, list): | ||
| 193 | return payload | ||
| 194 | return [] | ||
| 195 | |||
| 196 | def knowledge_search( | ||
| 197 | self, | ||
| 198 | query: str, | ||
| 199 | *, | ||
| 200 | knowledge_ids: list[str] | None = None, | ||
| 201 | knowledge_base_ids: list[str] | None = None, | ||
| 202 | ) -> list[dict[str, Any]]: | ||
| 203 | payload: dict[str, Any] = {"query": query} | ||
| 204 | if knowledge_ids: | ||
| 205 | payload["knowledge_ids"] = knowledge_ids | ||
| 206 | else: | ||
| 207 | self._ensure_knowledge_base_id() | ||
| 208 | payload["knowledge_base_ids"] = knowledge_base_ids or [self.knowledge_base_id] | ||
| 209 | data = self._json_request("POST", "knowledge-search", json=payload) | ||
| 210 | return data if isinstance(data, list) else [] | ||
| 211 | |||
| 212 | def _paginate(self, path: str, *, page_size: int = 100) -> list[dict[str, Any]]: | ||
| 213 | page = 1 | ||
| 214 | rows: list[dict[str, Any]] = [] | ||
| 215 | while True: | ||
| 216 | envelope = self._request("GET", path, params={"page": page, "page_size": page_size}) | ||
| 217 | payload = self._decode_envelope(envelope) | ||
| 218 | if not isinstance(payload, list): | ||
| 219 | raise WeKnoraApiError(f"Expected list response for {path}, got {type(payload).__name__}") | ||
| 220 | rows.extend(payload) | ||
| 221 | |||
| 222 | total = int(envelope.get("total") or len(rows)) | ||
| 223 | if len(rows) >= total or not payload: | ||
| 224 | return rows | ||
| 225 | page += 1 | ||
| 226 | |||
| 227 | def _json_request(self, method: str, path: str, **kwargs: Any) -> Any: | ||
| 228 | envelope = self._request(method, path, **kwargs) | ||
| 229 | return self._decode_envelope(envelope) | ||
| 230 | |||
| 231 | def _request(self, method: str, path: str, **kwargs: Any) -> dict[str, Any]: | ||
| 232 | url = self._url(path) | ||
| 233 | last_error: Exception | None = None | ||
| 234 | for attempt in range(1, self.max_retries + 1): | ||
| 235 | try: | ||
| 236 | response = self.session.request( | ||
| 237 | method, | ||
| 238 | url, | ||
| 239 | timeout=self.timeout_seconds, | ||
| 240 | **kwargs, | ||
| 241 | ) | ||
| 242 | if response.status_code in {429, 500, 502, 503, 504} and attempt < self.max_retries: | ||
| 243 | time.sleep(attempt) | ||
| 244 | continue | ||
| 245 | if response.status_code >= 400: | ||
| 246 | self._log_error(method, url, response) | ||
| 247 | raise WeKnoraApiError(f"{method} {url} failed with HTTP {response.status_code}") | ||
| 248 | time.sleep(self.request_interval_seconds) | ||
| 249 | return response.json() | ||
| 250 | except (requests.RequestException, ValueError, WeKnoraApiError) as exc: | ||
| 251 | last_error = exc | ||
| 252 | if attempt >= self.max_retries: | ||
| 253 | break | ||
| 254 | time.sleep(attempt) | ||
| 255 | |||
| 256 | raise WeKnoraApiError(f"{method} {url} failed: {last_error}") from last_error | ||
| 257 | |||
| 258 | def _decode_envelope(self, envelope: dict[str, Any]) -> Any: | ||
| 259 | if envelope.get("success") is False: | ||
| 260 | raise WeKnoraApiError(str(envelope)) | ||
| 261 | return envelope.get("data", envelope) | ||
| 262 | |||
| 263 | def _url(self, path: str) -> str: | ||
| 264 | return urljoin(self.base_url, path.lstrip("/")) | ||
| 265 | |||
| 266 | def _ensure_knowledge_base_id(self) -> None: | ||
| 267 | if not self.knowledge_base_id: | ||
| 268 | raise WeKnoraApiError("Missing knowledge_base_id. Run scripts/00_create_kb.py first.") | ||
| 269 | |||
| 270 | def _log_error(self, method: str, url: str, response: requests.Response) -> None: | ||
| 271 | body = response.text[:5000] | ||
| 272 | append_jsonl( | ||
| 273 | self.error_log_path, | ||
| 274 | { | ||
| 275 | "method": method, | ||
| 276 | "url": url, | ||
| 277 | "status_code": response.status_code, | ||
| 278 | "response_body": body, | ||
| 279 | }, | ||
| 280 | ) | ||
| 281 | |||
| 282 | |||
| 283 | def client_from_config(config: dict[str, Any]) -> WeKnoraClient: | ||
| 284 | weknora = config["weknora"] | ||
| 285 | return WeKnoraClient( | ||
| 286 | base_url=require_config(config, "weknora.base_url"), | ||
| 287 | api_key=require_config(config, "weknora.api_key"), | ||
| 288 | knowledge_base_id=require_config(config, "weknora.knowledge_base_id"), | ||
| 289 | timeout_seconds=int(weknora.get("timeout_seconds", 300)), | ||
| 290 | request_interval_seconds=float(weknora.get("request_interval_seconds", 0.2)), | ||
| 291 | ) | ||
| 292 | |||
| 293 | |||
| 294 | def bootstrap_client_from_config(config: dict[str, Any]) -> WeKnoraClient: | ||
| 295 | weknora = config["weknora"] | ||
| 296 | return WeKnoraClient( | ||
| 297 | base_url=require_config(config, "weknora.base_url"), | ||
| 298 | api_key=require_config(config, "weknora.api_key"), | ||
| 299 | knowledge_base_id=str(weknora.get("knowledge_base_id") or ""), | ||
| 300 | timeout_seconds=int(weknora.get("timeout_seconds", 300)), | ||
| 301 | request_interval_seconds=float(weknora.get("request_interval_seconds", 0.2)), | ||
| 302 | ) |
src/weknora_eval/config.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import os | ||
| 4 | import re | ||
| 5 | from pathlib import Path | ||
| 6 | from typing import Any | ||
| 7 | |||
| 8 | import yaml | ||
| 9 | from dotenv import load_dotenv | ||
| 10 | |||
| 11 | |||
| 12 | _ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}") | ||
| 13 | |||
| 14 | |||
| 15 | def _expand_env(value: Any) -> Any: | ||
| 16 | if isinstance(value, dict): | ||
| 17 | return {key: _expand_env(item) for key, item in value.items()} | ||
| 18 | if isinstance(value, list): | ||
| 19 | return [_expand_env(item) for item in value] | ||
| 20 | if not isinstance(value, str): | ||
| 21 | return value | ||
| 22 | |||
| 23 | def replace(match: re.Match[str]) -> str: | ||
| 24 | default = match.group(2) if match.group(2) is not None else "" | ||
| 25 | return os.getenv(match.group(1), default) | ||
| 26 | |||
| 27 | expanded = _ENV_PATTERN.sub(replace, value) | ||
| 28 | return _coerce_scalar(expanded) | ||
| 29 | |||
| 30 | |||
| 31 | def _coerce_scalar(value: str) -> Any: | ||
| 32 | lowered = value.lower() | ||
| 33 | if lowered in {"true", "false"}: | ||
| 34 | return lowered == "true" | ||
| 35 | if lowered in {"none", "null"}: | ||
| 36 | return None | ||
| 37 | try: | ||
| 38 | if "." not in value: | ||
| 39 | return int(value) | ||
| 40 | return float(value) | ||
| 41 | except ValueError: | ||
| 42 | return value | ||
| 43 | |||
| 44 | |||
| 45 | def load_config(path: str | Path = "configs/eval.yaml") -> dict[str, Any]: | ||
| 46 | load_dotenv() | ||
| 47 | config_path = Path(path) | ||
| 48 | with config_path.open("r", encoding="utf-8") as file: | ||
| 49 | raw = yaml.safe_load(file) or {} | ||
| 50 | return _expand_env(raw) | ||
| 51 | |||
| 52 | |||
| 53 | def require_config(config: dict[str, Any], dotted_key: str) -> Any: | ||
| 54 | current: Any = config | ||
| 55 | for part in dotted_key.split("."): | ||
| 56 | if not isinstance(current, dict) or part not in current: | ||
| 57 | raise ValueError(f"Missing required config value: {dotted_key}") | ||
| 58 | value = current[part] | ||
| 59 | if value is None or value == "": | ||
| 60 | raise ValueError(f"Missing required config value: {dotted_key}") | ||
| 61 | current = value | ||
| 62 | return current | ||
| 63 | |||
| 64 | |||
| 65 | def project_path(*parts: str) -> Path: | ||
| 66 | return Path.cwd().joinpath(*parts) |
src/weknora_eval/envfile.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from pathlib import Path | ||
| 4 | |||
| 5 | |||
| 6 | def set_env_value(path: str | Path, key: str, value: str) -> None: | ||
| 7 | target = Path(path) | ||
| 8 | lines = target.read_text(encoding="utf-8").splitlines() if target.exists() else [] | ||
| 9 | prefix = f"{key}=" | ||
| 10 | replacement = f"{key}={value}" | ||
| 11 | updated = False | ||
| 12 | output: list[str] = [] | ||
| 13 | |||
| 14 | for line in lines: | ||
| 15 | if line.startswith(prefix): | ||
| 16 | output.append(replacement) | ||
| 17 | updated = True | ||
| 18 | else: | ||
| 19 | output.append(line) | ||
| 20 | |||
| 21 | if not updated: | ||
| 22 | output.append(replacement) | ||
| 23 | |||
| 24 | target.write_text("\n".join(output) + "\n", encoding="utf-8") |
src/weknora_eval/loaders.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import json | ||
| 4 | import logging | ||
| 5 | from collections.abc import Iterable | ||
| 6 | from pathlib import Path | ||
| 7 | from typing import Any | ||
| 8 | |||
| 9 | |||
| 10 | def setup_logging(level: int = logging.INFO) -> None: | ||
| 11 | logging.basicConfig( | ||
| 12 | level=level, | ||
| 13 | format="%(asctime)s %(levelname)s %(name)s: %(message)s", | ||
| 14 | ) | ||
| 15 | |||
| 16 | |||
| 17 | def ensure_parent(path: str | Path) -> Path: | ||
| 18 | target = Path(path) | ||
| 19 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 20 | return target | ||
| 21 | |||
| 22 | |||
| 23 | def read_jsonl(path: str | Path, *, missing_ok: bool = False) -> list[dict[str, Any]]: | ||
| 24 | target = Path(path) | ||
| 25 | if not target.exists(): | ||
| 26 | if missing_ok: | ||
| 27 | return [] | ||
| 28 | raise FileNotFoundError(target) | ||
| 29 | |||
| 30 | rows: list[dict[str, Any]] = [] | ||
| 31 | with target.open("r", encoding="utf-8") as file: | ||
| 32 | for line_no, line in enumerate(file, start=1): | ||
| 33 | stripped = line.strip() | ||
| 34 | if not stripped: | ||
| 35 | continue | ||
| 36 | try: | ||
| 37 | rows.append(json.loads(stripped)) | ||
| 38 | except json.JSONDecodeError as exc: | ||
| 39 | raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc | ||
| 40 | return rows | ||
| 41 | |||
| 42 | |||
| 43 | def iter_jsonl(path: str | Path, *, missing_ok: bool = False) -> Iterable[dict[str, Any]]: | ||
| 44 | target = Path(path) | ||
| 45 | if not target.exists(): | ||
| 46 | if missing_ok: | ||
| 47 | return | ||
| 48 | raise FileNotFoundError(target) | ||
| 49 | |||
| 50 | with target.open("r", encoding="utf-8") as file: | ||
| 51 | for line_no, line in enumerate(file, start=1): | ||
| 52 | stripped = line.strip() | ||
| 53 | if not stripped: | ||
| 54 | continue | ||
| 55 | try: | ||
| 56 | yield json.loads(stripped) | ||
| 57 | except json.JSONDecodeError as exc: | ||
| 58 | raise ValueError(f"Invalid JSONL at {target}:{line_no}: {exc}") from exc | ||
| 59 | |||
| 60 | |||
| 61 | def write_jsonl(path: str | Path, rows: Iterable[dict[str, Any]]) -> int: | ||
| 62 | target = ensure_parent(path) | ||
| 63 | count = 0 | ||
| 64 | with target.open("w", encoding="utf-8") as file: | ||
| 65 | for row in rows: | ||
| 66 | file.write(json.dumps(row, ensure_ascii=False) + "\n") | ||
| 67 | count += 1 | ||
| 68 | return count | ||
| 69 | |||
| 70 | |||
| 71 | def append_jsonl(path: str | Path, row: dict[str, Any]) -> None: | ||
| 72 | target = ensure_parent(path) | ||
| 73 | with target.open("a", encoding="utf-8") as file: | ||
| 74 | file.write(json.dumps(row, ensure_ascii=False) + "\n") | ||
| 75 | |||
| 76 | |||
| 77 | def write_json(path: str | Path, payload: dict[str, Any]) -> None: | ||
| 78 | target = ensure_parent(path) | ||
| 79 | with target.open("w", encoding="utf-8") as file: | ||
| 80 | json.dump(payload, file, ensure_ascii=False, indent=2) | ||
| 81 | file.write("\n") | ||
| 82 | |||
| 83 | |||
| 84 | def compact_text(value: Any) -> str: | ||
| 85 | text = "" if value is None else str(value) | ||
| 86 | return "\n".join(line.strip() for line in text.splitlines() if line.strip()).strip() |
src/weknora_eval/parsers/__init__.py
0 → 100644
| 1 | """Document parser adapters.""" |
src/weknora_eval/parsers/local.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import statistics | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | from openpyxl import load_workbook | ||
| 8 | |||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | ||
| 10 | from weknora_eval.schemas import ParsedDocument | ||
| 11 | |||
| 12 | |||
| 13 | def parse_raw_docs(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| 14 | parsing = config["parsing"] | ||
| 15 | local_config = parsing.get("local", {}) | ||
| 16 | min_chars = int(local_config.get("min_chars", 80)) | ||
| 17 | pdf_backend = local_config.get("pdf_backend", "pypdf") | ||
| 18 | xlsx_mode = local_config.get("xlsx_mode", "row_text") | ||
| 19 | |||
| 20 | docs: list[ParsedDocument] = [] | ||
| 21 | failures: list[dict[str, Any]] = [] | ||
| 22 | |||
| 23 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | ||
| 24 | try: | ||
| 25 | docs.extend(parse_pdf(pdf_path, backend=pdf_backend, min_chars=min_chars)) | ||
| 26 | except Exception as exc: # noqa: BLE001 - parser failures must be persisted. | ||
| 27 | failures.append( | ||
| 28 | { | ||
| 29 | "source_file": pdf_path.name, | ||
| 30 | "parser": f"local:{pdf_backend}", | ||
| 31 | "status": "failed", | ||
| 32 | "error": str(exc), | ||
| 33 | "fallback_used": None, | ||
| 34 | } | ||
| 35 | ) | ||
| 36 | |||
| 37 | for xlsx_path in sorted(Path("data/raw_docs/xlsx").glob("*.xlsx")): | ||
| 38 | try: | ||
| 39 | docs.extend(parse_xlsx(xlsx_path, mode=xlsx_mode, min_chars=min_chars)) | ||
| 40 | except Exception as exc: # noqa: BLE001 | ||
| 41 | failures.append( | ||
| 42 | { | ||
| 43 | "source_file": xlsx_path.name, | ||
| 44 | "parser": "local:openpyxl", | ||
| 45 | "status": "failed", | ||
| 46 | "error": str(exc), | ||
| 47 | "fallback_used": None, | ||
| 48 | } | ||
| 49 | ) | ||
| 50 | |||
| 51 | rows = [doc.to_dict() for doc in docs] | ||
| 52 | write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows) | ||
| 53 | if failures: | ||
| 54 | write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures) | ||
| 55 | |||
| 56 | summary = build_parse_summary(rows, failures, parser=f"local:{pdf_backend}") | ||
| 57 | write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary) | ||
| 58 | return rows, summary | ||
| 59 | |||
| 60 | |||
| 61 | def parse_pdf(path: str | Path, *, backend: str = "pypdf", min_chars: int = 80) -> list[ParsedDocument]: | ||
| 62 | target = Path(path) | ||
| 63 | backend = backend.lower() | ||
| 64 | if backend == "pymupdf": | ||
| 65 | return _parse_pdf_pymupdf(target, min_chars=min_chars) | ||
| 66 | if backend == "pdfplumber": | ||
| 67 | return _parse_pdf_pdfplumber(target, min_chars=min_chars) | ||
| 68 | if backend == "pypdf": | ||
| 69 | return _parse_pdf_pypdf(target, min_chars=min_chars) | ||
| 70 | raise ValueError(f"Unsupported PDF backend: {backend}") | ||
| 71 | |||
| 72 | |||
| 73 | def _parse_pdf_pypdf(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 74 | from pypdf import PdfReader | ||
| 75 | |||
| 76 | reader = PdfReader(str(path)) | ||
| 77 | docs: list[ParsedDocument] = [] | ||
| 78 | for index, page in enumerate(reader.pages, start=1): | ||
| 79 | content = compact_text(page.extract_text() or "") | ||
| 80 | if len(content) < min_chars: | ||
| 81 | continue | ||
| 82 | docs.append(_pdf_doc(path, index, content, "local:pypdf")) | ||
| 83 | return docs | ||
| 84 | |||
| 85 | |||
| 86 | def _parse_pdf_pymupdf(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 87 | try: | ||
| 88 | import fitz | ||
| 89 | except ImportError as exc: | ||
| 90 | raise ImportError("pymupdf backend requires `pip install -e '.[pdf]'`") from exc | ||
| 91 | |||
| 92 | docs: list[ParsedDocument] = [] | ||
| 93 | with fitz.open(path) as document: | ||
| 94 | for index, page in enumerate(document, start=1): | ||
| 95 | content = compact_text(page.get_text("text")) | ||
| 96 | if len(content) < min_chars: | ||
| 97 | continue | ||
| 98 | docs.append(_pdf_doc(path, index, content, "local:pymupdf")) | ||
| 99 | return docs | ||
| 100 | |||
| 101 | |||
| 102 | def _parse_pdf_pdfplumber(path: Path, *, min_chars: int) -> list[ParsedDocument]: | ||
| 103 | try: | ||
| 104 | import pdfplumber | ||
| 105 | except ImportError as exc: | ||
| 106 | raise ImportError("pdfplumber backend requires `pip install -e '.[pdf]'`") from exc | ||
| 107 | |||
| 108 | docs: list[ParsedDocument] = [] | ||
| 109 | with pdfplumber.open(path) as pdf: | ||
| 110 | for index, page in enumerate(pdf.pages, start=1): | ||
| 111 | content = compact_text(page.extract_text() or "") | ||
| 112 | if len(content) < min_chars: | ||
| 113 | continue | ||
| 114 | docs.append(_pdf_doc(path, index, content, "local:pdfplumber")) | ||
| 115 | return docs | ||
| 116 | |||
| 117 | |||
| 118 | def _pdf_doc(path: Path, page: int, content: str, parser: str) -> ParsedDocument: | ||
| 119 | return ParsedDocument( | ||
| 120 | doc_id=f"{path.name}::page-{page}", | ||
| 121 | source_file=path.name, | ||
| 122 | file_type="pdf", | ||
| 123 | page=page, | ||
| 124 | content=content, | ||
| 125 | metadata={"parser": parser}, | ||
| 126 | ) | ||
| 127 | |||
| 128 | |||
| 129 | def parse_xlsx(path: str | Path, *, mode: str = "row_text", min_chars: int = 80) -> list[ParsedDocument]: | ||
| 130 | target = Path(path) | ||
| 131 | mode = mode.lower() | ||
| 132 | workbook = load_workbook(target, data_only=True, read_only=True) | ||
| 133 | if mode == "row_text": | ||
| 134 | return _parse_xlsx_row_text(target, workbook, min_chars=min_chars) | ||
| 135 | if mode == "markdown_table": | ||
| 136 | return _parse_xlsx_markdown_table(target, workbook, min_chars=min_chars) | ||
| 137 | raise ValueError(f"Unsupported XLSX mode: {mode}") | ||
| 138 | |||
| 139 | |||
| 140 | def _parse_xlsx_row_text(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]: | ||
| 141 | docs: list[ParsedDocument] = [] | ||
| 142 | for sheet in workbook.worksheets: | ||
| 143 | rows = list(sheet.iter_rows(values_only=True)) | ||
| 144 | if not rows: | ||
| 145 | continue | ||
| 146 | headers = [_cell_to_text(value) or f"col_{index}" for index, value in enumerate(rows[0], start=1)] | ||
| 147 | for row_index, row in enumerate(rows[1:], start=2): | ||
| 148 | pairs = [] | ||
| 149 | for header, value in zip(headers, row, strict=False): | ||
| 150 | cell = _cell_to_text(value) | ||
| 151 | if cell: | ||
| 152 | pairs.append(f"{header}: {cell}") | ||
| 153 | content = "\n".join(pairs).strip() | ||
| 154 | if len(content) < min_chars: | ||
| 155 | continue | ||
| 156 | docs.append( | ||
| 157 | ParsedDocument( | ||
| 158 | doc_id=f"{path.name}::{sheet.title}::row-{row_index}", | ||
| 159 | source_file=path.name, | ||
| 160 | file_type="xlsx", | ||
| 161 | sheet=sheet.title, | ||
| 162 | row_index=row_index, | ||
| 163 | content=content, | ||
| 164 | metadata={"parser": "local:openpyxl", "columns": headers}, | ||
| 165 | ) | ||
| 166 | ) | ||
| 167 | return docs | ||
| 168 | |||
| 169 | |||
| 170 | def _parse_xlsx_markdown_table(path: Path, workbook: Any, *, min_chars: int) -> list[ParsedDocument]: | ||
| 171 | docs: list[ParsedDocument] = [] | ||
| 172 | for sheet in workbook.worksheets: | ||
| 173 | rows = [ | ||
| 174 | [_cell_to_text(value) for value in row] | ||
| 175 | for row in sheet.iter_rows(values_only=True) | ||
| 176 | if any(value is not None for value in row) | ||
| 177 | ] | ||
| 178 | if not rows: | ||
| 179 | continue | ||
| 180 | width = max(len(row) for row in rows) | ||
| 181 | normalized = [row + [""] * (width - len(row)) for row in rows] | ||
| 182 | header = normalized[0] | ||
| 183 | separator = ["---"] * width | ||
| 184 | body = normalized[1:] | ||
| 185 | lines = [ | ||
| 186 | "| " + " | ".join(header) + " |", | ||
| 187 | "| " + " | ".join(separator) + " |", | ||
| 188 | ] | ||
| 189 | lines.extend("| " + " | ".join(row) + " |" for row in body) | ||
| 190 | content = "\n".join(lines) | ||
| 191 | if len(content) < min_chars: | ||
| 192 | continue | ||
| 193 | docs.append( | ||
| 194 | ParsedDocument( | ||
| 195 | doc_id=f"{path.name}::{sheet.title}", | ||
| 196 | source_file=path.name, | ||
| 197 | file_type="xlsx", | ||
| 198 | sheet=sheet.title, | ||
| 199 | content=content, | ||
| 200 | metadata={"parser": "local:openpyxl", "mode": "markdown_table"}, | ||
| 201 | ) | ||
| 202 | ) | ||
| 203 | return docs | ||
| 204 | |||
| 205 | |||
| 206 | def _cell_to_text(value: Any) -> str: | ||
| 207 | if value is None: | ||
| 208 | return "" | ||
| 209 | text = str(value).strip() | ||
| 210 | return text.replace("\n", " ") | ||
| 211 | |||
| 212 | |||
| 213 | def build_parse_summary( | ||
| 214 | rows: list[dict[str, Any]], | ||
| 215 | failures: list[dict[str, Any]], | ||
| 216 | *, | ||
| 217 | parser: str, | ||
| 218 | ) -> dict[str, Any]: | ||
| 219 | source_files = {row.get("source_file") for row in rows if row.get("source_file")} | ||
| 220 | failed_files = {row.get("source_file") for row in failures if row.get("source_file")} | ||
| 221 | lengths = [len(row.get("content") or "") for row in rows] | ||
| 222 | return { | ||
| 223 | "total_files": len(source_files | failed_files), | ||
| 224 | "parsed_files": len(source_files), | ||
| 225 | "failed_files": len(failed_files), | ||
| 226 | "total_documents": len(rows), | ||
| 227 | "empty_documents": sum(1 for length in lengths if length == 0), | ||
| 228 | "avg_chars": round(statistics.mean(lengths), 2) if lengths else 0, | ||
| 229 | "parser": parser, | ||
| 230 | } |
src/weknora_eval/parsers/mineru.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import subprocess | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import requests | ||
| 8 | |||
| 9 | from weknora_eval.loaders import compact_text, write_json, write_jsonl | ||
| 10 | from weknora_eval.parsers.local import build_parse_summary, parse_pdf | ||
| 11 | from weknora_eval.schemas import ParsedDocument | ||
| 12 | |||
| 13 | |||
| 14 | class MinerUParseError(RuntimeError): | ||
| 15 | pass | ||
| 16 | |||
| 17 | |||
| 18 | def parse_with_mineru(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| 19 | parsing = config["parsing"] | ||
| 20 | mineru = parsing.get("mineru", {}) | ||
| 21 | mode = mineru.get("mode", "cli") | ||
| 22 | fallback = bool(mineru.get("fallback_to_local", True)) | ||
| 23 | local_config = parsing.get("local", {}) | ||
| 24 | min_chars = int(local_config.get("min_chars", 80)) | ||
| 25 | |||
| 26 | docs: list[ParsedDocument] = [] | ||
| 27 | failures: list[dict[str, Any]] = [] | ||
| 28 | |||
| 29 | for pdf_path in sorted(Path("data/raw_docs/pdf").glob("*.pdf")): | ||
| 30 | parser_name = f"mineru:{mode}" | ||
| 31 | try: | ||
| 32 | if mode == "cli": | ||
| 33 | docs.extend(parse_pdf_with_cli(pdf_path, mineru, min_chars=min_chars)) | ||
| 34 | elif mode == "http": | ||
| 35 | docs.extend(parse_pdf_with_http(pdf_path, mineru, min_chars=min_chars)) | ||
| 36 | else: | ||
| 37 | raise MinerUParseError(f"Unsupported MinerU mode: {mode}") | ||
| 38 | except Exception as exc: # noqa: BLE001 | ||
| 39 | failure = { | ||
| 40 | "source_file": pdf_path.name, | ||
| 41 | "parser": parser_name, | ||
| 42 | "status": "failed", | ||
| 43 | "error": str(exc), | ||
| 44 | "fallback_used": None, | ||
| 45 | } | ||
| 46 | if fallback: | ||
| 47 | try: | ||
| 48 | backend = local_config.get("pdf_backend", "pypdf") | ||
| 49 | local_docs = parse_pdf(pdf_path, backend=backend, min_chars=min_chars) | ||
| 50 | docs.extend(local_docs) | ||
| 51 | failure["fallback_used"] = f"local:{backend}" | ||
| 52 | except Exception as fallback_exc: # noqa: BLE001 | ||
| 53 | failure["fallback_error"] = str(fallback_exc) | ||
| 54 | failures.append(failure) | ||
| 55 | |||
| 56 | rows = [doc.to_dict() for doc in docs] | ||
| 57 | write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows) | ||
| 58 | if failures: | ||
| 59 | write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures) | ||
| 60 | |||
| 61 | summary = build_parse_summary(rows, failures, parser=f"mineru:{mode}") | ||
| 62 | write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary) | ||
| 63 | return rows, summary | ||
| 64 | |||
| 65 | |||
| 66 | def parse_pdf_with_cli( | ||
| 67 | pdf_path: str | Path, | ||
| 68 | mineru_config: dict[str, Any], | ||
| 69 | *, | ||
| 70 | min_chars: int, | ||
| 71 | ) -> list[ParsedDocument]: | ||
| 72 | target = Path(pdf_path) | ||
| 73 | output_root = Path(mineru_config.get("output_dir", "data/parsed_docs/mineru_raw")) | ||
| 74 | output_dir = output_root / target.stem | ||
| 75 | output_dir.mkdir(parents=True, exist_ok=True) | ||
| 76 | cli_bin = mineru_config.get("cli_bin", "mineru") | ||
| 77 | timeout = int(mineru_config.get("timeout_seconds", 600)) | ||
| 78 | |||
| 79 | # MinerU CLI arguments vary by release. This common invocation is isolated | ||
| 80 | # here so deployments can replace it without touching pipeline scripts. | ||
| 81 | result = subprocess.run( | ||
| 82 | [cli_bin, "-p", str(target), "-o", str(output_dir)], | ||
| 83 | check=False, | ||
| 84 | capture_output=True, | ||
| 85 | text=True, | ||
| 86 | timeout=timeout, | ||
| 87 | ) | ||
| 88 | if result.returncode != 0: | ||
| 89 | raise MinerUParseError(result.stderr.strip() or result.stdout.strip() or "MinerU CLI failed") | ||
| 90 | |||
| 91 | markdown_files = sorted(output_dir.rglob("*.md")) | ||
| 92 | if not markdown_files: | ||
| 93 | raise MinerUParseError(f"No Markdown output found in {output_dir}") | ||
| 94 | |||
| 95 | docs: list[ParsedDocument] = [] | ||
| 96 | for index, markdown_path in enumerate(markdown_files, start=1): | ||
| 97 | content = compact_text(markdown_path.read_text(encoding="utf-8")) | ||
| 98 | if len(content) < min_chars: | ||
| 99 | continue | ||
| 100 | docs.append( | ||
| 101 | ParsedDocument( | ||
| 102 | doc_id=f"{target.name}::mineru-{index}", | ||
| 103 | source_file=target.name, | ||
| 104 | file_type="pdf", | ||
| 105 | content=content, | ||
| 106 | metadata={ | ||
| 107 | "parser": "mineru:cli", | ||
| 108 | "mineru_output": str(markdown_path), | ||
| 109 | }, | ||
| 110 | ) | ||
| 111 | ) | ||
| 112 | return docs | ||
| 113 | |||
| 114 | |||
| 115 | def parse_pdf_with_http( | ||
| 116 | pdf_path: str | Path, | ||
| 117 | mineru_config: dict[str, Any], | ||
| 118 | *, | ||
| 119 | min_chars: int, | ||
| 120 | ) -> list[ParsedDocument]: | ||
| 121 | target = Path(pdf_path) | ||
| 122 | base_url = str(mineru_config.get("http_base_url") or "").rstrip("/") | ||
| 123 | if not base_url: | ||
| 124 | raise MinerUParseError("MinerU HTTP mode requires parsing.mineru.http_base_url") | ||
| 125 | |||
| 126 | headers = {} | ||
| 127 | if mineru_config.get("api_key"): | ||
| 128 | headers["Authorization"] = f"Bearer {mineru_config['api_key']}" | ||
| 129 | |||
| 130 | # The checklist does not define a universal MinerU HTTP contract. This | ||
| 131 | # implementation expects a replaceable service exposing POST /parse and | ||
| 132 | # returning {"markdown": "..."} or {"documents": [{"content": "..."}]}. | ||
| 133 | with target.open("rb") as file: | ||
| 134 | response = requests.post( | ||
| 135 | f"{base_url}/parse", | ||
| 136 | files={"file": (target.name, file, "application/pdf")}, | ||
| 137 | headers=headers, | ||
| 138 | timeout=int(mineru_config.get("timeout_seconds", 600)), | ||
| 139 | ) | ||
| 140 | if response.status_code >= 400: | ||
| 141 | raise MinerUParseError(f"MinerU HTTP failed with {response.status_code}: {response.text[:500]}") | ||
| 142 | |||
| 143 | payload = response.json() | ||
| 144 | contents: list[str] = [] | ||
| 145 | if isinstance(payload.get("documents"), list): | ||
| 146 | contents = [compact_text(item.get("content")) for item in payload["documents"]] | ||
| 147 | elif payload.get("markdown"): | ||
| 148 | contents = [compact_text(payload["markdown"])] | ||
| 149 | else: | ||
| 150 | raise MinerUParseError("MinerU HTTP response must include `markdown` or `documents`") | ||
| 151 | |||
| 152 | docs: list[ParsedDocument] = [] | ||
| 153 | for index, content in enumerate(contents, start=1): | ||
| 154 | if len(content) < min_chars: | ||
| 155 | continue | ||
| 156 | docs.append( | ||
| 157 | ParsedDocument( | ||
| 158 | doc_id=f"{target.name}::mineru-http-{index}", | ||
| 159 | source_file=target.name, | ||
| 160 | file_type="pdf", | ||
| 161 | content=content, | ||
| 162 | metadata={"parser": "mineru:http"}, | ||
| 163 | ) | ||
| 164 | ) | ||
| 165 | return docs |
src/weknora_eval/ragas_runner.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import os | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import pandas as pd | ||
| 8 | |||
| 9 | from weknora_eval.config import require_config | ||
| 10 | from weknora_eval.loaders import read_jsonl | ||
| 11 | |||
| 12 | |||
| 13 | def run_ragas_eval( | ||
| 14 | config: dict[str, Any], | ||
| 15 | *, | ||
| 16 | input_path: str = "data/runs/ragas_input.jsonl", | ||
| 17 | output_csv_path: str = "data/reports/ragas_scores.csv", | ||
| 18 | ) -> pd.DataFrame: | ||
| 19 | from datasets import Dataset | ||
| 20 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings | ||
| 21 | from ragas import evaluate | ||
| 22 | from ragas.run_config import RunConfig | ||
| 23 | |||
| 24 | ragas_config = config["ragas"] | ||
| 25 | llm_api_key = _first_non_empty(ragas_config, "llm_api_key", "api_key") | ||
| 26 | llm_base_url = _first_non_empty(ragas_config, "llm_base_url", "base_url") | ||
| 27 | embedding_api_key = _first_non_empty(ragas_config, "embedding_api_key", "api_key") | ||
| 28 | embedding_base_url = _first_non_empty(ragas_config, "embedding_base_url", "base_url") | ||
| 29 | judge_model = str(require_config(config, "ragas.judge_model")) | ||
| 30 | embedding_model = str(require_config(config, "ragas.embedding_model")) | ||
| 31 | temperature = float(ragas_config.get("temperature", 0)) | ||
| 32 | max_tokens = int(ragas_config.get("max_tokens", 4096)) | ||
| 33 | timeout_seconds = int(ragas_config.get("timeout_seconds", 600)) | ||
| 34 | max_workers = int(ragas_config.get("max_workers", 1)) | ||
| 35 | |||
| 36 | os.environ["OPENAI_API_KEY"] = llm_api_key | ||
| 37 | if llm_base_url: | ||
| 38 | os.environ["OPENAI_BASE_URL"] = llm_base_url | ||
| 39 | |||
| 40 | rows = read_jsonl(input_path) | ||
| 41 | dataset = Dataset.from_list( | ||
| 42 | [ | ||
| 43 | { | ||
| 44 | "user_input": row["user_input"], | ||
| 45 | "response": row["response"], | ||
| 46 | "retrieved_contexts": row["retrieved_contexts"], | ||
| 47 | "reference": row["reference"], | ||
| 48 | "reference_contexts": row.get("reference_contexts") or [], | ||
| 49 | } | ||
| 50 | for row in rows | ||
| 51 | ] | ||
| 52 | ) | ||
| 53 | |||
| 54 | metric_map = _metric_map() | ||
| 55 | selected_metrics = [ | ||
| 56 | metric_map[name] | ||
| 57 | for name in ragas_config.get("metrics", metric_map.keys()) | ||
| 58 | if name in metric_map | ||
| 59 | ] | ||
| 60 | |||
| 61 | llm = ChatOpenAI( | ||
| 62 | model=judge_model, | ||
| 63 | api_key=llm_api_key, | ||
| 64 | base_url=llm_base_url or None, | ||
| 65 | temperature=temperature, | ||
| 66 | max_tokens=max_tokens, | ||
| 67 | ) | ||
| 68 | embeddings = OpenAIEmbeddings( | ||
| 69 | model=embedding_model, | ||
| 70 | api_key=embedding_api_key, | ||
| 71 | base_url=embedding_base_url or None, | ||
| 72 | tiktoken_enabled=False, | ||
| 73 | check_embedding_ctx_length=False, | ||
| 74 | ) | ||
| 75 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) | ||
| 76 | |||
| 77 | run_config = RunConfig(timeout=timeout_seconds, max_workers=max_workers) | ||
| 78 | result = evaluate( | ||
| 79 | dataset, | ||
| 80 | metrics=selected_metrics, | ||
| 81 | llm=ragas_llm, | ||
| 82 | embeddings=ragas_embeddings, | ||
| 83 | run_config=run_config, | ||
| 84 | ) | ||
| 85 | scores = result.to_pandas() | ||
| 86 | for index, row in enumerate(rows): | ||
| 87 | scores.loc[index, "sample_id"] = row.get("sample_id") | ||
| 88 | |||
| 89 | target = Path(output_csv_path) | ||
| 90 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 91 | scores.to_csv(target, index=False) | ||
| 92 | return scores | ||
| 93 | |||
| 94 | |||
| 95 | def _metric_map() -> dict[str, Any]: | ||
| 96 | try: | ||
| 97 | from ragas.metrics import ( | ||
| 98 | context_precision, | ||
| 99 | context_recall, | ||
| 100 | faithfulness, | ||
| 101 | factual_correctness, | ||
| 102 | response_relevancy, | ||
| 103 | ) | ||
| 104 | |||
| 105 | return { | ||
| 106 | "faithfulness": faithfulness, | ||
| 107 | "response_relevancy": response_relevancy, | ||
| 108 | "context_precision": context_precision, | ||
| 109 | "context_recall": context_recall, | ||
| 110 | "factual_correctness": factual_correctness, | ||
| 111 | } | ||
| 112 | except ImportError: | ||
| 113 | from ragas.metrics import ( | ||
| 114 | Faithfulness, | ||
| 115 | FactualCorrectness, | ||
| 116 | LLMContextPrecisionWithReference, | ||
| 117 | LLMContextRecall, | ||
| 118 | ResponseRelevancy, | ||
| 119 | ) | ||
| 120 | |||
| 121 | return { | ||
| 122 | "faithfulness": Faithfulness(), | ||
| 123 | "response_relevancy": ResponseRelevancy(), | ||
| 124 | "context_precision": LLMContextPrecisionWithReference(), | ||
| 125 | "context_recall": LLMContextRecall(), | ||
| 126 | "factual_correctness": FactualCorrectness(), | ||
| 127 | } | ||
| 128 | |||
| 129 | |||
| 130 | def _first_non_empty(config: dict[str, Any], *keys: str) -> str: | ||
| 131 | for key in keys: | ||
| 132 | value = config.get(key) | ||
| 133 | if value not in {None, ""}: | ||
| 134 | return str(value) | ||
| 135 | raise ValueError(f"Missing required Ragas config value. Checked: {', '.join(keys)}") | ||
| 136 | |||
| 137 | |||
| 138 | def _wrap_langchain_models(llm: Any, embeddings: Any) -> tuple[Any, Any]: | ||
| 139 | try: | ||
| 140 | from ragas.embeddings import LangchainEmbeddingsWrapper | ||
| 141 | from ragas.llms import LangchainLLMWrapper | ||
| 142 | except ImportError: | ||
| 143 | return llm, embeddings | ||
| 144 | |||
| 145 | return LangchainLLMWrapper(llm), LangchainEmbeddingsWrapper(embeddings) |
src/weknora_eval/report.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import math | ||
| 4 | from pathlib import Path | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | import pandas as pd | ||
| 8 | |||
| 9 | from weknora_eval.loaders import read_jsonl | ||
| 10 | |||
| 11 | |||
| 12 | def retrieval_metrics( | ||
| 13 | ragas_rows: list[dict[str, Any]], | ||
| 14 | *, | ||
| 15 | ks: tuple[int, ...] = (1, 3, 5), | ||
| 16 | ) -> dict[str, float]: | ||
| 17 | samples = [row for row in ragas_rows if row.get("gold_chunk_ids")] | ||
| 18 | if not samples: | ||
| 19 | return {} | ||
| 20 | |||
| 21 | totals: dict[str, float] = {f"hit@{k}": 0.0 for k in ks} | ||
| 22 | totals.update({f"recall@{k}": 0.0 for k in ks}) | ||
| 23 | totals["mrr"] = 0.0 | ||
| 24 | totals["ndcg@5"] = 0.0 | ||
| 25 | |||
| 26 | for row in samples: | ||
| 27 | gold = set(row.get("gold_chunk_ids") or []) | ||
| 28 | refs = row.get("weknora_references") or [] | ||
| 29 | predicted = [str(ref.get("id")) for ref in refs if ref.get("id")] | ||
| 30 | for k in ks: | ||
| 31 | top_k = predicted[:k] | ||
| 32 | hits = len(gold.intersection(top_k)) | ||
| 33 | totals[f"hit@{k}"] += 1.0 if hits else 0.0 | ||
| 34 | totals[f"recall@{k}"] += hits / len(gold) | ||
| 35 | |||
| 36 | first_rank = next((idx for idx, chunk_id in enumerate(predicted, start=1) if chunk_id in gold), None) | ||
| 37 | if first_rank: | ||
| 38 | totals["mrr"] += 1 / first_rank | ||
| 39 | |||
| 40 | dcg = 0.0 | ||
| 41 | for idx, chunk_id in enumerate(predicted[:5], start=1): | ||
| 42 | if chunk_id in gold: | ||
| 43 | dcg += 1 / math.log2(idx + 1) | ||
| 44 | ideal_hits = min(len(gold), 5) | ||
| 45 | idcg = sum(1 / math.log2(idx + 1) for idx in range(1, ideal_hits + 1)) | ||
| 46 | totals["ndcg@5"] += dcg / idcg if idcg else 0.0 | ||
| 47 | |||
| 48 | return {key: round(value / len(samples), 4) for key, value in totals.items()} | ||
| 49 | |||
| 50 | |||
| 51 | def generate_summary_report( | ||
| 52 | config: dict[str, Any], | ||
| 53 | *, | ||
| 54 | scores_csv_path: str = "data/reports/ragas_scores.csv", | ||
| 55 | ragas_input_path: str = "data/runs/ragas_input.jsonl", | ||
| 56 | answers_path: str = "data/runs/weknora_answers.jsonl", | ||
| 57 | output_path: str = "data/reports/summary.md", | ||
| 58 | ) -> str: | ||
| 59 | ragas_rows = read_jsonl(ragas_input_path, missing_ok=True) | ||
| 60 | answer_rows = read_jsonl(answers_path, missing_ok=True) | ||
| 61 | scores = pd.read_csv(scores_csv_path) if Path(scores_csv_path).exists() else pd.DataFrame() | ||
| 62 | |||
| 63 | lines = [ | ||
| 64 | "# Ragas 评估报告", | ||
| 65 | "", | ||
| 66 | "## 运行信息", | ||
| 67 | f"- WeKnora Base URL: {config.get('weknora', {}).get('base_url', '')}", | ||
| 68 | f"- 知识库 ID: {config.get('weknora', {}).get('knowledge_base_id', '')}", | ||
| 69 | f"- 测试集规模: {len(ragas_rows)}", | ||
| 70 | f"- 审核通过样本数: {len(ragas_rows)}", | ||
| 71 | f"- 失败样本数: {sum(1 for row in answer_rows if row.get('error'))}", | ||
| 72 | f"- Judge 模型: {config.get('ragas', {}).get('judge_model', '')}", | ||
| 73 | "", | ||
| 74 | "## 聚合指标", | ||
| 75 | "| 指标 | 平均值 | P50 | 失败阈值 |", | ||
| 76 | "| --- | --- | --- | --- |", | ||
| 77 | ] | ||
| 78 | |||
| 79 | metric_columns = [ | ||
| 80 | column | ||
| 81 | for column in scores.columns | ||
| 82 | if column not in {"sample_id", "user_input", "response", "reference"} | ||
| 83 | and pd.api.types.is_numeric_dtype(scores[column]) | ||
| 84 | ] | ||
| 85 | for column in metric_columns: | ||
| 86 | lines.append( | ||
| 87 | f"| {column} | {scores[column].mean():.4f} | {scores[column].median():.4f} | 0.50 |" | ||
| 88 | ) | ||
| 89 | |||
| 90 | chunk_metrics = retrieval_metrics(ragas_rows) | ||
| 91 | if chunk_metrics: | ||
| 92 | lines.extend(["", "## Chunk ID 检索指标", "| 指标 | 平均值 |", "| --- | --- |"]) | ||
| 93 | for key, value in chunk_metrics.items(): | ||
| 94 | lines.append(f"| {key} | {value:.4f} |") | ||
| 95 | |||
| 96 | lines.extend(["", "## 检索失败样本", "| sample_id | 问题 | 预期文件 | 实际召回文件 | context_recall | 备注 |", "| --- | --- | --- | --- | --- | --- |"]) | ||
| 97 | for row in _worst_rows(scores, "context_recall"): | ||
| 98 | sample = _sample_by_id(ragas_rows, row.get("sample_id")) | ||
| 99 | actual_files = sorted( | ||
| 100 | { | ||
| 101 | ref.get("knowledge_filename") or "" | ||
| 102 | for ref in sample.get("weknora_references", []) | ||
| 103 | if ref.get("knowledge_filename") | ||
| 104 | } | ||
| 105 | ) | ||
| 106 | lines.append( | ||
| 107 | f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | " | ||
| 108 | f"{_cell(sample.get('source_file'))} | {_cell(', '.join(actual_files))} | " | ||
| 109 | f"{_score(row.get('context_recall'))} | |" | ||
| 110 | ) | ||
| 111 | |||
| 112 | lines.extend(["", "## 生成失败样本", "| sample_id | 问题 | 模型答案 | 标准答案 | faithfulness | factual_correctness |", "| --- | --- | --- | --- | --- | --- |"]) | ||
| 113 | for row in _worst_rows(scores, "faithfulness"): | ||
| 114 | sample = _sample_by_id(ragas_rows, row.get("sample_id")) | ||
| 115 | lines.append( | ||
| 116 | f"| {row.get('sample_id', '')} | {_cell(sample.get('user_input'))} | " | ||
| 117 | f"{_cell(sample.get('response'))} | {_cell(sample.get('reference'))} | " | ||
| 118 | f"{_score(row.get('faithfulness'))} | {_score(row.get('factual_correctness'))} |" | ||
| 119 | ) | ||
| 120 | |||
| 121 | empty_retrievals = sum(1 for row in ragas_rows if not row.get("retrieved_contexts")) | ||
| 122 | fallback_answers = sum(1 for row in answer_rows if row.get("is_fallback")) | ||
| 123 | source_counts: dict[str, int] = {} | ||
| 124 | for row in ragas_rows: | ||
| 125 | source = row.get("source_file") or "unknown" | ||
| 126 | source_counts[source] = source_counts.get(source, 0) + 1 | ||
| 127 | |||
| 128 | lines.extend( | ||
| 129 | [ | ||
| 130 | "", | ||
| 131 | "## 数据质量", | ||
| 132 | f"- 空检索数量: {empty_retrievals}", | ||
| 133 | f"- fallback 答案数量: {fallback_answers}", | ||
| 134 | f"- 来源文件分布: {source_counts}", | ||
| 135 | "", | ||
| 136 | "## 改进建议", | ||
| 137 | "- 优先检查 context_recall 低且 retrieved_contexts 为空的样本。", | ||
| 138 | "- 对低 faithfulness 且 context_recall 正常的样本,重点检查生成模型和提示词。", | ||
| 139 | "- 对 Chunk ID 指标低但 Ragas context 指标正常的样本,检查 chunk 切分或 gold_chunk_ids 标注。", | ||
| 140 | "", | ||
| 141 | ] | ||
| 142 | ) | ||
| 143 | |||
| 144 | content = "\n".join(lines) | ||
| 145 | target = Path(output_path) | ||
| 146 | target.parent.mkdir(parents=True, exist_ok=True) | ||
| 147 | target.write_text(content, encoding="utf-8") | ||
| 148 | return content | ||
| 149 | |||
| 150 | |||
| 151 | def _worst_rows(scores: pd.DataFrame, column: str, *, limit: int = 10) -> list[dict[str, Any]]: | ||
| 152 | if scores.empty or column not in scores.columns: | ||
| 153 | return [] | ||
| 154 | return scores.sort_values(column, ascending=True).head(limit).to_dict(orient="records") | ||
| 155 | |||
| 156 | |||
| 157 | def _sample_by_id(rows: list[dict[str, Any]], sample_id: Any) -> dict[str, Any]: | ||
| 158 | return next((row for row in rows if row.get("sample_id") == sample_id), {}) | ||
| 159 | |||
| 160 | |||
| 161 | def _cell(value: Any, *, max_len: int = 120) -> str: | ||
| 162 | text = "" if value is None else " ".join(str(value).split()) | ||
| 163 | text = text.replace("|", "\\|") | ||
| 164 | if len(text) <= max_len: | ||
| 165 | return text | ||
| 166 | return text[:max_len].rstrip() + "..." | ||
| 167 | |||
| 168 | |||
| 169 | def _score(value: Any) -> str: | ||
| 170 | try: | ||
| 171 | if pd.isna(value): | ||
| 172 | return "" | ||
| 173 | return f"{float(value):.4f}" | ||
| 174 | except (TypeError, ValueError): | ||
| 175 | return "" |
src/weknora_eval/schemas.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from dataclasses import asdict, dataclass, field | ||
| 4 | from typing import Any | ||
| 5 | |||
| 6 | |||
| 7 | @dataclass | ||
| 8 | class ParsedDocument: | ||
| 9 | doc_id: str | ||
| 10 | source_file: str | ||
| 11 | file_type: str | ||
| 12 | content: str | ||
| 13 | page: int | None = None | ||
| 14 | sheet: str | None = None | ||
| 15 | row_index: int | None = None | ||
| 16 | metadata: dict[str, Any] = field(default_factory=dict) | ||
| 17 | |||
| 18 | def to_dict(self) -> dict[str, Any]: | ||
| 19 | return asdict(self) | ||
| 20 | |||
| 21 | |||
| 22 | @dataclass | ||
| 23 | class TestsetRecord: | ||
| 24 | sample_id: str | ||
| 25 | user_input: str | ||
| 26 | reference: str | ||
| 27 | reference_contexts: list[str] | ||
| 28 | source_file: str | None = None | ||
| 29 | gold_chunk_ids: list[str] = field(default_factory=list) | ||
| 30 | question_type: str = "single_hop" | ||
| 31 | review_status: str = "pending" | ||
| 32 | |||
| 33 | def to_dict(self) -> dict[str, Any]: | ||
| 34 | return asdict(self) | ||
| 35 | |||
| 36 | |||
| 37 | @dataclass | ||
| 38 | class WeKnoraAnswer: | ||
| 39 | sample_id: str | ||
| 40 | user_input: str | ||
| 41 | response: str | ||
| 42 | retrieved_contexts: list[str] | ||
| 43 | weknora_references: list[dict[str, Any]] | ||
| 44 | session_id: str | None = None | ||
| 45 | request_id: str | None = None | ||
| 46 | error: str | None = None | ||
| 47 | |||
| 48 | def to_dict(self) -> dict[str, Any]: | ||
| 49 | return asdict(self) |
src/weknora_eval/sse.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import json | ||
| 4 | from collections.abc import Iterable, Iterator | ||
| 5 | from typing import Any | ||
| 6 | |||
| 7 | |||
| 8 | def parse_sse_events(lines: Iterable[str | bytes]) -> Iterator[dict[str, Any]]: | ||
| 9 | event_name = "message" | ||
| 10 | data_lines: list[str] = [] | ||
| 11 | |||
| 12 | for raw_line in lines: | ||
| 13 | line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line | ||
| 14 | line = line.rstrip("\r\n") | ||
| 15 | |||
| 16 | if not line: | ||
| 17 | if data_lines: | ||
| 18 | yield _build_event(event_name, data_lines) | ||
| 19 | event_name = "message" | ||
| 20 | data_lines = [] | ||
| 21 | continue | ||
| 22 | |||
| 23 | if line.startswith(":"): | ||
| 24 | continue | ||
| 25 | if line.startswith("event:"): | ||
| 26 | event_name = line.removeprefix("event:").strip() | ||
| 27 | continue | ||
| 28 | if line.startswith("data:"): | ||
| 29 | data_lines.append(line.removeprefix("data:").strip()) | ||
| 30 | |||
| 31 | if data_lines: | ||
| 32 | yield _build_event(event_name, data_lines) | ||
| 33 | |||
| 34 | |||
| 35 | def _build_event(event_name: str, data_lines: list[str]) -> dict[str, Any]: | ||
| 36 | raw_data = "\n".join(data_lines) | ||
| 37 | parsed_data: Any = raw_data | ||
| 38 | if raw_data and raw_data != "[DONE]": | ||
| 39 | try: | ||
| 40 | parsed_data = json.loads(raw_data) | ||
| 41 | except json.JSONDecodeError: | ||
| 42 | parsed_data = raw_data | ||
| 43 | return {"event": event_name, "data": parsed_data} | ||
| 44 | |||
| 45 | |||
| 46 | def normalize_reference(reference: dict[str, Any]) -> dict[str, Any]: | ||
| 47 | return { | ||
| 48 | "id": reference.get("id"), | ||
| 49 | "content": reference.get("content") or "", | ||
| 50 | "knowledge_id": reference.get("knowledge_id"), | ||
| 51 | "chunk_index": reference.get("chunk_index"), | ||
| 52 | "score": reference.get("score"), | ||
| 53 | "knowledge_filename": reference.get("knowledge_filename") | ||
| 54 | or reference.get("knowledge_title"), | ||
| 55 | "match_type": reference.get("match_type"), | ||
| 56 | "chunk_type": reference.get("chunk_type"), | ||
| 57 | } |
src/weknora_eval/testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from typing import Any | ||
| 4 | |||
| 5 | from weknora_eval.loaders import read_jsonl, write_jsonl | ||
| 6 | from weknora_eval.schemas import TestsetRecord | ||
| 7 | |||
| 8 | |||
| 9 | def generate_rule_based_testset( | ||
| 10 | *, | ||
| 11 | documents_path: str = "data/parsed_docs/documents.jsonl", | ||
| 12 | output_path: str = "data/testsets/testset.raw.jsonl", | ||
| 13 | size: int = 50, | ||
| 14 | min_context_chars: int = 80, | ||
| 15 | ) -> list[dict[str, Any]]: | ||
| 16 | documents = [ | ||
| 17 | row | ||
| 18 | for row in read_jsonl(documents_path) | ||
| 19 | if len(row.get("content") or "") >= min_context_chars | ||
| 20 | ] | ||
| 21 | rows: list[dict[str, Any]] = [] | ||
| 22 | for index, document in enumerate(documents[:size], start=1): | ||
| 23 | context = document["content"] | ||
| 24 | source_file = document.get("source_file") | ||
| 25 | question = _default_question(document) | ||
| 26 | reference = _reference_from_context(context) | ||
| 27 | rows.append( | ||
| 28 | TestsetRecord( | ||
| 29 | sample_id=f"qa-{index:04d}", | ||
| 30 | user_input=question, | ||
| 31 | reference=reference, | ||
| 32 | reference_contexts=[context], | ||
| 33 | source_file=source_file, | ||
| 34 | question_type="single_hop", | ||
| 35 | review_status="pending", | ||
| 36 | ).to_dict() | ||
| 37 | ) | ||
| 38 | write_jsonl(output_path, rows) | ||
| 39 | return rows | ||
| 40 | |||
| 41 | |||
| 42 | def approve_pending_testset( | ||
| 43 | *, | ||
| 44 | input_path: str = "data/testsets/testset.raw.jsonl", | ||
| 45 | output_path: str = "data/testsets/testset.reviewed.jsonl", | ||
| 46 | ) -> list[dict[str, Any]]: | ||
| 47 | rows = read_jsonl(input_path) | ||
| 48 | reviewed: list[dict[str, Any]] = [] | ||
| 49 | for row in rows: | ||
| 50 | row = dict(row) | ||
| 51 | if row.get("review_status") == "rejected": | ||
| 52 | continue | ||
| 53 | row["review_status"] = "approved" | ||
| 54 | reviewed.append(row) | ||
| 55 | write_jsonl(output_path, reviewed) | ||
| 56 | return reviewed | ||
| 57 | |||
| 58 | |||
| 59 | def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]: | ||
| 60 | errors: list[str] = [] | ||
| 61 | for index, row in enumerate(read_jsonl(path), start=1): | ||
| 62 | prefix = f"{path}:{index}" | ||
| 63 | if row.get("review_status") != "approved": | ||
| 64 | errors.append(f"{prefix} review_status must be approved") | ||
| 65 | for key in ("sample_id", "user_input", "reference"): | ||
| 66 | if not row.get(key): | ||
| 67 | errors.append(f"{prefix} missing {key}") | ||
| 68 | if not row.get("reference_contexts"): | ||
| 69 | errors.append(f"{prefix} reference_contexts must be non-empty") | ||
| 70 | return errors | ||
| 71 | |||
| 72 | |||
| 73 | def _default_question(document: dict[str, Any]) -> str: | ||
| 74 | source = document.get("source_file") or "该文档" | ||
| 75 | if document.get("file_type") == "xlsx" and document.get("sheet"): | ||
| 76 | return f"请根据 {source} 的 {document['sheet']} 中对应记录回答:这条记录的主要内容是什么?" | ||
| 77 | if document.get("page"): | ||
| 78 | return f"请根据 {source} 第 {document['page']} 页回答:该片段的主要内容是什么?" | ||
| 79 | return f"请根据 {source} 回答:该片段的主要内容是什么?" | ||
| 80 | |||
| 81 | |||
| 82 | def _reference_from_context(context: str, *, max_chars: int = 500) -> str: | ||
| 83 | text = " ".join(context.split()) | ||
| 84 | if len(text) <= max_chars: | ||
| 85 | return text | ||
| 86 | return text[:max_chars].rstrip() + "..." |
-
Please register or sign in to post a comment