Build evaluation documents from WeKnora chunks by default
Showing
5 changed files
with
72 additions
and
5 deletions
| ... | @@ -68,6 +68,8 @@ python scripts/10_report.py | ... | @@ -68,6 +68,8 @@ python scripts/10_report.py |
| 68 | 68 | ||
| 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 |
| 70 | 70 | ||
| 71 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`local` 和 `mineru` 解析只作为可选实验配置保留。 | ||
| 72 | |||
| 71 | ## 主要产物 | 73 | ## 主要产物 |
| 72 | 74 | ||
| 73 | - `data/exported/knowledge.jsonl` | 75 | - `data/exported/knowledge.jsonl` | ... | ... |
| ... | @@ -164,10 +164,10 @@ python scripts/10_report.py | ... | @@ -164,10 +164,10 @@ python scripts/10_report.py |
| 164 | 164 | ||
| 165 | 说明: | 165 | 说明: |
| 166 | 166 | ||
| 167 | - `01_upload_docs.py` 上传 `data/raw_docs/pdf` 和 `data/raw_docs/xlsx`。 | 167 | - `01_upload_docs.py` 上传 `data/raw_docs/` 下的 PDF/XLSX,也兼容 `pdf/`、`xlsx/` 子目录。 |
| 168 | - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 | 168 | - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 |
| 169 | - `03_export_chunks.py` 导出 WeKnora chunks。 | 169 | - `03_export_chunks.py` 导出 WeKnora chunks。 |
| 170 | - `04_parse_docs.py` 在评估侧解析原始文档,生成 Ragas 测试集来源。 | 170 | - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。 |
| 171 | - `05_generate_testset.py` 生成候选 QA。 | 171 | - `05_generate_testset.py` 生成候选 QA。 |
| 172 | - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 | 172 | - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 |
| 173 | - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 | 173 | - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 | ... | ... |
| ... | @@ -15,10 +15,16 @@ testset: | ... | @@ -15,10 +15,16 @@ testset: |
| 15 | require_manual_review: true | 15 | require_manual_review: true |
| 16 | 16 | ||
| 17 | parsing: | 17 | parsing: |
| 18 | provider: "mineru" | 18 | # chunks evaluates WeKnora as deployed: documents.jsonl is built from |
| 19 | # data/exported/chunks.jsonl. local/mineru remain available for optional | ||
| 20 | # parser-specific experiments. | ||
| 21 | provider: "chunks" | ||
| 19 | output_path: "data/parsed_docs/documents.jsonl" | 22 | output_path: "data/parsed_docs/documents.jsonl" |
| 20 | failed_path: "data/parsed_docs/failed_parse.jsonl" | 23 | failed_path: "data/parsed_docs/failed_parse.jsonl" |
| 21 | summary_path: "data/parsed_docs/parse_summary.json" | 24 | summary_path: "data/parsed_docs/parse_summary.json" |
| 25 | chunks: | ||
| 26 | input_path: "data/exported/chunks.jsonl" | ||
| 27 | min_chars: 80 | ||
| 22 | local: | 28 | local: |
| 23 | pdf_backend: "pymupdf" | 29 | pdf_backend: "pymupdf" |
| 24 | xlsx_mode: "row_text" | 30 | xlsx_mode: "row_text" | ... | ... |
| ... | @@ -6,6 +6,7 @@ import _bootstrap # noqa: F401 | ... | @@ -6,6 +6,7 @@ import _bootstrap # noqa: F401 |
| 6 | 6 | ||
| 7 | from weknora_eval.config import load_config | 7 | from weknora_eval.config import load_config |
| 8 | from weknora_eval.loaders import setup_logging | 8 | from weknora_eval.loaders import setup_logging |
| 9 | from weknora_eval.parsers.chunks import parse_chunks | ||
| 9 | from weknora_eval.parsers.local import parse_raw_docs | 10 | from weknora_eval.parsers.local import parse_raw_docs |
| 10 | from weknora_eval.parsers.mineru import parse_with_mineru | 11 | from weknora_eval.parsers.mineru import parse_with_mineru |
| 11 | 12 | ||
| ... | @@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru | ... | @@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru |
| 13 | def main() -> int: | 14 | def main() -> int: |
| 14 | setup_logging() | 15 | setup_logging() |
| 15 | config = load_config() | 16 | config = load_config() |
| 16 | provider = config.get("parsing", {}).get("provider", "local") | 17 | provider = config.get("parsing", {}).get("provider", "chunks") |
| 17 | if provider == "local": | 18 | if provider == "chunks": |
| 19 | rows, summary = parse_chunks(config) | ||
| 20 | elif provider == "local": | ||
| 18 | rows, summary = parse_raw_docs(config) | 21 | rows, summary = parse_raw_docs(config) |
| 19 | elif provider == "mineru": | 22 | elif provider == "mineru": |
| 20 | rows, summary = parse_with_mineru(config) | 23 | rows, summary = parse_with_mineru(config) | ... | ... |
src/weknora_eval/parsers/chunks.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from typing import Any | ||
| 4 | |||
| 5 | from weknora_eval.loaders import compact_text, read_jsonl, write_json, write_jsonl | ||
| 6 | from weknora_eval.parsers.local import build_parse_summary | ||
| 7 | from weknora_eval.schemas import ParsedDocument | ||
| 8 | |||
| 9 | |||
| 10 | def parse_chunks(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]: | ||
| 11 | parsing = config["parsing"] | ||
| 12 | chunks_config = parsing.get("chunks", {}) | ||
| 13 | input_path = chunks_config.get("input_path", "data/exported/chunks.jsonl") | ||
| 14 | min_chars = int(chunks_config.get("min_chars", parsing.get("local", {}).get("min_chars", 80))) | ||
| 15 | |||
| 16 | rows: list[dict[str, Any]] = [] | ||
| 17 | failures: list[dict[str, Any]] = [] | ||
| 18 | for chunk in read_jsonl(input_path): | ||
| 19 | content = compact_text(chunk.get("content")) | ||
| 20 | chunk_id = chunk.get("chunk_id") or chunk.get("id") | ||
| 21 | source_file = chunk.get("source_file") or chunk.get("knowledge_filename") or "unknown" | ||
| 22 | if not content or len(content) < min_chars: | ||
| 23 | failures.append( | ||
| 24 | { | ||
| 25 | "source_file": source_file, | ||
| 26 | "parser": "weknora:chunks", | ||
| 27 | "status": "skipped", | ||
| 28 | "error": f"chunk content shorter than min_chars={min_chars}", | ||
| 29 | "fallback_used": None, | ||
| 30 | "chunk_id": chunk_id, | ||
| 31 | } | ||
| 32 | ) | ||
| 33 | continue | ||
| 34 | |||
| 35 | document = ParsedDocument( | ||
| 36 | doc_id=str(chunk_id or f"{source_file}::chunk-{len(rows) + 1}"), | ||
| 37 | source_file=str(source_file), | ||
| 38 | file_type=str(chunk.get("chunk_type") or "chunk"), | ||
| 39 | content=content, | ||
| 40 | metadata={ | ||
| 41 | "parser": "weknora:chunks", | ||
| 42 | "chunk_id": chunk_id, | ||
| 43 | "knowledge_id": chunk.get("knowledge_id"), | ||
| 44 | "knowledge_base_id": chunk.get("knowledge_base_id"), | ||
| 45 | "chunk_index": chunk.get("chunk_index"), | ||
| 46 | }, | ||
| 47 | ) | ||
| 48 | rows.append(document.to_dict()) | ||
| 49 | |||
| 50 | write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows) | ||
| 51 | if failures: | ||
| 52 | write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures) | ||
| 53 | |||
| 54 | summary = build_parse_summary(rows, failures, parser="weknora:chunks") | ||
| 55 | write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary) | ||
| 56 | return rows, summary |
-
Please register or sign in to post a comment