Commit feeaba05 feeaba057f54e31449bdf37ded063d12cf44c1fd by 沈秋雨

Build evaluation documents from WeKnora chunks by default

1 parent 8096ca31
...@@ -68,6 +68,8 @@ python scripts/10_report.py ...@@ -68,6 +68,8 @@ python scripts/10_report.py
68 68
69 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。 69 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。
70 70
71 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`local``mineru` 解析只作为可选实验配置保留。
72
71 ## 主要产物 73 ## 主要产物
72 74
73 - `data/exported/knowledge.jsonl` 75 - `data/exported/knowledge.jsonl`
......
...@@ -164,10 +164,10 @@ python scripts/10_report.py ...@@ -164,10 +164,10 @@ python scripts/10_report.py
164 164
165 说明: 165 说明:
166 166
167 - `01_upload_docs.py` 上传 `data/raw_docs/pdf``data/raw_docs/xlsx` 167 - `01_upload_docs.py` 上传 `data/raw_docs/` 下的 PDF/XLSX,也兼容 `pdf/``xlsx/` 子目录
168 - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 168 - `02_wait_ingestion.py` 等待 WeKnora 解析完成。
169 - `03_export_chunks.py` 导出 WeKnora chunks。 169 - `03_export_chunks.py` 导出 WeKnora chunks。
170 - `04_parse_docs.py` 在评估侧解析原始文档,生成 Ragas 测试集来源 170 - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF
171 - `05_generate_testset.py` 生成候选 QA。 171 - `05_generate_testset.py` 生成候选 QA。
172 - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 172 - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。
173 - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 173 - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。
......
...@@ -15,10 +15,16 @@ testset: ...@@ -15,10 +15,16 @@ testset:
15 require_manual_review: true 15 require_manual_review: true
16 16
17 parsing: 17 parsing:
18 provider: "mineru" 18 # chunks evaluates WeKnora as deployed: documents.jsonl is built from
19 # data/exported/chunks.jsonl. local/mineru remain available for optional
20 # parser-specific experiments.
21 provider: "chunks"
19 output_path: "data/parsed_docs/documents.jsonl" 22 output_path: "data/parsed_docs/documents.jsonl"
20 failed_path: "data/parsed_docs/failed_parse.jsonl" 23 failed_path: "data/parsed_docs/failed_parse.jsonl"
21 summary_path: "data/parsed_docs/parse_summary.json" 24 summary_path: "data/parsed_docs/parse_summary.json"
25 chunks:
26 input_path: "data/exported/chunks.jsonl"
27 min_chars: 80
22 local: 28 local:
23 pdf_backend: "pymupdf" 29 pdf_backend: "pymupdf"
24 xlsx_mode: "row_text" 30 xlsx_mode: "row_text"
......
...@@ -6,6 +6,7 @@ import _bootstrap # noqa: F401 ...@@ -6,6 +6,7 @@ import _bootstrap # noqa: F401
6 6
7 from weknora_eval.config import load_config 7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging 8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.parsers.chunks import parse_chunks
9 from weknora_eval.parsers.local import parse_raw_docs 10 from weknora_eval.parsers.local import parse_raw_docs
10 from weknora_eval.parsers.mineru import parse_with_mineru 11 from weknora_eval.parsers.mineru import parse_with_mineru
11 12
...@@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru ...@@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru
13 def main() -> int: 14 def main() -> int:
14 setup_logging() 15 setup_logging()
15 config = load_config() 16 config = load_config()
16 provider = config.get("parsing", {}).get("provider", "local") 17 provider = config.get("parsing", {}).get("provider", "chunks")
17 if provider == "local": 18 if provider == "chunks":
19 rows, summary = parse_chunks(config)
20 elif provider == "local":
18 rows, summary = parse_raw_docs(config) 21 rows, summary = parse_raw_docs(config)
19 elif provider == "mineru": 22 elif provider == "mineru":
20 rows, summary = parse_with_mineru(config) 23 rows, summary = parse_with_mineru(config)
......
1 from __future__ import annotations
2
3 from typing import Any
4
5 from weknora_eval.loaders import compact_text, read_jsonl, write_json, write_jsonl
6 from weknora_eval.parsers.local import build_parse_summary
7 from weknora_eval.schemas import ParsedDocument
8
9
10 def parse_chunks(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
11 parsing = config["parsing"]
12 chunks_config = parsing.get("chunks", {})
13 input_path = chunks_config.get("input_path", "data/exported/chunks.jsonl")
14 min_chars = int(chunks_config.get("min_chars", parsing.get("local", {}).get("min_chars", 80)))
15
16 rows: list[dict[str, Any]] = []
17 failures: list[dict[str, Any]] = []
18 for chunk in read_jsonl(input_path):
19 content = compact_text(chunk.get("content"))
20 chunk_id = chunk.get("chunk_id") or chunk.get("id")
21 source_file = chunk.get("source_file") or chunk.get("knowledge_filename") or "unknown"
22 if not content or len(content) < min_chars:
23 failures.append(
24 {
25 "source_file": source_file,
26 "parser": "weknora:chunks",
27 "status": "skipped",
28 "error": f"chunk content shorter than min_chars={min_chars}",
29 "fallback_used": None,
30 "chunk_id": chunk_id,
31 }
32 )
33 continue
34
35 document = ParsedDocument(
36 doc_id=str(chunk_id or f"{source_file}::chunk-{len(rows) + 1}"),
37 source_file=str(source_file),
38 file_type=str(chunk.get("chunk_type") or "chunk"),
39 content=content,
40 metadata={
41 "parser": "weknora:chunks",
42 "chunk_id": chunk_id,
43 "knowledge_id": chunk.get("knowledge_id"),
44 "knowledge_base_id": chunk.get("knowledge_base_id"),
45 "chunk_index": chunk.get("chunk_index"),
46 },
47 )
48 rows.append(document.to_dict())
49
50 write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
51 if failures:
52 write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
53
54 summary = build_parse_summary(rows, failures, parser="weknora:chunks")
55 write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
56 return rows, summary