Commit feeaba05 feeaba057f54e31449bdf37ded063d12cf44c1fd by 沈秋雨

Build evaluation documents from WeKnora chunks by default

1 parent 8096ca31
......@@ -68,6 +68,8 @@ python scripts/10_report.py
首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。
默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`local``mineru` 解析只作为可选实验配置保留。
## 主要产物
- `data/exported/knowledge.jsonl`
......
......@@ -164,10 +164,10 @@ python scripts/10_report.py
说明:
- `01_upload_docs.py` 上传 `data/raw_docs/pdf``data/raw_docs/xlsx`
- `01_upload_docs.py` 上传 `data/raw_docs/` 下的 PDF/XLSX,也兼容 `pdf/``xlsx/` 子目录
- `02_wait_ingestion.py` 等待 WeKnora 解析完成。
- `03_export_chunks.py` 导出 WeKnora chunks。
- `04_parse_docs.py` 在评估侧解析原始文档,生成 Ragas 测试集来源
- `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF
- `05_generate_testset.py` 生成候选 QA。
- `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。
- `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。
......
......@@ -15,10 +15,16 @@ testset:
require_manual_review: true
parsing:
provider: "mineru"
# chunks evaluates WeKnora as deployed: documents.jsonl is built from
# data/exported/chunks.jsonl. local/mineru remain available for optional
# parser-specific experiments.
provider: "chunks"
output_path: "data/parsed_docs/documents.jsonl"
failed_path: "data/parsed_docs/failed_parse.jsonl"
summary_path: "data/parsed_docs/parse_summary.json"
chunks:
input_path: "data/exported/chunks.jsonl"
min_chars: 80
local:
pdf_backend: "pymupdf"
xlsx_mode: "row_text"
......
......@@ -6,6 +6,7 @@ import _bootstrap # noqa: F401
from weknora_eval.config import load_config
from weknora_eval.loaders import setup_logging
from weknora_eval.parsers.chunks import parse_chunks
from weknora_eval.parsers.local import parse_raw_docs
from weknora_eval.parsers.mineru import parse_with_mineru
......@@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru
def main() -> int:
setup_logging()
config = load_config()
provider = config.get("parsing", {}).get("provider", "local")
if provider == "local":
provider = config.get("parsing", {}).get("provider", "chunks")
if provider == "chunks":
rows, summary = parse_chunks(config)
elif provider == "local":
rows, summary = parse_raw_docs(config)
elif provider == "mineru":
rows, summary = parse_with_mineru(config)
......
from __future__ import annotations
from typing import Any
from weknora_eval.loaders import compact_text, read_jsonl, write_json, write_jsonl
from weknora_eval.parsers.local import build_parse_summary
from weknora_eval.schemas import ParsedDocument
def parse_chunks(config: dict[str, Any]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
parsing = config["parsing"]
chunks_config = parsing.get("chunks", {})
input_path = chunks_config.get("input_path", "data/exported/chunks.jsonl")
min_chars = int(chunks_config.get("min_chars", parsing.get("local", {}).get("min_chars", 80)))
rows: list[dict[str, Any]] = []
failures: list[dict[str, Any]] = []
for chunk in read_jsonl(input_path):
content = compact_text(chunk.get("content"))
chunk_id = chunk.get("chunk_id") or chunk.get("id")
source_file = chunk.get("source_file") or chunk.get("knowledge_filename") or "unknown"
if not content or len(content) < min_chars:
failures.append(
{
"source_file": source_file,
"parser": "weknora:chunks",
"status": "skipped",
"error": f"chunk content shorter than min_chars={min_chars}",
"fallback_used": None,
"chunk_id": chunk_id,
}
)
continue
document = ParsedDocument(
doc_id=str(chunk_id or f"{source_file}::chunk-{len(rows) + 1}"),
source_file=str(source_file),
file_type=str(chunk.get("chunk_type") or "chunk"),
content=content,
metadata={
"parser": "weknora:chunks",
"chunk_id": chunk_id,
"knowledge_id": chunk.get("knowledge_id"),
"knowledge_base_id": chunk.get("knowledge_base_id"),
"chunk_index": chunk.get("chunk_index"),
},
)
rows.append(document.to_dict())
write_jsonl(parsing.get("output_path", "data/parsed_docs/documents.jsonl"), rows)
if failures:
write_jsonl(parsing.get("failed_path", "data/parsed_docs/failed_parse.jsonl"), failures)
summary = build_parse_summary(rows, failures, parser="weknora:chunks")
write_json(parsing.get("summary_path", "data/parsed_docs/parse_summary.json"), summary)
return rows, summary