Commit c1d6e33a c1d6e33abe7d9ef8425eb6417cf0655a4a6a4557 by 沈秋雨

Generate QA testsets with Ragas by default

1 parent feeaba05
...@@ -68,7 +68,7 @@ python scripts/10_report.py ...@@ -68,7 +68,7 @@ python scripts/10_report.py
68 68
69 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。 69 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts``response`、Ragas 输入字段都正常后再扩展样本量。
70 70
71 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`local``mineru` 解析只作为可选实验配置保留。 71 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;`local``mineru``rule_based` 只作为可选实验/兜底配置保留。
72 72
73 ## 主要产物 73 ## 主要产物
74 74
......
...@@ -168,7 +168,7 @@ python scripts/10_report.py ...@@ -168,7 +168,7 @@ python scripts/10_report.py
168 - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 168 - `02_wait_ingestion.py` 等待 WeKnora 解析完成。
169 - `03_export_chunks.py` 导出 WeKnora chunks。 169 - `03_export_chunks.py` 导出 WeKnora chunks。
170 - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。 170 - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。
171 - `05_generate_testset.py` 生成候选 QA。 171 - `05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 生成候选 QA。
172 - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 172 - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。
173 - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 173 - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。
174 - `08_build_ragas_input.py` 合并 QA 和 WeKnora 输出。 174 - `08_build_ragas_input.py` 合并 QA 和 WeKnora 输出。
......
...@@ -9,6 +9,7 @@ weknora: ...@@ -9,6 +9,7 @@ weknora:
9 9
10 testset: 10 testset:
11 size: "${TESTSET_SIZE:-50}" 11 size: "${TESTSET_SIZE:-50}"
12 generator: "ragas" # ragas or rule_based
12 include_pdf: true 13 include_pdf: true
13 include_xlsx: true 14 include_xlsx: true
14 min_context_chars: 80 15 min_context_chars: 80
......
...@@ -6,17 +6,13 @@ import _bootstrap # noqa: F401 ...@@ -6,17 +6,13 @@ import _bootstrap # noqa: F401
6 6
7 from weknora_eval.config import load_config 7 from weknora_eval.config import load_config
8 from weknora_eval.loaders import setup_logging 8 from weknora_eval.loaders import setup_logging
9 from weknora_eval.testset import generate_rule_based_testset 9 from weknora_eval.testset import generate_testset
10 10
11 11
12 def main() -> int: 12 def main() -> int:
13 setup_logging() 13 setup_logging()
14 config = load_config() 14 config = load_config()
15 testset = config.get("testset", {}) 15 rows = generate_testset(config)
16 rows = generate_rule_based_testset(
17 size=int(testset.get("size", 50)),
18 min_context_chars=int(testset.get("min_context_chars", 80)),
19 )
20 print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl") 16 print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl")
21 return 0 if rows else 1 17 return 0 if rows else 1
22 18
......
1 from __future__ import annotations 1 from __future__ import annotations
2 2
3 import json
3 from typing import Any 4 from typing import Any
4 5
6 from langchain_core.documents import Document
7 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
8 from ragas.run_config import RunConfig
9 from ragas.testset import TestsetGenerator
10
11 from weknora_eval.config import require_config
5 from weknora_eval.loaders import read_jsonl, write_jsonl 12 from weknora_eval.loaders import read_jsonl, write_jsonl
13 from weknora_eval.ragas_runner import _wrap_langchain_models
6 from weknora_eval.schemas import TestsetRecord 14 from weknora_eval.schemas import TestsetRecord
7 15
8 16
17 def generate_testset(config: dict[str, Any]) -> list[dict[str, Any]]:
18 testset = config.get("testset", {})
19 generator = str(testset.get("generator", "ragas"))
20 if generator == "ragas":
21 return generate_ragas_testset(config)
22 if generator == "rule_based":
23 return generate_rule_based_testset(
24 size=int(testset.get("size", 50)),
25 min_context_chars=int(testset.get("min_context_chars", 80)),
26 )
27 raise ValueError(f"Unsupported testset.generator: {generator}")
28
29
30 def generate_ragas_testset(
31 config: dict[str, Any],
32 *,
33 documents_path: str = "data/parsed_docs/documents.jsonl",
34 output_path: str = "data/testsets/testset.raw.jsonl",
35 ) -> list[dict[str, Any]]:
36 testset_config = config.get("testset", {})
37 ragas_config = config["ragas"]
38 size = int(testset_config.get("size", 50))
39 min_context_chars = int(testset_config.get("min_context_chars", 80))
40
41 source_rows = [
42 row
43 for row in read_jsonl(documents_path)
44 if len(row.get("content") or "") >= min_context_chars
45 ]
46 if not source_rows:
47 write_jsonl(output_path, [])
48 return []
49
50 documents = [
51 Document(
52 page_content=row["content"],
53 metadata={
54 "source_file": row.get("source_file"),
55 "doc_id": row.get("doc_id"),
56 **(row.get("metadata") or {}),
57 },
58 )
59 for row in source_rows
60 ]
61
62 llm = ChatOpenAI(
63 model=str(require_config(config, "ragas.generator_model")),
64 api_key=_required_ragas_value(ragas_config, "llm_api_key"),
65 base_url=_required_ragas_value(ragas_config, "llm_base_url"),
66 temperature=float(ragas_config.get("temperature", 0)),
67 max_tokens=int(ragas_config.get("max_tokens", 4096)),
68 timeout=int(ragas_config.get("timeout_seconds", 600)),
69 )
70 embeddings = OpenAIEmbeddings(
71 model=str(require_config(config, "ragas.embedding_model")),
72 api_key=_required_ragas_value(ragas_config, "embedding_api_key"),
73 base_url=_required_ragas_value(ragas_config, "embedding_base_url"),
74 tiktoken_enabled=False,
75 check_embedding_ctx_length=False,
76 request_timeout=int(ragas_config.get("timeout_seconds", 600)),
77 )
78 ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
79 generator = TestsetGenerator(llm=ragas_llm, embedding_model=ragas_embeddings)
80 result = generator.generate_with_langchain_docs(
81 documents[: max(size, 1)],
82 testset_size=size,
83 run_config=RunConfig(
84 timeout=int(ragas_config.get("timeout_seconds", 600)),
85 max_workers=int(ragas_config.get("max_workers", 1)),
86 ),
87 raise_exceptions=False,
88 )
89
90 ragas_rows = result.to_list()
91 rows = _normalize_ragas_rows(ragas_rows, source_rows)
92 write_jsonl(output_path, rows)
93 return rows
94
95
96 def _normalize_ragas_rows(
97 ragas_rows: list[dict[str, Any]],
98 source_rows: list[dict[str, Any]],
99 ) -> list[dict[str, Any]]:
100 normalized: list[dict[str, Any]] = []
101 source_by_doc_id = {str(row.get("doc_id")): row for row in source_rows if row.get("doc_id")}
102 for index, row in enumerate(ragas_rows, start=1):
103 reference_contexts = _as_string_list(row.get("reference_contexts"))
104 if not reference_contexts and row.get("reference_context"):
105 reference_contexts = _as_string_list(row.get("reference_context"))
106 source = _match_source_row(row, source_rows, source_by_doc_id, reference_contexts)
107 gold_chunk_ids = []
108 if source:
109 chunk_id = (source.get("metadata") or {}).get("chunk_id") or source.get("doc_id")
110 if chunk_id:
111 gold_chunk_ids = [str(chunk_id)]
112 normalized.append(
113 TestsetRecord(
114 sample_id=f"qa-{index:04d}",
115 user_input=str(row.get("user_input") or row.get("query") or "").strip(),
116 reference=str(row.get("reference") or row.get("answer") or "").strip(),
117 reference_contexts=reference_contexts or ([source["content"]] if source else []),
118 source_file=source.get("source_file") if source else None,
119 gold_chunk_ids=gold_chunk_ids,
120 question_type=str(row.get("synthesizer_name") or "ragas"),
121 review_status="pending",
122 ).to_dict()
123 )
124 return [
125 row
126 for row in normalized
127 if row.get("user_input") and row.get("reference") and row.get("reference_contexts")
128 ]
129
130
131 def _match_source_row(
132 ragas_row: dict[str, Any],
133 source_rows: list[dict[str, Any]],
134 source_by_doc_id: dict[str, dict[str, Any]],
135 reference_contexts: list[str],
136 ) -> dict[str, Any] | None:
137 for key in ("reference_context_ids", "retrieved_context_ids"):
138 for doc_id in _as_string_list(ragas_row.get(key)):
139 if doc_id in source_by_doc_id:
140 return source_by_doc_id[doc_id]
141 for context in reference_contexts:
142 for source in source_rows:
143 content = source.get("content") or ""
144 if context and (context in content or content in context):
145 return source
146 return source_rows[0] if source_rows else None
147
148
149 def _as_string_list(value: Any) -> list[str]:
150 if value is None:
151 return []
152 if isinstance(value, str):
153 try:
154 parsed = json.loads(value)
155 if parsed != value:
156 return _as_string_list(parsed)
157 except json.JSONDecodeError:
158 pass
159 return [value.strip()] if value.strip() else []
160 if isinstance(value, list):
161 result: list[str] = []
162 for item in value:
163 result.extend(_as_string_list(item))
164 return result
165 if isinstance(value, dict):
166 for key in ("content", "text", "page_content"):
167 if key in value:
168 return _as_string_list(value[key])
169 return []
170 return [str(value)]
171
172
173 def _required_ragas_value(config: dict[str, Any], key: str) -> str:
174 value = config.get(key)
175 if value in {None, ""}:
176 raise ValueError(f"Missing required Ragas config value: ragas.{key}")
177 return str(value)
178
179
9 def generate_rule_based_testset( 180 def generate_rule_based_testset(
10 *, 181 *,
11 documents_path: str = "data/parsed_docs/documents.jsonl", 182 documents_path: str = "data/parsed_docs/documents.jsonl",
......