Generate QA testsets with Ragas by default
Showing
5 changed files
with
176 additions
and
8 deletions
| ... | @@ -68,7 +68,7 @@ python scripts/10_report.py | ... | @@ -68,7 +68,7 @@ python scripts/10_report.py |
| 68 | 68 | ||
| 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 |
| 70 | 70 | ||
| 71 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`local` 和 `mineru` 解析只作为可选实验配置保留。 | 71 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;`local`、`mineru` 和 `rule_based` 只作为可选实验/兜底配置保留。 |
| 72 | 72 | ||
| 73 | ## 主要产物 | 73 | ## 主要产物 |
| 74 | 74 | ... | ... |
| ... | @@ -168,7 +168,7 @@ python scripts/10_report.py | ... | @@ -168,7 +168,7 @@ python scripts/10_report.py |
| 168 | - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 | 168 | - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 |
| 169 | - `03_export_chunks.py` 导出 WeKnora chunks。 | 169 | - `03_export_chunks.py` 导出 WeKnora chunks。 |
| 170 | - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。 | 170 | - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。 |
| 171 | - `05_generate_testset.py` 生成候选 QA。 | 171 | - `05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 生成候选 QA。 |
| 172 | - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 | 172 | - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 |
| 173 | - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 | 173 | - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 |
| 174 | - `08_build_ragas_input.py` 合并 QA 和 WeKnora 输出。 | 174 | - `08_build_ragas_input.py` 合并 QA 和 WeKnora 输出。 | ... | ... |
| ... | @@ -9,6 +9,7 @@ weknora: | ... | @@ -9,6 +9,7 @@ weknora: |
| 9 | 9 | ||
| 10 | testset: | 10 | testset: |
| 11 | size: "${TESTSET_SIZE:-50}" | 11 | size: "${TESTSET_SIZE:-50}" |
| 12 | generator: "ragas" # ragas or rule_based | ||
| 12 | include_pdf: true | 13 | include_pdf: true |
| 13 | include_xlsx: true | 14 | include_xlsx: true |
| 14 | min_context_chars: 80 | 15 | min_context_chars: 80 | ... | ... |
| ... | @@ -6,17 +6,13 @@ import _bootstrap # noqa: F401 | ... | @@ -6,17 +6,13 @@ import _bootstrap # noqa: F401 |
| 6 | 6 | ||
| 7 | from weknora_eval.config import load_config | 7 | from weknora_eval.config import load_config |
| 8 | from weknora_eval.loaders import setup_logging | 8 | from weknora_eval.loaders import setup_logging |
| 9 | from weknora_eval.testset import generate_rule_based_testset | 9 | from weknora_eval.testset import generate_testset |
| 10 | 10 | ||
| 11 | 11 | ||
| 12 | def main() -> int: | 12 | def main() -> int: |
| 13 | setup_logging() | 13 | setup_logging() |
| 14 | config = load_config() | 14 | config = load_config() |
| 15 | testset = config.get("testset", {}) | 15 | rows = generate_testset(config) |
| 16 | rows = generate_rule_based_testset( | ||
| 17 | size=int(testset.get("size", 50)), | ||
| 18 | min_context_chars=int(testset.get("min_context_chars", 80)), | ||
| 19 | ) | ||
| 20 | print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl") | 16 | print(f"Generated {len(rows)} pending QA candidates at data/testsets/testset.raw.jsonl") |
| 21 | return 0 if rows else 1 | 17 | return 0 if rows else 1 |
| 22 | 18 | ... | ... |
| 1 | from __future__ import annotations | 1 | from __future__ import annotations |
| 2 | 2 | ||
| 3 | import json | ||
| 3 | from typing import Any | 4 | from typing import Any |
| 4 | 5 | ||
| 6 | from langchain_core.documents import Document | ||
| 7 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings | ||
| 8 | from ragas.run_config import RunConfig | ||
| 9 | from ragas.testset import TestsetGenerator | ||
| 10 | |||
| 11 | from weknora_eval.config import require_config | ||
| 5 | from weknora_eval.loaders import read_jsonl, write_jsonl | 12 | from weknora_eval.loaders import read_jsonl, write_jsonl |
| 13 | from weknora_eval.ragas_runner import _wrap_langchain_models | ||
| 6 | from weknora_eval.schemas import TestsetRecord | 14 | from weknora_eval.schemas import TestsetRecord |
| 7 | 15 | ||
| 8 | 16 | ||
| 17 | def generate_testset(config: dict[str, Any]) -> list[dict[str, Any]]: | ||
| 18 | testset = config.get("testset", {}) | ||
| 19 | generator = str(testset.get("generator", "ragas")) | ||
| 20 | if generator == "ragas": | ||
| 21 | return generate_ragas_testset(config) | ||
| 22 | if generator == "rule_based": | ||
| 23 | return generate_rule_based_testset( | ||
| 24 | size=int(testset.get("size", 50)), | ||
| 25 | min_context_chars=int(testset.get("min_context_chars", 80)), | ||
| 26 | ) | ||
| 27 | raise ValueError(f"Unsupported testset.generator: {generator}") | ||
| 28 | |||
| 29 | |||
| 30 | def generate_ragas_testset( | ||
| 31 | config: dict[str, Any], | ||
| 32 | *, | ||
| 33 | documents_path: str = "data/parsed_docs/documents.jsonl", | ||
| 34 | output_path: str = "data/testsets/testset.raw.jsonl", | ||
| 35 | ) -> list[dict[str, Any]]: | ||
| 36 | testset_config = config.get("testset", {}) | ||
| 37 | ragas_config = config["ragas"] | ||
| 38 | size = int(testset_config.get("size", 50)) | ||
| 39 | min_context_chars = int(testset_config.get("min_context_chars", 80)) | ||
| 40 | |||
| 41 | source_rows = [ | ||
| 42 | row | ||
| 43 | for row in read_jsonl(documents_path) | ||
| 44 | if len(row.get("content") or "") >= min_context_chars | ||
| 45 | ] | ||
| 46 | if not source_rows: | ||
| 47 | write_jsonl(output_path, []) | ||
| 48 | return [] | ||
| 49 | |||
| 50 | documents = [ | ||
| 51 | Document( | ||
| 52 | page_content=row["content"], | ||
| 53 | metadata={ | ||
| 54 | "source_file": row.get("source_file"), | ||
| 55 | "doc_id": row.get("doc_id"), | ||
| 56 | **(row.get("metadata") or {}), | ||
| 57 | }, | ||
| 58 | ) | ||
| 59 | for row in source_rows | ||
| 60 | ] | ||
| 61 | |||
| 62 | llm = ChatOpenAI( | ||
| 63 | model=str(require_config(config, "ragas.generator_model")), | ||
| 64 | api_key=_required_ragas_value(ragas_config, "llm_api_key"), | ||
| 65 | base_url=_required_ragas_value(ragas_config, "llm_base_url"), | ||
| 66 | temperature=float(ragas_config.get("temperature", 0)), | ||
| 67 | max_tokens=int(ragas_config.get("max_tokens", 4096)), | ||
| 68 | timeout=int(ragas_config.get("timeout_seconds", 600)), | ||
| 69 | ) | ||
| 70 | embeddings = OpenAIEmbeddings( | ||
| 71 | model=str(require_config(config, "ragas.embedding_model")), | ||
| 72 | api_key=_required_ragas_value(ragas_config, "embedding_api_key"), | ||
| 73 | base_url=_required_ragas_value(ragas_config, "embedding_base_url"), | ||
| 74 | tiktoken_enabled=False, | ||
| 75 | check_embedding_ctx_length=False, | ||
| 76 | request_timeout=int(ragas_config.get("timeout_seconds", 600)), | ||
| 77 | ) | ||
| 78 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) | ||
| 79 | generator = TestsetGenerator(llm=ragas_llm, embedding_model=ragas_embeddings) | ||
| 80 | result = generator.generate_with_langchain_docs( | ||
| 81 | documents[: max(size, 1)], | ||
| 82 | testset_size=size, | ||
| 83 | run_config=RunConfig( | ||
| 84 | timeout=int(ragas_config.get("timeout_seconds", 600)), | ||
| 85 | max_workers=int(ragas_config.get("max_workers", 1)), | ||
| 86 | ), | ||
| 87 | raise_exceptions=False, | ||
| 88 | ) | ||
| 89 | |||
| 90 | ragas_rows = result.to_list() | ||
| 91 | rows = _normalize_ragas_rows(ragas_rows, source_rows) | ||
| 92 | write_jsonl(output_path, rows) | ||
| 93 | return rows | ||
| 94 | |||
| 95 | |||
| 96 | def _normalize_ragas_rows( | ||
| 97 | ragas_rows: list[dict[str, Any]], | ||
| 98 | source_rows: list[dict[str, Any]], | ||
| 99 | ) -> list[dict[str, Any]]: | ||
| 100 | normalized: list[dict[str, Any]] = [] | ||
| 101 | source_by_doc_id = {str(row.get("doc_id")): row for row in source_rows if row.get("doc_id")} | ||
| 102 | for index, row in enumerate(ragas_rows, start=1): | ||
| 103 | reference_contexts = _as_string_list(row.get("reference_contexts")) | ||
| 104 | if not reference_contexts and row.get("reference_context"): | ||
| 105 | reference_contexts = _as_string_list(row.get("reference_context")) | ||
| 106 | source = _match_source_row(row, source_rows, source_by_doc_id, reference_contexts) | ||
| 107 | gold_chunk_ids = [] | ||
| 108 | if source: | ||
| 109 | chunk_id = (source.get("metadata") or {}).get("chunk_id") or source.get("doc_id") | ||
| 110 | if chunk_id: | ||
| 111 | gold_chunk_ids = [str(chunk_id)] | ||
| 112 | normalized.append( | ||
| 113 | TestsetRecord( | ||
| 114 | sample_id=f"qa-{index:04d}", | ||
| 115 | user_input=str(row.get("user_input") or row.get("query") or "").strip(), | ||
| 116 | reference=str(row.get("reference") or row.get("answer") or "").strip(), | ||
| 117 | reference_contexts=reference_contexts or ([source["content"]] if source else []), | ||
| 118 | source_file=source.get("source_file") if source else None, | ||
| 119 | gold_chunk_ids=gold_chunk_ids, | ||
| 120 | question_type=str(row.get("synthesizer_name") or "ragas"), | ||
| 121 | review_status="pending", | ||
| 122 | ).to_dict() | ||
| 123 | ) | ||
| 124 | return [ | ||
| 125 | row | ||
| 126 | for row in normalized | ||
| 127 | if row.get("user_input") and row.get("reference") and row.get("reference_contexts") | ||
| 128 | ] | ||
| 129 | |||
| 130 | |||
| 131 | def _match_source_row( | ||
| 132 | ragas_row: dict[str, Any], | ||
| 133 | source_rows: list[dict[str, Any]], | ||
| 134 | source_by_doc_id: dict[str, dict[str, Any]], | ||
| 135 | reference_contexts: list[str], | ||
| 136 | ) -> dict[str, Any] | None: | ||
| 137 | for key in ("reference_context_ids", "retrieved_context_ids"): | ||
| 138 | for doc_id in _as_string_list(ragas_row.get(key)): | ||
| 139 | if doc_id in source_by_doc_id: | ||
| 140 | return source_by_doc_id[doc_id] | ||
| 141 | for context in reference_contexts: | ||
| 142 | for source in source_rows: | ||
| 143 | content = source.get("content") or "" | ||
| 144 | if context and (context in content or content in context): | ||
| 145 | return source | ||
| 146 | return source_rows[0] if source_rows else None | ||
| 147 | |||
| 148 | |||
| 149 | def _as_string_list(value: Any) -> list[str]: | ||
| 150 | if value is None: | ||
| 151 | return [] | ||
| 152 | if isinstance(value, str): | ||
| 153 | try: | ||
| 154 | parsed = json.loads(value) | ||
| 155 | if parsed != value: | ||
| 156 | return _as_string_list(parsed) | ||
| 157 | except json.JSONDecodeError: | ||
| 158 | pass | ||
| 159 | return [value.strip()] if value.strip() else [] | ||
| 160 | if isinstance(value, list): | ||
| 161 | result: list[str] = [] | ||
| 162 | for item in value: | ||
| 163 | result.extend(_as_string_list(item)) | ||
| 164 | return result | ||
| 165 | if isinstance(value, dict): | ||
| 166 | for key in ("content", "text", "page_content"): | ||
| 167 | if key in value: | ||
| 168 | return _as_string_list(value[key]) | ||
| 169 | return [] | ||
| 170 | return [str(value)] | ||
| 171 | |||
| 172 | |||
| 173 | def _required_ragas_value(config: dict[str, Any], key: str) -> str: | ||
| 174 | value = config.get(key) | ||
| 175 | if value in {None, ""}: | ||
| 176 | raise ValueError(f"Missing required Ragas config value: ragas.{key}") | ||
| 177 | return str(value) | ||
| 178 | |||
| 179 | |||
| 9 | def generate_rule_based_testset( | 180 | def generate_rule_based_testset( |
| 10 | *, | 181 | *, |
| 11 | documents_path: str = "data/parsed_docs/documents.jsonl", | 182 | documents_path: str = "data/parsed_docs/documents.jsonl", | ... | ... |
-
Please register or sign in to post a comment