Generate Ragas QA directly from WeKnora chunks
Showing
5 changed files
with
163 additions
and
20 deletions
| ... | @@ -22,6 +22,7 @@ RAGAS_JUDGE_MODEL=gpt-4o-mini | ... | @@ -22,6 +22,7 @@ RAGAS_JUDGE_MODEL=gpt-4o-mini |
| 22 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small | 22 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small |
| 23 | 23 | ||
| 24 | TESTSET_SIZE=50 | 24 | TESTSET_SIZE=50 |
| 25 | TESTSET_RAGAS_MODE=direct | ||
| 25 | TESTSET_MAX_DOCUMENT_CHARS=2000 | 26 | TESTSET_MAX_DOCUMENT_CHARS=2000 |
| 26 | TESTSET_SOURCE_MULTIPLIER=3 | 27 | TESTSET_SOURCE_MULTIPLIER=3 |
| 27 | TESTSET_GENERATOR_MAX_TOKENS=4096 | 28 | TESTSET_GENERATOR_MAX_TOKENS=4096 | ... | ... |
| ... | @@ -68,7 +68,7 @@ python scripts/10_report.py | ... | @@ -68,7 +68,7 @@ python scripts/10_report.py |
| 68 | 68 | ||
| 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | 69 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 |
| 70 | 70 | ||
| 71 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;生成阶段会用 `TESTSET_MAX_DOCUMENT_CHARS` 限制单条来源上下文长度,并用 `TESTSET_GENERATOR_MAX_TOKENS` 控制生成输出预算,避免和后续评测用的 `ragas.max_tokens` 混在一起。`local`、`mineru` 和 `rule_based` 只作为可选实验/兜底配置保留。 | 71 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;生成阶段使用 `TESTSET_RAGAS_MODE=direct`,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA,避免 Ragas 默认文档预处理链路重新抽标题、摘要和实体。生成阶段还会用 `TESTSET_MAX_DOCUMENT_CHARS` 限制单条来源上下文长度,并用 `TESTSET_GENERATOR_MAX_TOKENS` 控制生成输出预算,避免和后续评测用的 `ragas.max_tokens` 混在一起。`local`、`mineru` 和 `rule_based` 只作为可选实验/兜底配置保留。 |
| 72 | 72 | ||
| 73 | ## 主要产物 | 73 | ## 主要产物 |
| 74 | 74 | ... | ... |
| ... | @@ -293,10 +293,13 @@ max_tokens: 4096 | ... | @@ -293,10 +293,13 @@ max_tokens: 4096 |
| 293 | 如果 `05_generate_testset.py` 在生成 QA 时出现 `LLMDidNotFinishException`,优先不要继续盲目调大 `ragas.max_tokens`。`05` 有独立的生成预算和输入长度: | 293 | 如果 `05_generate_testset.py` 在生成 QA 时出现 `LLMDidNotFinishException`,优先不要继续盲目调大 `ragas.max_tokens`。`05` 有独立的生成预算和输入长度: |
| 294 | 294 | ||
| 295 | ```bash | 295 | ```bash |
| 296 | TESTSET_RAGAS_MODE=direct | ||
| 296 | TESTSET_GENERATOR_MAX_TOKENS=4096 | 297 | TESTSET_GENERATOR_MAX_TOKENS=4096 |
| 297 | TESTSET_MAX_DOCUMENT_CHARS=2000 | 298 | TESTSET_MAX_DOCUMENT_CHARS=2000 |
| 298 | ``` | 299 | ``` |
| 299 | 300 | ||
| 301 | `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor`、`SummaryExtractor`、`NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked` 和 `langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。 | ||
| 302 | |||
| 300 | 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。 | 303 | 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。 |
| 301 | 304 | ||
| 302 | ### WeKnora 问答没有 retrieved_contexts | 305 | ### WeKnora 问答没有 retrieved_contexts | ... | ... |
| ... | @@ -10,6 +10,7 @@ weknora: | ... | @@ -10,6 +10,7 @@ weknora: |
| 10 | testset: | 10 | testset: |
| 11 | size: "${TESTSET_SIZE:-50}" | 11 | size: "${TESTSET_SIZE:-50}" |
| 12 | generator: "ragas" # ragas or rule_based | 12 | generator: "ragas" # ragas or rule_based |
| 13 | ragas_mode: "${TESTSET_RAGAS_MODE:-direct}" # direct, prechunked, or langchain_docs | ||
| 13 | include_pdf: true | 14 | include_pdf: true |
| 14 | include_xlsx: true | 15 | include_xlsx: true |
| 15 | min_context_chars: 80 | 16 | min_context_chars: 80 | ... | ... |
| ... | @@ -8,7 +8,10 @@ from typing import Any | ... | @@ -8,7 +8,10 @@ from typing import Any |
| 8 | from langchain_core.documents import Document | 8 | from langchain_core.documents import Document |
| 9 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings | 9 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
| 10 | from ragas.run_config import RunConfig | 10 | from ragas.run_config import RunConfig |
| 11 | from ragas.testset.graph import KnowledgeGraph, Node, NodeType | ||
| 12 | from ragas.testset.persona import Persona | ||
| 11 | from ragas.testset import TestsetGenerator | 13 | from ragas.testset import TestsetGenerator |
| 14 | from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer | ||
| 12 | 15 | ||
| 13 | from weknora_eval.config import require_config | 16 | from weknora_eval.config import require_config |
| 14 | from weknora_eval.loaders import read_jsonl, write_jsonl | 17 | from weknora_eval.loaders import read_jsonl, write_jsonl |
| ... | @@ -46,6 +49,7 @@ def generate_ragas_testset( | ... | @@ -46,6 +49,7 @@ def generate_ragas_testset( |
| 46 | generator_max_tokens = int( | 49 | generator_max_tokens = int( |
| 47 | testset_config.get("generator_max_tokens", ragas_config.get("max_tokens", 4096)) | 50 | testset_config.get("generator_max_tokens", ragas_config.get("max_tokens", 4096)) |
| 48 | ) | 51 | ) |
| 52 | ragas_mode = str(testset_config.get("ragas_mode", "direct")) | ||
| 49 | 53 | ||
| 50 | source_rows = [ | 54 | source_rows = [ |
| 51 | row | 55 | row |
| ... | @@ -71,11 +75,12 @@ def generate_ragas_testset( | ... | @@ -71,11 +75,12 @@ def generate_ragas_testset( |
| 71 | for row in selected_source_rows | 75 | for row in selected_source_rows |
| 72 | ] | 76 | ] |
| 73 | logger.info( | 77 | logger.info( |
| 74 | "Generating Ragas testset: target_size=%s source_documents=%s max_document_chars=%s generator_max_tokens=%s", | 78 | "Generating Ragas testset: target_size=%s source_documents=%s max_document_chars=%s generator_max_tokens=%s ragas_mode=%s", |
| 75 | size, | 79 | size, |
| 76 | len(documents), | 80 | len(documents), |
| 77 | max_document_chars, | 81 | max_document_chars, |
| 78 | generator_max_tokens, | 82 | generator_max_tokens, |
| 83 | ragas_mode, | ||
| 79 | ) | 84 | ) |
| 80 | 85 | ||
| 81 | llm = ChatOpenAI( | 86 | llm = ChatOpenAI( |
| ... | @@ -86,32 +91,165 @@ def generate_ragas_testset( | ... | @@ -86,32 +91,165 @@ def generate_ragas_testset( |
| 86 | max_tokens=generator_max_tokens, | 91 | max_tokens=generator_max_tokens, |
| 87 | timeout=int(ragas_config.get("timeout_seconds", 600)), | 92 | timeout=int(ragas_config.get("timeout_seconds", 600)), |
| 88 | ) | 93 | ) |
| 89 | embeddings = OpenAIEmbeddings( | 94 | run_config = RunConfig( |
| 90 | model=str(require_config(config, "ragas.embedding_model")), | 95 | timeout=int(ragas_config.get("timeout_seconds", 600)), |
| 91 | api_key=_required_ragas_value(ragas_config, "embedding_api_key"), | 96 | max_workers=int(ragas_config.get("max_workers", 1)), |
| 92 | base_url=_required_ragas_value(ragas_config, "embedding_base_url"), | 97 | ) |
| 93 | tiktoken_enabled=False, | 98 | if ragas_mode == "direct": |
| 94 | check_embedding_ctx_length=False, | 99 | result = _generate_ragas_direct(llm, documents, size, run_config) |
| 95 | request_timeout=int(ragas_config.get("timeout_seconds", 600)), | 100 | elif ragas_mode == "prechunked": |
| 101 | result = _generate_ragas_prechunked( | ||
| 102 | config, ragas_config, llm, documents, size, run_config | ||
| 103 | ) | ||
| 104 | elif ragas_mode == "langchain_docs": | ||
| 105 | result = _generate_ragas_langchain_docs( | ||
| 106 | config, ragas_config, llm, documents, size, run_config | ||
| 107 | ) | ||
| 108 | else: | ||
| 109 | raise ValueError(f"Unsupported testset.ragas_mode: {ragas_mode}") | ||
| 110 | |||
| 111 | ragas_rows = result.to_list() | ||
| 112 | rows = _normalize_ragas_rows(ragas_rows, selected_source_rows) | ||
| 113 | write_jsonl(output_path, rows) | ||
| 114 | return rows | ||
| 115 | |||
| 116 | |||
| 117 | def _generate_ragas_direct( | ||
| 118 | llm: ChatOpenAI, | ||
| 119 | documents: list[Document], | ||
| 120 | size: int, | ||
| 121 | run_config: RunConfig, | ||
| 122 | ) -> Any: | ||
| 123 | ragas_llm = _wrap_langchain_llm(llm) | ||
| 124 | kg = KnowledgeGraph( | ||
| 125 | nodes=[ | ||
| 126 | Node( | ||
| 127 | type=NodeType.CHUNK, | ||
| 128 | properties={ | ||
| 129 | "page_content": document.page_content, | ||
| 130 | "document_metadata": document.metadata, | ||
| 131 | "entities": _generation_terms(document), | ||
| 132 | "themes": _generation_terms(document), | ||
| 133 | }, | ||
| 134 | ) | ||
| 135 | for document in documents | ||
| 136 | if document.page_content.strip() | ||
| 137 | ] | ||
| 138 | ) | ||
| 139 | generator = TestsetGenerator( | ||
| 140 | llm=ragas_llm, | ||
| 141 | embedding_model=None, | ||
| 142 | knowledge_graph=kg, | ||
| 143 | persona_list=[ | ||
| 144 | Persona( | ||
| 145 | name="合同审核人员", | ||
| 146 | role_description="关注合同条款、权利归属、授权范围和履约义务。", | ||
| 147 | ), | ||
| 148 | Persona( | ||
| 149 | name="业务运营人员", | ||
| 150 | role_description="关注文档中可用于业务执行和信息核验的事实。", | ||
| 151 | ), | ||
| 152 | Persona( | ||
| 153 | name="法务合规人员", | ||
| 154 | role_description="关注协议、版权、授权、责任和风险表述。", | ||
| 155 | ), | ||
| 156 | ], | ||
| 157 | ) | ||
| 158 | generate_kwargs: dict[str, Any] = { | ||
| 159 | "testset_size": size, | ||
| 160 | "query_distribution": [(SingleHopSpecificQuerySynthesizer(llm=ragas_llm), 1.0)], | ||
| 161 | "num_personas": 3, | ||
| 162 | "run_config": run_config, | ||
| 163 | "raise_exceptions": False, | ||
| 164 | } | ||
| 165 | if "batch_size" in inspect.signature(generator.generate).parameters: | ||
| 166 | generate_kwargs["batch_size"] = 1 | ||
| 167 | return generator.generate(**generate_kwargs) | ||
| 168 | |||
| 169 | |||
| 170 | def _generate_ragas_prechunked( | ||
| 171 | config: dict[str, Any], | ||
| 172 | ragas_config: dict[str, Any], | ||
| 173 | llm: ChatOpenAI, | ||
| 174 | documents: list[Document], | ||
| 175 | size: int, | ||
| 176 | run_config: RunConfig, | ||
| 177 | ) -> Any: | ||
| 178 | embeddings = _build_embeddings(config, ragas_config) | ||
| 179 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) | ||
| 180 | generator = TestsetGenerator(llm=ragas_llm, embedding_model=ragas_embeddings) | ||
| 181 | return generator.generate_with_chunks( | ||
| 182 | documents, | ||
| 183 | testset_size=size, | ||
| 184 | run_config=run_config, | ||
| 185 | raise_exceptions=False, | ||
| 96 | ) | 186 | ) |
| 187 | |||
| 188 | |||
| 189 | def _generate_ragas_langchain_docs( | ||
| 190 | config: dict[str, Any], | ||
| 191 | ragas_config: dict[str, Any], | ||
| 192 | llm: ChatOpenAI, | ||
| 193 | documents: list[Document], | ||
| 194 | size: int, | ||
| 195 | run_config: RunConfig, | ||
| 196 | ) -> Any: | ||
| 197 | embeddings = _build_embeddings(config, ragas_config) | ||
| 97 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) | 198 | ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings) |
| 98 | generator = TestsetGenerator(llm=ragas_llm, embedding_model=ragas_embeddings) | 199 | generator = TestsetGenerator(llm=ragas_llm, embedding_model=ragas_embeddings) |
| 99 | generate_kwargs: dict[str, Any] = { | 200 | generate_kwargs: dict[str, Any] = { |
| 100 | "testset_size": size, | 201 | "testset_size": size, |
| 101 | "run_config": RunConfig( | 202 | "run_config": run_config, |
| 102 | timeout=int(ragas_config.get("timeout_seconds", 600)), | ||
| 103 | max_workers=int(ragas_config.get("max_workers", 1)), | ||
| 104 | ), | ||
| 105 | "raise_exceptions": False, | 203 | "raise_exceptions": False, |
| 106 | } | 204 | } |
| 107 | if "batch_size" in inspect.signature(generator.generate_with_langchain_docs).parameters: | 205 | return generator.generate_with_langchain_docs(documents, **generate_kwargs) |
| 108 | generate_kwargs["batch_size"] = 1 | ||
| 109 | result = generator.generate_with_langchain_docs(documents, **generate_kwargs) | ||
| 110 | 206 | ||
| 111 | ragas_rows = result.to_list() | 207 | |
| 112 | rows = _normalize_ragas_rows(ragas_rows, selected_source_rows) | 208 | def _build_embeddings( |
| 113 | write_jsonl(output_path, rows) | 209 | config: dict[str, Any], ragas_config: dict[str, Any] |
| 114 | return rows | 210 | ) -> OpenAIEmbeddings: |
| 211 | return OpenAIEmbeddings( | ||
| 212 | model=str(require_config(config, "ragas.embedding_model")), | ||
| 213 | api_key=_required_ragas_value(ragas_config, "embedding_api_key"), | ||
| 214 | base_url=_required_ragas_value(ragas_config, "embedding_base_url"), | ||
| 215 | tiktoken_enabled=False, | ||
| 216 | check_embedding_ctx_length=False, | ||
| 217 | request_timeout=int(ragas_config.get("timeout_seconds", 600)), | ||
| 218 | ) | ||
| 219 | |||
| 220 | |||
| 221 | def _wrap_langchain_llm(llm: Any) -> Any: | ||
| 222 | try: | ||
| 223 | from ragas.llms import LangchainLLMWrapper | ||
| 224 | except ImportError: | ||
| 225 | return llm | ||
| 226 | return LangchainLLMWrapper(llm) | ||
| 227 | |||
| 228 | |||
| 229 | def _generation_terms(document: Document) -> list[str]: | ||
| 230 | text = f"{document.metadata.get('source_file') or ''} {document.page_content}" | ||
| 231 | candidates = [ | ||
| 232 | "合同条款", | ||
| 233 | "权利归属", | ||
| 234 | "著作权", | ||
| 235 | "邻接权", | ||
| 236 | "录音权利", | ||
| 237 | "词权利", | ||
| 238 | "曲权利", | ||
| 239 | "授权范围", | ||
| 240 | "作品信息", | ||
| 241 | "甲方", | ||
| 242 | "乙方", | ||
| 243 | "协议", | ||
| 244 | "付款", | ||
| 245 | "违约责任", | ||
| 246 | "期限", | ||
| 247 | ] | ||
| 248 | terms = [term for term in candidates if term in text] | ||
| 249 | source_file = str(document.metadata.get("source_file") or "").strip() | ||
| 250 | if source_file: | ||
| 251 | terms.append(source_file.rsplit(".", 1)[0][:40]) | ||
| 252 | return terms[:6] or ["文档内容"] | ||
| 115 | 253 | ||
| 116 | 254 | ||
| 117 | def _truncate_for_generation(content: str, max_chars: int) -> str: | 255 | def _truncate_for_generation(content: str, max_chars: int) -> str: | ... | ... |
-
Please register or sign in to post a comment