设置ragas思考模式开关
Showing
9 changed files
with
49 additions
and
0 deletions
| ... | @@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me | ... | @@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me |
| 20 | RAGAS_GENERATOR_MODEL=gpt-4o-mini | 20 | RAGAS_GENERATOR_MODEL=gpt-4o-mini |
| 21 | RAGAS_JUDGE_MODEL=gpt-4o-mini | 21 | RAGAS_JUDGE_MODEL=gpt-4o-mini |
| 22 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small | 22 | RAGAS_EMBEDDING_MODEL=text-embedding-3-small |
| 23 | RAGAS_ENABLE_THINKING=false | ||
| 23 | 24 | ||
| 24 | TESTSET_SIZE=50 | 25 | TESTSET_SIZE=50 |
| 25 | TESTSET_RAGAS_MODE=direct | 26 | TESTSET_RAGAS_MODE=direct | ... | ... |
| ... | @@ -39,6 +39,7 @@ cp .env.example .env | ... | @@ -39,6 +39,7 @@ cp .env.example .env |
| 39 | - `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1` | 39 | - `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1` |
| 40 | - `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1` | 40 | - `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1` |
| 41 | - `RAGAS_*_MODEL` 是评估侧模型名称 | 41 | - `RAGAS_*_MODEL` 是评估侧模型名称 |
| 42 | - `RAGAS_ENABLE_THINKING=false` 只会在本评估项目的 RAGAS LLM 请求中发送 `chat_template_kwargs.enable_thinking=false`,不会改变 WeKnora 检索/问答服务的模型配置 | ||
| 42 | 43 | ||
| 43 | ## 首轮 Pilot | 44 | ## 首轮 Pilot |
| 44 | 45 | ... | ... |
| ... | @@ -296,10 +296,13 @@ max_tokens: 4096 | ... | @@ -296,10 +296,13 @@ max_tokens: 4096 |
| 296 | TESTSET_RAGAS_MODE=direct | 296 | TESTSET_RAGAS_MODE=direct |
| 297 | TESTSET_GENERATOR_MAX_TOKENS=4096 | 297 | TESTSET_GENERATOR_MAX_TOKENS=4096 |
| 298 | TESTSET_MAX_DOCUMENT_CHARS=2000 | 298 | TESTSET_MAX_DOCUMENT_CHARS=2000 |
| 299 | RAGAS_ENABLE_THINKING=false | ||
| 299 | ``` | 300 | ``` |
| 300 | 301 | ||
| 301 | `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor`、`SummaryExtractor`、`NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked` 和 `langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。 | 302 | `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor`、`SummaryExtractor`、`NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked` 和 `langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。 |
| 302 | 303 | ||
| 304 | 如果使用 Qwen thinking 模型,`RAGAS_ENABLE_THINKING=false` 会只在 RAGAS 请求里附加 `chat_template_kwargs.enable_thinking=false`,避免 RAGAS 的 JSON/Pydantic 结构化输出被 `Thinking Process` 前缀破坏;WeKnora 本身的检索问答链路不经过这些脚本,不会受影响。 | ||
| 305 | |||
| 303 | 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。 | 306 | 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。 |
| 304 | 307 | ||
| 305 | ### WeKnora 问答没有 retrieved_contexts | 308 | ### WeKnora 问答没有 retrieved_contexts | ... | ... |
| ... | @@ -72,6 +72,7 @@ ragas: | ... | @@ -72,6 +72,7 @@ ragas: |
| 72 | generator_model: "${RAGAS_GENERATOR_MODEL}" | 72 | generator_model: "${RAGAS_GENERATOR_MODEL}" |
| 73 | judge_model: "${RAGAS_JUDGE_MODEL}" | 73 | judge_model: "${RAGAS_JUDGE_MODEL}" |
| 74 | embedding_model: "${RAGAS_EMBEDDING_MODEL}" | 74 | embedding_model: "${RAGAS_EMBEDDING_MODEL}" |
| 75 | enable_thinking: "${RAGAS_ENABLE_THINKING:-false}" | ||
| 75 | temperature: 0 | 76 | temperature: 0 |
| 76 | max_tokens: 4096 | 77 | max_tokens: 4096 |
| 77 | timeout_seconds: 600 | 78 | timeout_seconds: 600 | ... | ... |
| ... | @@ -10,6 +10,7 @@ import requests | ... | @@ -10,6 +10,7 @@ import requests |
| 10 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings | 10 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
| 11 | 11 | ||
| 12 | from weknora_eval.config import load_config | 12 | from weknora_eval.config import load_config |
| 13 | from weknora_eval.llm_options import chat_openai_kwargs | ||
| 13 | 14 | ||
| 14 | 15 | ||
| 15 | def main() -> int: | 16 | def main() -> int: |
| ... | @@ -27,6 +28,7 @@ def main() -> int: | ... | @@ -27,6 +28,7 @@ def main() -> int: |
| 27 | model=require_value(ragas, "generator_model"), | 28 | model=require_value(ragas, "generator_model"), |
| 28 | temperature=float(ragas.get("temperature", 0)), | 29 | temperature=float(ragas.get("temperature", 0)), |
| 29 | max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), | 30 | max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), |
| 31 | extra_kwargs=chat_openai_kwargs(ragas), | ||
| 30 | ) | 32 | ) |
| 31 | ) | 33 | ) |
| 32 | failures.extend( | 34 | failures.extend( |
| ... | @@ -37,6 +39,7 @@ def main() -> int: | ... | @@ -37,6 +39,7 @@ def main() -> int: |
| 37 | model=require_value(ragas, "judge_model"), | 39 | model=require_value(ragas, "judge_model"), |
| 38 | temperature=float(ragas.get("temperature", 0)), | 40 | temperature=float(ragas.get("temperature", 0)), |
| 39 | max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), | 41 | max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), |
| 42 | extra_kwargs=chat_openai_kwargs(ragas), | ||
| 40 | ) | 43 | ) |
| 41 | ) | 44 | ) |
| 42 | failures.extend( | 45 | failures.extend( |
| ... | @@ -78,6 +81,7 @@ def check_chat_model( | ... | @@ -78,6 +81,7 @@ def check_chat_model( |
| 78 | model: str, | 81 | model: str, |
| 79 | temperature: float, | 82 | temperature: float, |
| 80 | max_tokens: int, | 83 | max_tokens: int, |
| 84 | extra_kwargs: dict[str, Any], | ||
| 81 | ) -> list[str]: | 85 | ) -> list[str]: |
| 82 | print(f"[CHECK] {title}: model={model} base_url={base_url}") | 86 | print(f"[CHECK] {title}: model={model} base_url={base_url}") |
| 83 | started = time.monotonic() | 87 | started = time.monotonic() |
| ... | @@ -89,6 +93,7 @@ def check_chat_model( | ... | @@ -89,6 +93,7 @@ def check_chat_model( |
| 89 | temperature=temperature, | 93 | temperature=temperature, |
| 90 | max_tokens=max_tokens, | 94 | max_tokens=max_tokens, |
| 91 | timeout=120, | 95 | timeout=120, |
| 96 | **extra_kwargs, | ||
| 92 | ) | 97 | ) |
| 93 | response = llm.invoke("Reply with exactly: OK") | 98 | response = llm.invoke("Reply with exactly: OK") |
| 94 | content = str(response.content or "").strip() | 99 | content = str(response.content or "").strip() | ... | ... |
| ... | @@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import ( | ... | @@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import ( |
| 21 | ) | 21 | ) |
| 22 | 22 | ||
| 23 | from weknora_eval.config import load_config | 23 | from weknora_eval.config import load_config |
| 24 | from weknora_eval.llm_options import chat_extra_body, chat_openai_kwargs | ||
| 24 | 25 | ||
| 25 | 26 | ||
| 26 | class SimpleQA(BaseModel): | 27 | class SimpleQA(BaseModel): |
| ... | @@ -38,6 +39,7 @@ def main() -> int: | ... | @@ -38,6 +39,7 @@ def main() -> int: |
| 38 | max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096))) | 39 | max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096))) |
| 39 | temperature = float(ragas.get("temperature", 0)) | 40 | temperature = float(ragas.get("temperature", 0)) |
| 40 | timeout = int(ragas.get("timeout_seconds", 600)) | 41 | timeout = int(ragas.get("timeout_seconds", 600)) |
| 42 | extra_body = chat_extra_body(ragas) | ||
| 41 | 43 | ||
| 42 | print("Diagnosing Ragas generator LLM compatibility\n") | 44 | print("Diagnosing Ragas generator LLM compatibility\n") |
| 43 | print(f"model={model}") | 45 | print(f"model={model}") |
| ... | @@ -54,6 +56,7 @@ def main() -> int: | ... | @@ -54,6 +56,7 @@ def main() -> int: |
| 54 | max_tokens=min(max_tokens, 256), | 56 | max_tokens=min(max_tokens, 256), |
| 55 | temperature=temperature, | 57 | temperature=temperature, |
| 56 | timeout=timeout, | 58 | timeout=timeout, |
| 59 | extra_body=extra_body, | ||
| 57 | ) | 60 | ) |
| 58 | 61 | ||
| 59 | json_prompt = ( | 62 | json_prompt = ( |
| ... | @@ -70,6 +73,7 @@ def main() -> int: | ... | @@ -70,6 +73,7 @@ def main() -> int: |
| 70 | max_tokens=max_tokens, | 73 | max_tokens=max_tokens, |
| 71 | temperature=temperature, | 74 | temperature=temperature, |
| 72 | timeout=timeout, | 75 | timeout=timeout, |
| 76 | extra_body=extra_body, | ||
| 73 | ) | 77 | ) |
| 74 | validate_json_payload(structured.get("content") or "") | 78 | validate_json_payload(structured.get("content") or "") |
| 75 | 79 | ||
| ... | @@ -81,6 +85,7 @@ def main() -> int: | ... | @@ -81,6 +85,7 @@ def main() -> int: |
| 81 | max_tokens=max_tokens, | 85 | max_tokens=max_tokens, |
| 82 | temperature=temperature, | 86 | temperature=temperature, |
| 83 | timeout=timeout, | 87 | timeout=timeout, |
| 88 | extra_kwargs=chat_openai_kwargs(ragas), | ||
| 84 | ) | 89 | ) |
| 85 | 90 | ||
| 86 | run_ragas_prompt_probe( | 91 | run_ragas_prompt_probe( |
| ... | @@ -90,6 +95,7 @@ def main() -> int: | ... | @@ -90,6 +95,7 @@ def main() -> int: |
| 90 | max_tokens=max_tokens, | 95 | max_tokens=max_tokens, |
| 91 | temperature=temperature, | 96 | temperature=temperature, |
| 92 | timeout=timeout, | 97 | timeout=timeout, |
| 98 | extra_kwargs=chat_openai_kwargs(ragas), | ||
| 93 | ) | 99 | ) |
| 94 | 100 | ||
| 95 | explain_result(plain, structured, langchain_result) | 101 | explain_result(plain, structured, langchain_result) |
| ... | @@ -106,6 +112,7 @@ def run_raw_chat( | ... | @@ -106,6 +112,7 @@ def run_raw_chat( |
| 106 | max_tokens: int, | 112 | max_tokens: int, |
| 107 | temperature: float, | 113 | temperature: float, |
| 108 | timeout: int, | 114 | timeout: int, |
| 115 | extra_body: dict[str, Any], | ||
| 109 | ) -> dict[str, Any]: | 116 | ) -> dict[str, Any]: |
| 110 | print(f"[RAW] {title}") | 117 | print(f"[RAW] {title}") |
| 111 | started = time.monotonic() | 118 | started = time.monotonic() |
| ... | @@ -120,6 +127,7 @@ def run_raw_chat( | ... | @@ -120,6 +127,7 @@ def run_raw_chat( |
| 120 | "messages": messages, | 127 | "messages": messages, |
| 121 | "temperature": temperature, | 128 | "temperature": temperature, |
| 122 | "max_tokens": max_tokens, | 129 | "max_tokens": max_tokens, |
| 130 | **extra_body, | ||
| 123 | }, | 131 | }, |
| 124 | timeout=timeout, | 132 | timeout=timeout, |
| 125 | ) | 133 | ) |
| ... | @@ -169,6 +177,7 @@ def run_langchain_probe( | ... | @@ -169,6 +177,7 @@ def run_langchain_probe( |
| 169 | max_tokens: int, | 177 | max_tokens: int, |
| 170 | temperature: float, | 178 | temperature: float, |
| 171 | timeout: int, | 179 | timeout: int, |
| 180 | extra_kwargs: dict[str, Any], | ||
| 172 | ) -> dict[str, Any]: | 181 | ) -> dict[str, Any]: |
| 173 | print("[LANGCHAIN] generation metadata") | 182 | print("[LANGCHAIN] generation metadata") |
| 174 | llm = ChatOpenAI( | 183 | llm = ChatOpenAI( |
| ... | @@ -178,6 +187,7 @@ def run_langchain_probe( | ... | @@ -178,6 +187,7 @@ def run_langchain_probe( |
| 178 | temperature=temperature, | 187 | temperature=temperature, |
| 179 | max_tokens=max_tokens, | 188 | max_tokens=max_tokens, |
| 180 | timeout=timeout, | 189 | timeout=timeout, |
| 190 | **extra_kwargs, | ||
| 181 | ) | 191 | ) |
| 182 | prompt_value = StringPromptValue(text=prompt) | 192 | prompt_value = StringPromptValue(text=prompt) |
| 183 | result = llm.generate_prompt([prompt_value]) | 193 | result = llm.generate_prompt([prompt_value]) |
| ... | @@ -206,6 +216,7 @@ def run_ragas_prompt_probe( | ... | @@ -206,6 +216,7 @@ def run_ragas_prompt_probe( |
| 206 | max_tokens: int, | 216 | max_tokens: int, |
| 207 | temperature: float, | 217 | temperature: float, |
| 208 | timeout: int, | 218 | timeout: int, |
| 219 | extra_kwargs: dict[str, Any], | ||
| 209 | ) -> None: | 220 | ) -> None: |
| 210 | print("[RAGAS] QueryAnswerGenerationPrompt") | 221 | print("[RAGAS] QueryAnswerGenerationPrompt") |
| 211 | llm = ChatOpenAI( | 222 | llm = ChatOpenAI( |
| ... | @@ -215,6 +226,7 @@ def run_ragas_prompt_probe( | ... | @@ -215,6 +226,7 @@ def run_ragas_prompt_probe( |
| 215 | temperature=temperature, | 226 | temperature=temperature, |
| 216 | max_tokens=max_tokens, | 227 | max_tokens=max_tokens, |
| 217 | timeout=timeout, | 228 | timeout=timeout, |
| 229 | **extra_kwargs, | ||
| 218 | ) | 230 | ) |
| 219 | ragas_llm = LangchainLLMWrapper(llm) | 231 | ragas_llm = LangchainLLMWrapper(llm) |
| 220 | ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1)) | 232 | ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1)) | ... | ... |
src/weknora_eval/llm_options.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | from typing import Any | ||
| 4 | |||
| 5 | |||
| 6 | def chat_extra_body(config: dict[str, Any]) -> dict[str, Any]: | ||
| 7 | if not _as_bool(config.get("enable_thinking", False)): | ||
| 8 | return {"chat_template_kwargs": {"enable_thinking": False}} | ||
| 9 | return {} | ||
| 10 | |||
| 11 | |||
| 12 | def chat_openai_kwargs(config: dict[str, Any]) -> dict[str, Any]: | ||
| 13 | extra_body = chat_extra_body(config) | ||
| 14 | return {"extra_body": extra_body} if extra_body else {} | ||
| 15 | |||
| 16 | |||
| 17 | def _as_bool(value: Any) -> bool: | ||
| 18 | if isinstance(value, bool): | ||
| 19 | return value | ||
| 20 | if isinstance(value, str): | ||
| 21 | return value.strip().lower() in {"1", "true", "yes", "on"} | ||
| 22 | return bool(value) |
| ... | @@ -8,6 +8,7 @@ import pandas as pd | ... | @@ -8,6 +8,7 @@ import pandas as pd |
| 8 | 8 | ||
| 9 | from weknora_eval.config import require_config | 9 | from weknora_eval.config import require_config |
| 10 | from weknora_eval.loaders import read_jsonl | 10 | from weknora_eval.loaders import read_jsonl |
| 11 | from weknora_eval.llm_options import chat_openai_kwargs | ||
| 11 | 12 | ||
| 12 | 13 | ||
| 13 | def run_ragas_eval( | 14 | def run_ragas_eval( |
| ... | @@ -64,6 +65,7 @@ def run_ragas_eval( | ... | @@ -64,6 +65,7 @@ def run_ragas_eval( |
| 64 | base_url=llm_base_url or None, | 65 | base_url=llm_base_url or None, |
| 65 | temperature=temperature, | 66 | temperature=temperature, |
| 66 | max_tokens=max_tokens, | 67 | max_tokens=max_tokens, |
| 68 | **chat_openai_kwargs(ragas_config), | ||
| 67 | ) | 69 | ) |
| 68 | embeddings = OpenAIEmbeddings( | 70 | embeddings = OpenAIEmbeddings( |
| 69 | model=embedding_model, | 71 | model=embedding_model, | ... | ... |
| ... | @@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import ( | ... | @@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import ( |
| 20 | 20 | ||
| 21 | from weknora_eval.config import require_config | 21 | from weknora_eval.config import require_config |
| 22 | from weknora_eval.loaders import read_jsonl, write_jsonl | 22 | from weknora_eval.loaders import read_jsonl, write_jsonl |
| 23 | from weknora_eval.llm_options import chat_openai_kwargs | ||
| 23 | from weknora_eval.ragas_runner import _wrap_langchain_models | 24 | from weknora_eval.ragas_runner import _wrap_langchain_models |
| 24 | from weknora_eval.schemas import TestsetRecord | 25 | from weknora_eval.schemas import TestsetRecord |
| 25 | 26 | ||
| ... | @@ -95,6 +96,7 @@ def generate_ragas_testset( | ... | @@ -95,6 +96,7 @@ def generate_ragas_testset( |
| 95 | temperature=float(ragas_config.get("temperature", 0)), | 96 | temperature=float(ragas_config.get("temperature", 0)), |
| 96 | max_tokens=generator_max_tokens, | 97 | max_tokens=generator_max_tokens, |
| 97 | timeout=int(ragas_config.get("timeout_seconds", 600)), | 98 | timeout=int(ragas_config.get("timeout_seconds", 600)), |
| 99 | **chat_openai_kwargs(ragas_config), | ||
| 98 | ) | 100 | ) |
| 99 | run_config = RunConfig( | 101 | run_config = RunConfig( |
| 100 | timeout=int(ragas_config.get("timeout_seconds", 600)), | 102 | timeout=int(ragas_config.get("timeout_seconds", 600)), | ... | ... |
-
Please register or sign in to post a comment