Commit f0c4e2ec f0c4e2ec8072c4eeba5b2bb327e19a32b9867980 by 沈秋雨

设置ragas思考模式开关

1 parent 6c7e5043
...@@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me ...@@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me
20 RAGAS_GENERATOR_MODEL=gpt-4o-mini 20 RAGAS_GENERATOR_MODEL=gpt-4o-mini
21 RAGAS_JUDGE_MODEL=gpt-4o-mini 21 RAGAS_JUDGE_MODEL=gpt-4o-mini
22 RAGAS_EMBEDDING_MODEL=text-embedding-3-small 22 RAGAS_EMBEDDING_MODEL=text-embedding-3-small
23 RAGAS_ENABLE_THINKING=false
23 24
24 TESTSET_SIZE=50 25 TESTSET_SIZE=50
25 TESTSET_RAGAS_MODE=direct 26 TESTSET_RAGAS_MODE=direct
......
...@@ -39,6 +39,7 @@ cp .env.example .env ...@@ -39,6 +39,7 @@ cp .env.example .env
39 - `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1` 39 - `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1`
40 - `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1` 40 - `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1`
41 - `RAGAS_*_MODEL` 是评估侧模型名称 41 - `RAGAS_*_MODEL` 是评估侧模型名称
42 - `RAGAS_ENABLE_THINKING=false` 只会在本评估项目的 RAGAS LLM 请求中发送 `chat_template_kwargs.enable_thinking=false`,不会改变 WeKnora 检索/问答服务的模型配置
42 43
43 ## 首轮 Pilot 44 ## 首轮 Pilot
44 45
......
...@@ -296,10 +296,13 @@ max_tokens: 4096 ...@@ -296,10 +296,13 @@ max_tokens: 4096
296 TESTSET_RAGAS_MODE=direct 296 TESTSET_RAGAS_MODE=direct
297 TESTSET_GENERATOR_MAX_TOKENS=4096 297 TESTSET_GENERATOR_MAX_TOKENS=4096
298 TESTSET_MAX_DOCUMENT_CHARS=2000 298 TESTSET_MAX_DOCUMENT_CHARS=2000
299 RAGAS_ENABLE_THINKING=false
299 ``` 300 ```
300 301
301 `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor``SummaryExtractor``NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked``langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。 302 `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor``SummaryExtractor``NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked``langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。
302 303
304 如果使用 Qwen thinking 模型,`RAGAS_ENABLE_THINKING=false` 会只在 RAGAS 请求里附加 `chat_template_kwargs.enable_thinking=false`,避免 RAGAS 的 JSON/Pydantic 结构化输出被 `Thinking Process` 前缀破坏;WeKnora 本身的检索问答链路不经过这些脚本,不会受影响。
305
303 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。 306 如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。
304 307
305 ### WeKnora 问答没有 retrieved_contexts 308 ### WeKnora 问答没有 retrieved_contexts
......
...@@ -72,6 +72,7 @@ ragas: ...@@ -72,6 +72,7 @@ ragas:
72 generator_model: "${RAGAS_GENERATOR_MODEL}" 72 generator_model: "${RAGAS_GENERATOR_MODEL}"
73 judge_model: "${RAGAS_JUDGE_MODEL}" 73 judge_model: "${RAGAS_JUDGE_MODEL}"
74 embedding_model: "${RAGAS_EMBEDDING_MODEL}" 74 embedding_model: "${RAGAS_EMBEDDING_MODEL}"
75 enable_thinking: "${RAGAS_ENABLE_THINKING:-false}"
75 temperature: 0 76 temperature: 0
76 max_tokens: 4096 77 max_tokens: 4096
77 timeout_seconds: 600 78 timeout_seconds: 600
......
...@@ -10,6 +10,7 @@ import requests ...@@ -10,6 +10,7 @@ import requests
10 from langchain_openai import ChatOpenAI, OpenAIEmbeddings 10 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
11 11
12 from weknora_eval.config import load_config 12 from weknora_eval.config import load_config
13 from weknora_eval.llm_options import chat_openai_kwargs
13 14
14 15
15 def main() -> int: 16 def main() -> int:
...@@ -27,6 +28,7 @@ def main() -> int: ...@@ -27,6 +28,7 @@ def main() -> int:
27 model=require_value(ragas, "generator_model"), 28 model=require_value(ragas, "generator_model"),
28 temperature=float(ragas.get("temperature", 0)), 29 temperature=float(ragas.get("temperature", 0)),
29 max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), 30 max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
31 extra_kwargs=chat_openai_kwargs(ragas),
30 ) 32 )
31 ) 33 )
32 failures.extend( 34 failures.extend(
...@@ -37,6 +39,7 @@ def main() -> int: ...@@ -37,6 +39,7 @@ def main() -> int:
37 model=require_value(ragas, "judge_model"), 39 model=require_value(ragas, "judge_model"),
38 temperature=float(ragas.get("temperature", 0)), 40 temperature=float(ragas.get("temperature", 0)),
39 max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024), 41 max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
42 extra_kwargs=chat_openai_kwargs(ragas),
40 ) 43 )
41 ) 44 )
42 failures.extend( 45 failures.extend(
...@@ -78,6 +81,7 @@ def check_chat_model( ...@@ -78,6 +81,7 @@ def check_chat_model(
78 model: str, 81 model: str,
79 temperature: float, 82 temperature: float,
80 max_tokens: int, 83 max_tokens: int,
84 extra_kwargs: dict[str, Any],
81 ) -> list[str]: 85 ) -> list[str]:
82 print(f"[CHECK] {title}: model={model} base_url={base_url}") 86 print(f"[CHECK] {title}: model={model} base_url={base_url}")
83 started = time.monotonic() 87 started = time.monotonic()
...@@ -89,6 +93,7 @@ def check_chat_model( ...@@ -89,6 +93,7 @@ def check_chat_model(
89 temperature=temperature, 93 temperature=temperature,
90 max_tokens=max_tokens, 94 max_tokens=max_tokens,
91 timeout=120, 95 timeout=120,
96 **extra_kwargs,
92 ) 97 )
93 response = llm.invoke("Reply with exactly: OK") 98 response = llm.invoke("Reply with exactly: OK")
94 content = str(response.content or "").strip() 99 content = str(response.content or "").strip()
......
...@@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import ( ...@@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import (
21 ) 21 )
22 22
23 from weknora_eval.config import load_config 23 from weknora_eval.config import load_config
24 from weknora_eval.llm_options import chat_extra_body, chat_openai_kwargs
24 25
25 26
26 class SimpleQA(BaseModel): 27 class SimpleQA(BaseModel):
...@@ -38,6 +39,7 @@ def main() -> int: ...@@ -38,6 +39,7 @@ def main() -> int:
38 max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096))) 39 max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096)))
39 temperature = float(ragas.get("temperature", 0)) 40 temperature = float(ragas.get("temperature", 0))
40 timeout = int(ragas.get("timeout_seconds", 600)) 41 timeout = int(ragas.get("timeout_seconds", 600))
42 extra_body = chat_extra_body(ragas)
41 43
42 print("Diagnosing Ragas generator LLM compatibility\n") 44 print("Diagnosing Ragas generator LLM compatibility\n")
43 print(f"model={model}") 45 print(f"model={model}")
...@@ -54,6 +56,7 @@ def main() -> int: ...@@ -54,6 +56,7 @@ def main() -> int:
54 max_tokens=min(max_tokens, 256), 56 max_tokens=min(max_tokens, 256),
55 temperature=temperature, 57 temperature=temperature,
56 timeout=timeout, 58 timeout=timeout,
59 extra_body=extra_body,
57 ) 60 )
58 61
59 json_prompt = ( 62 json_prompt = (
...@@ -70,6 +73,7 @@ def main() -> int: ...@@ -70,6 +73,7 @@ def main() -> int:
70 max_tokens=max_tokens, 73 max_tokens=max_tokens,
71 temperature=temperature, 74 temperature=temperature,
72 timeout=timeout, 75 timeout=timeout,
76 extra_body=extra_body,
73 ) 77 )
74 validate_json_payload(structured.get("content") or "") 78 validate_json_payload(structured.get("content") or "")
75 79
...@@ -81,6 +85,7 @@ def main() -> int: ...@@ -81,6 +85,7 @@ def main() -> int:
81 max_tokens=max_tokens, 85 max_tokens=max_tokens,
82 temperature=temperature, 86 temperature=temperature,
83 timeout=timeout, 87 timeout=timeout,
88 extra_kwargs=chat_openai_kwargs(ragas),
84 ) 89 )
85 90
86 run_ragas_prompt_probe( 91 run_ragas_prompt_probe(
...@@ -90,6 +95,7 @@ def main() -> int: ...@@ -90,6 +95,7 @@ def main() -> int:
90 max_tokens=max_tokens, 95 max_tokens=max_tokens,
91 temperature=temperature, 96 temperature=temperature,
92 timeout=timeout, 97 timeout=timeout,
98 extra_kwargs=chat_openai_kwargs(ragas),
93 ) 99 )
94 100
95 explain_result(plain, structured, langchain_result) 101 explain_result(plain, structured, langchain_result)
...@@ -106,6 +112,7 @@ def run_raw_chat( ...@@ -106,6 +112,7 @@ def run_raw_chat(
106 max_tokens: int, 112 max_tokens: int,
107 temperature: float, 113 temperature: float,
108 timeout: int, 114 timeout: int,
115 extra_body: dict[str, Any],
109 ) -> dict[str, Any]: 116 ) -> dict[str, Any]:
110 print(f"[RAW] {title}") 117 print(f"[RAW] {title}")
111 started = time.monotonic() 118 started = time.monotonic()
...@@ -120,6 +127,7 @@ def run_raw_chat( ...@@ -120,6 +127,7 @@ def run_raw_chat(
120 "messages": messages, 127 "messages": messages,
121 "temperature": temperature, 128 "temperature": temperature,
122 "max_tokens": max_tokens, 129 "max_tokens": max_tokens,
130 **extra_body,
123 }, 131 },
124 timeout=timeout, 132 timeout=timeout,
125 ) 133 )
...@@ -169,6 +177,7 @@ def run_langchain_probe( ...@@ -169,6 +177,7 @@ def run_langchain_probe(
169 max_tokens: int, 177 max_tokens: int,
170 temperature: float, 178 temperature: float,
171 timeout: int, 179 timeout: int,
180 extra_kwargs: dict[str, Any],
172 ) -> dict[str, Any]: 181 ) -> dict[str, Any]:
173 print("[LANGCHAIN] generation metadata") 182 print("[LANGCHAIN] generation metadata")
174 llm = ChatOpenAI( 183 llm = ChatOpenAI(
...@@ -178,6 +187,7 @@ def run_langchain_probe( ...@@ -178,6 +187,7 @@ def run_langchain_probe(
178 temperature=temperature, 187 temperature=temperature,
179 max_tokens=max_tokens, 188 max_tokens=max_tokens,
180 timeout=timeout, 189 timeout=timeout,
190 **extra_kwargs,
181 ) 191 )
182 prompt_value = StringPromptValue(text=prompt) 192 prompt_value = StringPromptValue(text=prompt)
183 result = llm.generate_prompt([prompt_value]) 193 result = llm.generate_prompt([prompt_value])
...@@ -206,6 +216,7 @@ def run_ragas_prompt_probe( ...@@ -206,6 +216,7 @@ def run_ragas_prompt_probe(
206 max_tokens: int, 216 max_tokens: int,
207 temperature: float, 217 temperature: float,
208 timeout: int, 218 timeout: int,
219 extra_kwargs: dict[str, Any],
209 ) -> None: 220 ) -> None:
210 print("[RAGAS] QueryAnswerGenerationPrompt") 221 print("[RAGAS] QueryAnswerGenerationPrompt")
211 llm = ChatOpenAI( 222 llm = ChatOpenAI(
...@@ -215,6 +226,7 @@ def run_ragas_prompt_probe( ...@@ -215,6 +226,7 @@ def run_ragas_prompt_probe(
215 temperature=temperature, 226 temperature=temperature,
216 max_tokens=max_tokens, 227 max_tokens=max_tokens,
217 timeout=timeout, 228 timeout=timeout,
229 **extra_kwargs,
218 ) 230 )
219 ragas_llm = LangchainLLMWrapper(llm) 231 ragas_llm = LangchainLLMWrapper(llm)
220 ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1)) 232 ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1))
......
1 from __future__ import annotations
2
3 from typing import Any
4
5
6 def chat_extra_body(config: dict[str, Any]) -> dict[str, Any]:
7 if not _as_bool(config.get("enable_thinking", False)):
8 return {"chat_template_kwargs": {"enable_thinking": False}}
9 return {}
10
11
12 def chat_openai_kwargs(config: dict[str, Any]) -> dict[str, Any]:
13 extra_body = chat_extra_body(config)
14 return {"extra_body": extra_body} if extra_body else {}
15
16
17 def _as_bool(value: Any) -> bool:
18 if isinstance(value, bool):
19 return value
20 if isinstance(value, str):
21 return value.strip().lower() in {"1", "true", "yes", "on"}
22 return bool(value)
...@@ -8,6 +8,7 @@ import pandas as pd ...@@ -8,6 +8,7 @@ import pandas as pd
8 8
9 from weknora_eval.config import require_config 9 from weknora_eval.config import require_config
10 from weknora_eval.loaders import read_jsonl 10 from weknora_eval.loaders import read_jsonl
11 from weknora_eval.llm_options import chat_openai_kwargs
11 12
12 13
13 def run_ragas_eval( 14 def run_ragas_eval(
...@@ -64,6 +65,7 @@ def run_ragas_eval( ...@@ -64,6 +65,7 @@ def run_ragas_eval(
64 base_url=llm_base_url or None, 65 base_url=llm_base_url or None,
65 temperature=temperature, 66 temperature=temperature,
66 max_tokens=max_tokens, 67 max_tokens=max_tokens,
68 **chat_openai_kwargs(ragas_config),
67 ) 69 )
68 embeddings = OpenAIEmbeddings( 70 embeddings = OpenAIEmbeddings(
69 model=embedding_model, 71 model=embedding_model,
......
...@@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import ( ...@@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import (
20 20
21 from weknora_eval.config import require_config 21 from weknora_eval.config import require_config
22 from weknora_eval.loaders import read_jsonl, write_jsonl 22 from weknora_eval.loaders import read_jsonl, write_jsonl
23 from weknora_eval.llm_options import chat_openai_kwargs
23 from weknora_eval.ragas_runner import _wrap_langchain_models 24 from weknora_eval.ragas_runner import _wrap_langchain_models
24 from weknora_eval.schemas import TestsetRecord 25 from weknora_eval.schemas import TestsetRecord
25 26
...@@ -95,6 +96,7 @@ def generate_ragas_testset( ...@@ -95,6 +96,7 @@ def generate_ragas_testset(
95 temperature=float(ragas_config.get("temperature", 0)), 96 temperature=float(ragas_config.get("temperature", 0)),
96 max_tokens=generator_max_tokens, 97 max_tokens=generator_max_tokens,
97 timeout=int(ragas_config.get("timeout_seconds", 600)), 98 timeout=int(ragas_config.get("timeout_seconds", 600)),
99 **chat_openai_kwargs(ragas_config),
98 ) 100 )
99 run_config = RunConfig( 101 run_config = RunConfig(
100 timeout=int(ragas_config.get("timeout_seconds", 600)), 102 timeout=int(ragas_config.get("timeout_seconds", 600)),
......