设置ragas思考模式开关

沈秋雨
Commit f0c4e2ec ... f0c4e2ec8072c4eeba5b2bb327e19a32b9867980 authored 2026-04-22 13:16:58 +0800 by 沈秋雨
Showing 9 changed files with 49 additions and 0 deletions
.env.example
README.md
TESTING_GUIDE.md
configs/eval.yaml
scripts/00_check_models.py
scripts/00_diagnose_ragas_llm.py
src/weknora_eval/llm_options.py
src/weknora_eval/ragas_runner.py
src/weknora_eval/testset.py
--- a/.env.example
View file @f0c4e2e
+++ b/.env.example
View file @f0c4e2e
@@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me
 RAGAS_GENERATOR_MODEL=gpt-4o-mini
 RAGAS_JUDGE_MODEL=gpt-4o-mini
 RAGAS_EMBEDDING_MODEL=text-embedding-3-small
+RAGAS_ENABLE_THINKING=false

 TESTSET_SIZE=50
 TESTSET_RAGAS_MODE=direct
--- a/README.md
View file @f0c4e2e
+++ b/README.md
View file @f0c4e2e
@@ -39,6 +39,7 @@ cp .env.example .env
 - `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1`
 - `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1`
 - `RAGAS_*_MODEL` 是评估侧模型名称
+- `RAGAS_ENABLE_THINKING=false` 只会在本评估项目的 RAGAS LLM 请求中发送 `chat_template_kwargs.enable_thinking=false`，不会改变 WeKnora 检索/问答服务的模型配置

 ## 首轮 Pilot

--- a/TESTING_GUIDE.md
View file @f0c4e2e
+++ b/TESTING_GUIDE.md
View file @f0c4e2e
@@ -296,10 +296,13 @@ max_tokens: 4096
 TESTSET_RAGAS_MODE=direct
 TESTSET_GENERATOR_MAX_TOKENS=4096
 TESTSET_MAX_DOCUMENT_CHARS=2000
+RAGAS_ENABLE_THINKING=false
 ```

 `direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor`、`SummaryExtractor`、`NERExtractor` 文档预处理链路，直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked` 和 `langchain_docs` 仅用于对比实验，遇到本地 vLLM 结构化输出不稳定时不建议使用。

+如果使用 Qwen thinking 模型，`RAGAS_ENABLE_THINKING=false` 会只在 RAGAS 请求里附加 `chat_template_kwargs.enable_thinking=false`，避免 RAGAS 的 JSON/Pydantic 结构化输出被 `Thinking Process` 前缀破坏；WeKnora 本身的检索问答链路不经过这些脚本，不会受影响。
+
 如果 vLLM 仍然报生成未完成，先把 `TESTSET_SIZE` 降到 3，再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路；`ragas.max_tokens` 主要用于后续评测阶段，不应该拿来无限放大测试集生成阶段的输出长度。

 ### WeKnora 问答没有 retrieved_contexts
--- a/configs/eval.yaml
View file @f0c4e2e
+++ b/configs/eval.yaml
View file @f0c4e2e
@@ -72,6 +72,7 @@ ragas:
  generator_model: "${RAGAS_GENERATOR_MODEL}"
  judge_model: "${RAGAS_JUDGE_MODEL}"
  embedding_model: "${RAGAS_EMBEDDING_MODEL}"
+  enable_thinking: "${RAGAS_ENABLE_THINKING:-false}"
  temperature: 0
  max_tokens: 4096
  timeout_seconds: 600
--- a/scripts/00_check_models.py
View file @f0c4e2e
+++ b/scripts/00_check_models.py
View file @f0c4e2e
@@ -10,6 +10,7 @@ import requests
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings

 from weknora_eval.config import load_config
+from weknora_eval.llm_options import chat_openai_kwargs


 def main() -> int:
@@ -27,6 +28,7 @@ def main() -> int:
            model=require_value(ragas, "generator_model"),
            temperature=float(ragas.get("temperature", 0)),
            max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
+            extra_kwargs=chat_openai_kwargs(ragas),
        )
    )
    failures.extend(
@@ -37,6 +39,7 @@ def main() -> int:
            model=require_value(ragas, "judge_model"),
            temperature=float(ragas.get("temperature", 0)),
            max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
+            extra_kwargs=chat_openai_kwargs(ragas),
        )
    )
    failures.extend(
@@ -78,6 +81,7 @@ def check_chat_model(
    model: str,
    temperature: float,
    max_tokens: int,
+    extra_kwargs: dict[str, Any],
 ) -> list[str]:
    print(f"[CHECK] {title}: model={model} base_url={base_url}")
    started = time.monotonic()
@@ -89,6 +93,7 @@ def check_chat_model(
            temperature=temperature,
            max_tokens=max_tokens,
            timeout=120,
+            **extra_kwargs,
        )
        response = llm.invoke("Reply with exactly: OK")
        content = str(response.content or "").strip()
--- a/scripts/00_diagnose_ragas_llm.py
View file @f0c4e2e
+++ b/scripts/00_diagnose_ragas_llm.py
View file @f0c4e2e
@@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import (
 )

 from weknora_eval.config import load_config
+from weknora_eval.llm_options import chat_extra_body, chat_openai_kwargs


 class SimpleQA(BaseModel):
@@ -38,6 +39,7 @@ def main() -> int:
    max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096)))
    temperature = float(ragas.get("temperature", 0))
    timeout = int(ragas.get("timeout_seconds", 600))
+    extra_body = chat_extra_body(ragas)

    print("Diagnosing Ragas generator LLM compatibility\n")
    print(f"model={model}")
@@ -54,6 +56,7 @@ def main() -> int:
        max_tokens=min(max_tokens, 256),
        temperature=temperature,
        timeout=timeout,
+        extra_body=extra_body,
    )

    json_prompt = (
@@ -70,6 +73,7 @@ def main() -> int:
        max_tokens=max_tokens,
        temperature=temperature,
        timeout=timeout,
+        extra_body=extra_body,
    )
    validate_json_payload(structured.get("content") or "")

@@ -81,6 +85,7 @@ def main() -> int:
        max_tokens=max_tokens,
        temperature=temperature,
        timeout=timeout,
+        extra_kwargs=chat_openai_kwargs(ragas),
    )

    run_ragas_prompt_probe(
@@ -90,6 +95,7 @@ def main() -> int:
        max_tokens=max_tokens,
        temperature=temperature,
        timeout=timeout,
+        extra_kwargs=chat_openai_kwargs(ragas),
    )

    explain_result(plain, structured, langchain_result)
@@ -106,6 +112,7 @@ def run_raw_chat(
    max_tokens: int,
    temperature: float,
    timeout: int,
+    extra_body: dict[str, Any],
 ) -> dict[str, Any]:
    print(f"[RAW] {title}")
    started = time.monotonic()
@@ -120,6 +127,7 @@ def run_raw_chat(
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
+            **extra_body,
        },
        timeout=timeout,
    )
@@ -169,6 +177,7 @@ def run_langchain_probe(
    max_tokens: int,
    temperature: float,
    timeout: int,
+    extra_kwargs: dict[str, Any],
 ) -> dict[str, Any]:
    print("[LANGCHAIN] generation metadata")
    llm = ChatOpenAI(
@@ -178,6 +187,7 @@ def run_langchain_probe(
        temperature=temperature,
        max_tokens=max_tokens,
        timeout=timeout,
+        **extra_kwargs,
    )
    prompt_value = StringPromptValue(text=prompt)
    result = llm.generate_prompt([prompt_value])
@@ -206,6 +216,7 @@ def run_ragas_prompt_probe(
    max_tokens: int,
    temperature: float,
    timeout: int,
+    extra_kwargs: dict[str, Any],
 ) -> None:
    print("[RAGAS] QueryAnswerGenerationPrompt")
    llm = ChatOpenAI(
@@ -215,6 +226,7 @@ def run_ragas_prompt_probe(
        temperature=temperature,
        max_tokens=max_tokens,
        timeout=timeout,
+        **extra_kwargs,
    )
    ragas_llm = LangchainLLMWrapper(llm)
    ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1))
--- a/src/weknora_eval/llm_options.py 0 → 100644
View file @f0c4e2e
+++ b/src/weknora_eval/llm_options.py 0 → 100644
View file @f0c4e2e
+from __future__ import annotations
+
+from typing import Any
+
+
+def chat_extra_body(config: dict[str, Any]) -> dict[str, Any]:
+    if not _as_bool(config.get("enable_thinking", False)):
+        return {"chat_template_kwargs": {"enable_thinking": False}}
+    return {}
+
+
+def chat_openai_kwargs(config: dict[str, Any]) -> dict[str, Any]:
+    extra_body = chat_extra_body(config)
+    return {"extra_body": extra_body} if extra_body else {}
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
--- a/src/weknora_eval/ragas_runner.py
View file @f0c4e2e
+++ b/src/weknora_eval/ragas_runner.py
View file @f0c4e2e
@@ -8,6 +8,7 @@ import pandas as pd

 from weknora_eval.config import require_config
 from weknora_eval.loaders import read_jsonl
+from weknora_eval.llm_options import chat_openai_kwargs


 def run_ragas_eval(
@@ -64,6 +65,7 @@ def run_ragas_eval(
        base_url=llm_base_url or None,
        temperature=temperature,
        max_tokens=max_tokens,
+        **chat_openai_kwargs(ragas_config),
    )
    embeddings = OpenAIEmbeddings(
        model=embedding_model,
--- a/src/weknora_eval/testset.py
View file @f0c4e2e
+++ b/src/weknora_eval/testset.py
View file @f0c4e2e
@@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import (

 from weknora_eval.config import require_config
 from weknora_eval.loaders import read_jsonl, write_jsonl
+from weknora_eval.llm_options import chat_openai_kwargs
 from weknora_eval.ragas_runner import _wrap_langchain_models
 from weknora_eval.schemas import TestsetRecord

@@ -95,6 +96,7 @@ def generate_ragas_testset(
        temperature=float(ragas_config.get("temperature", 0)),
        max_tokens=generator_max_tokens,
        timeout=int(ragas_config.get("timeout_seconds", 600)),
+        **chat_openai_kwargs(ragas_config),
    )
    run_config = RunConfig(
        timeout=int(ragas_config.get("timeout_seconds", 600)),