Commit f0c4e2ec f0c4e2ec8072c4eeba5b2bb327e19a32b9867980 by 沈秋雨

设置ragas思考模式开关

1 parent 6c7e5043
......@@ -20,6 +20,7 @@ RAGAS_RERANKER_MODEL=replace-me
RAGAS_GENERATOR_MODEL=gpt-4o-mini
RAGAS_JUDGE_MODEL=gpt-4o-mini
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
RAGAS_ENABLE_THINKING=false
TESTSET_SIZE=50
TESTSET_RAGAS_MODE=direct
......
......@@ -39,6 +39,7 @@ cp .env.example .env
- `RAGAS_LLM_BASE_URL` 指向 vLLM 的 OpenAI-compatible `/v1`
- `RAGAS_EMBEDDING_BASE_URL` 指向 Infinity embedding 的 OpenAI-compatible `/v1`
- `RAGAS_*_MODEL` 是评估侧模型名称
- `RAGAS_ENABLE_THINKING=false` 只会在本评估项目的 RAGAS LLM 请求中发送 `chat_template_kwargs.enable_thinking=false`,不会改变 WeKnora 检索/问答服务的模型配置
## 首轮 Pilot
......
......@@ -296,10 +296,13 @@ max_tokens: 4096
TESTSET_RAGAS_MODE=direct
TESTSET_GENERATOR_MAX_TOKENS=4096
TESTSET_MAX_DOCUMENT_CHARS=2000
RAGAS_ENABLE_THINKING=false
```
`direct` 模式会跳过 Ragas 默认的 `HeadlinesExtractor``SummaryExtractor``NERExtractor` 文档预处理链路,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA。`prechunked``langchain_docs` 仅用于对比实验,遇到本地 vLLM 结构化输出不稳定时不建议使用。
如果使用 Qwen thinking 模型,`RAGAS_ENABLE_THINKING=false` 会只在 RAGAS 请求里附加 `chat_template_kwargs.enable_thinking=false`,避免 RAGAS 的 JSON/Pydantic 结构化输出被 `Thinking Process` 前缀破坏;WeKnora 本身的检索问答链路不经过这些脚本,不会受影响。
如果 vLLM 仍然报生成未完成,先把 `TESTSET_SIZE` 降到 3,再把 `TESTSET_MAX_DOCUMENT_CHARS` 调到 1000-1500 验证链路;`ragas.max_tokens` 主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。
### WeKnora 问答没有 retrieved_contexts
......
......@@ -72,6 +72,7 @@ ragas:
generator_model: "${RAGAS_GENERATOR_MODEL}"
judge_model: "${RAGAS_JUDGE_MODEL}"
embedding_model: "${RAGAS_EMBEDDING_MODEL}"
enable_thinking: "${RAGAS_ENABLE_THINKING:-false}"
temperature: 0
max_tokens: 4096
timeout_seconds: 600
......
......@@ -10,6 +10,7 @@ import requests
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from weknora_eval.config import load_config
from weknora_eval.llm_options import chat_openai_kwargs
def main() -> int:
......@@ -27,6 +28,7 @@ def main() -> int:
model=require_value(ragas, "generator_model"),
temperature=float(ragas.get("temperature", 0)),
max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
extra_kwargs=chat_openai_kwargs(ragas),
)
)
failures.extend(
......@@ -37,6 +39,7 @@ def main() -> int:
model=require_value(ragas, "judge_model"),
temperature=float(ragas.get("temperature", 0)),
max_tokens=min(int(ragas.get("max_tokens", 1024)), 1024),
extra_kwargs=chat_openai_kwargs(ragas),
)
)
failures.extend(
......@@ -78,6 +81,7 @@ def check_chat_model(
model: str,
temperature: float,
max_tokens: int,
extra_kwargs: dict[str, Any],
) -> list[str]:
print(f"[CHECK] {title}: model={model} base_url={base_url}")
started = time.monotonic()
......@@ -89,6 +93,7 @@ def check_chat_model(
temperature=temperature,
max_tokens=max_tokens,
timeout=120,
**extra_kwargs,
)
response = llm.invoke("Reply with exactly: OK")
content = str(response.content or "").strip()
......
......@@ -21,6 +21,7 @@ from ragas.testset.synthesizers.single_hop.prompts import (
)
from weknora_eval.config import load_config
from weknora_eval.llm_options import chat_extra_body, chat_openai_kwargs
class SimpleQA(BaseModel):
......@@ -38,6 +39,7 @@ def main() -> int:
max_tokens = int(testset.get("generator_max_tokens", ragas.get("max_tokens", 4096)))
temperature = float(ragas.get("temperature", 0))
timeout = int(ragas.get("timeout_seconds", 600))
extra_body = chat_extra_body(ragas)
print("Diagnosing Ragas generator LLM compatibility\n")
print(f"model={model}")
......@@ -54,6 +56,7 @@ def main() -> int:
max_tokens=min(max_tokens, 256),
temperature=temperature,
timeout=timeout,
extra_body=extra_body,
)
json_prompt = (
......@@ -70,6 +73,7 @@ def main() -> int:
max_tokens=max_tokens,
temperature=temperature,
timeout=timeout,
extra_body=extra_body,
)
validate_json_payload(structured.get("content") or "")
......@@ -81,6 +85,7 @@ def main() -> int:
max_tokens=max_tokens,
temperature=temperature,
timeout=timeout,
extra_kwargs=chat_openai_kwargs(ragas),
)
run_ragas_prompt_probe(
......@@ -90,6 +95,7 @@ def main() -> int:
max_tokens=max_tokens,
temperature=temperature,
timeout=timeout,
extra_kwargs=chat_openai_kwargs(ragas),
)
explain_result(plain, structured, langchain_result)
......@@ -106,6 +112,7 @@ def run_raw_chat(
max_tokens: int,
temperature: float,
timeout: int,
extra_body: dict[str, Any],
) -> dict[str, Any]:
print(f"[RAW] {title}")
started = time.monotonic()
......@@ -120,6 +127,7 @@ def run_raw_chat(
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
**extra_body,
},
timeout=timeout,
)
......@@ -169,6 +177,7 @@ def run_langchain_probe(
max_tokens: int,
temperature: float,
timeout: int,
extra_kwargs: dict[str, Any],
) -> dict[str, Any]:
print("[LANGCHAIN] generation metadata")
llm = ChatOpenAI(
......@@ -178,6 +187,7 @@ def run_langchain_probe(
temperature=temperature,
max_tokens=max_tokens,
timeout=timeout,
**extra_kwargs,
)
prompt_value = StringPromptValue(text=prompt)
result = llm.generate_prompt([prompt_value])
......@@ -206,6 +216,7 @@ def run_ragas_prompt_probe(
max_tokens: int,
temperature: float,
timeout: int,
extra_kwargs: dict[str, Any],
) -> None:
print("[RAGAS] QueryAnswerGenerationPrompt")
llm = ChatOpenAI(
......@@ -215,6 +226,7 @@ def run_ragas_prompt_probe(
temperature=temperature,
max_tokens=max_tokens,
timeout=timeout,
**extra_kwargs,
)
ragas_llm = LangchainLLMWrapper(llm)
ragas_llm.set_run_config(RunConfig(timeout=timeout, max_workers=1))
......
from __future__ import annotations
from typing import Any
def chat_extra_body(config: dict[str, Any]) -> dict[str, Any]:
if not _as_bool(config.get("enable_thinking", False)):
return {"chat_template_kwargs": {"enable_thinking": False}}
return {}
def chat_openai_kwargs(config: dict[str, Any]) -> dict[str, Any]:
extra_body = chat_extra_body(config)
return {"extra_body": extra_body} if extra_body else {}
def _as_bool(value: Any) -> bool:
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "yes", "on"}
return bool(value)
......@@ -8,6 +8,7 @@ import pandas as pd
from weknora_eval.config import require_config
from weknora_eval.loaders import read_jsonl
from weknora_eval.llm_options import chat_openai_kwargs
def run_ragas_eval(
......@@ -64,6 +65,7 @@ def run_ragas_eval(
base_url=llm_base_url or None,
temperature=temperature,
max_tokens=max_tokens,
**chat_openai_kwargs(ragas_config),
)
embeddings = OpenAIEmbeddings(
model=embedding_model,
......
......@@ -20,6 +20,7 @@ from ragas.testset.synthesizers.single_hop.base import (
from weknora_eval.config import require_config
from weknora_eval.loaders import read_jsonl, write_jsonl
from weknora_eval.llm_options import chat_openai_kwargs
from weknora_eval.ragas_runner import _wrap_langchain_models
from weknora_eval.schemas import TestsetRecord
......@@ -95,6 +96,7 @@ def generate_ragas_testset(
temperature=float(ragas_config.get("temperature", 0)),
max_tokens=generator_max_tokens,
timeout=int(ragas_config.get("timeout_seconds", 600)),
**chat_openai_kwargs(ragas_config),
)
run_config = RunConfig(
timeout=int(ragas_config.get("timeout_seconds", 600)),
......