testset.py 17.8 KB

Raw Blame History Permalink

from __future__ import annotations

import asyncio
import inspect
import json
import logging
from typing import Any

from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.run_config import RunConfig
from ragas.testset import TestsetGenerator
from ragas.testset.graph import Node, NodeType
from ragas.testset.persona import Persona
from ragas.testset.synthesizers.base import QueryLength, QueryStyle
from ragas.testset.synthesizers.single_hop.base import (
    SingleHopQuerySynthesizer,
    SingleHopScenario,
)
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)
from ragas.testset.transforms.extractors.llm_based import NERExtractor

from weknora_eval.config import require_config
from weknora_eval.loaders import read_jsonl, write_jsonl
from weknora_eval.llm_options import chat_openai_kwargs
from weknora_eval.ragas_runner import _wrap_langchain_models
from weknora_eval.schemas import TestsetRecord

logger = logging.getLogger(__name__)


def generate_testset(config: dict[str, Any]) -> list[dict[str, Any]]:
    testset = config.get("testset", {})
    generator = str(testset.get("generator", "ragas"))
    if generator == "ragas":
        return generate_ragas_testset(config)
    if generator == "rule_based":
        return generate_rule_based_testset(
            size=int(testset.get("size", 50)),
            min_context_chars=int(testset.get("min_context_chars", 80)),
        )
    raise ValueError(f"Unsupported testset.generator: {generator}")


def generate_ragas_testset(
    config: dict[str, Any],
    *,
    documents_path: str = "data/parsed_docs/documents.jsonl",
    output_path: str = "data/testsets/testset.raw.jsonl",
) -> list[dict[str, Any]]:
    testset_config = config.get("testset", {})
    ragas_config = config["ragas"]
    size = int(testset_config.get("size", 50))
    min_context_chars = int(testset_config.get("min_context_chars", 80))
    max_document_chars = int(testset_config.get("max_document_chars", 2000))
    source_multiplier = max(int(testset_config.get("source_multiplier", 3)), 1)
    generator_max_tokens = int(
        testset_config.get("generator_max_tokens", ragas_config.get("max_tokens", 4096))
    )
    ragas_mode = str(testset_config.get("ragas_mode", "direct"))

    source_rows = [
        row
        for row in read_jsonl(documents_path)
        if len(row.get("content") or "") >= min_context_chars
    ]
    if not source_rows:
        write_jsonl(output_path, [])
        return []

    source_limit = min(len(source_rows), max(size * source_multiplier, size, 1))
    selected_source_rows = source_rows[:source_limit]
    documents = [
        Document(
            page_content=_truncate_for_generation(row["content"], max_document_chars),
            metadata={
                "source_file": row.get("source_file"),
                "doc_id": row.get("doc_id"),
                "content_chars": len(row.get("content") or ""),
                **(row.get("metadata") or {}),
            },
        )
        for row in selected_source_rows
    ]
    logger.info(
        "Generating Ragas testset: target_size=%s source_documents=%s max_document_chars=%s generator_max_tokens=%s ragas_mode=%s",
        size,
        len(documents),
        max_document_chars,
        generator_max_tokens,
        ragas_mode,
    )

    llm = ChatOpenAI(
        model=str(require_config(config, "ragas.generator_model")),
        api_key=_required_ragas_value(ragas_config, "llm_api_key"),
        base_url=_required_ragas_value(ragas_config, "llm_base_url"),
        temperature=float(ragas_config.get("temperature", 0)),
        max_tokens=generator_max_tokens,
        timeout=int(ragas_config.get("timeout_seconds", 600)),
        **chat_openai_kwargs(ragas_config),
    )
    run_config = RunConfig(
        timeout=int(ragas_config.get("timeout_seconds", 600)),
        max_workers=int(ragas_config.get("max_workers", 1)),
    )
    if ragas_mode == "direct":
        rows = _generate_ragas_direct_rows(
            llm, documents, selected_source_rows, size, run_config
        )
        write_jsonl(output_path, rows)
        return rows
    elif ragas_mode == "prechunked":
        result = _generate_ragas_prechunked(
            config, ragas_config, llm, documents, size, run_config
        )
    elif ragas_mode == "langchain_docs":
        result = _generate_ragas_langchain_docs(
            config, ragas_config, llm, documents, size, run_config
        )
    else:
        raise ValueError(f"Unsupported testset.ragas_mode: {ragas_mode}")

    ragas_rows = result.to_list()
    rows = _normalize_ragas_rows(ragas_rows, selected_source_rows)
    write_jsonl(output_path, rows)
    return rows


def _generate_ragas_direct_rows(
    llm: ChatOpenAI,
    documents: list[Document],
    source_rows: list[dict[str, Any]],
    size: int,
    run_config: RunConfig,
) -> list[dict[str, Any]]:
    ragas_llm = _wrap_langchain_llm(llm)
    if hasattr(ragas_llm, "set_run_config"):
        ragas_llm.set_run_config(run_config)

    personas = _default_personas()
    synthesizer = SingleHopSpecificQuerySynthesizer(llm=ragas_llm)
    rows = asyncio.run(
        _generate_direct_samples(synthesizer, documents, source_rows, personas, size)
    )
    logger.info("Generated %s Ragas direct QA samples", len(rows))
    return rows


async def _generate_direct_samples(
    synthesizer: SingleHopQuerySynthesizer,
    documents: list[Document],
    source_rows: list[dict[str, Any]],
    personas: list[Persona],
    size: int,
) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    styles = [QueryStyle.PERFECT_GRAMMAR, QueryStyle.WEB_SEARCH_LIKE]
    lengths = [QueryLength.MEDIUM, QueryLength.SHORT]
    for index, (document, source) in enumerate(zip(documents, source_rows), start=1):
        if len(rows) >= size:
            break
        term = _generation_terms(document)[0]
        node = Node(
            type=NodeType.CHUNK,
            properties={
                "page_content": document.page_content,
                "document_metadata": document.metadata,
            },
        )
        scenario = SingleHopScenario(
            nodes=[node],
            term=term,
            persona=personas[(index - 1) % len(personas)],
            style=styles[(index - 1) % len(styles)],
            length=lengths[(index - 1) % len(lengths)],
        )
        try:
            sample = await synthesizer.generate_sample(scenario)
        except Exception as exc:  # noqa: BLE001
            logger.warning(
                "Ragas direct QA generation failed for source_file=%s doc_id=%s: %s",
                source.get("source_file"),
                source.get("doc_id"),
                exc,
            )
            continue

        chunk_id = (source.get("metadata") or {}).get("chunk_id") or source.get(
            "doc_id"
        )
        rows.append(
            TestsetRecord(
                sample_id=f"qa-{len(rows) + 1:04d}",
                user_input=str(sample.user_input or "").strip(),
                reference=str(sample.reference or "").strip(),
                reference_contexts=[document.page_content],
                source_file=source.get("source_file"),
                gold_chunk_ids=[str(chunk_id)] if chunk_id else [],
                question_type="ragas_single_hop_direct",
                review_status="pending",
            ).to_dict()
        )
    return [
        row
        for row in rows
        if row.get("user_input")
        and row.get("reference")
        and row.get("reference_contexts")
    ]


def _generate_ragas_prechunked(
    config: dict[str, Any],
    ragas_config: dict[str, Any],
    llm: ChatOpenAI,
    documents: list[Document],
    size: int,
    run_config: RunConfig,
) -> Any:
    embeddings = _build_embeddings(config, ragas_config)
    ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
    generator = TestsetGenerator(
        llm=ragas_llm,
        embedding_model=ragas_embeddings,
        persona_list=_default_personas(),
    )
    return generator.generate_with_chunks(
        documents,
        testset_size=size,
        transforms=_prechunked_transforms(ragas_config, ragas_llm),
        query_distribution=[(SingleHopSpecificQuerySynthesizer(llm=ragas_llm), 1.0)],
        run_config=run_config,
        raise_exceptions=True,
    )


def _generate_ragas_langchain_docs(
    config: dict[str, Any],
    ragas_config: dict[str, Any],
    llm: ChatOpenAI,
    documents: list[Document],
    size: int,
    run_config: RunConfig,
) -> Any:
    embeddings = _build_embeddings(config, ragas_config)
    ragas_llm, ragas_embeddings = _wrap_langchain_models(llm, embeddings)
    generator = TestsetGenerator(
        llm=ragas_llm,
        embedding_model=ragas_embeddings,
        persona_list=_default_personas(),
    )
    generate_kwargs: dict[str, Any] = {
        "testset_size": size,
        "query_distribution": [(SingleHopSpecificQuerySynthesizer(llm=ragas_llm), 1.0)],
        "run_config": run_config,
        "raise_exceptions": True,
    }
    return generator.generate_with_langchain_docs(documents, **generate_kwargs)


def _prechunked_transforms(ragas_config: dict[str, Any], ragas_llm: Any) -> Any:
    mode = str(ragas_config.get("testset_transforms", "single_hop_entities"))
    if mode == "default":
        return None
    if mode == "single_hop_entities":
        return [NERExtractor(llm=ragas_llm, filter_nodes=_is_chunk_node)]
    raise ValueError(f"Unsupported ragas.testset_transforms: {mode}")


def _is_chunk_node(node: Any) -> bool:
    return getattr(getattr(node, "type", None), "name", "") == "CHUNK"


def _default_personas() -> list[Persona]:
    return [
        Persona(
            name="合同审核人员",
            role_description="关注合同条款、权利归属、授权范围和履约义务。",
        ),
        Persona(
            name="业务运营人员",
            role_description="关注文档中可用于业务执行和信息核验的事实。",
        ),
        Persona(
            name="法务合规人员",
            role_description="关注协议、版权、授权、责任和风险表述。",
        ),
    ]


def _build_embeddings(
    config: dict[str, Any], ragas_config: dict[str, Any]
) -> OpenAIEmbeddings:
    return OpenAIEmbeddings(
        model=str(require_config(config, "ragas.embedding_model")),
        api_key=_required_ragas_value(ragas_config, "embedding_api_key"),
        base_url=_required_ragas_value(ragas_config, "embedding_base_url"),
        tiktoken_enabled=False,
        check_embedding_ctx_length=False,
        request_timeout=int(ragas_config.get("timeout_seconds", 600)),
    )


def _wrap_langchain_llm(llm: Any) -> Any:
    try:
        from ragas.llms import LangchainLLMWrapper
    except ImportError:
        return llm
    return LangchainLLMWrapper(llm)


def _generation_terms(document: Document) -> list[str]:
    text = f"{document.metadata.get('source_file') or ''} {document.page_content}"
    candidates = [
        "合同条款",
        "权利归属",
        "著作权",
        "邻接权",
        "录音权利",
        "词权利",
        "曲权利",
        "授权范围",
        "作品信息",
        "甲方",
        "乙方",
        "协议",
        "付款",
        "违约责任",
        "期限",
    ]
    terms = [term for term in candidates if term in text]
    source_file = str(document.metadata.get("source_file") or "").strip()
    if source_file:
        terms.append(source_file.rsplit(".", 1)[0][:40])
    return terms[:6] or ["文档内容"]


def _truncate_for_generation(content: str, max_chars: int) -> str:
    text = " ".join((content or "").split())
    if max_chars <= 0 or len(text) <= max_chars:
        return text
    return text[:max_chars].rstrip()


def _normalize_ragas_rows(
    ragas_rows: list[dict[str, Any]],
    source_rows: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    normalized: list[dict[str, Any]] = []
    source_by_doc_id = {str(row.get("doc_id")): row for row in source_rows if row.get("doc_id")}
    for index, row in enumerate(ragas_rows, start=1):
        reference_contexts = _as_string_list(row.get("reference_contexts"))
        if not reference_contexts and row.get("reference_context"):
            reference_contexts = _as_string_list(row.get("reference_context"))
        source = _match_source_row(row, source_rows, source_by_doc_id, reference_contexts)
        gold_chunk_ids = []
        if source:
            chunk_id = (source.get("metadata") or {}).get("chunk_id") or source.get("doc_id")
            if chunk_id:
                gold_chunk_ids = [str(chunk_id)]
        normalized.append(
            TestsetRecord(
                sample_id=f"qa-{index:04d}",
                user_input=str(row.get("user_input") or row.get("query") or "").strip(),
                reference=str(row.get("reference") or row.get("answer") or "").strip(),
                reference_contexts=reference_contexts or ([source["content"]] if source else []),
                source_file=source.get("source_file") if source else None,
                gold_chunk_ids=gold_chunk_ids,
                question_type=str(row.get("synthesizer_name") or "ragas"),
                review_status="pending",
            ).to_dict()
        )
    return [
        row
        for row in normalized
        if row.get("user_input") and row.get("reference") and row.get("reference_contexts")
    ]


def _match_source_row(
    ragas_row: dict[str, Any],
    source_rows: list[dict[str, Any]],
    source_by_doc_id: dict[str, dict[str, Any]],
    reference_contexts: list[str],
) -> dict[str, Any] | None:
    for key in ("reference_context_ids", "retrieved_context_ids"):
        for doc_id in _as_string_list(ragas_row.get(key)):
            if doc_id in source_by_doc_id:
                return source_by_doc_id[doc_id]
    for context in reference_contexts:
        for source in source_rows:
            content = source.get("content") or ""
            if context and (context in content or content in context):
                return source
    return source_rows[0] if source_rows else None


def _as_string_list(value: Any) -> list[str]:
    if value is None:
        return []
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            if parsed != value:
                return _as_string_list(parsed)
        except json.JSONDecodeError:
            pass
        return [value.strip()] if value.strip() else []
    if isinstance(value, list):
        result: list[str] = []
        for item in value:
            result.extend(_as_string_list(item))
        return result
    if isinstance(value, dict):
        for key in ("content", "text", "page_content"):
            if key in value:
                return _as_string_list(value[key])
        return []
    return [str(value)]


def _required_ragas_value(config: dict[str, Any], key: str) -> str:
    value = config.get(key)
    if value in {None, ""}:
        raise ValueError(f"Missing required Ragas config value: ragas.{key}")
    return str(value)


def generate_rule_based_testset(
    *,
    documents_path: str = "data/parsed_docs/documents.jsonl",
    output_path: str = "data/testsets/testset.raw.jsonl",
    size: int = 50,
    min_context_chars: int = 80,
) -> list[dict[str, Any]]:
    documents = [
        row
        for row in read_jsonl(documents_path)
        if len(row.get("content") or "") >= min_context_chars
    ]
    rows: list[dict[str, Any]] = []
    for index, document in enumerate(documents[:size], start=1):
        context = document["content"]
        source_file = document.get("source_file")
        question = _default_question(document)
        reference = _reference_from_context(context)
        rows.append(
            TestsetRecord(
                sample_id=f"qa-{index:04d}",
                user_input=question,
                reference=reference,
                reference_contexts=[context],
                source_file=source_file,
                question_type="single_hop",
                review_status="pending",
            ).to_dict()
        )
    write_jsonl(output_path, rows)
    return rows


def approve_pending_testset(
    *,
    input_path: str = "data/testsets/testset.raw.jsonl",
    output_path: str = "data/testsets/testset.reviewed.jsonl",
) -> list[dict[str, Any]]:
    rows = read_jsonl(input_path)
    reviewed: list[dict[str, Any]] = []
    for row in rows:
        row = dict(row)
        if row.get("review_status") == "rejected":
            continue
        row["review_status"] = "approved"
        reviewed.append(row)
    write_jsonl(output_path, reviewed)
    return reviewed


def validate_reviewed_testset(path: str = "data/testsets/testset.reviewed.jsonl") -> list[str]:
    errors: list[str] = []
    for index, row in enumerate(read_jsonl(path), start=1):
        prefix = f"{path}:{index}"
        if row.get("review_status") != "approved":
            errors.append(f"{prefix} review_status must be approved")
        for key in ("sample_id", "user_input", "reference"):
            if not row.get(key):
                errors.append(f"{prefix} missing {key}")
        if not row.get("reference_contexts"):
            errors.append(f"{prefix} reference_contexts must be non-empty")
    return errors


def _default_question(document: dict[str, Any]) -> str:
    source = document.get("source_file") or "该文档"
    if document.get("file_type") == "xlsx" and document.get("sheet"):
        return f"请根据 {source} 的 {document['sheet']} 中对应记录回答：这条记录的主要内容是什么？"
    if document.get("page"):
        return f"请根据 {source} 第 {document['page']} 页回答：该片段的主要内容是什么？"
    return f"请根据 {source} 回答：该片段的主要内容是什么？"


def _reference_from_context(context: str, *, max_chars: int = 500) -> str:
    text = " ".join(context.split())
    if len(text) <= max_chars:
        return text
    return text[:max_chars].rstrip() + "..."