Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
weknora_ragas
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
7ce899a9
...
7ce899a9b07c4ede9971a5608503808dc9ae1c24
authored
2026-04-22 11:53:50 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Constrain Ragas testset generation budget
1 parent
0813ef9c
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
48 additions
and
7 deletions
.env.example
README.md
TESTING_GUIDE.md
configs/eval.yaml
src/weknora_eval/testset.py
.env.example
View file @
7ce899a
...
...
@@ -22,4 +22,7 @@ RAGAS_JUDGE_MODEL=gpt-4o-mini
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
TESTSET_SIZE=50
TESTSET_MAX_DOCUMENT_CHARS=2000
TESTSET_SOURCE_MULTIPLIER=3
TESTSET_GENERATOR_MAX_TOKENS=4096
REQUEST_INTERVAL_SECONDS=0.2
...
...
README.md
View file @
7ce899a
...
...
@@ -68,7 +68,7 @@ python scripts/10_report.py
首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认
`retrieved_contexts`
、
`response`
、Ragas 输入字段都正常后再扩展样本量。
默认
`04_parse_docs.py`
从 WeKnora 导出的
`data/exported/chunks.jsonl`
构造测试集来源,不再重复调用外部 PDF 解析器。
`05_generate_testset.py`
默认使用 Ragas 结合评估侧 LLM 自动生成 QA;
`local`
、
`mineru`
和
`rule_based`
只作为可选实验/兜底配置保留。
默认
`04_parse_docs.py`
从 WeKnora 导出的
`data/exported/chunks.jsonl`
构造测试集来源,不再重复调用外部 PDF 解析器。
`05_generate_testset.py`
默认使用 Ragas 结合评估侧 LLM 自动生成 QA;
生成阶段会用
`TESTSET_MAX_DOCUMENT_CHARS`
限制单条来源上下文长度,并用
`TESTSET_GENERATOR_MAX_TOKENS`
控制生成输出预算,避免和后续评测用的
`ragas.max_tokens`
混在一起。
`local`
、
`mineru`
和
`rule_based`
只作为可选实验/兜底配置保留。
## 主要产物
...
...
TESTING_GUIDE.md
View file @
7ce899a
...
...
@@ -290,6 +290,15 @@ max_workers: 1
max_tokens
:
4096
```
如果
`05_generate_testset.py`
在生成 QA 时出现
`LLMDidNotFinishException`
,优先不要继续盲目调大
`ragas.max_tokens`
。
`05`
有独立的生成预算和输入长度:
```
bash
TESTSET_GENERATOR_MAX_TOKENS
=
4096
TESTSET_MAX_DOCUMENT_CHARS
=
2000
```
如果 vLLM 仍然报生成未完成,先把
`TESTSET_SIZE`
降到 3,再把
`TESTSET_MAX_DOCUMENT_CHARS`
调到 1000-1500 验证链路;
`ragas.max_tokens`
主要用于后续评测阶段,不应该拿来无限放大测试集生成阶段的输出长度。
### WeKnora 问答没有 retrieved_contexts
检查:
...
...
configs/eval.yaml
View file @
7ce899a
...
...
@@ -13,6 +13,9 @@ testset:
include_pdf
:
true
include_xlsx
:
true
min_context_chars
:
80
max_document_chars
:
"
${TESTSET_MAX_DOCUMENT_CHARS:-2000}"
source_multiplier
:
"
${TESTSET_SOURCE_MULTIPLIER:-3}"
generator_max_tokens
:
"
${TESTSET_GENERATOR_MAX_TOKENS:-4096}"
require_manual_review
:
true
parsing
:
...
...
@@ -69,7 +72,7 @@ ragas:
judge_model
:
"
${RAGAS_JUDGE_MODEL}"
embedding_model
:
"
${RAGAS_EMBEDDING_MODEL}"
temperature
:
0
max_tokens
:
8192
max_tokens
:
4096
timeout_seconds
:
600
max_workers
:
1
metrics
:
...
...
src/weknora_eval/testset.py
View file @
7ce899a
from
__future__
import
annotations
import
json
import
logging
from
typing
import
Any
from
langchain_core.documents
import
Document
...
...
@@ -13,6 +14,8 @@ from weknora_eval.loaders import read_jsonl, write_jsonl
from
weknora_eval.ragas_runner
import
_wrap_langchain_models
from
weknora_eval.schemas
import
TestsetRecord
logger
=
logging
.
getLogger
(
__name__
)
def
generate_testset
(
config
:
dict
[
str
,
Any
])
->
list
[
dict
[
str
,
Any
]]:
testset
=
config
.
get
(
"testset"
,
{})
...
...
@@ -37,6 +40,11 @@ def generate_ragas_testset(
ragas_config
=
config
[
"ragas"
]
size
=
int
(
testset_config
.
get
(
"size"
,
50
))
min_context_chars
=
int
(
testset_config
.
get
(
"min_context_chars"
,
80
))
max_document_chars
=
int
(
testset_config
.
get
(
"max_document_chars"
,
2000
))
source_multiplier
=
max
(
int
(
testset_config
.
get
(
"source_multiplier"
,
3
)),
1
)
generator_max_tokens
=
int
(
testset_config
.
get
(
"generator_max_tokens"
,
ragas_config
.
get
(
"max_tokens"
,
4096
))
)
source_rows
=
[
row
...
...
@@ -47,24 +55,34 @@ def generate_ragas_testset(
write_jsonl
(
output_path
,
[])
return
[]
source_limit
=
min
(
len
(
source_rows
),
max
(
size
*
source_multiplier
,
size
,
1
))
selected_source_rows
=
source_rows
[:
source_limit
]
documents
=
[
Document
(
page_content
=
row
[
"content"
]
,
page_content
=
_truncate_for_generation
(
row
[
"content"
],
max_document_chars
)
,
metadata
=
{
"source_file"
:
row
.
get
(
"source_file"
),
"doc_id"
:
row
.
get
(
"doc_id"
),
"content_chars"
:
len
(
row
.
get
(
"content"
)
or
""
),
**
(
row
.
get
(
"metadata"
)
or
{}),
},
)
for
row
in
source_rows
for
row
in
s
elected_s
ource_rows
]
logger
.
info
(
"Generating Ragas testset: target_size=
%
s source_documents=
%
s max_document_chars=
%
s generator_max_tokens=
%
s"
,
size
,
len
(
documents
),
max_document_chars
,
generator_max_tokens
,
)
llm
=
ChatOpenAI
(
model
=
str
(
require_config
(
config
,
"ragas.generator_model"
)),
api_key
=
_required_ragas_value
(
ragas_config
,
"llm_api_key"
),
base_url
=
_required_ragas_value
(
ragas_config
,
"llm_base_url"
),
temperature
=
float
(
ragas_config
.
get
(
"temperature"
,
0
)),
max_tokens
=
int
(
ragas_config
.
get
(
"max_tokens"
,
4096
))
,
max_tokens
=
generator_max_tokens
,
timeout
=
int
(
ragas_config
.
get
(
"timeout_seconds"
,
600
)),
)
embeddings
=
OpenAIEmbeddings
(
...
...
@@ -78,21 +96,29 @@ def generate_ragas_testset(
ragas_llm
,
ragas_embeddings
=
_wrap_langchain_models
(
llm
,
embeddings
)
generator
=
TestsetGenerator
(
llm
=
ragas_llm
,
embedding_model
=
ragas_embeddings
)
result
=
generator
.
generate_with_langchain_docs
(
documents
[:
max
(
size
,
1
)]
,
documents
,
testset_size
=
size
,
run_config
=
RunConfig
(
timeout
=
int
(
ragas_config
.
get
(
"timeout_seconds"
,
600
)),
max_workers
=
int
(
ragas_config
.
get
(
"max_workers"
,
1
)),
),
batch_size
=
1
,
raise_exceptions
=
False
,
)
ragas_rows
=
result
.
to_list
()
rows
=
_normalize_ragas_rows
(
ragas_rows
,
source_rows
)
rows
=
_normalize_ragas_rows
(
ragas_rows
,
s
elected_s
ource_rows
)
write_jsonl
(
output_path
,
rows
)
return
rows
def
_truncate_for_generation
(
content
:
str
,
max_chars
:
int
)
->
str
:
text
=
" "
.
join
((
content
or
""
)
.
split
())
if
max_chars
<=
0
or
len
(
text
)
<=
max_chars
:
return
text
return
text
[:
max_chars
]
.
rstrip
()
def
_normalize_ragas_rows
(
ragas_rows
:
list
[
dict
[
str
,
Any
]],
source_rows
:
list
[
dict
[
str
,
Any
]],
...
...
Please
register
or
sign in
to post a comment