Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
沈秋雨
/
weknora_ragas
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
Commit
feeaba05
...
feeaba057f54e31449bdf37ded063d12cf44c1fd
authored
2026-04-22 11:15:41 +0800
by
沈秋雨
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
Build evaluation documents from WeKnora chunks by default
1 parent
8096ca31
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
5 deletions
README.md
TESTING_GUIDE.md
configs/eval.yaml
scripts/04_parse_docs.py
src/weknora_eval/parsers/chunks.py
README.md
View file @
feeaba0
...
...
@@ -68,6 +68,8 @@ python scripts/10_report.py
首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认
`retrieved_contexts`
、
`response`
、Ragas 输入字段都正常后再扩展样本量。
默认
`04_parse_docs.py`
从 WeKnora 导出的
`data/exported/chunks.jsonl`
构造测试集来源,不再重复调用外部 PDF 解析器。
`local`
和
`mineru`
解析只作为可选实验配置保留。
## 主要产物
-
`data/exported/knowledge.jsonl`
...
...
TESTING_GUIDE.md
View file @
feeaba0
...
...
@@ -164,10 +164,10 @@ python scripts/10_report.py
说明:
-
`01_upload_docs.py`
上传
`data/raw_docs/
pdf`
和
`data/raw_docs/xlsx`
。
-
`01_upload_docs.py`
上传
`data/raw_docs/
`
下的 PDF/XLSX,也兼容
`pdf/`
、
`xlsx/`
子目录
。
-
`02_wait_ingestion.py`
等待 WeKnora 解析完成。
-
`03_export_chunks.py`
导出 WeKnora chunks。
-
`04_parse_docs.py`
在评估侧解析原始文档,生成 Ragas 测试集来源
。
-
`04_parse_docs.py`
默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF
。
-
`05_generate_testset.py`
生成候选 QA。
-
`06_review_testset.py`
当前会把候选 QA 标为 approved,后续可替换为人工审核。
-
`07_run_weknora_qa.py`
逐条调用 WeKnora 问答并解析 SSE。
...
...
configs/eval.yaml
View file @
feeaba0
...
...
@@ -15,10 +15,16 @@ testset:
require_manual_review
:
true
parsing
:
provider
:
"
mineru"
# chunks evaluates WeKnora as deployed: documents.jsonl is built from
# data/exported/chunks.jsonl. local/mineru remain available for optional
# parser-specific experiments.
provider
:
"
chunks"
output_path
:
"
data/parsed_docs/documents.jsonl"
failed_path
:
"
data/parsed_docs/failed_parse.jsonl"
summary_path
:
"
data/parsed_docs/parse_summary.json"
chunks
:
input_path
:
"
data/exported/chunks.jsonl"
min_chars
:
80
local
:
pdf_backend
:
"
pymupdf"
xlsx_mode
:
"
row_text"
...
...
scripts/04_parse_docs.py
View file @
feeaba0
...
...
@@ -6,6 +6,7 @@ import _bootstrap # noqa: F401
from
weknora_eval.config
import
load_config
from
weknora_eval.loaders
import
setup_logging
from
weknora_eval.parsers.chunks
import
parse_chunks
from
weknora_eval.parsers.local
import
parse_raw_docs
from
weknora_eval.parsers.mineru
import
parse_with_mineru
...
...
@@ -13,8 +14,10 @@ from weknora_eval.parsers.mineru import parse_with_mineru
def
main
()
->
int
:
setup_logging
()
config
=
load_config
()
provider
=
config
.
get
(
"parsing"
,
{})
.
get
(
"provider"
,
"local"
)
if
provider
==
"local"
:
provider
=
config
.
get
(
"parsing"
,
{})
.
get
(
"provider"
,
"chunks"
)
if
provider
==
"chunks"
:
rows
,
summary
=
parse_chunks
(
config
)
elif
provider
==
"local"
:
rows
,
summary
=
parse_raw_docs
(
config
)
elif
provider
==
"mineru"
:
rows
,
summary
=
parse_with_mineru
(
config
)
...
...
src/weknora_eval/parsers/chunks.py
0 → 100644
View file @
feeaba0
from
__future__
import
annotations
from
typing
import
Any
from
weknora_eval.loaders
import
compact_text
,
read_jsonl
,
write_json
,
write_jsonl
from
weknora_eval.parsers.local
import
build_parse_summary
from
weknora_eval.schemas
import
ParsedDocument
def
parse_chunks
(
config
:
dict
[
str
,
Any
])
->
tuple
[
list
[
dict
[
str
,
Any
]],
dict
[
str
,
Any
]]:
parsing
=
config
[
"parsing"
]
chunks_config
=
parsing
.
get
(
"chunks"
,
{})
input_path
=
chunks_config
.
get
(
"input_path"
,
"data/exported/chunks.jsonl"
)
min_chars
=
int
(
chunks_config
.
get
(
"min_chars"
,
parsing
.
get
(
"local"
,
{})
.
get
(
"min_chars"
,
80
)))
rows
:
list
[
dict
[
str
,
Any
]]
=
[]
failures
:
list
[
dict
[
str
,
Any
]]
=
[]
for
chunk
in
read_jsonl
(
input_path
):
content
=
compact_text
(
chunk
.
get
(
"content"
))
chunk_id
=
chunk
.
get
(
"chunk_id"
)
or
chunk
.
get
(
"id"
)
source_file
=
chunk
.
get
(
"source_file"
)
or
chunk
.
get
(
"knowledge_filename"
)
or
"unknown"
if
not
content
or
len
(
content
)
<
min_chars
:
failures
.
append
(
{
"source_file"
:
source_file
,
"parser"
:
"weknora:chunks"
,
"status"
:
"skipped"
,
"error"
:
f
"chunk content shorter than min_chars={min_chars}"
,
"fallback_used"
:
None
,
"chunk_id"
:
chunk_id
,
}
)
continue
document
=
ParsedDocument
(
doc_id
=
str
(
chunk_id
or
f
"{source_file}::chunk-{len(rows) + 1}"
),
source_file
=
str
(
source_file
),
file_type
=
str
(
chunk
.
get
(
"chunk_type"
)
or
"chunk"
),
content
=
content
,
metadata
=
{
"parser"
:
"weknora:chunks"
,
"chunk_id"
:
chunk_id
,
"knowledge_id"
:
chunk
.
get
(
"knowledge_id"
),
"knowledge_base_id"
:
chunk
.
get
(
"knowledge_base_id"
),
"chunk_index"
:
chunk
.
get
(
"chunk_index"
),
},
)
rows
.
append
(
document
.
to_dict
())
write_jsonl
(
parsing
.
get
(
"output_path"
,
"data/parsed_docs/documents.jsonl"
),
rows
)
if
failures
:
write_jsonl
(
parsing
.
get
(
"failed_path"
,
"data/parsed_docs/failed_parse.jsonl"
),
failures
)
summary
=
build_parse_summary
(
rows
,
failures
,
parser
=
"weknora:chunks"
)
write_json
(
parsing
.
get
(
"summary_path"
,
"data/parsed_docs/parse_summary.json"
),
summary
)
return
rows
,
summary
Please
register
or
sign in
to post a comment