简化测试流程
Showing
7 changed files
with
174 additions
and
30 deletions
| ... | @@ -57,18 +57,21 @@ cp .env.example .env | ... | @@ -57,18 +57,21 @@ cp .env.example .env |
| 57 | ```bash | 57 | ```bash |
| 58 | python scripts/00_create_kb.py | 58 | python scripts/00_create_kb.py |
| 59 | python scripts/00_check_models.py | 59 | python scripts/00_check_models.py |
| 60 | python scripts/01_upload_docs.py | 60 | python workflows/01_ingest_export.py |
| 61 | python scripts/02_wait_ingestion.py | 61 | python workflows/02_generate_testset.py |
| 62 | python scripts/03_export_chunks.py | 62 | python workflows/03_review_testset.py |
| 63 | python scripts/04_parse_docs.py | 63 | python workflows/04_evaluate_report.py |
| 64 | python scripts/05_generate_testset.py | ||
| 65 | python scripts/06_review_testset.py | ||
| 66 | python scripts/07_run_weknora_qa.py | ||
| 67 | python scripts/08_build_ragas_input.py | ||
| 68 | python scripts/09_run_ragas_eval.py | ||
| 69 | python scripts/10_report.py | ||
| 70 | ``` | 64 | ``` |
| 71 | 65 | ||
| 66 | 合并后的 workflow 只调用原有脚本,不复制业务逻辑。新旧步骤对应关系: | ||
| 67 | |||
| 68 | - `workflows/01_ingest_export.py` = `scripts/01_upload_docs.py` + `scripts/02_wait_ingestion.py` + `scripts/03_export_chunks.py` | ||
| 69 | - `workflows/02_generate_testset.py` = `scripts/04_parse_docs.py` + `scripts/05_generate_testset.py` | ||
| 70 | - `workflows/03_review_testset.py` = `scripts/06_review_testset.py` | ||
| 71 | - `workflows/04_evaluate_report.py` = `scripts/07_run_weknora_qa.py` + `scripts/08_build_ragas_input.py` + `scripts/09_run_ragas_eval.py` + `scripts/10_report.py` | ||
| 72 | |||
| 73 | 每个 workflow 结束时都会打印本阶段生成的文件路径。 | ||
| 74 | |||
| 72 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 | 75 | 首轮建议只使用 2 个 PDF、1 个 XLSX 和 10 条审核通过 QA,确认 `retrieved_contexts`、`response`、Ragas 输入字段都正常后再扩展样本量。 |
| 73 | 76 | ||
| 74 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;生成阶段使用 `TESTSET_RAGAS_MODE=direct`,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA,避免 Ragas 默认文档预处理链路重新抽标题、摘要和实体。生成阶段还会用 `TESTSET_MAX_DOCUMENT_CHARS` 限制单条来源上下文长度,用 `TESTSET_GENERATOR_MAX_TOKENS` 控制生成输出预算,并按来源文件轮询抽样,避免测试集集中在单个文件。`local`、`mineru` 和 `rule_based` 只作为可选实验/兜底配置保留。 | 77 | 默认 `04_parse_docs.py` 从 WeKnora 导出的 `data/exported/chunks.jsonl` 构造测试集来源,不再重复调用外部 PDF 解析器。`05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 自动生成 QA;生成阶段使用 `TESTSET_RAGAS_MODE=direct`,直接把 WeKnora chunks 组装成 Ragas KnowledgeGraph 并生成单跳 QA,避免 Ragas 默认文档预处理链路重新抽标题、摘要和实体。生成阶段还会用 `TESTSET_MAX_DOCUMENT_CHARS` 限制单条来源上下文长度,用 `TESTSET_GENERATOR_MAX_TOKENS` 控制生成输出预算,并按来源文件轮询抽样,避免测试集集中在单个文件。`local`、`mineru` 和 `rule_based` 只作为可选实验/兜底配置保留。 | ... | ... |
| ... | @@ -150,30 +150,19 @@ cp /path/to/*.xlsx data/raw_docs/xlsx/ | ... | @@ -150,30 +150,19 @@ cp /path/to/*.xlsx data/raw_docs/xlsx/ |
| 150 | 按顺序执行: | 150 | 按顺序执行: |
| 151 | 151 | ||
| 152 | ```bash | 152 | ```bash |
| 153 | python scripts/01_upload_docs.py | 153 | python workflows/01_ingest_export.py |
| 154 | python scripts/02_wait_ingestion.py | 154 | python workflows/02_generate_testset.py |
| 155 | python scripts/03_export_chunks.py | 155 | python workflows/03_review_testset.py |
| 156 | python scripts/04_parse_docs.py | 156 | python workflows/04_evaluate_report.py |
| 157 | python scripts/05_generate_testset.py | ||
| 158 | python scripts/06_review_testset.py | ||
| 159 | python scripts/07_run_weknora_qa.py | ||
| 160 | python scripts/08_build_ragas_input.py | ||
| 161 | python scripts/09_run_ragas_eval.py | ||
| 162 | python scripts/10_report.py | ||
| 163 | ``` | 157 | ``` |
| 164 | 158 | ||
| 165 | 说明: | 159 | 说明: |
| 166 | 160 | ||
| 167 | - `01_upload_docs.py` 上传 `data/raw_docs/` 下的 PDF/XLSX,也兼容 `pdf/`、`xlsx/` 子目录。 | 161 | - `workflows/01_ingest_export.py` 对齐原 `01_upload_docs.py`、`02_wait_ingestion.py`、`03_export_chunks.py`:上传原始文件、等待 WeKnora 解析、导出 chunks。 |
| 168 | - `02_wait_ingestion.py` 等待 WeKnora 解析完成。 | 162 | - `workflows/02_generate_testset.py` 对齐原 `04_parse_docs.py`、`05_generate_testset.py`:从 chunks 构造测试集来源并生成候选 QA。 |
| 169 | - `03_export_chunks.py` 导出 WeKnora chunks。 | 163 | - `workflows/03_review_testset.py` 对齐原 `06_review_testset.py`:把候选 QA 标为 approved,后续可替换为人工审核。 |
| 170 | - `04_parse_docs.py` 默认从 WeKnora 导出的 chunks 构造 Ragas 测试集来源,不再重复解析原始 PDF。 | 164 | - `workflows/04_evaluate_report.py` 对齐原 `07_run_weknora_qa.py`、`08_build_ragas_input.py`、`09_run_ragas_eval.py`、`10_report.py`:逐条调用 WeKnora 问答、构造 Ragas 输入、调用 Ragas 打分、生成 Markdown 报告。 |
| 171 | - `05_generate_testset.py` 默认使用 Ragas 结合评估侧 LLM 生成候选 QA。 | 165 | - 每个 workflow 结束时会打印本阶段生成的文件路径。 |
| 172 | - `06_review_testset.py` 当前会把候选 QA 标为 approved,后续可替换为人工审核。 | ||
| 173 | - `07_run_weknora_qa.py` 逐条调用 WeKnora 问答并解析 SSE。 | ||
| 174 | - `08_build_ragas_input.py` 合并 QA 和 WeKnora 输出。 | ||
| 175 | - `09_run_ragas_eval.py` 调用 Ragas 打分。 | ||
| 176 | - `10_report.py` 生成 Markdown 报告。 | ||
| 177 | 166 | ||
| 178 | ## 6. 产物验收 | 167 | ## 6. 产物验收 |
| 179 | 168 | ... | ... |
workflows/01_ingest_export.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | from _runner import print_artifacts, run_scripts | ||
| 6 | |||
| 7 | |||
| 8 | SCRIPTS = [ | ||
| 9 | "scripts/01_upload_docs.py", | ||
| 10 | "scripts/02_wait_ingestion.py", | ||
| 11 | "scripts/03_export_chunks.py", | ||
| 12 | ] | ||
| 13 | |||
| 14 | ARTIFACTS = [ | ||
| 15 | "data/exported/knowledge_uploads.jsonl", | ||
| 16 | "data/exported/failed_uploads.jsonl", | ||
| 17 | "data/exported/knowledge.jsonl", | ||
| 18 | "data/exported/chunks.jsonl", | ||
| 19 | ] | ||
| 20 | |||
| 21 | |||
| 22 | def main() -> int: | ||
| 23 | code = run_scripts(SCRIPTS) | ||
| 24 | print_artifacts(ARTIFACTS) | ||
| 25 | return code | ||
| 26 | |||
| 27 | |||
| 28 | if __name__ == "__main__": | ||
| 29 | sys.exit(main()) |
workflows/02_generate_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | from _runner import print_artifacts, run_scripts | ||
| 6 | |||
| 7 | |||
| 8 | SCRIPTS = [ | ||
| 9 | "scripts/04_parse_docs.py", | ||
| 10 | "scripts/05_generate_testset.py", | ||
| 11 | ] | ||
| 12 | |||
| 13 | ARTIFACTS = [ | ||
| 14 | "data/parsed_docs/documents.jsonl", | ||
| 15 | "data/parsed_docs/failed_parse.jsonl", | ||
| 16 | "data/parsed_docs/parse_summary.json", | ||
| 17 | "data/testsets/testset.raw.jsonl", | ||
| 18 | ] | ||
| 19 | |||
| 20 | |||
| 21 | def main() -> int: | ||
| 22 | code = run_scripts(SCRIPTS) | ||
| 23 | print_artifacts(ARTIFACTS) | ||
| 24 | return code | ||
| 25 | |||
| 26 | |||
| 27 | if __name__ == "__main__": | ||
| 28 | sys.exit(main()) |
workflows/03_review_testset.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | from _runner import print_artifacts, run_scripts | ||
| 6 | |||
| 7 | |||
| 8 | SCRIPTS = ["scripts/06_review_testset.py"] | ||
| 9 | ARTIFACTS = ["data/testsets/testset.reviewed.jsonl"] | ||
| 10 | |||
| 11 | |||
| 12 | def main() -> int: | ||
| 13 | code = run_scripts(SCRIPTS) | ||
| 14 | print_artifacts(ARTIFACTS) | ||
| 15 | return code | ||
| 16 | |||
| 17 | |||
| 18 | if __name__ == "__main__": | ||
| 19 | sys.exit(main()) |
workflows/04_evaluate_report.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import sys | ||
| 4 | |||
| 5 | from _runner import print_artifacts, run_scripts | ||
| 6 | |||
| 7 | |||
| 8 | SCRIPTS = [ | ||
| 9 | "scripts/07_run_weknora_qa.py", | ||
| 10 | "scripts/08_build_ragas_input.py", | ||
| 11 | "scripts/09_run_ragas_eval.py", | ||
| 12 | "scripts/10_report.py", | ||
| 13 | ] | ||
| 14 | |||
| 15 | ARTIFACTS = [ | ||
| 16 | "data/runs/weknora_answers.jsonl", | ||
| 17 | "data/runs/failed_requests.jsonl", | ||
| 18 | "data/runs/ragas_input.jsonl", | ||
| 19 | "data/reports/ragas_scores.csv", | ||
| 20 | "data/reports/summary.md", | ||
| 21 | ] | ||
| 22 | |||
| 23 | |||
| 24 | def main() -> int: | ||
| 25 | code = run_scripts(SCRIPTS) | ||
| 26 | print_artifacts(ARTIFACTS) | ||
| 27 | return code | ||
| 28 | |||
| 29 | |||
| 30 | if __name__ == "__main__": | ||
| 31 | sys.exit(main()) |
workflows/_runner.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import importlib.util | ||
| 4 | import sys | ||
| 5 | from pathlib import Path | ||
| 6 | |||
| 7 | |||
| 8 | ROOT = Path(__file__).resolve().parents[1] | ||
| 9 | SCRIPTS_DIR = ROOT / "scripts" | ||
| 10 | if str(SCRIPTS_DIR) not in sys.path: | ||
| 11 | sys.path.insert(0, str(SCRIPTS_DIR)) | ||
| 12 | |||
| 13 | |||
| 14 | def run_script(path: str) -> int: | ||
| 15 | script_path = ROOT / path | ||
| 16 | module_name = f"_workflow_{script_path.stem}" | ||
| 17 | spec = importlib.util.spec_from_file_location(module_name, script_path) | ||
| 18 | if spec is None or spec.loader is None: | ||
| 19 | raise RuntimeError(f"Cannot load script: {script_path}") | ||
| 20 | module = importlib.util.module_from_spec(spec) | ||
| 21 | spec.loader.exec_module(module) | ||
| 22 | if not hasattr(module, "main"): | ||
| 23 | raise RuntimeError(f"Script has no main(): {script_path}") | ||
| 24 | result = module.main() | ||
| 25 | return int(result or 0) | ||
| 26 | |||
| 27 | |||
| 28 | def run_scripts(paths: list[str]) -> int: | ||
| 29 | for path in paths: | ||
| 30 | print(f"\n==> Running {path}") | ||
| 31 | code = run_script(path) | ||
| 32 | if code != 0: | ||
| 33 | print(f"Stopped at {path} with exit code {code}") | ||
| 34 | return code | ||
| 35 | return 0 | ||
| 36 | |||
| 37 | |||
| 38 | def print_artifacts(paths: list[str]) -> None: | ||
| 39 | existing = [ROOT / path for path in paths if (ROOT / path).exists()] | ||
| 40 | if not existing: | ||
| 41 | print("\nGenerated files: none") | ||
| 42 | return | ||
| 43 | print("\nGenerated files:") | ||
| 44 | for path in existing: | ||
| 45 | print(f"- {path.relative_to(ROOT)}") |
-
Please register or sign in to post a comment