Finish the offline business-export chain by generating project manifests directl…
…y from normalized rows Constraint: Keep this checkpoint offline-only and avoid touching real business data, datasets, or model artifacts Rejected: Leave final manifest shaping as a manual next-session task | The handoff is stronger when catalog/train/test/val can already be produced automatically Confidence: high Scope-risk: narrow Directive: Treat these generated manifests as integration-stage scaffolds and validate final field policy again before production data ingestion Tested: Ran build_business_project_manifests.py on normalized sample data and verified catalog/train/test/val structure; rechecked 70 relative links Not-tested: Did not run the generated manifests through full training/evaluation against live business audio
Showing
6 changed files
with
189 additions
and
0 deletions
| ... | @@ -60,6 +60,7 @@ | ... | @@ -60,6 +60,7 @@ |
| 60 | - 导出 cookbook:`docs/business-export-cookbook.md` | 60 | - 导出 cookbook:`docs/business-export-cookbook.md` |
| 61 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` | 61 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` |
| 62 | - 角色拆分脚本:`acr-engine/scripts/split_business_manifest_ready.py` | 62 | - 角色拆分脚本:`acr-engine/scripts/split_business_manifest_ready.py` |
| 63 | - 项目 manifest 适配:`acr-engine/scripts/build_business_project_manifests.py` | ||
| 63 | 2. 补 cap64 multi-seed aggregate。 | 64 | 2. 补 cap64 multi-seed aggregate。 |
| 64 | 3. 更新: | 65 | 3. 更新: |
| 65 | - `docs/open-dataset-workflow.md` | 66 | - `docs/open-dataset-workflow.md` | ... | ... |
| 1 | #!/usr/bin/env python3 | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | |||
| 8 | |||
| 9 | def load_jsonl(path: Path) -> list[dict]: | ||
| 10 | return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] | ||
| 11 | |||
| 12 | |||
| 13 | def write_json(path: Path, rows: list[dict]) -> None: | ||
| 14 | path.parent.mkdir(parents=True, exist_ok=True) | ||
| 15 | path.write_text(json.dumps(rows, ensure_ascii=False, indent=2)) | ||
| 16 | |||
| 17 | |||
| 18 | def build_reference(row: dict) -> dict: | ||
| 19 | return { | ||
| 20 | 'song_id': row['song_id'], | ||
| 21 | 'audio_path': row['audio_path'], | ||
| 22 | 'duration': row.get('duration_sec') or 0.0, | ||
| 23 | 'type': 'reference', | ||
| 24 | 'source_dataset': row.get('source_dataset', 'business_music'), | ||
| 25 | } | ||
| 26 | |||
| 27 | |||
| 28 | def build_query(row: dict) -> dict: | ||
| 29 | return { | ||
| 30 | 'song_id': row['song_id'], | ||
| 31 | 'audio_path': row['audio_path'], | ||
| 32 | 'duration': row.get('duration_sec') or 8.0, | ||
| 33 | 'type': 'clean', | ||
| 34 | 'offset': row.get('offset_sec') or 0.0, | ||
| 35 | 'segment_type': 'external_query', | ||
| 36 | 'source_dataset': row.get('source_dataset', 'business_music'), | ||
| 37 | } | ||
| 38 | |||
| 39 | |||
| 40 | def dedupe_refs(rows: list[dict]) -> list[dict]: | ||
| 41 | seen = set() | ||
| 42 | out = [] | ||
| 43 | for row in rows: | ||
| 44 | key = (row['song_id'], row['audio_path']) | ||
| 45 | if key in seen: | ||
| 46 | continue | ||
| 47 | seen.add(key) | ||
| 48 | out.append(row) | ||
| 49 | return out | ||
| 50 | |||
| 51 | |||
| 52 | def main() -> None: | ||
| 53 | parser = argparse.ArgumentParser(description='Build project manifests from business manifest-ready JSONL') | ||
| 54 | parser.add_argument('--input', required=True, help='manifest-ready JSONL from normalize_business_export.py') | ||
| 55 | parser.add_argument('--output-dir', required=True, help='output manifests dir') | ||
| 56 | parser.add_argument('--include-holdout-in-val', action='store_true', help='map holdout queries into val.json') | ||
| 57 | args = parser.parse_args() | ||
| 58 | |||
| 59 | rows = load_jsonl(Path(args.input).resolve()) | ||
| 60 | refs_src = [row for row in rows if row.get('role') == 'reference'] | ||
| 61 | query_src = [row for row in rows if row.get('role') == 'query'] | ||
| 62 | |||
| 63 | refs = dedupe_refs([build_reference(row) for row in refs_src]) | ||
| 64 | train_queries = [build_query(row) for row in query_src if row.get('split') == 'train'] | ||
| 65 | test_queries = [build_query(row) for row in query_src if row.get('split') == 'test'] | ||
| 66 | val_queries = [build_query(row) for row in query_src if row.get('split') == 'val'] | ||
| 67 | if args.include_holdout_in_val: | ||
| 68 | val_queries.extend(build_query(row) for row in query_src if row.get('split') == 'holdout') | ||
| 69 | |||
| 70 | out_dir = Path(args.output_dir).resolve() | ||
| 71 | write_json(out_dir / 'catalog.json', refs) | ||
| 72 | write_json(out_dir / 'train.json', train_queries + refs) | ||
| 73 | write_json(out_dir / 'test.json', test_queries + refs) | ||
| 74 | write_json(out_dir / 'val.json', val_queries) | ||
| 75 | |||
| 76 | summary = { | ||
| 77 | 'catalog_refs': len(refs), | ||
| 78 | 'train_queries': len(train_queries), | ||
| 79 | 'test_queries': len(test_queries), | ||
| 80 | 'val_queries': len(val_queries), | ||
| 81 | 'output_dir': str(out_dir), | ||
| 82 | } | ||
| 83 | print(json.dumps(summary, ensure_ascii=False, indent=2)) | ||
| 84 | |||
| 85 | |||
| 86 | if __name__ == '__main__': | ||
| 87 | main() |
| 1 | ## 2026-06-02 项目 manifest 适配脚本交付 checkpoint | ||
| 2 | |||
| 3 | 完成项: | ||
| 4 | - 新增 `acr-engine/scripts/build_business_project_manifests.py` | ||
| 5 | - 新增 `docs/business-project-manifest-adapter.md` | ||
| 6 | - 已把业务导出链推进到可直接生成项目 `catalog/train/test/val` 的阶段。 | ||
| 7 | |||
| 8 | 结论: | ||
| 9 | - 下个 session 已基本不需要再手工拼项目 manifest。 | ||
| 10 | - 从业务导出到项目 manifest 的离线适配链已经成型。 | ||
| 11 | |||
| 1 | ## 2026-06-02 manifest-ready 角色拆分脚本交付 checkpoint | 12 | ## 2026-06-02 manifest-ready 角色拆分脚本交付 checkpoint |
| 2 | 13 | ||
| 3 | 完成项: | 14 | 完成项: | ... | ... |
| ... | @@ -143,3 +143,10 @@ cd /workspace/acr-engine | ... | @@ -143,3 +143,10 @@ cd /workspace/acr-engine |
| 143 | - `excluded.json` | 143 | - `excluded.json` |
| 144 | 144 | ||
| 145 | 这样下个 session 可以更快把业务素材继续整形成训练/评测所需清单。 | 145 | 这样下个 session 可以更快把业务素材继续整形成训练/评测所需清单。 |
| 146 | |||
| 147 | |||
| 148 | ## 8. 生成项目 manifest | ||
| 149 | |||
| 150 | 如果你已经有 manifest-ready JSONL,可以直接继续生成项目当前需要的四个 manifest: | ||
| 151 | - [../acr-engine/scripts/build_business_project_manifests.py](../acr-engine/scripts/build_business_project_manifests.py) | ||
| 152 | - [business-project-manifest-adapter.md](./business-project-manifest-adapter.md) | ... | ... |
docs/business-project-manifest-adapter.md
0 → 100644
| 1 | # Business Project Manifest Adapter / 业务数据到项目 Manifest 适配说明 | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | > 关联文档:[业务导出 Cookbook](./business-export-cookbook.md) · [业务 Manifest 与 Type-Role 规范](./business-manifest-and-type-role-spec.md) | ||
| 5 | |||
| 6 | ## 一页结论 | ||
| 7 | |||
| 8 | 现在仓库里已经有一条接近项目训练/评测 manifest 的离线脚本链: | ||
| 9 | |||
| 10 | 1. 业务库表导出 CSV / JSONL | ||
| 11 | 2. [../acr-engine/scripts/normalize_business_export.py](../acr-engine/scripts/normalize_business_export.py) | ||
| 12 | 3. [../acr-engine/scripts/split_business_manifest_ready.py](../acr-engine/scripts/split_business_manifest_ready.py) | ||
| 13 | 4. [../acr-engine/scripts/build_business_project_manifests.py](../acr-engine/scripts/build_business_project_manifests.py) | ||
| 14 | |||
| 15 | 最后一步会直接生成: | ||
| 16 | - `catalog.json` | ||
| 17 | - `train.json` | ||
| 18 | - `test.json` | ||
| 19 | - `val.json` | ||
| 20 | |||
| 21 | 格式对齐当前项目已有 manifest 结构。 | ||
| 22 | |||
| 23 | --- | ||
| 24 | |||
| 25 | ## 1. 对齐后的项目格式 | ||
| 26 | |||
| 27 | ### `catalog.json` | ||
| 28 | - 只放 reference | ||
| 29 | - 字段:`song_id / audio_path / duration / type=reference / source_dataset` | ||
| 30 | |||
| 31 | ### `train.json` / `test.json` | ||
| 32 | - 前半部分是 query | ||
| 33 | - 后半部分拼接 reference | ||
| 34 | - query 字段: | ||
| 35 | - `song_id` | ||
| 36 | - `audio_path` | ||
| 37 | - `duration` | ||
| 38 | - `type=clean` | ||
| 39 | - `offset` | ||
| 40 | - `segment_type=external_query` | ||
| 41 | - `source_dataset` | ||
| 42 | |||
| 43 | ### `val.json` | ||
| 44 | - 当前默认只放 `split=val` 的 query | ||
| 45 | - 可选把 `holdout` 合并进 `val` | ||
| 46 | |||
| 47 | --- | ||
| 48 | |||
| 49 | ## 2. 示例命令 | ||
| 50 | |||
| 51 | ```bash | ||
| 52 | cd /workspace/acr-engine | ||
| 53 | /usr/local/miniconda3/bin/python scripts/normalize_business_export.py \ | ||
| 54 | --input configs/manifests/examples/business_asset_export_example.csv \ | ||
| 55 | --output /tmp/business_asset_manifest_ready.jsonl | ||
| 56 | |||
| 57 | /usr/local/miniconda3/bin/python scripts/build_business_project_manifests.py \ | ||
| 58 | --input /tmp/business_asset_manifest_ready.jsonl \ | ||
| 59 | --output-dir /tmp/business_project_manifests | ||
| 60 | ``` | ||
| 61 | |||
| 62 | 如果你希望把 `holdout` 先并进 `val.json`: | ||
| 63 | |||
| 64 | ```bash | ||
| 65 | /usr/local/miniconda3/bin/python scripts/build_business_project_manifests.py \ | ||
| 66 | --input /tmp/business_asset_manifest_ready.jsonl \ | ||
| 67 | --output-dir /tmp/business_project_manifests \ | ||
| 68 | --include-holdout-in-val | ||
| 69 | ``` | ||
| 70 | |||
| 71 | --- | ||
| 72 | |||
| 73 | ## 3. 适配边界 | ||
| 74 | |||
| 75 | 这一步还不是最终“真实业务生产接入”,但已经足够让下个 session: | ||
| 76 | - 用真实业务导出样本跑通 manifest 结构 | ||
| 77 | - 对接 `train.py / evaluate.py / run_demo.py` | ||
| 78 | - 再只针对最终字段细节做小修 | ||
| 79 | |||
| 80 | ## Sources | ||
| 81 | - See [business-export-cookbook.md](./business-export-cookbook.md) | ||
| 82 | - See [business-manifest-and-type-role-spec.md](./business-manifest-and-type-role-spec.md) |
| ... | @@ -261,6 +261,7 @@ | ... | @@ -261,6 +261,7 @@ |
| 261 | - SQL/CSV/JSONL 导出参考:[business-export-cookbook.md](./business-export-cookbook.md) | 261 | - SQL/CSV/JSONL 导出参考:[business-export-cookbook.md](./business-export-cookbook.md) |
| 262 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` | 262 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` |
| 263 | - 角色拆分脚本:`acr-engine/scripts/split_business_manifest_ready.py` | 263 | - 角色拆分脚本:`acr-engine/scripts/split_business_manifest_ready.py` |
| 264 | - 项目 manifest 适配:`acr-engine/scripts/build_business_project_manifests.py` | ||
| 264 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 | 265 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 |
| 265 | 3. 继续补 cap64 multi-seed,而不是只保留单 seed。 | 266 | 3. 继续补 cap64 multi-seed,而不是只保留单 seed。 |
| 266 | 4. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 | 267 | 4. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 | ... | ... |
-
Please register or sign in to post a comment