Turn business export guidance into a runnable normalization step for the next session
Constraint: Keep this checkpoint offline-only and avoid touching real databases, datasets, or model artifacts Rejected: Stop at static CSV/JSONL examples only | The next session needs an executable normalization path, not just samples Confidence: high Scope-risk: narrow Directive: Treat normalized JSONL as manifest-ready staging output and keep final manifest shaping explicit in the integration step Tested: Ran normalize_business_export.py on the sample CSV and JSONL inputs; verified 3 output rows each; rechecked 71 relative links Not-tested: Did not run against a live business export or connect to any database
Showing
6 changed files
with
153 additions
and
0 deletions
| ... | @@ -58,6 +58,7 @@ | ... | @@ -58,6 +58,7 @@ |
| 58 | - 业务说明:`docs/business-music-bucket-and-type-guide.md` | 58 | - 业务说明:`docs/business-music-bucket-and-type-guide.md` |
| 59 | - Manifest 规范:`docs/business-manifest-and-type-role-spec.md` | 59 | - Manifest 规范:`docs/business-manifest-and-type-role-spec.md` |
| 60 | - 导出 cookbook:`docs/business-export-cookbook.md` | 60 | - 导出 cookbook:`docs/business-export-cookbook.md` |
| 61 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` | ||
| 61 | 2. 补 cap64 multi-seed aggregate。 | 62 | 2. 补 cap64 multi-seed aggregate。 |
| 62 | 3. 更新: | 63 | 3. 更新: |
| 63 | - `docs/open-dataset-workflow.md` | 64 | - `docs/open-dataset-workflow.md` | ... | ... |
| 1 | #!/usr/bin/env python3 | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import csv | ||
| 6 | import json | ||
| 7 | from pathlib import Path | ||
| 8 | from typing import Iterable | ||
| 9 | |||
| 10 | |||
| 11 | def load_rows(path: Path) -> list[dict]: | ||
| 12 | suffix = path.suffix.lower() | ||
| 13 | if suffix == '.csv': | ||
| 14 | with path.open(newline='') as f: | ||
| 15 | return list(csv.DictReader(f)) | ||
| 16 | if suffix == '.jsonl': | ||
| 17 | return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] | ||
| 18 | raise ValueError(f'unsupported input format: {path}') | ||
| 19 | |||
| 20 | |||
| 21 | def load_mapping(path: Path) -> dict[int, dict]: | ||
| 22 | data = json.loads(path.read_text()) | ||
| 23 | return {int(item['type']): item for item in data['mappings']} | ||
| 24 | |||
| 25 | |||
| 26 | def parse_bool(value): | ||
| 27 | if isinstance(value, bool): | ||
| 28 | return value | ||
| 29 | if value is None: | ||
| 30 | return None | ||
| 31 | s = str(value).strip().lower() | ||
| 32 | if s in {'true', '1', 'yes'}: | ||
| 33 | return True | ||
| 34 | if s in {'false', '0', 'no'}: | ||
| 35 | return False | ||
| 36 | return None | ||
| 37 | |||
| 38 | |||
| 39 | def parse_float(value): | ||
| 40 | if value in (None, ''): | ||
| 41 | return None | ||
| 42 | try: | ||
| 43 | return float(value) | ||
| 44 | except ValueError: | ||
| 45 | return None | ||
| 46 | |||
| 47 | |||
| 48 | def normalize_row(row: dict, mapping: dict[int, dict], source_dataset: str, default_split: str) -> dict: | ||
| 49 | row = dict(row) | ||
| 50 | asset_type = int(row['type']) | ||
| 51 | rule = mapping.get(asset_type, {'role': 'excluded', 'default_bucket': 'unknown', 'trainable': False}) | ||
| 52 | normalized = { | ||
| 53 | 'song_id': row['song_id'], | ||
| 54 | 'asset_id': row['asset_id'], | ||
| 55 | 'type': asset_type, | ||
| 56 | 'role': row.get('role') or rule['role'], | ||
| 57 | 'split': row.get('split') or default_split, | ||
| 58 | 'audio_path': row['audio_path'], | ||
| 59 | 'source_dataset': row.get('source_dataset') or source_dataset, | ||
| 60 | 'title': row.get('title'), | ||
| 61 | 'artist': row.get('artist'), | ||
| 62 | 'album_id': row.get('album_id'), | ||
| 63 | 'bucket': row.get('bucket') or rule.get('default_bucket'), | ||
| 64 | 'offset_sec': parse_float(row.get('offset_sec')), | ||
| 65 | 'duration_sec': parse_float(row.get('duration_sec')), | ||
| 66 | 'sample_rate': int(row['sample_rate']) if row.get('sample_rate') not in (None, '') else None, | ||
| 67 | 'bitrate': int(row['bitrate']) if row.get('bitrate') not in (None, '') else None, | ||
| 68 | 'license': row.get('license'), | ||
| 69 | 'is_lossless': parse_bool(row.get('is_lossless')), | ||
| 70 | 'trainable': bool(rule.get('trainable', False)), | ||
| 71 | } | ||
| 72 | return normalized | ||
| 73 | |||
| 74 | |||
| 75 | def emit_jsonl(rows: Iterable[dict], output: Path) -> None: | ||
| 76 | output.parent.mkdir(parents=True, exist_ok=True) | ||
| 77 | with output.open('w') as f: | ||
| 78 | for row in rows: | ||
| 79 | f.write(json.dumps(row, ensure_ascii=False) + '\n') | ||
| 80 | |||
| 81 | |||
| 82 | def main() -> None: | ||
| 83 | parser = argparse.ArgumentParser(description='Normalize business CSV/JSONL export into manifest-ready JSONL rows') | ||
| 84 | parser.add_argument('--input', required=True, help='Input CSV or JSONL export') | ||
| 85 | parser.add_argument('--mapping', default='configs/manifests/business_type_role_mapping.json') | ||
| 86 | parser.add_argument('--source-dataset', default='internal_catalog') | ||
| 87 | parser.add_argument('--default-split', default='holdout') | ||
| 88 | parser.add_argument('--output', required=True, help='Output JSONL path') | ||
| 89 | args = parser.parse_args() | ||
| 90 | |||
| 91 | repo = Path(__file__).resolve().parents[1] | ||
| 92 | input_path = Path(args.input) | ||
| 93 | if not input_path.is_absolute(): | ||
| 94 | input_path = (repo / input_path).resolve() | ||
| 95 | mapping_path = Path(args.mapping) | ||
| 96 | if not mapping_path.is_absolute(): | ||
| 97 | mapping_path = (repo / mapping_path).resolve() | ||
| 98 | output_path = Path(args.output) | ||
| 99 | if not output_path.is_absolute(): | ||
| 100 | output_path = (repo / output_path).resolve() | ||
| 101 | |||
| 102 | rows = load_rows(input_path) | ||
| 103 | mapping = load_mapping(mapping_path) | ||
| 104 | normalized = [normalize_row(row, mapping, args.source_dataset, args.default_split) for row in rows] | ||
| 105 | emit_jsonl(normalized, output_path) | ||
| 106 | summary = { | ||
| 107 | 'input_rows': len(rows), | ||
| 108 | 'output_rows': len(normalized), | ||
| 109 | 'output': str(output_path), | ||
| 110 | 'roles': sorted({row['role'] for row in normalized}), | ||
| 111 | 'buckets': sorted({row['bucket'] for row in normalized if row.get('bucket')}), | ||
| 112 | } | ||
| 113 | print(json.dumps(summary, ensure_ascii=False, indent=2)) | ||
| 114 | |||
| 115 | |||
| 116 | if __name__ == '__main__': | ||
| 117 | main() |
| 1 | ## 2026-06-02 业务导出规范化脚本交付 checkpoint | ||
| 2 | |||
| 3 | 完成项: | ||
| 4 | - 新增 `acr-engine/scripts/normalize_business_export.py` | ||
| 5 | - 已把业务导出 cookbook 从“样例说明”推进为“可运行转换脚本 + 样例输入”。 | ||
| 6 | |||
| 7 | 结论: | ||
| 8 | - 下个 session 可以直接把业务 CSV/JSONL 导出转成 manifest-ready JSONL。 | ||
| 9 | - `type -> role -> bucket` 默认规则现在不只是文档约定,也有可执行脚本承接。 | ||
| 10 | |||
| 1 | ## 2026-06-02 业务导出 cookbook 与样例交付 checkpoint | 11 | ## 2026-06-02 业务导出 cookbook 与样例交付 checkpoint |
| 2 | 12 | ||
| 3 | 完成项: | 13 | 完成项: | ... | ... |
| ... | @@ -11,6 +11,7 @@ | ... | @@ -11,6 +11,7 @@ |
| 11 | 2. 用 `type-role mapping` 补 `role` / `bucket` | 11 | 2. 用 `type-role mapping` 补 `role` / `bucket` |
| 12 | 3. 落成 CSV 或 JSONL 中间文件 | 12 | 3. 落成 CSV 或 JSONL 中间文件 |
| 13 | 4. 再转成项目 manifest | 13 | 4. 再转成项目 manifest |
| 14 | 5. 或直接先用仓库脚本转成 manifest-ready JSONL | ||
| 14 | 15 | ||
| 15 | 仓库里已经补好以下参考物: | 16 | 仓库里已经补好以下参考物: |
| 16 | - [../acr-engine/configs/manifests/business_asset_manifest_template.json](../acr-engine/configs/manifests/business_asset_manifest_template.json) | 17 | - [../acr-engine/configs/manifests/business_asset_manifest_template.json](../acr-engine/configs/manifests/business_asset_manifest_template.json) |
| ... | @@ -99,3 +100,24 @@ WHERE a.type IN (1,7,8,9,10,11,16,18,2,12); | ... | @@ -99,3 +100,24 @@ WHERE a.type IN (1,7,8,9,10,11,16,18,2,12); |
| 99 | ## Sources | 100 | ## Sources |
| 100 | - See [business-manifest-and-type-role-spec.md](./business-manifest-and-type-role-spec.md) | 101 | - See [business-manifest-and-type-role-spec.md](./business-manifest-and-type-role-spec.md) |
| 101 | - See [business-music-bucket-and-type-guide.md](./business-music-bucket-and-type-guide.md) | 102 | - See [business-music-bucket-and-type-guide.md](./business-music-bucket-and-type-guide.md) |
| 103 | |||
| 104 | |||
| 105 | ## 6. 轻量规范化脚本 | ||
| 106 | |||
| 107 | 仓库里已经补了一层可直接运行的转换脚本: | ||
| 108 | - [../acr-engine/scripts/normalize_business_export.py](../acr-engine/scripts/normalize_business_export.py) | ||
| 109 | |||
| 110 | 示例: | ||
| 111 | |||
| 112 | ```bash | ||
| 113 | cd /workspace/acr-engine | ||
| 114 | /usr/local/miniconda3/bin/python scripts/normalize_business_export.py \ | ||
| 115 | --input configs/manifests/examples/business_asset_export_example.csv \ | ||
| 116 | --output /tmp/business_asset_manifest_ready.jsonl | ||
| 117 | ``` | ||
| 118 | |||
| 119 | 这个脚本会: | ||
| 120 | 1. 读取 CSV 或 JSONL 导出 | ||
| 121 | 2. 应用 `business_type_role_mapping.json` | ||
| 122 | 3. 自动补 `role / bucket / source_dataset / split` 默认值 | ||
| 123 | 4. 输出 manifest-ready JSONL | ... | ... |
| ... | @@ -77,6 +77,8 @@ flowchart LR | ... | @@ -77,6 +77,8 @@ flowchart LR |
| 77 | - [../acr-engine/configs/manifests/business_type_role_mapping.json](../acr-engine/configs/manifests/business_type_role_mapping.json) | 77 | - [../acr-engine/configs/manifests/business_type_role_mapping.json](../acr-engine/configs/manifests/business_type_role_mapping.json) |
| 78 | - 打印脚本: | 78 | - 打印脚本: |
| 79 | - [../acr-engine/scripts/print_business_type_mapping.py](../acr-engine/scripts/print_business_type_mapping.py) | 79 | - [../acr-engine/scripts/print_business_type_mapping.py](../acr-engine/scripts/print_business_type_mapping.py) |
| 80 | - 规范化脚本: | ||
| 81 | - [../acr-engine/scripts/normalize_business_export.py](../acr-engine/scripts/normalize_business_export.py) | ||
| 80 | 82 | ||
| 81 | 示例命令: | 83 | 示例命令: |
| 82 | 84 | ... | ... |
| ... | @@ -259,6 +259,7 @@ | ... | @@ -259,6 +259,7 @@ |
| 259 | - 业务型素材优先看:[business-music-bucket-and-type-guide.md](./business-music-bucket-and-type-guide.md) | 259 | - 业务型素材优先看:[business-music-bucket-and-type-guide.md](./business-music-bucket-and-type-guide.md) |
| 260 | - Manifest/角色映射看:[business-manifest-and-type-role-spec.md](./business-manifest-and-type-role-spec.md) | 260 | - Manifest/角色映射看:[business-manifest-and-type-role-spec.md](./business-manifest-and-type-role-spec.md) |
| 261 | - SQL/CSV/JSONL 导出参考:[business-export-cookbook.md](./business-export-cookbook.md) | 261 | - SQL/CSV/JSONL 导出参考:[business-export-cookbook.md](./business-export-cookbook.md) |
| 262 | - 规范化脚本:`acr-engine/scripts/normalize_business_export.py` | ||
| 262 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 | 263 | 2. 对比 cap48 与 cap64 的不一致现象,补充分规模结论。 |
| 263 | 3. 继续补 cap64 multi-seed,而不是只保留单 seed。 | 264 | 3. 继续补 cap64 multi-seed,而不是只保留单 seed。 |
| 264 | 4. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 | 265 | 4. 继续优化 `hybrid`,重点降低波动并提升 hard case 稳定性。 | ... | ... |
-
Please register or sign in to post a comment