Make internal asset policies executable before DB-scale import
Constraint: Internal type enums need a repeatable mapping path into manifest-ready buckets before bulk database exports begin Rejected: Leave type handling as documentation only | Would force repeated manual filtering and inconsistent ingestion decisions Confidence: high Scope-risk: narrow Directive: Keep internal asset mapping defaults conservative; conditional instrumental variants should stay opt-in until version-aware training is ready Tested: internal_asset_type_mapper.py on a 6-row sample CSV produced references=2 queries=2 metadata_only=1 excluded=1 with expected type routing Not-tested: Direct SQL export integration against the live source database
Showing
3 changed files
with
215 additions
and
0 deletions
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Map internal asset type codes into manifest-ready ACR roles. | ||
| 3 | |||
| 4 | Input: CSV exported from an internal asset table. | ||
| 5 | Output: JSON bundles for references, queries, metadata-only assets, and excluded assets. | ||
| 6 | """ | ||
| 7 | |||
| 8 | from __future__ import annotations | ||
| 9 | |||
| 10 | import argparse | ||
| 11 | import csv | ||
| 12 | import json | ||
| 13 | from pathlib import Path | ||
| 14 | from typing import Dict, List, Tuple | ||
| 15 | |||
| 16 | REFERENCE = "reference" | ||
| 17 | QUERY = "query" | ||
| 18 | METADATA = "metadata_only" | ||
| 19 | EXCLUDED = "excluded" | ||
| 20 | CONDITIONAL = "conditional" | ||
| 21 | |||
| 22 | TYPE_POLICY: Dict[int, Dict[str, str]] = { | ||
| 23 | 1: {"bucket": REFERENCE, "audio_role": "original_lossy", "train_type": "reference", "priority": "secondary_reference"}, | ||
| 24 | 2: {"bucket": CONDITIONAL, "audio_role": "inst_with_harmony_lossy", "train_type": "hard_negative", "priority": "conditional"}, | ||
| 25 | 3: {"bucket": METADATA, "audio_role": "lyrics_txt", "train_type": "none", "priority": "metadata"}, | ||
| 26 | 4: {"bucket": METADATA, "audio_role": "cover_image", "train_type": "none", "priority": "metadata"}, | ||
| 27 | 5: {"bucket": METADATA, "audio_role": "license_doc", "train_type": "none", "priority": "metadata"}, | ||
| 28 | 6: {"bucket": METADATA, "audio_role": "album_info", "train_type": "none", "priority": "metadata"}, | ||
| 29 | 7: {"bucket": QUERY, "audio_role": "short_video_clip", "train_type": "clean", "priority": "high_value_query"}, | ||
| 30 | 8: {"bucket": QUERY, "audio_role": "chorus_clip", "train_type": "clean", "priority": "high_value_query"}, | ||
| 31 | 9: {"bucket": CONDITIONAL, "audio_role": "inst_no_harmony_lossy", "train_type": "hard_negative", "priority": "conditional"}, | ||
| 32 | 10: {"bucket": CONDITIONAL, "audio_role": "inst_no_harmony_lossless", "train_type": "hard_negative", "priority": "conditional"}, | ||
| 33 | 11: {"bucket": REFERENCE, "audio_role": "original_lossless", "train_type": "reference", "priority": "primary_reference"}, | ||
| 34 | 12: {"bucket": CONDITIONAL, "audio_role": "inst_with_harmony_lossless", "train_type": "hard_negative", "priority": "conditional"}, | ||
| 35 | 13: {"bucket": METADATA, "audio_role": "lyrics_lrc", "train_type": "none", "priority": "metadata"}, | ||
| 36 | 14: {"bucket": METADATA, "audio_role": "cover_source", "train_type": "none", "priority": "metadata"}, | ||
| 37 | 16: {"bucket": QUERY, "audio_role": "short_video_clip", "train_type": "clean", "priority": "high_value_query"}, | ||
| 38 | 17: {"bucket": METADATA, "audio_role": "archive_package", "train_type": "none", "priority": "metadata"}, | ||
| 39 | 18: {"bucket": QUERY, "audio_role": "demo_audio", "train_type": "clean", "priority": "screen_before_use"}, | ||
| 40 | 19: {"bucket": METADATA, "audio_role": "sheet_image", "train_type": "none", "priority": "metadata"}, | ||
| 41 | 20: {"bucket": METADATA, "audio_role": "lyrics_translation", "train_type": "none", "priority": "metadata"}, | ||
| 42 | } | ||
| 43 | |||
| 44 | |||
| 45 | def normalize_row(row: Dict[str, str], args) -> Dict: | ||
| 46 | type_code = int(row[args.type_field]) | ||
| 47 | policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) | ||
| 48 | canonical_song_id = row.get(args.song_field) or row.get(args.canonical_song_field) or row.get(args.asset_id_field) or "unknown_song" | ||
| 49 | version_id = row.get(args.version_field) or f"{canonical_song_id}_type_{type_code}" | ||
| 50 | record = { | ||
| 51 | "asset_id": row.get(args.asset_id_field), | ||
| 52 | "canonical_song_id": canonical_song_id, | ||
| 53 | "version_id": version_id, | ||
| 54 | "asset_type_code": type_code, | ||
| 55 | "audio_role": policy["audio_role"], | ||
| 56 | "recommended_train_type": policy["train_type"], | ||
| 57 | "priority": policy["priority"], | ||
| 58 | "bucket": policy["bucket"], | ||
| 59 | "audio_path": row.get(args.path_field), | ||
| 60 | "title": row.get(args.title_field), | ||
| 61 | "artist": row.get(args.artist_field), | ||
| 62 | "source_platform": row.get(args.platform_field) or "internal", | ||
| 63 | } | ||
| 64 | return record | ||
| 65 | |||
| 66 | |||
| 67 | def to_manifest_record(record: Dict, bucket: str) -> Dict: | ||
| 68 | base = { | ||
| 69 | "song_id": record["canonical_song_id"], | ||
| 70 | "version_id": record["version_id"], | ||
| 71 | "asset_type_code": record["asset_type_code"], | ||
| 72 | "audio_role": record["audio_role"], | ||
| 73 | "audio_path": record["audio_path"], | ||
| 74 | "source_dataset": "internal_assets", | ||
| 75 | "source_platform": record["source_platform"], | ||
| 76 | } | ||
| 77 | if bucket == REFERENCE: | ||
| 78 | return { | ||
| 79 | **base, | ||
| 80 | "type": "reference", | ||
| 81 | "duration": 0.0, | ||
| 82 | } | ||
| 83 | return { | ||
| 84 | **base, | ||
| 85 | "type": record["recommended_train_type"], | ||
| 86 | "duration": 0.0, | ||
| 87 | "offset": None, | ||
| 88 | "segment_type": "external_query", | ||
| 89 | } | ||
| 90 | |||
| 91 | |||
| 92 | def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]: | ||
| 93 | references, queries, metadata_only, excluded = [], [], [], [] | ||
| 94 | for record in rows: | ||
| 95 | bucket = record["bucket"] | ||
| 96 | if bucket == CONDITIONAL: | ||
| 97 | bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED | ||
| 98 | |||
| 99 | if bucket == REFERENCE: | ||
| 100 | references.append(to_manifest_record(record, REFERENCE)) | ||
| 101 | elif bucket == QUERY: | ||
| 102 | queries.append(to_manifest_record(record, QUERY)) | ||
| 103 | elif bucket == METADATA: | ||
| 104 | metadata_only.append(record) | ||
| 105 | else: | ||
| 106 | excluded.append(record) | ||
| 107 | return references, queries, metadata_only, excluded | ||
| 108 | |||
| 109 | |||
| 110 | def main(): | ||
| 111 | parser = argparse.ArgumentParser() | ||
| 112 | parser.add_argument("csv_path") | ||
| 113 | parser.add_argument("--output-dir", required=True) | ||
| 114 | parser.add_argument("--asset-id-field", default="id") | ||
| 115 | parser.add_argument("--song-field", default="song_id") | ||
| 116 | parser.add_argument("--canonical-song-field", default="canonical_song_id") | ||
| 117 | parser.add_argument("--version-field", default="version_id") | ||
| 118 | parser.add_argument("--type-field", default="type") | ||
| 119 | parser.add_argument("--path-field", default="audio_path") | ||
| 120 | parser.add_argument("--title-field", default="title") | ||
| 121 | parser.add_argument("--artist-field", default="artist") | ||
| 122 | parser.add_argument("--platform-field", default="source_platform") | ||
| 123 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | ||
| 124 | args = parser.parse_args() | ||
| 125 | |||
| 126 | rows = [] | ||
| 127 | with open(args.csv_path, newline="") as f: | ||
| 128 | reader = csv.DictReader(f) | ||
| 129 | for row in reader: | ||
| 130 | rows.append(normalize_row(row, args)) | ||
| 131 | |||
| 132 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) | ||
| 133 | |||
| 134 | out_dir = Path(args.output_dir) | ||
| 135 | out_dir.mkdir(parents=True, exist_ok=True) | ||
| 136 | outputs = { | ||
| 137 | "references.json": references, | ||
| 138 | "queries.json": queries, | ||
| 139 | "metadata_only.json": metadata_only, | ||
| 140 | "excluded.json": excluded, | ||
| 141 | "summary.json": { | ||
| 142 | "input_rows": len(rows), | ||
| 143 | "references": len(references), | ||
| 144 | "queries": len(queries), | ||
| 145 | "metadata_only": len(metadata_only), | ||
| 146 | "excluded": len(excluded), | ||
| 147 | "include_conditionals_as": args.include_conditionals_as, | ||
| 148 | }, | ||
| 149 | } | ||
| 150 | for name, payload in outputs.items(): | ||
| 151 | (out_dir / name).write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | ||
| 152 | |||
| 153 | print(json.dumps(outputs["summary.json"], indent=2, ensure_ascii=False)) | ||
| 154 | |||
| 155 | |||
| 156 | if __name__ == "__main__": | ||
| 157 | main() |
| ... | @@ -2,6 +2,34 @@ | ... | @@ -2,6 +2,34 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 将内部素材 type 策略落成可执行映射脚本 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 新增 [acr-engine/scripts/internal_asset_type_mapper.py](../acr-engine/scripts/internal_asset_type_mapper.py) | ||
| 9 | - 支持从内部素材 CSV 自动分流到: | ||
| 10 | - `references.json` | ||
| 11 | - `queries.json` | ||
| 12 | - `metadata_only.json` | ||
| 13 | - `excluded.json` | ||
| 14 | - 支持 `--include-conditionals-as`,可选把伴奏类临时导出成 `query` 或 `reference` | ||
| 15 | - 在 [training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 增加脚本入口说明 | ||
| 16 | |||
| 17 | 验证结果: | ||
| 18 | - 使用 6 行样例 CSV 验证: | ||
| 19 | - `11/1 -> references` | ||
| 20 | - `7/18 -> queries` | ||
| 21 | - `3 -> metadata_only` | ||
| 22 | - `12 -> excluded` | ||
| 23 | - 摘要输出: | ||
| 24 | - `references = 2` | ||
| 25 | - `queries = 2` | ||
| 26 | - `metadata_only = 1` | ||
| 27 | - `excluded = 1` | ||
| 28 | |||
| 29 | 结论: | ||
| 30 | - 现在内部素材 type 策略已经不只在文档里,而是可以直接作为批量清洗入口使用 | ||
| 31 | - 后续如果要从数据库导出 CSV 再转 manifest,已经有第一版可执行桥接脚本 | ||
| 32 | |||
| 5 | ### Stage: 为内部素材 type 枚举补齐训练参与策略文档 | 33 | ### Stage: 为内部素材 type 枚举补齐训练参与策略文档 |
| 6 | 34 | ||
| 7 | 完成项: | 35 | 完成项: | ... | ... |
| ... | @@ -478,5 +478,35 @@ query: | ... | @@ -478,5 +478,35 @@ query: |
| 478 | | 18 | `demo_audio` | `clean` / `augmented` | | 478 | | 18 | `demo_audio` | `clean` / `augmented` | |
| 479 | | 2/9/10/12 | `instrumental_variant` | 先不进主训练,或做 hard negative | | 479 | | 2/9/10/12 | `instrumental_variant` | 先不进主训练,或做 hard negative | |
| 480 | 480 | ||
| 481 | ## 12.6 现在仓库里已经有可执行映射脚本 | ||
| 482 | |||
| 483 | 脚本: | ||
| 484 | - [acr-engine/scripts/internal_asset_type_mapper.py](../acr-engine/scripts/internal_asset_type_mapper.py) | ||
| 485 | |||
| 486 | 作用: | ||
| 487 | - 读取内部素材 CSV | ||
| 488 | - 按 `type` 枚举自动分流成: | ||
| 489 | - `references.json` | ||
| 490 | - `queries.json` | ||
| 491 | - `metadata_only.json` | ||
| 492 | - `excluded.json` | ||
| 493 | |||
| 494 | 最短示例: | ||
| 495 | |||
| 496 | ```bash | ||
| 497 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map | ||
| 498 | ``` | ||
| 499 | |||
| 500 | 如果你想临时把伴奏类也纳入导出,可用: | ||
| 501 | |||
| 502 | ```bash | ||
| 503 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --output-dir out/internal_asset_map --include-conditionals-as query | ||
| 504 | ``` | ||
| 505 | |||
| 506 | 但默认仍建议: | ||
| 507 | - `--include-conditionals-as skip` | ||
| 508 | |||
| 509 | 这样更符合当前主任务“先把原曲识别打稳,再逐步纳入伴奏版本”的策略。 | ||
| 510 | |||
| 481 | ## Sources | 511 | ## Sources |
| 482 | - 当前代码事实来自 [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py), [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py), [acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py), [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py), [acr-engine/train.py](../acr-engine/train.py) | 512 | - 当前代码事实来自 [acr-engine/src/data/dataset.py](../acr-engine/src/data/dataset.py), [acr-engine/src/data/manifest_tools.py](../acr-engine/src/data/manifest_tools.py), [acr-engine/src/data/external_adapters.py](../acr-engine/src/data/external_adapters.py), [acr-engine/src/utils/audio.py](../acr-engine/src/utils/audio.py), [acr-engine/src/engines/ecapa_embedder.py](../acr-engine/src/engines/ecapa_embedder.py), [acr-engine/train.py](../acr-engine/train.py) | ... | ... |
-
Please register or sign in to post a comment