Fill internal query timing semantics before training on imported clips
Constraint: Internal short-video and demo assets need explicit duration/offset semantics before they can behave like real training or pgvector segment records Rejected: Leave query offsets empty by default | Produces weaker provenance and less useful downstream segment metadata Confidence: high Scope-risk: narrow Directive: Prefer source CSV timing when available, then fall back to inspected audio duration and conservative default offsets Tested: Sample CSV run confirmed one query used CSV duration/offset (5.0/12.5) and another fell back to inspected duration/default offset (6.5/0.0), with pgvector segments matching Not-tested: Complex multi-segment offset generation from long-form internal masters
Showing
3 changed files
with
73 additions
and
7 deletions
| ... | @@ -59,6 +59,18 @@ def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool | ... | @@ -59,6 +59,18 @@ def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool |
| 59 | return True, None | 59 | return True, None |
| 60 | 60 | ||
| 61 | 61 | ||
| 62 | def parse_optional_float(value: str | None) -> float | None: | ||
| 63 | if value is None: | ||
| 64 | return None | ||
| 65 | text = str(value).strip() | ||
| 66 | if text == "": | ||
| 67 | return None | ||
| 68 | try: | ||
| 69 | return float(text) | ||
| 70 | except ValueError: | ||
| 71 | return None | ||
| 72 | |||
| 73 | |||
| 62 | def normalize_row(row: Dict[str, str], args) -> Dict: | 74 | def normalize_row(row: Dict[str, str], args) -> Dict: |
| 63 | type_code = int(row[args.type_field]) | 75 | type_code = int(row[args.type_field]) |
| 64 | policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) | 76 | policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) |
| ... | @@ -67,6 +79,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: | ... | @@ -67,6 +79,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: |
| 67 | audio_path = row.get(args.path_field) | 79 | audio_path = row.get(args.path_field) |
| 68 | audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None) | 80 | audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None) |
| 69 | validation_status = "ok" if audio_exists else "missing_audio" | 81 | validation_status = "ok" if audio_exists else "missing_audio" |
| 82 | csv_duration = parse_optional_float(row.get(args.duration_field)) | ||
| 83 | csv_offset = parse_optional_float(row.get(args.offset_field)) | ||
| 70 | record = { | 84 | record = { |
| 71 | "asset_id": row.get(args.asset_id_field), | 85 | "asset_id": row.get(args.asset_id_field), |
| 72 | "canonical_song_id": canonical_song_id, | 86 | "canonical_song_id": canonical_song_id, |
| ... | @@ -79,6 +93,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: | ... | @@ -79,6 +93,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: |
| 79 | "audio_path": audio_path, | 93 | "audio_path": audio_path, |
| 80 | "audio_exists": audio_exists, | 94 | "audio_exists": audio_exists, |
| 81 | "duration_sec": duration_sec, | 95 | "duration_sec": duration_sec, |
| 96 | "csv_duration_sec": csv_duration, | ||
| 97 | "csv_offset_sec": csv_offset, | ||
| 82 | "validation_status": validation_status, | 98 | "validation_status": validation_status, |
| 83 | "title": row.get(args.title_field), | 99 | "title": row.get(args.title_field), |
| 84 | "artist": row.get(args.artist_field), | 100 | "artist": row.get(args.artist_field), |
| ... | @@ -87,7 +103,14 @@ def normalize_row(row: Dict[str, str], args) -> Dict: | ... | @@ -87,7 +103,14 @@ def normalize_row(row: Dict[str, str], args) -> Dict: |
| 87 | return record | 103 | return record |
| 88 | 104 | ||
| 89 | 105 | ||
| 90 | def to_manifest_record(record: Dict, bucket: str) -> Dict: | 106 | def to_manifest_record(record: Dict, bucket: str, args) -> Dict: |
| 107 | inferred_query_duration = record["csv_duration_sec"] | ||
| 108 | if inferred_query_duration is None: | ||
| 109 | inferred_query_duration = record["duration_sec"] if record["duration_sec"] is not None else args.default_query_duration | ||
| 110 | inferred_query_offset = record["csv_offset_sec"] | ||
| 111 | if inferred_query_offset is None: | ||
| 112 | inferred_query_offset = args.default_query_offset | ||
| 113 | |||
| 91 | base = { | 114 | base = { |
| 92 | "song_id": record["canonical_song_id"], | 115 | "song_id": record["canonical_song_id"], |
| 93 | "version_id": record["version_id"], | 116 | "version_id": record["version_id"], |
| ... | @@ -108,13 +131,13 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: | ... | @@ -108,13 +131,13 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: |
| 108 | return { | 131 | return { |
| 109 | **base, | 132 | **base, |
| 110 | "type": record["recommended_train_type"], | 133 | "type": record["recommended_train_type"], |
| 111 | "duration": record["duration_sec"] or 0.0, | 134 | "duration": inferred_query_duration or 0.0, |
| 112 | "offset": None, | 135 | "offset": inferred_query_offset, |
| 113 | "segment_type": "external_query", | 136 | "segment_type": "external_query", |
| 114 | } | 137 | } |
| 115 | 138 | ||
| 116 | 139 | ||
| 117 | def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]: | 140 | def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]: |
| 118 | references, queries, metadata_only, excluded = [], [], [], [] | 141 | references, queries, metadata_only, excluded = [], [], [], [] |
| 119 | for record in rows: | 142 | for record in rows: |
| 120 | bucket = record["bucket"] | 143 | bucket = record["bucket"] |
| ... | @@ -122,9 +145,9 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ | ... | @@ -122,9 +145,9 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ |
| 122 | bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED | 145 | bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED |
| 123 | 146 | ||
| 124 | if bucket == REFERENCE: | 147 | if bucket == REFERENCE: |
| 125 | references.append(to_manifest_record(record, REFERENCE)) | 148 | references.append(to_manifest_record(record, REFERENCE, args)) |
| 126 | elif bucket == QUERY: | 149 | elif bucket == QUERY: |
| 127 | queries.append(to_manifest_record(record, QUERY)) | 150 | queries.append(to_manifest_record(record, QUERY, args)) |
| 128 | elif bucket == METADATA: | 151 | elif bucket == METADATA: |
| 129 | metadata_only.append(record) | 152 | metadata_only.append(record) |
| 130 | else: | 153 | else: |
| ... | @@ -248,10 +271,14 @@ def main(): | ... | @@ -248,10 +271,14 @@ def main(): |
| 248 | parser.add_argument("--version-field", default="version_id") | 271 | parser.add_argument("--version-field", default="version_id") |
| 249 | parser.add_argument("--type-field", default="type") | 272 | parser.add_argument("--type-field", default="type") |
| 250 | parser.add_argument("--path-field", default="audio_path") | 273 | parser.add_argument("--path-field", default="audio_path") |
| 274 | parser.add_argument("--duration-field", default="duration_sec") | ||
| 275 | parser.add_argument("--offset-field", default="offset_sec") | ||
| 251 | parser.add_argument("--title-field", default="title") | 276 | parser.add_argument("--title-field", default="title") |
| 252 | parser.add_argument("--artist-field", default="artist") | 277 | parser.add_argument("--artist-field", default="artist") |
| 253 | parser.add_argument("--platform-field", default="source_platform") | 278 | parser.add_argument("--platform-field", default="source_platform") |
| 254 | parser.add_argument("--audio-root", default=None) | 279 | parser.add_argument("--audio-root", default=None) |
| 280 | parser.add_argument("--default-query-duration", type=float, default=8.0) | ||
| 281 | parser.add_argument("--default-query-offset", type=float, default=0.0) | ||
| 255 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | 282 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") |
| 256 | parser.add_argument("--emit-manifests", action="store_true") | 283 | parser.add_argument("--emit-manifests", action="store_true") |
| 257 | parser.add_argument("--emit-pgvector-json", action="store_true") | 284 | parser.add_argument("--emit-pgvector-json", action="store_true") |
| ... | @@ -266,7 +293,7 @@ def main(): | ... | @@ -266,7 +293,7 @@ def main(): |
| 266 | for row in reader: | 293 | for row in reader: |
| 267 | rows.append(normalize_row(row, args)) | 294 | rows.append(normalize_row(row, args)) |
| 268 | 295 | ||
| 269 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) | 296 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args) |
| 270 | missing_audio = sum(1 for row in rows if not row["audio_exists"]) | 297 | missing_audio = sum(1 for row in rows if not row["audio_exists"]) |
| 271 | trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) | 298 | trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) |
| 272 | 299 | ... | ... |
| ... | @@ -2,6 +2,34 @@ | ... | @@ -2,6 +2,34 @@ |
| 2 | 2 | ||
| 3 | ## 2026-06-02 | 3 | ## 2026-06-02 |
| 4 | 4 | ||
| 5 | ### Stage: 为内部素材 query 自动补 duration / offset 规则 | ||
| 6 | |||
| 7 | 完成项: | ||
| 8 | - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py` | ||
| 9 | - 新增 `--duration-field` | ||
| 10 | - 新增 `--offset-field` | ||
| 11 | - 新增 `--default-query-duration` | ||
| 12 | - 新增 `--default-query-offset` | ||
| 13 | - 规则更新: | ||
| 14 | - query 优先使用 CSV 提供的 `duration/offset` | ||
| 15 | - 无 CSV duration 时,优先使用音频探测时长 | ||
| 16 | - 无 CSV offset 时,使用默认 offset | ||
| 17 | - pgvector payload 同步使用生成后的 `duration/offset` | ||
| 18 | |||
| 19 | 验证结果: | ||
| 20 | - 用 3 行样例 CSV 验证: | ||
| 21 | - `song_a` 短视频 query 使用 CSV 值: | ||
| 22 | - `duration = 5.0` | ||
| 23 | - `offset = 12.5` | ||
| 24 | - `song_c` demo query 使用自动回填: | ||
| 25 | - `duration = 6.5` | ||
| 26 | - `offset = 0.0` | ||
| 27 | - `pgvector_payload.json` 中的 `segments` 也已同步带上正确 `offset_sec/duration_sec` | ||
| 28 | |||
| 29 | 结论: | ||
| 30 | - 现在内部素材 query 已经不再只能输出“空 offset” | ||
| 31 | - 对短视频片段、demo、后续回流片段的训练和入库更接近真实可用状态 | ||
| 32 | |||
| 5 | ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出 | 33 | ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出 |
| 6 | 34 | ||
| 7 | 完成项: | 35 | 完成项: | ... | ... |
| ... | @@ -525,6 +525,17 @@ query: | ... | @@ -525,6 +525,17 @@ query: |
| 525 | - `duration` | 525 | - `duration` |
| 526 | - `missing_audio` 汇总 | 526 | - `missing_audio` 汇总 |
| 527 | 527 | ||
| 528 | 同时脚本现在还支持: | ||
| 529 | - `--duration-field` | ||
| 530 | - `--offset-field` | ||
| 531 | - `--default-query-duration` | ||
| 532 | - `--default-query-offset` | ||
| 533 | |||
| 534 | 规则是: | ||
| 535 | - query 优先使用 CSV 自带的 `duration/offset` | ||
| 536 | - 没有时,优先使用音频探测时长 | ||
| 537 | - offset 没有时,回落到默认值(通常 `0.0`) | ||
| 538 | |||
| 528 | 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: | 539 | 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: |
| 529 | 540 | ||
| 530 | ```bash | 541 | ```bash | ... | ... |
-
Please register or sign in to post a comment