Preserve internal query window semantics for trainable asset exports
Constraint: Internal assets must support both manually labeled clips and whole-track auto-window generation without breaking pgvector export Rejected: Treat missing query duration as full audio duration | prevents multi-window query expansion for long source audio Confidence: high Scope-risk: narrow Directive: Keep explicit CSV offset authoritative; only auto-expand when offset is absent and query_stride is set Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/internal_asset_type_mapper.py; local 30s/40s WAV fixture export with manifest + pgvector verification Not-tested: End-to-end retraining with newly expanded internal manifests
Showing
3 changed files
with
112 additions
and
3 deletions
| ... | @@ -10,6 +10,7 @@ from __future__ import annotations | ... | @@ -10,6 +10,7 @@ from __future__ import annotations |
| 10 | import argparse | 10 | import argparse |
| 11 | import csv | 11 | import csv |
| 12 | import json | 12 | import json |
| 13 | import math | ||
| 13 | import random | 14 | import random |
| 14 | from pathlib import Path | 15 | from pathlib import Path |
| 15 | from typing import Dict, List, Tuple | 16 | from typing import Dict, List, Tuple |
| ... | @@ -106,7 +107,7 @@ def normalize_row(row: Dict[str, str], args) -> Dict: | ... | @@ -106,7 +107,7 @@ def normalize_row(row: Dict[str, str], args) -> Dict: |
| 106 | def to_manifest_record(record: Dict, bucket: str, args) -> Dict: | 107 | def to_manifest_record(record: Dict, bucket: str, args) -> Dict: |
| 107 | inferred_query_duration = record["csv_duration_sec"] | 108 | inferred_query_duration = record["csv_duration_sec"] |
| 108 | if inferred_query_duration is None: | 109 | if inferred_query_duration is None: |
| 109 | inferred_query_duration = record["duration_sec"] if record["duration_sec"] is not None else args.default_query_duration | 110 | inferred_query_duration = args.default_query_duration |
| 110 | inferred_query_offset = record["csv_offset_sec"] | 111 | inferred_query_offset = record["csv_offset_sec"] |
| 111 | if inferred_query_offset is None: | 112 | if inferred_query_offset is None: |
| 112 | inferred_query_offset = args.default_query_offset | 113 | inferred_query_offset = args.default_query_offset |
| ... | @@ -133,7 +134,9 @@ def to_manifest_record(record: Dict, bucket: str, args) -> Dict: | ... | @@ -133,7 +134,9 @@ def to_manifest_record(record: Dict, bucket: str, args) -> Dict: |
| 133 | "type": record["recommended_train_type"], | 134 | "type": record["recommended_train_type"], |
| 134 | "duration": inferred_query_duration or 0.0, | 135 | "duration": inferred_query_duration or 0.0, |
| 135 | "offset": inferred_query_offset, | 136 | "offset": inferred_query_offset, |
| 137 | "offset_is_explicit": record["csv_offset_sec"] is not None, | ||
| 136 | "segment_type": "external_query", | 138 | "segment_type": "external_query", |
| 139 | "source_audio_duration": record["duration_sec"], | ||
| 137 | } | 140 | } |
| 138 | 141 | ||
| 139 | 142 | ||
| ... | @@ -155,6 +158,52 @@ def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple | ... | @@ -155,6 +158,52 @@ def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple |
| 155 | return references, queries, metadata_only, excluded | 158 | return references, queries, metadata_only, excluded |
| 156 | 159 | ||
| 157 | 160 | ||
| 161 | def expand_query_records(queries: List[Dict], query_stride: float | None) -> List[Dict]: | ||
| 162 | if not query_stride or query_stride <= 0: | ||
| 163 | return queries | ||
| 164 | |||
| 165 | expanded: List[Dict] = [] | ||
| 166 | for row in queries: | ||
| 167 | duration = float(row.get("duration", 0.0) or 0.0) | ||
| 168 | audio_duration = float(duration or 0.0) | ||
| 169 | source_duration = row.get("source_audio_duration") | ||
| 170 | if source_duration is not None: | ||
| 171 | try: | ||
| 172 | audio_duration = float(source_duration) | ||
| 173 | except (TypeError, ValueError): | ||
| 174 | pass | ||
| 175 | |||
| 176 | explicit_offset = row.get("offset") | ||
| 177 | offset_is_explicit = bool(row.get("offset_is_explicit")) | ||
| 178 | if offset_is_explicit and explicit_offset not in (None, ""): | ||
| 179 | clone = dict(row) | ||
| 180 | clone["query_index"] = 0 | ||
| 181 | expanded.append(clone) | ||
| 182 | continue | ||
| 183 | |||
| 184 | if audio_duration <= 0 or duration <= 0 or audio_duration <= duration: | ||
| 185 | clone = dict(row) | ||
| 186 | clone["offset"] = 0.0 | ||
| 187 | clone["query_index"] = 0 | ||
| 188 | expanded.append(clone) | ||
| 189 | continue | ||
| 190 | |||
| 191 | max_offset = max(0.0, audio_duration - duration) | ||
| 192 | n_steps = int(math.floor(max_offset / query_stride)) | ||
| 193 | offsets = [round(i * query_stride, 3) for i in range(n_steps + 1)] | ||
| 194 | if not offsets: | ||
| 195 | offsets = [0.0] | ||
| 196 | if round(max_offset, 3) > offsets[-1]: | ||
| 197 | offsets.append(round(max_offset, 3)) | ||
| 198 | |||
| 199 | for idx, offset in enumerate(offsets): | ||
| 200 | clone = dict(row) | ||
| 201 | clone["offset"] = offset | ||
| 202 | clone["query_index"] = idx | ||
| 203 | expanded.append(clone) | ||
| 204 | return expanded | ||
| 205 | |||
| 206 | |||
| 158 | def build_manifest_bundle( | 207 | def build_manifest_bundle( |
| 159 | references: List[Dict], | 208 | references: List[Dict], |
| 160 | queries: List[Dict], | 209 | queries: List[Dict], |
| ... | @@ -227,6 +276,7 @@ def build_pgvector_payload( | ... | @@ -227,6 +276,7 @@ def build_pgvector_payload( |
| 227 | "asset_type_code": row.get("asset_type_code"), | 276 | "asset_type_code": row.get("asset_type_code"), |
| 228 | "audio_exists": row.get("audio_exists"), | 277 | "audio_exists": row.get("audio_exists"), |
| 229 | "validation_status": row.get("validation_status"), | 278 | "validation_status": row.get("validation_status"), |
| 279 | "query_index": row.get("query_index"), | ||
| 230 | }) | 280 | }) |
| 231 | 281 | ||
| 232 | for row in queries: | 282 | for row in queries: |
| ... | @@ -252,6 +302,7 @@ def build_pgvector_payload( | ... | @@ -252,6 +302,7 @@ def build_pgvector_payload( |
| 252 | "asset_type_code": row.get("asset_type_code"), | 302 | "asset_type_code": row.get("asset_type_code"), |
| 253 | "audio_exists": row.get("audio_exists"), | 303 | "audio_exists": row.get("audio_exists"), |
| 254 | "validation_status": row.get("validation_status"), | 304 | "validation_status": row.get("validation_status"), |
| 305 | "query_index": row.get("query_index"), | ||
| 255 | }) | 306 | }) |
| 256 | 307 | ||
| 257 | return { | 308 | return { |
| ... | @@ -279,6 +330,7 @@ def main(): | ... | @@ -279,6 +330,7 @@ def main(): |
| 279 | parser.add_argument("--audio-root", default=None) | 330 | parser.add_argument("--audio-root", default=None) |
| 280 | parser.add_argument("--default-query-duration", type=float, default=8.0) | 331 | parser.add_argument("--default-query-duration", type=float, default=8.0) |
| 281 | parser.add_argument("--default-query-offset", type=float, default=0.0) | 332 | parser.add_argument("--default-query-offset", type=float, default=0.0) |
| 333 | parser.add_argument("--query-stride", type=float, default=None) | ||
| 282 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") | 334 | parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") |
| 283 | parser.add_argument("--emit-manifests", action="store_true") | 335 | parser.add_argument("--emit-manifests", action="store_true") |
| 284 | parser.add_argument("--emit-pgvector-json", action="store_true") | 336 | parser.add_argument("--emit-pgvector-json", action="store_true") |
| ... | @@ -294,6 +346,7 @@ def main(): | ... | @@ -294,6 +346,7 @@ def main(): |
| 294 | rows.append(normalize_row(row, args)) | 346 | rows.append(normalize_row(row, args)) |
| 295 | 347 | ||
| 296 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args) | 348 | references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args) |
| 349 | queries = expand_query_records(queries, args.query_stride) | ||
| 297 | missing_audio = sum(1 for row in rows if not row["audio_exists"]) | 350 | missing_audio = sum(1 for row in rows if not row["audio_exists"]) |
| 298 | trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) | 351 | trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) |
| 299 | 352 | ||
| ... | @@ -308,6 +361,7 @@ def main(): | ... | @@ -308,6 +361,7 @@ def main(): |
| 308 | "missing_audio": missing_audio, | 361 | "missing_audio": missing_audio, |
| 309 | "trainable_audio_rows": trainable_audio_rows, | 362 | "trainable_audio_rows": trainable_audio_rows, |
| 310 | "include_conditionals_as": args.include_conditionals_as, | 363 | "include_conditionals_as": args.include_conditionals_as, |
| 364 | "query_stride": args.query_stride, | ||
| 311 | } | 365 | } |
| 312 | outputs = { | 366 | outputs = { |
| 313 | "references.json": references, | 367 | "references.json": references, | ... | ... |
| ... | @@ -5372,3 +5372,29 @@ | ... | @@ -5372,3 +5372,29 @@ |
| 5372 | 结论: | 5372 | 结论: |
| 5373 | - type-aware weighting 比 naive oversampling 更有效 | 5373 | - type-aware weighting 比 naive oversampling 更有效 |
| 5374 | - 下一轮应专门针对 confused 类设计更强的 negative mining / confusion-aware 信号 | 5374 | - 下一轮应专门针对 confused 类设计更强的 negative mining / confusion-aware 信号 |
| 5375 | |||
| 5376 | ### Stage: internal asset query stride fix + type policy hardening | ||
| 5377 | |||
| 5378 | 完成项: | ||
| 5379 | - 修复 `acr-engine/scripts/internal_asset_type_mapper.py` 中内部素材 query 的自动扩窗逻辑 | ||
| 5380 | - 新增 `source_audio_duration` 透传,使长音频可基于真实总时长按 `--query-stride` 展开 | ||
| 5381 | - 修复 “默认 offset=0 被误判为显式 offset” 的问题,确保只有 CSV 明确给了 offset 才禁用扩窗 | ||
| 5382 | - 为 `pgvector_payload.json` 的 `segments` 补充 `query_index` | ||
| 5383 | - 在 [docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 补充内部素材滑窗规则、推荐参数表与自动扩窗示例 | ||
| 5384 | |||
| 5385 | 验证结果: | ||
| 5386 | - 使用本地 30s `songA.wav` 验证: | ||
| 5387 | - `--default-query-duration 8 --query-stride 4` | ||
| 5388 | - `queries.json` 成功导出 `7` 条 query | ||
| 5389 | - offset 为 `0, 4, 8, 12, 16, 20, 22` | ||
| 5390 | - `query_index` 为 `0..6` | ||
| 5391 | - 使用本地 40s `songB.wav` + CSV 显式 `offset=12` 验证: | ||
| 5392 | - 仍只导出 `1` 条 query | ||
| 5393 | - 未被自动扩窗覆盖 | ||
| 5394 | - `manifest_bundle/*.json` 与 `pgvector_payload.json` 均已同步反映扩窗结果 | ||
| 5395 | |||
| 5396 | 结论: | ||
| 5397 | - 现在内部素材可以稳定支持两种模式: | ||
| 5398 | - **人工标 offset 的短视频片段**:保持单条 query | ||
| 5399 | - **只有整首音频、没有 query 起点的素材**:自动生成多窗口 query | ||
| 5400 | - 这让 `7/8/16/18` 这类 query 型素材可以更直接进入训练与评测流水线,同时保留对 `pgvector` 入库的可追踪性 | ... | ... |
| ... | @@ -530,11 +530,23 @@ query: | ... | @@ -530,11 +530,23 @@ query: |
| 530 | - `--offset-field` | 530 | - `--offset-field` |
| 531 | - `--default-query-duration` | 531 | - `--default-query-duration` |
| 532 | - `--default-query-offset` | 532 | - `--default-query-offset` |
| 533 | - `--query-stride` | ||
| 533 | 534 | ||
| 534 | 规则是: | 535 | 规则是: |
| 535 | - query 优先使用 CSV 自带的 `duration/offset` | 536 | - query 优先使用 CSV 自带的 `duration/offset` |
| 536 | - 没有时,优先使用音频探测时长 | 537 | - duration 没有时,回落到默认 query duration(例如 `8.0s`),而不是整首音频时长 |
| 537 | - offset 没有时,回落到默认值(通常 `0.0`) | 538 | - 音频总时长会单独保留为 `source_audio_duration`,供 query 滑窗展开使用 |
| 539 | - offset 有 CSV 显式值时,保持单条 query,不做自动扩窗 | ||
| 540 | - offset 没有显式值且设置了 `--query-stride` 时,会按滑窗方式自动展开成多条 query | ||
| 541 | - 若未设置 `--query-stride`,offset 没有显式值时回落到默认值(通常 `0.0`) | ||
| 542 | |||
| 543 | 推荐参数: | ||
| 544 | |||
| 545 | | 场景 | 推荐参数 | 说明 | | ||
| 546 | |---|---|---| | ||
| 547 | | 内部短视频片段已人工标好起点 | `--offset-field offset_sec` | 保留人工时间戳,避免自动扩窗覆盖人工标注 | | ||
| 548 | | 只有整首原始音频,没有 query 起点 | `--default-query-duration 8 --query-stride 4` | 自动产出 50% overlap 的多窗口 query | | ||
| 549 | | 只想先做最小可用集 | `--default-query-duration 8` | 每条 query 只导出 1 个片段,默认 offset=0 | | ||
| 538 | 550 | ||
| 539 | 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: | 551 | 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: |
| 540 | 552 | ||
| ... | @@ -542,6 +554,23 @@ query: | ... | @@ -542,6 +554,23 @@ query: |
| 542 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train | 554 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train |
| 543 | ``` | 555 | ``` |
| 544 | 556 | ||
| 557 | 自动扩窗示例: | ||
| 558 | |||
| 559 | ```bash | ||
| 560 | /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv \ | ||
| 561 | --audio-root data/internal_audio \ | ||
| 562 | --output-dir out/internal_asset_map \ | ||
| 563 | --default-query-duration 8 \ | ||
| 564 | --query-stride 4 \ | ||
| 565 | --emit-manifests \ | ||
| 566 | --emit-pgvector-json | ||
| 567 | ``` | ||
| 568 | |||
| 569 | 例如 30s 音频在 `8s` query、`4s` stride 下会导出 offset: | ||
| 570 | - `0, 4, 8, 12, 16, 20, 22` | ||
| 571 | |||
| 572 | 导出的 `queries.json` 与 `pgvector_payload.json` 中都会保留 `query_index`,方便后续追踪窗口来源。 | ||
| 573 | |||
| 545 | 输出会包含: | 574 | 输出会包含: |
| 546 | - `songs` | 575 | - `songs` |
| 547 | - `references` | 576 | - `references` | ... | ... |
-
Please register or sign in to post a comment