Commit d61ee980 d61ee9806973e69d3510cd953ef130deeb51bb06 by cnb.bofCdSsphPA

Preserve internal query window semantics for trainable asset exports

Constraint: Internal assets must support both manually labeled clips and whole-track auto-window generation without breaking pgvector export
Rejected: Treat missing query duration as full audio duration | prevents multi-window query expansion for long source audio
Confidence: high
Scope-risk: narrow
Directive: Keep explicit CSV offset authoritative; only auto-expand when offset is absent and query_stride is set
Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/internal_asset_type_mapper.py; local 30s/40s WAV fixture export with manifest + pgvector verification
Not-tested: End-to-end retraining with newly expanded internal manifests
1 parent 3e13c578
...@@ -10,6 +10,7 @@ from __future__ import annotations ...@@ -10,6 +10,7 @@ from __future__ import annotations
10 import argparse 10 import argparse
11 import csv 11 import csv
12 import json 12 import json
13 import math
13 import random 14 import random
14 from pathlib import Path 15 from pathlib import Path
15 from typing import Dict, List, Tuple 16 from typing import Dict, List, Tuple
...@@ -106,7 +107,7 @@ def normalize_row(row: Dict[str, str], args) -> Dict: ...@@ -106,7 +107,7 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
106 def to_manifest_record(record: Dict, bucket: str, args) -> Dict: 107 def to_manifest_record(record: Dict, bucket: str, args) -> Dict:
107 inferred_query_duration = record["csv_duration_sec"] 108 inferred_query_duration = record["csv_duration_sec"]
108 if inferred_query_duration is None: 109 if inferred_query_duration is None:
109 inferred_query_duration = record["duration_sec"] if record["duration_sec"] is not None else args.default_query_duration 110 inferred_query_duration = args.default_query_duration
110 inferred_query_offset = record["csv_offset_sec"] 111 inferred_query_offset = record["csv_offset_sec"]
111 if inferred_query_offset is None: 112 if inferred_query_offset is None:
112 inferred_query_offset = args.default_query_offset 113 inferred_query_offset = args.default_query_offset
...@@ -133,7 +134,9 @@ def to_manifest_record(record: Dict, bucket: str, args) -> Dict: ...@@ -133,7 +134,9 @@ def to_manifest_record(record: Dict, bucket: str, args) -> Dict:
133 "type": record["recommended_train_type"], 134 "type": record["recommended_train_type"],
134 "duration": inferred_query_duration or 0.0, 135 "duration": inferred_query_duration or 0.0,
135 "offset": inferred_query_offset, 136 "offset": inferred_query_offset,
137 "offset_is_explicit": record["csv_offset_sec"] is not None,
136 "segment_type": "external_query", 138 "segment_type": "external_query",
139 "source_audio_duration": record["duration_sec"],
137 } 140 }
138 141
139 142
...@@ -155,6 +158,52 @@ def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple ...@@ -155,6 +158,52 @@ def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple
155 return references, queries, metadata_only, excluded 158 return references, queries, metadata_only, excluded
156 159
157 160
161 def expand_query_records(queries: List[Dict], query_stride: float | None) -> List[Dict]:
162 if not query_stride or query_stride <= 0:
163 return queries
164
165 expanded: List[Dict] = []
166 for row in queries:
167 duration = float(row.get("duration", 0.0) or 0.0)
168 audio_duration = float(duration or 0.0)
169 source_duration = row.get("source_audio_duration")
170 if source_duration is not None:
171 try:
172 audio_duration = float(source_duration)
173 except (TypeError, ValueError):
174 pass
175
176 explicit_offset = row.get("offset")
177 offset_is_explicit = bool(row.get("offset_is_explicit"))
178 if offset_is_explicit and explicit_offset not in (None, ""):
179 clone = dict(row)
180 clone["query_index"] = 0
181 expanded.append(clone)
182 continue
183
184 if audio_duration <= 0 or duration <= 0 or audio_duration <= duration:
185 clone = dict(row)
186 clone["offset"] = 0.0
187 clone["query_index"] = 0
188 expanded.append(clone)
189 continue
190
191 max_offset = max(0.0, audio_duration - duration)
192 n_steps = int(math.floor(max_offset / query_stride))
193 offsets = [round(i * query_stride, 3) for i in range(n_steps + 1)]
194 if not offsets:
195 offsets = [0.0]
196 if round(max_offset, 3) > offsets[-1]:
197 offsets.append(round(max_offset, 3))
198
199 for idx, offset in enumerate(offsets):
200 clone = dict(row)
201 clone["offset"] = offset
202 clone["query_index"] = idx
203 expanded.append(clone)
204 return expanded
205
206
158 def build_manifest_bundle( 207 def build_manifest_bundle(
159 references: List[Dict], 208 references: List[Dict],
160 queries: List[Dict], 209 queries: List[Dict],
...@@ -227,6 +276,7 @@ def build_pgvector_payload( ...@@ -227,6 +276,7 @@ def build_pgvector_payload(
227 "asset_type_code": row.get("asset_type_code"), 276 "asset_type_code": row.get("asset_type_code"),
228 "audio_exists": row.get("audio_exists"), 277 "audio_exists": row.get("audio_exists"),
229 "validation_status": row.get("validation_status"), 278 "validation_status": row.get("validation_status"),
279 "query_index": row.get("query_index"),
230 }) 280 })
231 281
232 for row in queries: 282 for row in queries:
...@@ -252,6 +302,7 @@ def build_pgvector_payload( ...@@ -252,6 +302,7 @@ def build_pgvector_payload(
252 "asset_type_code": row.get("asset_type_code"), 302 "asset_type_code": row.get("asset_type_code"),
253 "audio_exists": row.get("audio_exists"), 303 "audio_exists": row.get("audio_exists"),
254 "validation_status": row.get("validation_status"), 304 "validation_status": row.get("validation_status"),
305 "query_index": row.get("query_index"),
255 }) 306 })
256 307
257 return { 308 return {
...@@ -279,6 +330,7 @@ def main(): ...@@ -279,6 +330,7 @@ def main():
279 parser.add_argument("--audio-root", default=None) 330 parser.add_argument("--audio-root", default=None)
280 parser.add_argument("--default-query-duration", type=float, default=8.0) 331 parser.add_argument("--default-query-duration", type=float, default=8.0)
281 parser.add_argument("--default-query-offset", type=float, default=0.0) 332 parser.add_argument("--default-query-offset", type=float, default=0.0)
333 parser.add_argument("--query-stride", type=float, default=None)
282 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") 334 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
283 parser.add_argument("--emit-manifests", action="store_true") 335 parser.add_argument("--emit-manifests", action="store_true")
284 parser.add_argument("--emit-pgvector-json", action="store_true") 336 parser.add_argument("--emit-pgvector-json", action="store_true")
...@@ -294,6 +346,7 @@ def main(): ...@@ -294,6 +346,7 @@ def main():
294 rows.append(normalize_row(row, args)) 346 rows.append(normalize_row(row, args))
295 347
296 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args) 348 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args)
349 queries = expand_query_records(queries, args.query_stride)
297 missing_audio = sum(1 for row in rows if not row["audio_exists"]) 350 missing_audio = sum(1 for row in rows if not row["audio_exists"])
298 trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) 351 trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL})
299 352
...@@ -308,6 +361,7 @@ def main(): ...@@ -308,6 +361,7 @@ def main():
308 "missing_audio": missing_audio, 361 "missing_audio": missing_audio,
309 "trainable_audio_rows": trainable_audio_rows, 362 "trainable_audio_rows": trainable_audio_rows,
310 "include_conditionals_as": args.include_conditionals_as, 363 "include_conditionals_as": args.include_conditionals_as,
364 "query_stride": args.query_stride,
311 } 365 }
312 outputs = { 366 outputs = {
313 "references.json": references, 367 "references.json": references,
......
...@@ -5372,3 +5372,29 @@ ...@@ -5372,3 +5372,29 @@
5372 结论: 5372 结论:
5373 - type-aware weighting 比 naive oversampling 更有效 5373 - type-aware weighting 比 naive oversampling 更有效
5374 - 下一轮应专门针对 confused 类设计更强的 negative mining / confusion-aware 信号 5374 - 下一轮应专门针对 confused 类设计更强的 negative mining / confusion-aware 信号
5375
5376 ### Stage: internal asset query stride fix + type policy hardening
5377
5378 完成项:
5379 - 修复 `acr-engine/scripts/internal_asset_type_mapper.py` 中内部素材 query 的自动扩窗逻辑
5380 - 新增 `source_audio_duration` 透传,使长音频可基于真实总时长按 `--query-stride` 展开
5381 - 修复 “默认 offset=0 被误判为显式 offset” 的问题,确保只有 CSV 明确给了 offset 才禁用扩窗
5382 -`pgvector_payload.json``segments` 补充 `query_index`
5383 -[docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 补充内部素材滑窗规则、推荐参数表与自动扩窗示例
5384
5385 验证结果:
5386 - 使用本地 30s `songA.wav` 验证:
5387 - `--default-query-duration 8 --query-stride 4`
5388 - `queries.json` 成功导出 `7` 条 query
5389 - offset 为 `0, 4, 8, 12, 16, 20, 22`
5390 - `query_index``0..6`
5391 - 使用本地 40s `songB.wav` + CSV 显式 `offset=12` 验证:
5392 - 仍只导出 `1` 条 query
5393 - 未被自动扩窗覆盖
5394 - `manifest_bundle/*.json``pgvector_payload.json` 均已同步反映扩窗结果
5395
5396 结论:
5397 - 现在内部素材可以稳定支持两种模式:
5398 - **人工标 offset 的短视频片段**:保持单条 query
5399 - **只有整首音频、没有 query 起点的素材**:自动生成多窗口 query
5400 - 这让 `7/8/16/18` 这类 query 型素材可以更直接进入训练与评测流水线,同时保留对 `pgvector` 入库的可追踪性
......
...@@ -530,11 +530,23 @@ query: ...@@ -530,11 +530,23 @@ query:
530 - `--offset-field` 530 - `--offset-field`
531 - `--default-query-duration` 531 - `--default-query-duration`
532 - `--default-query-offset` 532 - `--default-query-offset`
533 - `--query-stride`
533 534
534 规则是: 535 规则是:
535 - query 优先使用 CSV 自带的 `duration/offset` 536 - query 优先使用 CSV 自带的 `duration/offset`
536 - 没有时,优先使用音频探测时长 537 - duration 没有时,回落到默认 query duration(例如 `8.0s`),而不是整首音频时长
537 - offset 没有时,回落到默认值(通常 `0.0` 538 - 音频总时长会单独保留为 `source_audio_duration`,供 query 滑窗展开使用
539 - offset 有 CSV 显式值时,保持单条 query,不做自动扩窗
540 - offset 没有显式值且设置了 `--query-stride` 时,会按滑窗方式自动展开成多条 query
541 - 若未设置 `--query-stride`,offset 没有显式值时回落到默认值(通常 `0.0`
542
543 推荐参数:
544
545 | 场景 | 推荐参数 | 说明 |
546 |---|---|---|
547 | 内部短视频片段已人工标好起点 | `--offset-field offset_sec` | 保留人工时间戳,避免自动扩窗覆盖人工标注 |
548 | 只有整首原始音频,没有 query 起点 | `--default-query-duration 8 --query-stride 4` | 自动产出 50% overlap 的多窗口 query |
549 | 只想先做最小可用集 | `--default-query-duration 8` | 每条 query 只导出 1 个片段,默认 offset=0 |
538 550
539 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: 551 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出:
540 552
...@@ -542,6 +554,23 @@ query: ...@@ -542,6 +554,23 @@ query:
542 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train 554 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv --audio-root data/internal_audio --output-dir out/internal_asset_map --emit-pgvector-json --pgvector-split train
543 ``` 555 ```
544 556
557 自动扩窗示例:
558
559 ```bash
560 /usr/local/miniconda3/bin/python acr-engine/scripts/internal_asset_type_mapper.py assets.csv \
561 --audio-root data/internal_audio \
562 --output-dir out/internal_asset_map \
563 --default-query-duration 8 \
564 --query-stride 4 \
565 --emit-manifests \
566 --emit-pgvector-json
567 ```
568
569 例如 30s 音频在 `8s` query、`4s` stride 下会导出 offset:
570 - `0, 4, 8, 12, 16, 20, 22`
571
572 导出的 `queries.json``pgvector_payload.json` 中都会保留 `query_index`,方便后续追踪窗口来源。
573
545 输出会包含: 574 输出会包含:
546 - `songs` 575 - `songs`
547 - `references` 576 - `references`
......