Fill internal query timing semantics before training on imported clips

Constraint: Internal short-video and demo assets need explicit duration/offset semantics before they can behave like real training or pgvector segment records Rejected: Leave query offsets empty by default | Produces weaker provenance and less useful downstream segment metadata Confidence: high Scope-risk: narrow Directive: Prefer source CSV timing when available, then fall back to inspected audio duration and conservative default offsets Tested: Sample CSV run confirmed one query used CSV duration/offset (5.0/12.5) and another fell back to inspected duration/default offset (6.5/0.0), with pgvector segments matching Not-tested: Complex multi-segment offset generation from long-form internal masters

Fill internal query timing semantics before training on imported clips
Constraint: Internal short-video and demo assets need explicit duration/offset semantics before they can behave like real training or pgvector segment records Rejected: Leave query offsets empty by default | Produces weaker provenance and less useful downstream segment metadata Confidence: high Scope-risk: narrow Directive: Prefer source CSV timing when available, then fall back to inspected audio duration and conservative default offsets Tested: Sample CSV run confirmed one query used CSV duration/offset (5.0/12.5) and another fell back to inspected duration/default offset (6.5/0.0), with pgvector segments matching Not-tested: Complex multi-segment offset generation from long-form internal masters
cnb.bofCdSsphPA
Commit 3e13c578 ... 3e13c578f5ae4c5f361925e7223776f665445ac7 authored 2026-06-02 15:45:28 +0800 by cnb.bofCdSsphPA
Showing 3 changed files with 73 additions and 7 deletions
acr-engine/scripts/internal_asset_type_mapper.py
docs/CHANGELOG.md
docs/training-data-and-pgvector-guide.md
--- a/acr-engine/scripts/internal_asset_type_mapper.py
View file @3e13c57
+++ b/acr-engine/scripts/internal_asset_type_mapper.py
View file @3e13c57
@@ -59,6 +59,18 @@ def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool
        return True, None


+def parse_optional_float(value: str | None) -> float | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    if text == "":
+        return None
+    try:
+        return float(text)
+    except ValueError:
+        return None
+
+
 def normalize_row(row: Dict[str, str], args) -> Dict:
    type_code = int(row[args.type_field])
    policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"})
@@ -67,6 +79,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
    audio_path = row.get(args.path_field)
    audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None)
    validation_status = "ok" if audio_exists else "missing_audio"
+    csv_duration = parse_optional_float(row.get(args.duration_field))
+    csv_offset = parse_optional_float(row.get(args.offset_field))
    record = {
        "asset_id": row.get(args.asset_id_field),
        "canonical_song_id": canonical_song_id,
@@ -79,6 +93,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
        "audio_path": audio_path,
        "audio_exists": audio_exists,
        "duration_sec": duration_sec,
+        "csv_duration_sec": csv_duration,
+        "csv_offset_sec": csv_offset,
        "validation_status": validation_status,
        "title": row.get(args.title_field),
        "artist": row.get(args.artist_field),
@@ -87,7 +103,14 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
    return record


-def to_manifest_record(record: Dict, bucket: str) -> Dict:
+def to_manifest_record(record: Dict, bucket: str, args) -> Dict:
+    inferred_query_duration = record["csv_duration_sec"]
+    if inferred_query_duration is None:
+        inferred_query_duration = record["duration_sec"] if record["duration_sec"] is not None else args.default_query_duration
+    inferred_query_offset = record["csv_offset_sec"]
+    if inferred_query_offset is None:
+        inferred_query_offset = args.default_query_offset
+
    base = {
        "song_id": record["canonical_song_id"],
        "version_id": record["version_id"],
@@ -108,13 +131,13 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict:
    return {
        **base,
        "type": record["recommended_train_type"],
-        "duration": record["duration_sec"] or 0.0,
-        "offset": None,
+        "duration": inferred_query_duration or 0.0,
+        "offset": inferred_query_offset,
        "segment_type": "external_query",
    }


-def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]:
+def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]:
    references, queries, metadata_only, excluded = [], [], [], []
    for record in rows:
        bucket = record["bucket"]
@@ -122,9 +145,9 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[
            bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED

        if bucket == REFERENCE:
-            references.append(to_manifest_record(record, REFERENCE))
+            references.append(to_manifest_record(record, REFERENCE, args))
        elif bucket == QUERY:
-            queries.append(to_manifest_record(record, QUERY))
+            queries.append(to_manifest_record(record, QUERY, args))
        elif bucket == METADATA:
            metadata_only.append(record)
        else:
@@ -248,10 +271,14 @@ def main():
    parser.add_argument("--version-field", default="version_id")
    parser.add_argument("--type-field", default="type")
    parser.add_argument("--path-field", default="audio_path")
+    parser.add_argument("--duration-field", default="duration_sec")
+    parser.add_argument("--offset-field", default="offset_sec")
    parser.add_argument("--title-field", default="title")
    parser.add_argument("--artist-field", default="artist")
    parser.add_argument("--platform-field", default="source_platform")
    parser.add_argument("--audio-root", default=None)
+    parser.add_argument("--default-query-duration", type=float, default=8.0)
+    parser.add_argument("--default-query-offset", type=float, default=0.0)
    parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
    parser.add_argument("--emit-manifests", action="store_true")
    parser.add_argument("--emit-pgvector-json", action="store_true")
@@ -266,7 +293,7 @@ def main():
        for row in reader:
            rows.append(normalize_row(row, args))

-    references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as)
+    references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args)
    missing_audio = sum(1 for row in rows if not row["audio_exists"])
    trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL})

--- a/docs/CHANGELOG.md
View file @3e13c57
+++ b/docs/CHANGELOG.md
View file @3e13c57
@@ -2,6 +2,34 @@

 ## 2026-06-02

+### Stage: 为内部素材 query 自动补 duration / offset 规则
+
+完成项：
+- 扩展 `acr-engine/scripts/internal_asset_type_mapper.py`
+  - 新增 `--duration-field`
+  - 新增 `--offset-field`
+  - 新增 `--default-query-duration`
+  - 新增 `--default-query-offset`
+- 规则更新：
+  - query 优先使用 CSV 提供的 `duration/offset`
+  - 无 CSV duration 时，优先使用音频探测时长
+  - 无 CSV offset 时，使用默认 offset
+- pgvector payload 同步使用生成后的 `duration/offset`
+
+验证结果：
+- 用 3 行样例 CSV 验证：
+  - `song_a` 短视频 query 使用 CSV 值：
+    - `duration = 5.0`
+    - `offset = 12.5`
+  - `song_c` demo query 使用自动回填：
+    - `duration = 6.5`
+    - `offset = 0.0`
+- `pgvector_payload.json` 中的 `segments` 也已同步带上正确 `offset_sec/duration_sec`
+
+结论：
+- 现在内部素材 query 已经不再只能输出“空 offset”
+- 对短视频片段、demo、后续回流片段的训练和入库更接近真实可用状态
+
 ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出

 完成项：
--- a/docs/training-data-and-pgvector-guide.md
View file @3e13c57
+++ b/docs/training-data-and-pgvector-guide.md
View file @3e13c57
@@ -525,6 +525,17 @@ query:
 - `duration`
 - `missing_audio` 汇总

+同时脚本现在还支持：
+- `--duration-field`
+- `--offset-field`
+- `--default-query-duration`
+- `--default-query-offset`
+
+规则是：
+- query 优先使用 CSV 自带的 `duration/offset`
+- 没有时，优先使用音频探测时长
+- offset 没有时，回落到默认值（通常 `0.0`）
+
 如果你们下一步就是要进 PostgreSQL / pgvector，可直接导出：

 ```bash