Commit 3e13c578 3e13c578f5ae4c5f361925e7223776f665445ac7 by cnb.bofCdSsphPA

Fill internal query timing semantics before training on imported clips

Constraint: Internal short-video and demo assets need explicit duration/offset semantics before they can behave like real training or pgvector segment records
Rejected: Leave query offsets empty by default | Produces weaker provenance and less useful downstream segment metadata
Confidence: high
Scope-risk: narrow
Directive: Prefer source CSV timing when available, then fall back to inspected audio duration and conservative default offsets
Tested: Sample CSV run confirmed one query used CSV duration/offset (5.0/12.5) and another fell back to inspected duration/default offset (6.5/0.0), with pgvector segments matching
Not-tested: Complex multi-segment offset generation from long-form internal masters
1 parent 58041e10
...@@ -59,6 +59,18 @@ def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool ...@@ -59,6 +59,18 @@ def inspect_audio(asset_path: str | None, audio_root: Path | None) -> Tuple[bool
59 return True, None 59 return True, None
60 60
61 61
62 def parse_optional_float(value: str | None) -> float | None:
63 if value is None:
64 return None
65 text = str(value).strip()
66 if text == "":
67 return None
68 try:
69 return float(text)
70 except ValueError:
71 return None
72
73
62 def normalize_row(row: Dict[str, str], args) -> Dict: 74 def normalize_row(row: Dict[str, str], args) -> Dict:
63 type_code = int(row[args.type_field]) 75 type_code = int(row[args.type_field])
64 policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"}) 76 policy = TYPE_POLICY.get(type_code, {"bucket": EXCLUDED, "audio_role": "unknown", "train_type": "none", "priority": "unknown"})
...@@ -67,6 +79,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: ...@@ -67,6 +79,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
67 audio_path = row.get(args.path_field) 79 audio_path = row.get(args.path_field)
68 audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None) 80 audio_exists, duration_sec = inspect_audio(audio_path, Path(args.audio_root) if args.audio_root else None)
69 validation_status = "ok" if audio_exists else "missing_audio" 81 validation_status = "ok" if audio_exists else "missing_audio"
82 csv_duration = parse_optional_float(row.get(args.duration_field))
83 csv_offset = parse_optional_float(row.get(args.offset_field))
70 record = { 84 record = {
71 "asset_id": row.get(args.asset_id_field), 85 "asset_id": row.get(args.asset_id_field),
72 "canonical_song_id": canonical_song_id, 86 "canonical_song_id": canonical_song_id,
...@@ -79,6 +93,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict: ...@@ -79,6 +93,8 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
79 "audio_path": audio_path, 93 "audio_path": audio_path,
80 "audio_exists": audio_exists, 94 "audio_exists": audio_exists,
81 "duration_sec": duration_sec, 95 "duration_sec": duration_sec,
96 "csv_duration_sec": csv_duration,
97 "csv_offset_sec": csv_offset,
82 "validation_status": validation_status, 98 "validation_status": validation_status,
83 "title": row.get(args.title_field), 99 "title": row.get(args.title_field),
84 "artist": row.get(args.artist_field), 100 "artist": row.get(args.artist_field),
...@@ -87,7 +103,14 @@ def normalize_row(row: Dict[str, str], args) -> Dict: ...@@ -87,7 +103,14 @@ def normalize_row(row: Dict[str, str], args) -> Dict:
87 return record 103 return record
88 104
89 105
90 def to_manifest_record(record: Dict, bucket: str) -> Dict: 106 def to_manifest_record(record: Dict, bucket: str, args) -> Dict:
107 inferred_query_duration = record["csv_duration_sec"]
108 if inferred_query_duration is None:
109 inferred_query_duration = record["duration_sec"] if record["duration_sec"] is not None else args.default_query_duration
110 inferred_query_offset = record["csv_offset_sec"]
111 if inferred_query_offset is None:
112 inferred_query_offset = args.default_query_offset
113
91 base = { 114 base = {
92 "song_id": record["canonical_song_id"], 115 "song_id": record["canonical_song_id"],
93 "version_id": record["version_id"], 116 "version_id": record["version_id"],
...@@ -108,13 +131,13 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict: ...@@ -108,13 +131,13 @@ def to_manifest_record(record: Dict, bucket: str) -> Dict:
108 return { 131 return {
109 **base, 132 **base,
110 "type": record["recommended_train_type"], 133 "type": record["recommended_train_type"],
111 "duration": record["duration_sec"] or 0.0, 134 "duration": inferred_query_duration or 0.0,
112 "offset": None, 135 "offset": inferred_query_offset,
113 "segment_type": "external_query", 136 "segment_type": "external_query",
114 } 137 }
115 138
116 139
117 def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]: 140 def route_records(rows: List[Dict], include_conditionals_as: str, args) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict]]:
118 references, queries, metadata_only, excluded = [], [], [], [] 141 references, queries, metadata_only, excluded = [], [], [], []
119 for record in rows: 142 for record in rows:
120 bucket = record["bucket"] 143 bucket = record["bucket"]
...@@ -122,9 +145,9 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[ ...@@ -122,9 +145,9 @@ def route_records(rows: List[Dict], include_conditionals_as: str) -> Tuple[List[
122 bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED 145 bucket = include_conditionals_as if include_conditionals_as != "skip" else EXCLUDED
123 146
124 if bucket == REFERENCE: 147 if bucket == REFERENCE:
125 references.append(to_manifest_record(record, REFERENCE)) 148 references.append(to_manifest_record(record, REFERENCE, args))
126 elif bucket == QUERY: 149 elif bucket == QUERY:
127 queries.append(to_manifest_record(record, QUERY)) 150 queries.append(to_manifest_record(record, QUERY, args))
128 elif bucket == METADATA: 151 elif bucket == METADATA:
129 metadata_only.append(record) 152 metadata_only.append(record)
130 else: 153 else:
...@@ -248,10 +271,14 @@ def main(): ...@@ -248,10 +271,14 @@ def main():
248 parser.add_argument("--version-field", default="version_id") 271 parser.add_argument("--version-field", default="version_id")
249 parser.add_argument("--type-field", default="type") 272 parser.add_argument("--type-field", default="type")
250 parser.add_argument("--path-field", default="audio_path") 273 parser.add_argument("--path-field", default="audio_path")
274 parser.add_argument("--duration-field", default="duration_sec")
275 parser.add_argument("--offset-field", default="offset_sec")
251 parser.add_argument("--title-field", default="title") 276 parser.add_argument("--title-field", default="title")
252 parser.add_argument("--artist-field", default="artist") 277 parser.add_argument("--artist-field", default="artist")
253 parser.add_argument("--platform-field", default="source_platform") 278 parser.add_argument("--platform-field", default="source_platform")
254 parser.add_argument("--audio-root", default=None) 279 parser.add_argument("--audio-root", default=None)
280 parser.add_argument("--default-query-duration", type=float, default=8.0)
281 parser.add_argument("--default-query-offset", type=float, default=0.0)
255 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip") 282 parser.add_argument("--include-conditionals-as", choices=["skip", "query", "reference"], default="skip")
256 parser.add_argument("--emit-manifests", action="store_true") 283 parser.add_argument("--emit-manifests", action="store_true")
257 parser.add_argument("--emit-pgvector-json", action="store_true") 284 parser.add_argument("--emit-pgvector-json", action="store_true")
...@@ -266,7 +293,7 @@ def main(): ...@@ -266,7 +293,7 @@ def main():
266 for row in reader: 293 for row in reader:
267 rows.append(normalize_row(row, args)) 294 rows.append(normalize_row(row, args))
268 295
269 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as) 296 references, queries, metadata_only, excluded = route_records(rows, args.include_conditionals_as, args)
270 missing_audio = sum(1 for row in rows if not row["audio_exists"]) 297 missing_audio = sum(1 for row in rows if not row["audio_exists"])
271 trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL}) 298 trainable_audio_rows = sum(1 for row in rows if row["audio_exists"] and row["bucket"] in {REFERENCE, QUERY, CONDITIONAL})
272 299
......
...@@ -2,6 +2,34 @@ ...@@ -2,6 +2,34 @@
2 2
3 ## 2026-06-02 3 ## 2026-06-02
4 4
5 ### Stage: 为内部素材 query 自动补 duration / offset 规则
6
7 完成项:
8 - 扩展 `acr-engine/scripts/internal_asset_type_mapper.py`
9 - 新增 `--duration-field`
10 - 新增 `--offset-field`
11 - 新增 `--default-query-duration`
12 - 新增 `--default-query-offset`
13 - 规则更新:
14 - query 优先使用 CSV 提供的 `duration/offset`
15 - 无 CSV duration 时,优先使用音频探测时长
16 - 无 CSV offset 时,使用默认 offset
17 - pgvector payload 同步使用生成后的 `duration/offset`
18
19 验证结果:
20 - 用 3 行样例 CSV 验证:
21 - `song_a` 短视频 query 使用 CSV 值:
22 - `duration = 5.0`
23 - `offset = 12.5`
24 - `song_c` demo query 使用自动回填:
25 - `duration = 6.5`
26 - `offset = 0.0`
27 - `pgvector_payload.json` 中的 `segments` 也已同步带上正确 `offset_sec/duration_sec`
28
29 结论:
30 - 现在内部素材 query 已经不再只能输出“空 offset”
31 - 对短视频片段、demo、后续回流片段的训练和入库更接近真实可用状态
32
5 ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出 33 ### Stage: 为内部素材映射脚本增加 pgvector-ready JSON 导出
6 34
7 完成项: 35 完成项:
......
...@@ -525,6 +525,17 @@ query: ...@@ -525,6 +525,17 @@ query:
525 - `duration` 525 - `duration`
526 - `missing_audio` 汇总 526 - `missing_audio` 汇总
527 527
528 同时脚本现在还支持:
529 - `--duration-field`
530 - `--offset-field`
531 - `--default-query-duration`
532 - `--default-query-offset`
533
534 规则是:
535 - query 优先使用 CSV 自带的 `duration/offset`
536 - 没有时,优先使用音频探测时长
537 - offset 没有时,回落到默认值(通常 `0.0`
538
528 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出: 539 如果你们下一步就是要进 PostgreSQL / pgvector,可直接导出:
529 540
530 ```bash 541 ```bash
......