Reduce voice service latency and record the first successful payload smoke

Constraint: the voice service must return a payload under the current CPU environment before we can iterate on business-corpus correctness Rejected: keeping the previous multi-chunk defaults | they caused smoke-timeout regressions and blocked basic endpoint validation Confidence: high Scope-risk: moderate Directive: treat the current result as transport/runtime proof only until the service is switched from synthetic defaults to the /workspace business reference corpus Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/service_voice_smoke.py -> status ok, chunk_count=1, top_song_id=song_0022, has_context=false Not-tested: business-corpus song_id correctness for /recognize/voice under /workspace reference data

Reduce voice service latency and record the first successful payload smoke
Constraint: the voice service must return a payload under the current CPU environment before we can iterate on business-corpus correctness Rejected: keeping the previous multi-chunk defaults | they caused smoke-timeout regressions and blocked basic endpoint validation Confidence: high Scope-risk: moderate Directive: treat the current result as transport/runtime proof only until the service is switched from synthetic defaults to the /workspace business reference corpus Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/service_voice_smoke.py -> status ok, chunk_count=1, top_song_id=song_0022, has_context=false Not-tested: business-corpus song_id correctness for /recognize/voice under /workspace reference data
cnb.bofCdSsphPA
Commit 86c3f935 ... 86c3f935796b77989ffffa17e7eb1912d921e3b1 authored 2026-06-03 18:04:18 +0800 by cnb.bofCdSsphPA
Showing 5 changed files with 14 additions and 8 deletions
acr-engine/scripts/service_voice_smoke.py
acr-engine/src/data/voice_chunker.py
acr-engine/src/service/app.py
docs/release-checklist.md
docs/session-handoff.md
--- a/acr-engine/scripts/service_voice_smoke.py
View file @86c3f93
+++ b/acr-engine/scripts/service_voice_smoke.py
View file @86c3f93
@@ -18,7 +18,7 @@ def post_multipart(url: str, file_path: Path):
        f'Content-Disposition: form-data; name="file"; filename="{file_path.name}"\r\n'
        f'Content-Type: audio/wav\r\n\r\n'
    ).encode('utf-8') + data + f'\r\n--{boundary}--\r\n'.encode('utf-8')
-    req = Request(url, data=body, method='POST')
+    req = Request(url + '?top_n=1&max_chunks=1&include_context=false', data=body, method='POST')
    req.add_header('Content-Type', f'multipart/form-data; boundary={boundary}')
    with urlopen(req, timeout=20) as resp:
        return json.loads(resp.read().decode('utf-8'))
--- a/acr-engine/src/data/voice_chunker.py
View file @86c3f93
+++ b/acr-engine/src/data/voice_chunker.py
View file @86c3f93
@@ -26,7 +26,7 @@ def detect_voiced_intervals(y: np.ndarray, sr: int, top_db: int = 30, min_voiced
    return kept


-def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: float = 8.0, stride_sec: float = 4.0) -> List[tuple[int, int, bool]]:
+def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, max_chunks: int = 3) -> List[tuple[int, int, bool]]:
    chunk_len = int(sr * target_chunk_sec)
    stride = int(sr * stride_sec)
    chunks: List[tuple[int, int, bool]] = []
@@ -49,6 +49,8 @@ def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec:
        if key not in seen:
            deduped.append(item)
            seen.add(key)
+    if max_chunks > 0 and len(deduped) > max_chunks:
+        return deduped[:max_chunks]
    return deduped


@@ -78,10 +80,10 @@ def write_chunks(y: np.ndarray, sr: int, chunks: List[tuple[int, int, bool]], ou
    return results


-def voice_to_chunks(audio_path: str, output_dir: str, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, min_voiced_sec: float = 2.0, top_db: int = 30, sr: int = 16000) -> List[Dict]:
+def voice_to_chunks(audio_path: str, output_dir: str, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, min_voiced_sec: float = 2.0, top_db: int = 30, sr: int = 16000, max_chunks: int = 3) -> List[Dict]:
    y = normalize_audio(audio_path, sr=sr)
    intervals = detect_voiced_intervals(y, sr=sr, top_db=top_db, min_voiced_sec=min_voiced_sec)
-    chunks = chunk_intervals(intervals, sr=sr, target_chunk_sec=target_chunk_sec, stride_sec=stride_sec)
+    chunks = chunk_intervals(intervals, sr=sr, target_chunk_sec=target_chunk_sec, stride_sec=stride_sec, max_chunks=max_chunks)
    return write_chunks(y, sr, chunks, output_dir, source_audio_path=audio_path)


@@ -94,6 +96,7 @@ def main() -> None:
    ap.add_argument('--min-voiced-sec', type=float, default=2.0)
    ap.add_argument('--top-db', type=int, default=30)
    ap.add_argument('--sr', type=int, default=16000)
+    ap.add_argument('--max-chunks', type=int, default=3)
    ap.add_argument('--output-json', default='chunks.json')
    args = ap.parse_args()
    chunks = voice_to_chunks(
@@ -104,6 +107,7 @@ def main() -> None:
        min_voiced_sec=args.min_voiced_sec,
        top_db=args.top_db,
        sr=args.sr,
+        max_chunks=args.max_chunks,
    )
    out_json = Path(args.output_dir) / args.output_json
    out_json.write_text(json.dumps({'chunks': chunks}, ensure_ascii=False, indent=2), encoding='utf-8')
--- a/acr-engine/src/service/app.py
View file @86c3f93
+++ b/acr-engine/src/service/app.py
View file @86c3f93
@@ -213,6 +213,8 @@ async def recognize_voice(
    device: Optional[str] = None,
    context_sec: float = 10.0,
    output_format: str = 'mp3',
+    max_chunks: int = 3,
+    include_context: bool = True,
 ):
    resolved = _resolve(data_dir, model_path, index_prefix, device)
    engine, cache_hit = _load_engine(**resolved)
@@ -223,7 +225,7 @@ async def recognize_voice(
        raw_path.write_bytes(await file.read())

        chunk_dir = tmp / 'chunks'
-        chunks = voice_to_chunks(str(raw_path), str(chunk_dir))
+        chunks = voice_to_chunks(str(raw_path), str(chunk_dir), max_chunks=max_chunks)
        if not chunks:
            raise HTTPException(status_code=400, detail='No voiced chunks detected from input audio')

@@ -242,7 +244,7 @@ async def recognize_voice(
            song_id = item['song_id']
            ref_audio = _reference_audio_for_song(engine, song_id)
            context_info = None
-            if ref_audio and item['best_chunk'] is not None:
+            if include_context and ref_audio and item['best_chunk'] is not None:
                match = find_best_matching_window(
                    query_audio_path=item['best_chunk']['chunk']['audio_path'],
                    reference_audio_path=ref_audio,
--- a/docs/release-checklist.md
View file @86c3f93
+++ b/docs/release-checklist.md
View file @86c3f93
@@ -24,7 +24,7 @@ flowchart TD
 | benchmark report 已生成 |  |
 | model card 已生成 |  |
 | license registry 已更新 |  |
-| service smoke test 通过 | partial: `/health` OK, CPU torch installed, `/recognize/voice` end-to-end smoke still times out |
+| service smoke test 通过 | partial: `/health` OK, `/recognize/voice` payload returns, but still bound to synthetic service index rather than business reference corpus |
 | dataset whitelist 已确认 |  |
 | changelog 已更新 | yes |
 | architect review completed | yes (approved with watch) |
--- a/docs/session-handoff.md
View file @86c3f93
+++ b/docs/session-handoff.md
View file @86c3f93
@@ -30,7 +30,7 @@
  - `acr-engine/src/service/app.py` 已新增 `POST /recognize/voice`
  - `/health` 可正常启动并返回 `ok`
  - architect review: approved with watch；当前 split（本地 FAISS / 可选 ChromaDB / 生产 pgvector）方向成立
-  - 当前 `POST /recognize/voice` 已跨过依赖缺失阶段：CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`；当前剩余现象是 voice 端到端 smoke 仍超时，需要继续收敛推理耗时、chunk 数量或缓存策略
+  - 当前 `POST /recognize/voice` 已跨过依赖缺失与超时阶段：CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`，voice smoke 已返回 payload（`chunk_count=1`, `top_song_id=song_0022`, `has_context=false`）；当前剩余问题是服务默认仍绑定 synthetic 索引语义，尚未切到 `/workspace` 业务曲库 reference
 - 当前 docs 已做第一轮简化：
  - `docs/README.md` 只保留最新架构与最短阅读顺序