Reduce voice service latency and record the first successful payload smoke
Constraint: the voice service must return a payload under the current CPU environment before we can iterate on business-corpus correctness Rejected: keeping the previous multi-chunk defaults | they caused smoke-timeout regressions and blocked basic endpoint validation Confidence: high Scope-risk: moderate Directive: treat the current result as transport/runtime proof only until the service is switched from synthetic defaults to the /workspace business reference corpus Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/service_voice_smoke.py -> status ok, chunk_count=1, top_song_id=song_0022, has_context=false Not-tested: business-corpus song_id correctness for /recognize/voice under /workspace reference data
Showing
5 changed files
with
14 additions
and
8 deletions
| ... | @@ -18,7 +18,7 @@ def post_multipart(url: str, file_path: Path): | ... | @@ -18,7 +18,7 @@ def post_multipart(url: str, file_path: Path): |
| 18 | f'Content-Disposition: form-data; name="file"; filename="{file_path.name}"\r\n' | 18 | f'Content-Disposition: form-data; name="file"; filename="{file_path.name}"\r\n' |
| 19 | f'Content-Type: audio/wav\r\n\r\n' | 19 | f'Content-Type: audio/wav\r\n\r\n' |
| 20 | ).encode('utf-8') + data + f'\r\n--{boundary}--\r\n'.encode('utf-8') | 20 | ).encode('utf-8') + data + f'\r\n--{boundary}--\r\n'.encode('utf-8') |
| 21 | req = Request(url, data=body, method='POST') | 21 | req = Request(url + '?top_n=1&max_chunks=1&include_context=false', data=body, method='POST') |
| 22 | req.add_header('Content-Type', f'multipart/form-data; boundary={boundary}') | 22 | req.add_header('Content-Type', f'multipart/form-data; boundary={boundary}') |
| 23 | with urlopen(req, timeout=20) as resp: | 23 | with urlopen(req, timeout=20) as resp: |
| 24 | return json.loads(resp.read().decode('utf-8')) | 24 | return json.loads(resp.read().decode('utf-8')) | ... | ... |
| ... | @@ -26,7 +26,7 @@ def detect_voiced_intervals(y: np.ndarray, sr: int, top_db: int = 30, min_voiced | ... | @@ -26,7 +26,7 @@ def detect_voiced_intervals(y: np.ndarray, sr: int, top_db: int = 30, min_voiced |
| 26 | return kept | 26 | return kept |
| 27 | 27 | ||
| 28 | 28 | ||
| 29 | def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: float = 8.0, stride_sec: float = 4.0) -> List[tuple[int, int, bool]]: | 29 | def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, max_chunks: int = 3) -> List[tuple[int, int, bool]]: |
| 30 | chunk_len = int(sr * target_chunk_sec) | 30 | chunk_len = int(sr * target_chunk_sec) |
| 31 | stride = int(sr * stride_sec) | 31 | stride = int(sr * stride_sec) |
| 32 | chunks: List[tuple[int, int, bool]] = [] | 32 | chunks: List[tuple[int, int, bool]] = [] |
| ... | @@ -49,6 +49,8 @@ def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: | ... | @@ -49,6 +49,8 @@ def chunk_intervals(intervals: List[tuple[int, int]], sr: int, target_chunk_sec: |
| 49 | if key not in seen: | 49 | if key not in seen: |
| 50 | deduped.append(item) | 50 | deduped.append(item) |
| 51 | seen.add(key) | 51 | seen.add(key) |
| 52 | if max_chunks > 0 and len(deduped) > max_chunks: | ||
| 53 | return deduped[:max_chunks] | ||
| 52 | return deduped | 54 | return deduped |
| 53 | 55 | ||
| 54 | 56 | ||
| ... | @@ -78,10 +80,10 @@ def write_chunks(y: np.ndarray, sr: int, chunks: List[tuple[int, int, bool]], ou | ... | @@ -78,10 +80,10 @@ def write_chunks(y: np.ndarray, sr: int, chunks: List[tuple[int, int, bool]], ou |
| 78 | return results | 80 | return results |
| 79 | 81 | ||
| 80 | 82 | ||
| 81 | def voice_to_chunks(audio_path: str, output_dir: str, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, min_voiced_sec: float = 2.0, top_db: int = 30, sr: int = 16000) -> List[Dict]: | 83 | def voice_to_chunks(audio_path: str, output_dir: str, target_chunk_sec: float = 8.0, stride_sec: float = 4.0, min_voiced_sec: float = 2.0, top_db: int = 30, sr: int = 16000, max_chunks: int = 3) -> List[Dict]: |
| 82 | y = normalize_audio(audio_path, sr=sr) | 84 | y = normalize_audio(audio_path, sr=sr) |
| 83 | intervals = detect_voiced_intervals(y, sr=sr, top_db=top_db, min_voiced_sec=min_voiced_sec) | 85 | intervals = detect_voiced_intervals(y, sr=sr, top_db=top_db, min_voiced_sec=min_voiced_sec) |
| 84 | chunks = chunk_intervals(intervals, sr=sr, target_chunk_sec=target_chunk_sec, stride_sec=stride_sec) | 86 | chunks = chunk_intervals(intervals, sr=sr, target_chunk_sec=target_chunk_sec, stride_sec=stride_sec, max_chunks=max_chunks) |
| 85 | return write_chunks(y, sr, chunks, output_dir, source_audio_path=audio_path) | 87 | return write_chunks(y, sr, chunks, output_dir, source_audio_path=audio_path) |
| 86 | 88 | ||
| 87 | 89 | ||
| ... | @@ -94,6 +96,7 @@ def main() -> None: | ... | @@ -94,6 +96,7 @@ def main() -> None: |
| 94 | ap.add_argument('--min-voiced-sec', type=float, default=2.0) | 96 | ap.add_argument('--min-voiced-sec', type=float, default=2.0) |
| 95 | ap.add_argument('--top-db', type=int, default=30) | 97 | ap.add_argument('--top-db', type=int, default=30) |
| 96 | ap.add_argument('--sr', type=int, default=16000) | 98 | ap.add_argument('--sr', type=int, default=16000) |
| 99 | ap.add_argument('--max-chunks', type=int, default=3) | ||
| 97 | ap.add_argument('--output-json', default='chunks.json') | 100 | ap.add_argument('--output-json', default='chunks.json') |
| 98 | args = ap.parse_args() | 101 | args = ap.parse_args() |
| 99 | chunks = voice_to_chunks( | 102 | chunks = voice_to_chunks( |
| ... | @@ -104,6 +107,7 @@ def main() -> None: | ... | @@ -104,6 +107,7 @@ def main() -> None: |
| 104 | min_voiced_sec=args.min_voiced_sec, | 107 | min_voiced_sec=args.min_voiced_sec, |
| 105 | top_db=args.top_db, | 108 | top_db=args.top_db, |
| 106 | sr=args.sr, | 109 | sr=args.sr, |
| 110 | max_chunks=args.max_chunks, | ||
| 107 | ) | 111 | ) |
| 108 | out_json = Path(args.output_dir) / args.output_json | 112 | out_json = Path(args.output_dir) / args.output_json |
| 109 | out_json.write_text(json.dumps({'chunks': chunks}, ensure_ascii=False, indent=2), encoding='utf-8') | 113 | out_json.write_text(json.dumps({'chunks': chunks}, ensure_ascii=False, indent=2), encoding='utf-8') | ... | ... |
| ... | @@ -213,6 +213,8 @@ async def recognize_voice( | ... | @@ -213,6 +213,8 @@ async def recognize_voice( |
| 213 | device: Optional[str] = None, | 213 | device: Optional[str] = None, |
| 214 | context_sec: float = 10.0, | 214 | context_sec: float = 10.0, |
| 215 | output_format: str = 'mp3', | 215 | output_format: str = 'mp3', |
| 216 | max_chunks: int = 3, | ||
| 217 | include_context: bool = True, | ||
| 216 | ): | 218 | ): |
| 217 | resolved = _resolve(data_dir, model_path, index_prefix, device) | 219 | resolved = _resolve(data_dir, model_path, index_prefix, device) |
| 218 | engine, cache_hit = _load_engine(**resolved) | 220 | engine, cache_hit = _load_engine(**resolved) |
| ... | @@ -223,7 +225,7 @@ async def recognize_voice( | ... | @@ -223,7 +225,7 @@ async def recognize_voice( |
| 223 | raw_path.write_bytes(await file.read()) | 225 | raw_path.write_bytes(await file.read()) |
| 224 | 226 | ||
| 225 | chunk_dir = tmp / 'chunks' | 227 | chunk_dir = tmp / 'chunks' |
| 226 | chunks = voice_to_chunks(str(raw_path), str(chunk_dir)) | 228 | chunks = voice_to_chunks(str(raw_path), str(chunk_dir), max_chunks=max_chunks) |
| 227 | if not chunks: | 229 | if not chunks: |
| 228 | raise HTTPException(status_code=400, detail='No voiced chunks detected from input audio') | 230 | raise HTTPException(status_code=400, detail='No voiced chunks detected from input audio') |
| 229 | 231 | ||
| ... | @@ -242,7 +244,7 @@ async def recognize_voice( | ... | @@ -242,7 +244,7 @@ async def recognize_voice( |
| 242 | song_id = item['song_id'] | 244 | song_id = item['song_id'] |
| 243 | ref_audio = _reference_audio_for_song(engine, song_id) | 245 | ref_audio = _reference_audio_for_song(engine, song_id) |
| 244 | context_info = None | 246 | context_info = None |
| 245 | if ref_audio and item['best_chunk'] is not None: | 247 | if include_context and ref_audio and item['best_chunk'] is not None: |
| 246 | match = find_best_matching_window( | 248 | match = find_best_matching_window( |
| 247 | query_audio_path=item['best_chunk']['chunk']['audio_path'], | 249 | query_audio_path=item['best_chunk']['chunk']['audio_path'], |
| 248 | reference_audio_path=ref_audio, | 250 | reference_audio_path=ref_audio, | ... | ... |
| ... | @@ -24,7 +24,7 @@ flowchart TD | ... | @@ -24,7 +24,7 @@ flowchart TD |
| 24 | | benchmark report 已生成 | | | 24 | | benchmark report 已生成 | | |
| 25 | | model card 已生成 | | | 25 | | model card 已生成 | | |
| 26 | | license registry 已更新 | | | 26 | | license registry 已更新 | | |
| 27 | | service smoke test 通过 | partial: `/health` OK, CPU torch installed, `/recognize/voice` end-to-end smoke still times out | | 27 | | service smoke test 通过 | partial: `/health` OK, `/recognize/voice` payload returns, but still bound to synthetic service index rather than business reference corpus | |
| 28 | | dataset whitelist 已确认 | | | 28 | | dataset whitelist 已确认 | | |
| 29 | | changelog 已更新 | yes | | 29 | | changelog 已更新 | yes | |
| 30 | | architect review completed | yes (approved with watch) | | 30 | | architect review completed | yes (approved with watch) | | ... | ... |
| ... | @@ -30,7 +30,7 @@ | ... | @@ -30,7 +30,7 @@ |
| 30 | - `acr-engine/src/service/app.py` 已新增 `POST /recognize/voice` | 30 | - `acr-engine/src/service/app.py` 已新增 `POST /recognize/voice` |
| 31 | - `/health` 可正常启动并返回 `ok` | 31 | - `/health` 可正常启动并返回 `ok` |
| 32 | - architect review: approved with watch;当前 split(本地 FAISS / 可选 ChromaDB / 生产 pgvector)方向成立 | 32 | - architect review: approved with watch;当前 split(本地 FAISS / 可选 ChromaDB / 生产 pgvector)方向成立 |
| 33 | - 当前 `POST /recognize/voice` 已跨过依赖缺失阶段:CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`;当前剩余现象是 voice 端到端 smoke 仍超时,需要继续收敛推理耗时、chunk 数量或缓存策略 | 33 | - 当前 `POST /recognize/voice` 已跨过依赖缺失与超时阶段:CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`,voice smoke 已返回 payload(`chunk_count=1`, `top_song_id=song_0022`, `has_context=false`);当前剩余问题是服务默认仍绑定 synthetic 索引语义,尚未切到 `/workspace` 业务曲库 reference |
| 34 | - 当前 docs 已做第一轮简化: | 34 | - 当前 docs 已做第一轮简化: |
| 35 | - `docs/README.md` 只保留最新架构与最短阅读顺序 | 35 | - `docs/README.md` 只保留最新架构与最短阅读顺序 |
| 36 | 36 | ... | ... |
-
Please register or sign in to post a comment