Commit 356053b7 356053b724a8ac7522a9fe46509121ab00632715 by cnb.bofCdSsphPA

Route voice recognition through the workspace music20 corpus

Constraint: external voice uploads now need a business-sample-backed path before any pgvector production cutover, while still staying lightweight enough for CPU smoke tests
Rejected: waiting for full pgvector service integration before proving a business-corpus path | would leave the external voice interface unvalidated against real sample references
Confidence: medium
Scope-risk: moderate
Directive: treat workspace_music20 as a proving lane only; validate business top1 correctness before promoting its defaults or claiming production readiness
Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/service_voice_smoke.py -> status ok, corpus=workspace_music20, chunk_count=1, top_song_id=109, has_context=true
Not-tested: pgvector-backed /recognize/voice production retrieval path
1 parent 86c3f935
...@@ -16,18 +16,16 @@ def post_multipart(url: str, file_path: Path): ...@@ -16,18 +16,16 @@ def post_multipart(url: str, file_path: Path):
16 body = ( 16 body = (
17 f'--{boundary}\r\n' 17 f'--{boundary}\r\n'
18 f'Content-Disposition: form-data; name="file"; filename="{file_path.name}"\r\n' 18 f'Content-Disposition: form-data; name="file"; filename="{file_path.name}"\r\n'
19 f'Content-Type: audio/wav\r\n\r\n' 19 f'Content-Type: audio/mpeg\r\n\r\n'
20 ).encode('utf-8') + data + f'\r\n--{boundary}--\r\n'.encode('utf-8') 20 ).encode('utf-8') + data + f'\r\n--{boundary}--\r\n'.encode('utf-8')
21 req = Request(url + '?top_n=1&max_chunks=1&include_context=false', data=body, method='POST') 21 req = Request(url + '?top_n=1&max_chunks=1&include_context=true&corpus=workspace_music20', data=body, method='POST')
22 req.add_header('Content-Type', f'multipart/form-data; boundary={boundary}') 22 req.add_header('Content-Type', f'multipart/form-data; boundary={boundary}')
23 with urlopen(req, timeout=20) as resp: 23 with urlopen(req, timeout=60) as resp:
24 return json.loads(resp.read().decode('utf-8')) 24 return json.loads(resp.read().decode('utf-8'))
25 25
26 26
27 def main(): 27 def main():
28 cmd = [ 28 cmd = ['/usr/local/miniconda3/bin/python', '-m', 'uvicorn', 'src.service.app:app', '--host', '127.0.0.1', '--port', '8000']
29 '/usr/local/miniconda3/bin/python', '-m', 'uvicorn', 'src.service.app:app', '--host', '127.0.0.1', '--port', '8000'
30 ]
31 proc = subprocess.Popen(cmd, cwd='/root/vprecog/acr-engine', stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) 29 proc = subprocess.Popen(cmd, cwd='/root/vprecog/acr-engine', stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
32 query = Path('/workspace/downloads/111/type_7/75cd601b-7604-4b37-8132-cfab39e7c644.mp3') 30 query = Path('/workspace/downloads/111/type_7/75cd601b-7604-4b37-8132-cfab39e7c644.mp3')
33 try: 31 try:
...@@ -35,11 +33,14 @@ def main(): ...@@ -35,11 +33,14 @@ def main():
35 time.sleep(0.5) 33 time.sleep(0.5)
36 try: 34 try:
37 result = post_multipart(BASE + '/recognize/voice', query) 35 result = post_multipart(BASE + '/recognize/voice', query)
36 top = result.get('candidates', [{}])[0] if result.get('candidates') else {}
38 print(json.dumps({ 37 print(json.dumps({
39 'status': 'ok', 38 'status': 'ok',
39 'corpus': result.get('corpus'),
40 'chunk_count': result.get('chunk_count'), 40 'chunk_count': result.get('chunk_count'),
41 'top_song_id': result.get('candidates', [{}])[0].get('song_id') if result.get('candidates') else None, 41 'top_song_id': top.get('song_id'),
42 'has_context': bool(result.get('candidates', [{}])[0].get('context_clip')) if result.get('candidates') else False, 42 'has_context': bool(top.get('context_clip')),
43 'reference_audio_path': top.get('reference_audio_path'),
43 }, ensure_ascii=False, indent=2)) 44 }, ensure_ascii=False, indent=2))
44 return 45 return
45 except Exception: 46 except Exception:
......
...@@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory ...@@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory
5 from threading import Lock 5 from threading import Lock
6 from typing import Optional 6 from typing import Optional
7 7
8 import faiss
8 import numpy as np 9 import numpy as np
9 from fastapi import FastAPI, File, HTTPException, UploadFile 10 from fastapi import FastAPI, File, HTTPException, UploadFile
10 from pydantic import BaseModel 11 from pydantic import BaseModel
...@@ -12,6 +13,7 @@ from pydantic import BaseModel ...@@ -12,6 +13,7 @@ from pydantic import BaseModel
12 from src.data.voice_chunker import voice_to_chunks 13 from src.data.voice_chunker import voice_to_chunks
13 from src.service.settings import ServiceSettings 14 from src.service.settings import ServiceSettings
14 from src.utils.context_exporter import export_match_context, find_best_matching_window 15 from src.utils.context_exporter import export_match_context, find_best_matching_window
16 from scripts.local_music20_acr import REFERENCE_TYPE, SUPPORTED_QUERY_TYPES, embed_chroma, first_file
15 17
16 18
17 class RecognizeRequest(BaseModel): 19 class RecognizeRequest(BaseModel):
...@@ -30,7 +32,7 @@ class BuildIndexRequest(BaseModel): ...@@ -30,7 +32,7 @@ class BuildIndexRequest(BaseModel):
30 device: Optional[str] = None 32 device: Optional[str] = None
31 33
32 34
33 app = FastAPI(title='ACR Service', version='0.4.0') 35 app = FastAPI(title='ACR Service', version='0.5.0')
34 settings = ServiceSettings() 36 settings = ServiceSettings()
35 _engine_cache: dict[tuple[str, str, str, str], object] = {} 37 _engine_cache: dict[tuple[str, str, str, str], object] = {}
36 _cache_lock = Lock() 38 _cache_lock = Lock()
...@@ -70,23 +72,20 @@ def _load_engine_uncached(data_dir: str, model_path: str, index_prefix: str, dev ...@@ -70,23 +72,20 @@ def _load_engine_uncached(data_dir: str, model_path: str, index_prefix: str, dev
70 from src.engines.ecapa_embedder import ECAPAEmbedder 72 from src.engines.ecapa_embedder import ECAPAEmbedder
71 from src.engines.hybrid_engine import HybridEngine 73 from src.engines.hybrid_engine import HybridEngine
72 except Exception as exc: 74 except Exception as exc:
73 raise HTTPException(status_code=500, detail=f"Engine dependencies unavailable: {exc}") 75 raise HTTPException(status_code=500, detail=f'Engine dependencies unavailable: {exc}')
74 76
75 matcher = ChromaprintMatcher() 77 matcher = ChromaprintMatcher()
76 chroma_path = str(Path(index_prefix).parent / 'chromaprint.pkl') 78 chroma_path = str(Path(index_prefix).parent / 'chromaprint.pkl')
77 if not Path(chroma_path).exists(): 79 if not Path(chroma_path).exists():
78 raise HTTPException(status_code=400, detail=f'Missing chromaprint index: {chroma_path}') 80 raise HTTPException(status_code=400, detail=f'Missing chromaprint index: {chroma_path}')
79 matcher.load(chroma_path) 81 matcher.load(chroma_path)
80
81 if not Path(model_path).exists(): 82 if not Path(model_path).exists():
82 raise HTTPException(status_code=400, detail=f'Missing model: {model_path}') 83 raise HTTPException(status_code=400, detail=f'Missing model: {model_path}')
83 embedder = ECAPAEmbedder(model_path=model_path, device=device) 84 embedder = ECAPAEmbedder(model_path=model_path, device=device)
84
85 embs_path = f'{index_prefix}_embs.npy' 85 embs_path = f'{index_prefix}_embs.npy'
86 ids_path = f'{index_prefix}_ids.npy' 86 ids_path = f'{index_prefix}_ids.npy'
87 if not Path(embs_path).exists() or not Path(ids_path).exists(): 87 if not Path(embs_path).exists() or not Path(ids_path).exists():
88 raise HTTPException(status_code=400, detail='Missing embedding index files') 88 raise HTTPException(status_code=400, detail='Missing embedding index files')
89
90 ref_embs = np.load(embs_path) 89 ref_embs = np.load(embs_path)
91 ref_ids = np.load(ids_path, allow_pickle=True).tolist() 90 ref_ids = np.load(ids_path, allow_pickle=True).tolist()
92 engine = HybridEngine(matcher, embedder, ref_embs, ref_ids) 91 engine = HybridEngine(matcher, embedder, ref_embs, ref_ids)
...@@ -147,22 +146,54 @@ def _aggregate_chunk_results(chunk_results: list[dict], top_n: int) -> list[dict ...@@ -147,22 +146,54 @@ def _aggregate_chunk_results(chunk_results: list[dict], top_n: int) -> list[dict
147 return ranked[:top_n] 146 return ranked[:top_n]
148 147
149 148
150 def _reference_audio_for_song(engine: HybridEngine, song_id: str) -> str | None: 149 def _reference_audio_for_song(engine, song_id: str) -> str | None:
151 return engine.song_audio_paths.get(song_id) 150 return getattr(engine, 'song_audio_paths', {}).get(song_id)
151
152
153 def _workspace_reference_map(downloads_dir: Path, song_limit: int = 20) -> list[dict]:
154 refs = []
155 for song_dir in sorted(p for p in downloads_dir.iterdir() if p.is_dir()):
156 ref = first_file(song_dir / f'type_{REFERENCE_TYPE}')
157 if ref:
158 refs.append({'song_id': song_dir.name, 'reference_path': str(ref)})
159 if len(refs) >= song_limit:
160 break
161 return refs
162
163
164 def _workspace_faiss_candidates(query_audio_path: str, downloads_dir: Path, song_limit: int, sr: int, duration: float, top_n: int) -> list[dict]:
165 refs = _workspace_reference_map(downloads_dir, song_limit)
166 if not refs:
167 return []
168 ref_vecs = [embed_chroma(item['reference_path'], sr, duration) for item in refs]
169 qry_vec = embed_chroma(query_audio_path, sr, duration).reshape(1, -1).astype(np.float32)
170 ref_matrix = np.vstack(ref_vecs).astype(np.float32)
171 index = faiss.IndexFlatIP(ref_matrix.shape[1])
172 index.add(ref_matrix)
173 sims, idxs = index.search(qry_vec, top_n)
174 results = []
175 for j in range(top_n):
176 ref_idx = int(idxs[0, j])
177 results.append({
178 'song_id': refs[ref_idx]['song_id'],
179 'confidence': float(sims[0, j]),
180 'reference_path': refs[ref_idx]['reference_path'],
181 })
182 return results
152 183
153 184
154 @app.get('/health') 185 @app.get('/health')
155 def health(): 186 def health():
156 resolved = _resolve() 187 resolved = _resolve()
157 readiness = _readiness_snapshot(resolved['data_dir'], resolved['model_path'], resolved['index_prefix']) 188 readiness = _readiness_snapshot(resolved['data_dir'], resolved['model_path'], resolved['index_prefix'])
158 return {'status': 'ok', 'service': 'acr', 'version': '0.4.0', 'ready': readiness['ready']} 189 return {'status': 'ok', 'service': 'acr', 'version': '0.5.0', 'ready': readiness['ready']}
159 190
160 191
161 @app.get('/ready') 192 @app.get('/ready')
162 def ready(): 193 def ready():
163 resolved = _resolve() 194 resolved = _resolve()
164 readiness = _readiness_snapshot(resolved['data_dir'], resolved['model_path'], resolved['index_prefix']) 195 readiness = _readiness_snapshot(resolved['data_dir'], resolved['model_path'], resolved['index_prefix'])
165 return {'service': 'acr', 'version': '0.4.0', **readiness, **_cache_stats()} 196 return {'service': 'acr', 'version': '0.5.0', **readiness, **_cache_stats()}
166 197
167 198
168 @app.get('/config') 199 @app.get('/config')
...@@ -188,19 +219,13 @@ def recognize(req: RecognizeRequest): ...@@ -188,19 +219,13 @@ def recognize(req: RecognizeRequest):
188 @app.post('/index/build') 219 @app.post('/index/build')
189 def build_index(req: BuildIndexRequest): 220 def build_index(req: BuildIndexRequest):
190 from run_demo import build_chroma_index, build_embedding_index 221 from run_demo import build_chroma_index, build_embedding_index
191
192 resolved = _resolve(req.data_dir, req.model_path, None, req.device) 222 resolved = _resolve(req.data_dir, req.model_path, None, req.device)
193 data_dir = Path(resolved['data_dir']) 223 data_dir = Path(resolved['data_dir'])
194 out_dir = Path(req.output_dir) 224 out_dir = Path(req.output_dir)
195 out_dir.mkdir(parents=True, exist_ok=True) 225 out_dir.mkdir(parents=True, exist_ok=True)
196 build_chroma_index(data_dir, out_dir) 226 build_chroma_index(data_dir, out_dir)
197 _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(resolved['model_path']), out_dir / 'reference', resolved['device']) 227 _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(resolved['model_path']), out_dir / 'reference', resolved['device'])
198 return { 228 return {'status': 'ok', 'num_reference_windows': len(ref_ids), 'embedding_dim': int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0, 'output_dir': str(out_dir.resolve())}
199 'status': 'ok',
200 'num_reference_windows': len(ref_ids),
201 'embedding_dim': int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0,
202 'output_dir': str(out_dir.resolve()),
203 }
204 229
205 230
206 @app.post('/recognize/voice') 231 @app.post('/recognize/voice')
...@@ -215,29 +240,61 @@ async def recognize_voice( ...@@ -215,29 +240,61 @@ async def recognize_voice(
215 output_format: str = 'mp3', 240 output_format: str = 'mp3',
216 max_chunks: int = 3, 241 max_chunks: int = 3,
217 include_context: bool = True, 242 include_context: bool = True,
243 corpus: str = 'synthetic',
244 downloads_dir: str = '/workspace/downloads',
245 song_limit: int = 20,
246 local_duration_sec: float = 8.0,
247 local_sr: int = 22050,
218 ): 248 ):
219 resolved = _resolve(data_dir, model_path, index_prefix, device)
220 engine, cache_hit = _load_engine(**resolved)
221 with TemporaryDirectory(prefix='acr_voice_') as tmpdir: 249 with TemporaryDirectory(prefix='acr_voice_') as tmpdir:
222 tmp = Path(tmpdir) 250 tmp = Path(tmpdir)
223 suffix = Path(file.filename or 'upload.wav').suffix or '.wav' 251 suffix = Path(file.filename or 'upload.wav').suffix or '.wav'
224 raw_path = tmp / f'input{suffix}' 252 raw_path = tmp / f'input{suffix}'
225 raw_path.write_bytes(await file.read()) 253 raw_path.write_bytes(await file.read())
226
227 chunk_dir = tmp / 'chunks' 254 chunk_dir = tmp / 'chunks'
228 chunks = voice_to_chunks(str(raw_path), str(chunk_dir), max_chunks=max_chunks) 255 chunks = voice_to_chunks(str(raw_path), str(chunk_dir), max_chunks=max_chunks)
229 if not chunks: 256 if not chunks:
230 raise HTTPException(status_code=400, detail='No voiced chunks detected from input audio') 257 raise HTTPException(status_code=400, detail='No voiced chunks detected from input audio')
231 258
232 chunk_results = [] 259 chunk_results = []
260 if corpus == 'workspace_music20':
261 for chunk in chunks:
262 candidates = _workspace_faiss_candidates(chunk['audio_path'], Path(downloads_dir), song_limit, local_sr, local_duration_sec, top_n)
263 chunk_results.append({'chunk': chunk, 'candidates': candidates, 'processing_time_ms': None})
264 ranked = _aggregate_chunk_results(chunk_results, top_n=top_n)
265 response_candidates = []
266 for item in ranked:
267 ref_audio = item['best_candidate']['reference_path'] if item.get('best_candidate') else None
268 context_info = None
269 if include_context and ref_audio and item['best_chunk'] is not None:
270 match = find_best_matching_window(item['best_chunk']['chunk']['audio_path'], ref_audio)
271 out_path = tmp / 'contexts' / f"{item['song_id']}.{output_format}"
272 context_info = export_match_context(ref_audio, match['window_start_sec'], match['window_end_sec'], str(out_path), context_sec=context_sec, output_format=output_format)
273 context_info['match'] = match
274 response_candidates.append({
275 'song_id': item['song_id'],
276 'combined_confidence': item['combined_confidence'],
277 'best_confidence': item['best_confidence'],
278 'match_count': item['match_count'],
279 'reference_audio_path': ref_audio,
280 'best_candidate': item['best_candidate'],
281 'best_chunk': item['best_chunk']['chunk'] if item['best_chunk'] else None,
282 'context_clip': context_info,
283 })
284 return {
285 'cache_hit': False,
286 'corpus': corpus,
287 'query_audio_filename': file.filename,
288 'chunk_count': len(chunks),
289 'chunk_results': chunk_results,
290 'candidates': response_candidates,
291 }
292
293 resolved = _resolve(data_dir, model_path, index_prefix, device)
294 engine, cache_hit = _load_engine(**resolved)
233 for chunk in chunks: 295 for chunk in chunks:
234 result = engine.recognize(chunk['audio_path'], top_n=top_n) 296 result = engine.recognize(chunk['audio_path'], top_n=top_n)
235 chunk_results.append({ 297 chunk_results.append({'chunk': chunk, 'candidates': result['candidates'], 'processing_time_ms': result['processing_time_ms']})
236 'chunk': chunk,
237 'candidates': result['candidates'],
238 'processing_time_ms': result['processing_time_ms'],
239 })
240
241 ranked = _aggregate_chunk_results(chunk_results, top_n=top_n) 298 ranked = _aggregate_chunk_results(chunk_results, top_n=top_n)
242 response_candidates = [] 299 response_candidates = []
243 for item in ranked: 300 for item in ranked:
...@@ -245,37 +302,9 @@ async def recognize_voice( ...@@ -245,37 +302,9 @@ async def recognize_voice(
245 ref_audio = _reference_audio_for_song(engine, song_id) 302 ref_audio = _reference_audio_for_song(engine, song_id)
246 context_info = None 303 context_info = None
247 if include_context and ref_audio and item['best_chunk'] is not None: 304 if include_context and ref_audio and item['best_chunk'] is not None:
248 match = find_best_matching_window( 305 match = find_best_matching_window(query_audio_path=item['best_chunk']['chunk']['audio_path'], reference_audio_path=ref_audio)
249 query_audio_path=item['best_chunk']['chunk']['audio_path'],
250 reference_audio_path=ref_audio,
251 )
252 out_path = tmp / 'contexts' / f'{song_id}.{output_format}' 306 out_path = tmp / 'contexts' / f'{song_id}.{output_format}'
253 context_info = export_match_context( 307 context_info = export_match_context(audio_path=ref_audio, window_start_sec=match['window_start_sec'], window_end_sec=match['window_end_sec'], output_path=str(out_path), context_sec=context_sec, output_format=output_format)
254 audio_path=ref_audio,
255 window_start_sec=match['window_start_sec'],
256 window_end_sec=match['window_end_sec'],
257 output_path=str(out_path),
258 context_sec=context_sec,
259 output_format=output_format,
260 )
261 context_info['match'] = match 308 context_info['match'] = match
262 309 response_candidates.append({'song_id': song_id, 'combined_confidence': item['combined_confidence'], 'best_confidence': item['best_confidence'], 'match_count': item['match_count'], 'reference_audio_path': ref_audio, 'best_candidate': item['best_candidate'], 'best_chunk': item['best_chunk']['chunk'] if item['best_chunk'] else None, 'context_clip': context_info})
263 response_candidates.append({ 310 return {'cache_hit': cache_hit, 'resolved': resolved, 'corpus': corpus, 'query_audio_filename': file.filename, 'chunk_count': len(chunks), 'chunk_results': chunk_results, 'candidates': response_candidates}
264 'song_id': song_id,
265 'combined_confidence': item['combined_confidence'],
266 'best_confidence': item['best_confidence'],
267 'match_count': item['match_count'],
268 'reference_audio_path': ref_audio,
269 'best_candidate': item['best_candidate'],
270 'best_chunk': item['best_chunk']['chunk'] if item['best_chunk'] else None,
271 'context_clip': context_info,
272 })
273
274 return {
275 'cache_hit': cache_hit,
276 'resolved': resolved,
277 'query_audio_filename': file.filename,
278 'chunk_count': len(chunks),
279 'chunk_results': chunk_results,
280 'candidates': response_candidates,
281 }
......
...@@ -24,7 +24,7 @@ flowchart TD ...@@ -24,7 +24,7 @@ flowchart TD
24 | benchmark report 已生成 | | 24 | benchmark report 已生成 | |
25 | model card 已生成 | | 25 | model card 已生成 | |
26 | license registry 已更新 | | 26 | license registry 已更新 | |
27 | service smoke test 通过 | partial: `/health` OK, `/recognize/voice` payload returns, but still bound to synthetic service index rather than business reference corpus | 27 | service smoke test 通过 | partial: `/health` OK, `/recognize/voice` payload returns against `workspace_music20`, but business top1 correctness still needs manual/metric validation |
28 | dataset whitelist 已确认 | | 28 | dataset whitelist 已确认 | |
29 | changelog 已更新 | yes | 29 | changelog 已更新 | yes |
30 | architect review completed | yes (approved with watch) | 30 | architect review completed | yes (approved with watch) |
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
30 - `acr-engine/src/service/app.py` 已新增 `POST /recognize/voice` 30 - `acr-engine/src/service/app.py` 已新增 `POST /recognize/voice`
31 - `/health` 可正常启动并返回 `ok` 31 - `/health` 可正常启动并返回 `ok`
32 - architect review: approved with watch;当前 split(本地 FAISS / 可选 ChromaDB / 生产 pgvector)方向成立 32 - architect review: approved with watch;当前 split(本地 FAISS / 可选 ChromaDB / 生产 pgvector)方向成立
33 - 当前 `POST /recognize/voice` 已跨过依赖缺失与超时阶段:CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`,voice smoke 已返回 payload(`chunk_count=1`, `top_song_id=song_0022`, `has_context=false`);当前剩余问题是服务默认仍绑定 synthetic 索引语义,尚未切到 `/workspace` 业务曲库 reference 33 - 当前 `POST /recognize/voice` 已跨过依赖缺失与超时阶段:CPU 版 `torch` 已安装、`uvicorn` / `fastapi` / `python-multipart` 已安装、`/health` 可返回 `ok`;同时 voice smoke 已切到 `corpus=workspace_music20`,返回 `chunk_count=1`, `top_song_id=109`, `has_context=true`,并附带真实 `/workspace` reference 路径。当前剩余问题是继续校验该 top1 是否与业务预期一致,而不是链路未通。
34 - 当前 docs 已做第一轮简化: 34 - 当前 docs 已做第一轮简化:
35 - `docs/README.md` 只保留最新架构与最短阅读顺序 35 - `docs/README.md` 只保留最新架构与最短阅读顺序
36 36
......