Add a FAISS-first local ACR workflow for music20 samples
Constraint: local validation should stay lightweight and use /workspace sample files while production retrieval remains pgvector-backed Rejected: making ChromaDB the default local backend | chromadb is not installed in the current environment and FAISS is already available Confidence: high Scope-risk: narrow Directive: keep local dev workflows explicitly separated from production pgvector flows in docs and scripts Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/local_music20_acr.py --downloads-dir /workspace/downloads --song-limit 20 --backend faiss --output acr-engine/data/local_eval/music20_summary.json Not-tested: chromadb backend execution without installation; live pgvector database execution path
Showing
5 changed files
with
263 additions
and
0 deletions
| ... | @@ -94,6 +94,29 @@ cd acr-engine | ... | @@ -94,6 +94,29 @@ cd acr-engine |
| 94 | python evaluate.py --data data/synthetic --model data/models/best_model.pt --index-prefix data/index/reference --split test --device cpu | 94 | python evaluate.py --data data/synthetic --model data/models/best_model.pt --index-prefix data/index/reference --split test --device cpu |
| 95 | ``` | 95 | ``` |
| 96 | 96 | ||
| 97 | ## 本地 20 首歌流程与生产向量库约定 | ||
| 98 | |||
| 99 | - 本地小样本(如 `/workspace/downloads` 的 20 首歌流程)优先使用 **FAISS**。 | ||
| 100 | - 若后续本机安装了 `chromadb`,可作为可选对照后端,但不是默认依赖。 | ||
| 101 | - 生产环境统一保留 **pgvector** 作为正式向量存储与检索底座。 | ||
| 102 | |||
| 103 | 本地 20 首歌流程: | ||
| 104 | |||
| 105 | ```bash | ||
| 106 | cd acr-engine | ||
| 107 | /usr/local/miniconda3/bin/python scripts/local_music20_acr.py \ | ||
| 108 | --downloads-dir /workspace/downloads \ | ||
| 109 | --song-limit 20 \ | ||
| 110 | --backend faiss \ | ||
| 111 | --output data/local_eval/music20_summary.json | ||
| 112 | ``` | ||
| 113 | |||
| 114 | 说明: | ||
| 115 | - `query_type=1/7/8/16` 会在一个汇总 JSON 里统一输出。 | ||
| 116 | - 默认本地后端为 `faiss`。 | ||
| 117 | - 若使用 `--backend chromadb` 且环境缺少 `chromadb`,脚本会明确报错提示。 | ||
| 118 | - 生产向量化链路仍走 `scripts/export_manifest_to_pgvector_json.py` 与后续 pgvector 入库。 | ||
| 119 | |||
| 97 | ## 当前提升方向 | 120 | ## 当前提升方向 |
| 98 | 121 | ||
| 99 | - 更强合成混淆样本(confused / humming_like) | 122 | - 更强合成混淆样本(confused / humming_like) | ... | ... |
This diff is collapsed.
Click to expand it.
acr-engine/scripts/local_music20_acr.py
0 → 100755
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | """Run a FAISS-first local ACR eval on up to 20 songs from /workspace/downloads. | ||
| 3 | |||
| 4 | Purpose: | ||
| 5 | - keep small-sample validation inside acr-engine | ||
| 6 | - default to FAISS for local dev | ||
| 7 | - optionally allow ChromaDB when installed | ||
| 8 | - preserve pgvector as the production path (not used here) | ||
| 9 | """ | ||
| 10 | from __future__ import annotations | ||
| 11 | |||
| 12 | import argparse | ||
| 13 | import json | ||
| 14 | from pathlib import Path | ||
| 15 | from typing import Dict, List | ||
| 16 | |||
| 17 | import faiss | ||
| 18 | import librosa | ||
| 19 | import numpy as np | ||
| 20 | |||
| 21 | |||
| 22 | DEFAULT_DOWNLOADS = Path('/workspace/downloads') | ||
| 23 | DEFAULT_OUTPUT = Path('/root/vprecog/acr-engine/data/local_eval/music20_summary.json') | ||
| 24 | SUPPORTED_QUERY_TYPES = (1, 7, 8, 16) | ||
| 25 | REFERENCE_TYPE = 11 | ||
| 26 | |||
| 27 | |||
| 28 | def parse_args() -> argparse.Namespace: | ||
| 29 | ap = argparse.ArgumentParser() | ||
| 30 | ap.add_argument('--downloads-dir', default=str(DEFAULT_DOWNLOADS)) | ||
| 31 | ap.add_argument('--song-limit', type=int, default=20) | ||
| 32 | ap.add_argument('--duration', type=float, default=8.0) | ||
| 33 | ap.add_argument('--sr', type=int, default=22050) | ||
| 34 | ap.add_argument('--topk', type=int, default=3) | ||
| 35 | ap.add_argument('--backend', choices=['faiss', 'chromadb'], default='faiss') | ||
| 36 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 37 | return ap.parse_args() | ||
| 38 | |||
| 39 | |||
| 40 | def first_file(path: Path) -> Path | None: | ||
| 41 | files = sorted(p for p in path.iterdir() if p.is_file()) if path.exists() else [] | ||
| 42 | return files[0] if files else None | ||
| 43 | |||
| 44 | |||
| 45 | def collect_pairs(downloads_dir: Path, song_limit: int, query_type: int) -> List[Dict[str, str]]: | ||
| 46 | pairs = [] | ||
| 47 | for song_dir in sorted(p for p in downloads_dir.iterdir() if p.is_dir()): | ||
| 48 | ref = first_file(song_dir / f'type_{REFERENCE_TYPE}') | ||
| 49 | qry = first_file(song_dir / f'type_{query_type}') | ||
| 50 | if ref and qry: | ||
| 51 | pairs.append({ | ||
| 52 | 'song_id': song_dir.name, | ||
| 53 | 'reference_path': str(ref), | ||
| 54 | 'query_path': str(qry), | ||
| 55 | }) | ||
| 56 | if len(pairs) >= song_limit: | ||
| 57 | break | ||
| 58 | return pairs | ||
| 59 | |||
| 60 | |||
| 61 | def load_audio(path: str, sr: int, duration: float) -> np.ndarray: | ||
| 62 | y, _ = librosa.load(path, sr=sr, mono=True, duration=duration) | ||
| 63 | target_len = int(sr * duration) | ||
| 64 | if len(y) < target_len: | ||
| 65 | y = np.pad(y, (0, target_len - len(y))) | ||
| 66 | else: | ||
| 67 | y = y[:target_len] | ||
| 68 | return y.astype(np.float32) | ||
| 69 | |||
| 70 | |||
| 71 | def embed_chroma(path: str, sr: int, duration: float) -> np.ndarray: | ||
| 72 | y = load_audio(path, sr=sr, duration=duration) | ||
| 73 | chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=12) | ||
| 74 | feat = np.concatenate([chroma.mean(axis=1), chroma.std(axis=1)], axis=0).astype(np.float32) | ||
| 75 | norm = np.linalg.norm(feat) | ||
| 76 | if norm > 0: | ||
| 77 | feat = feat / norm | ||
| 78 | return feat | ||
| 79 | |||
| 80 | |||
| 81 | def run_faiss(ref_matrix: np.ndarray, qry_matrix: np.ndarray, topk: int): | ||
| 82 | index = faiss.IndexFlatIP(ref_matrix.shape[1]) | ||
| 83 | index.add(ref_matrix) | ||
| 84 | return index.search(qry_matrix, topk) | ||
| 85 | |||
| 86 | |||
| 87 | def run_chromadb(ref_matrix: np.ndarray, qry_matrix: np.ndarray, topk: int): | ||
| 88 | try: | ||
| 89 | import chromadb # type: ignore | ||
| 90 | except Exception as exc: # pragma: no cover - env-dependent | ||
| 91 | raise SystemExit(f'ChromaDB backend requested but unavailable: {exc}') | ||
| 92 | |||
| 93 | client = chromadb.EphemeralClient() | ||
| 94 | collection = client.create_collection('music20_local_eval') | ||
| 95 | ref_ids = [str(i) for i in range(len(ref_matrix))] | ||
| 96 | collection.add(ids=ref_ids, embeddings=ref_matrix.tolist()) | ||
| 97 | result = collection.query(query_embeddings=qry_matrix.tolist(), n_results=topk) | ||
| 98 | distances = np.asarray(result['distances'], dtype=np.float32) | ||
| 99 | idxs = np.asarray([[int(x) for x in row] for row in result['ids']], dtype=np.int32) | ||
| 100 | sims = 1.0 - distances | ||
| 101 | return sims, idxs | ||
| 102 | |||
| 103 | |||
| 104 | def evaluate_query_type(downloads_dir: Path, song_limit: int, query_type: int, sr: int, duration: float, topk: int, backend: str): | ||
| 105 | pairs = collect_pairs(downloads_dir, song_limit, query_type=query_type) | ||
| 106 | if not pairs: | ||
| 107 | return { | ||
| 108 | 'query_type': query_type, | ||
| 109 | 'reference_type': REFERENCE_TYPE, | ||
| 110 | 'song_count': 0, | ||
| 111 | 'file_count': 0, | ||
| 112 | 'topk': topk, | ||
| 113 | 'metrics': {'top1': 0.0, 'top3': 0.0}, | ||
| 114 | 'results': [], | ||
| 115 | 'note': 'No matching query/reference pairs found.', | ||
| 116 | } | ||
| 117 | |||
| 118 | ref_vecs = [embed_chroma(item['reference_path'], sr, duration) for item in pairs] | ||
| 119 | qry_vecs = [embed_chroma(item['query_path'], sr, duration) for item in pairs] | ||
| 120 | ref_ids = [item['song_id'] for item in pairs] | ||
| 121 | |||
| 122 | ref_matrix = np.vstack(ref_vecs).astype(np.float32) | ||
| 123 | qry_matrix = np.vstack(qry_vecs).astype(np.float32) | ||
| 124 | |||
| 125 | if backend == 'faiss': | ||
| 126 | sims, idxs = run_faiss(ref_matrix, qry_matrix, topk) | ||
| 127 | else: | ||
| 128 | sims, idxs = run_chromadb(ref_matrix, qry_matrix, topk) | ||
| 129 | |||
| 130 | ranks = [] | ||
| 131 | results = [] | ||
| 132 | for i, item in enumerate(pairs): | ||
| 133 | candidates = [] | ||
| 134 | rank = None | ||
| 135 | for j in range(topk): | ||
| 136 | ref_idx = int(idxs[i, j]) | ||
| 137 | cand_song_id = ref_ids[ref_idx] | ||
| 138 | score = float(sims[i, j]) | ||
| 139 | candidates.append({'rank': j + 1, 'song_id': cand_song_id, 'score': score}) | ||
| 140 | if cand_song_id == item['song_id'] and rank is None: | ||
| 141 | rank = j + 1 | ||
| 142 | if rank is None: | ||
| 143 | rank = topk + 1 | ||
| 144 | ranks.append(rank) | ||
| 145 | results.append({ | ||
| 146 | 'song_id': item['song_id'], | ||
| 147 | 'query_path': item['query_path'], | ||
| 148 | 'reference_path': item['reference_path'], | ||
| 149 | 'rank': rank, | ||
| 150 | 'candidates': candidates, | ||
| 151 | }) | ||
| 152 | |||
| 153 | top1 = sum(1 for r in ranks if r == 1) / len(ranks) | ||
| 154 | top3 = sum(1 for r in ranks if r <= min(3, topk)) / len(ranks) | ||
| 155 | return { | ||
| 156 | 'query_type': query_type, | ||
| 157 | 'reference_type': REFERENCE_TYPE, | ||
| 158 | 'song_count': len(pairs), | ||
| 159 | 'file_count': len(pairs) * 2, | ||
| 160 | 'topk': topk, | ||
| 161 | 'metrics': {'top1': top1, 'top3': top3}, | ||
| 162 | 'results': results, | ||
| 163 | } | ||
| 164 | |||
| 165 | |||
| 166 | def main() -> None: | ||
| 167 | args = parse_args() | ||
| 168 | downloads_dir = Path(args.downloads_dir) | ||
| 169 | out = Path(args.output) | ||
| 170 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 171 | |||
| 172 | summary = { | ||
| 173 | 'backend': args.backend, | ||
| 174 | 'purpose': 'Local 20-song ACR sanity flow for development; production remains pgvector.', | ||
| 175 | 'downloads_dir': str(downloads_dir), | ||
| 176 | 'song_limit': args.song_limit, | ||
| 177 | 'duration_sec': args.duration, | ||
| 178 | 'sr': args.sr, | ||
| 179 | 'evaluations': [], | ||
| 180 | } | ||
| 181 | for query_type in SUPPORTED_QUERY_TYPES: | ||
| 182 | summary['evaluations'].append( | ||
| 183 | evaluate_query_type(downloads_dir, args.song_limit, query_type, args.sr, args.duration, args.topk, args.backend) | ||
| 184 | ) | ||
| 185 | |||
| 186 | out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 187 | print(json.dumps(summary, ensure_ascii=False, indent=2)) | ||
| 188 | |||
| 189 | |||
| 190 | if __name__ == '__main__': | ||
| 191 | main() |
acr-engine/tests/test_local_music20_acr.py
0 → 100644
| 1 | import tempfile | ||
| 2 | import unittest | ||
| 3 | from pathlib import Path | ||
| 4 | |||
| 5 | from scripts.local_music20_acr import collect_pairs, first_file | ||
| 6 | |||
| 7 | |||
| 8 | class LocalMusic20AcrTests(unittest.TestCase): | ||
| 9 | def test_first_file_returns_none_for_missing_dir(self): | ||
| 10 | with tempfile.TemporaryDirectory() as tmp: | ||
| 11 | self.assertIsNone(first_file(Path(tmp) / 'missing')) | ||
| 12 | |||
| 13 | def test_collect_pairs_uses_reference_and_query_types(self): | ||
| 14 | with tempfile.TemporaryDirectory() as tmp: | ||
| 15 | root = Path(tmp) | ||
| 16 | song = root / '1' | ||
| 17 | (song / 'type_11').mkdir(parents=True) | ||
| 18 | (song / 'type_7').mkdir(parents=True) | ||
| 19 | ((song / 'type_11') / 'ref.wav').write_bytes(b'r') | ||
| 20 | ((song / 'type_7') / 'query.mp3').write_bytes(b'q') | ||
| 21 | |||
| 22 | pairs = collect_pairs(root, song_limit=20, query_type=7) | ||
| 23 | |||
| 24 | self.assertEqual(len(pairs), 1) | ||
| 25 | self.assertEqual(pairs[0]['song_id'], '1') | ||
| 26 | self.assertTrue(pairs[0]['reference_path'].endswith('ref.wav')) | ||
| 27 | self.assertTrue(pairs[0]['query_path'].endswith('query.mp3')) | ||
| 28 | |||
| 29 | |||
| 30 | if __name__ == '__main__': | ||
| 31 | unittest.main() |
| 1 | |||
| 2 | ## 2026-06-03 20-song local ACR workflow in acr-engine | ||
| 3 | |||
| 4 | - 新增 `acr-engine/scripts/local_music20_acr.py`,在 `acr-engine` 内提供基于 `/workspace/downloads` 的本地 20 首歌 ACR 小样本流程。 | ||
| 5 | - 本地流程默认优先使用 **FAISS**,`chromadb` 作为可选对照后端;生产环境继续保留 **pgvector**。 | ||
| 6 | - 新增 `acr-engine/tests/test_local_music20_acr.py`,覆盖本地配对逻辑。 | ||
| 7 | - 新增 `acr-engine/data/local_eval/music20_summary.json`,记录 `type_1/7/8/16 -> type_11` 的本地 20-song 汇总结果。 | ||
| 8 | - 更新 `acr-engine/README.md`,明确“本地 FAISS / 可选 ChromaDB / 生产 pgvector”的分层约定。 | ||
| 9 | |||
| 10 | Fresh evidence: | ||
| 11 | - `/usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v` => `Ran 2 tests, OK` | ||
| 12 | - `/usr/local/miniconda3/bin/python acr-engine/scripts/local_music20_acr.py --downloads-dir /workspace/downloads --song-limit 20 --backend faiss --output acr-engine/data/local_eval/music20_summary.json` 跑通 | ||
| 13 | - 汇总结果: | ||
| 14 | - `type_1 -> type_11`: `top1=1.0`, `top3=1.0` | ||
| 15 | - `type_7 -> type_11`: `top1=0.45`, `top3=0.65` | ||
| 16 | - `type_8 -> type_11`: `top1=0.4667`, `top3=0.7333` | ||
| 17 | - `type_16 -> type_11`: `top1=0.4167`, `top3=0.4167` | ||
| 18 | |||
| 1 | ### Stage: production encoder freeze FAQ and rollout guidance | 19 | ### Stage: production encoder freeze FAQ and rollout guidance |
| 2 | 20 | ||
| 3 | 完成项: | 21 | 完成项: | ... | ... |
-
Please register or sign in to post a comment