Commit 69843933 69843933efecfbfff4cdcfcfb1b3eb0329384489 by cnb.bofCdSsphPA

Add a FAISS-first local ACR workflow for music20 samples

Constraint: local validation should stay lightweight and use /workspace sample files while production retrieval remains pgvector-backed
Rejected: making ChromaDB the default local backend | chromadb is not installed in the current environment and FAISS is already available
Confidence: high
Scope-risk: narrow
Directive: keep local dev workflows explicitly separated from production pgvector flows in docs and scripts
Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/local_music20_acr.py --downloads-dir /workspace/downloads --song-limit 20 --backend faiss --output acr-engine/data/local_eval/music20_summary.json
Not-tested: chromadb backend execution without installation; live pgvector database execution path
1 parent 4806664b
...@@ -94,6 +94,29 @@ cd acr-engine ...@@ -94,6 +94,29 @@ cd acr-engine
94 python evaluate.py --data data/synthetic --model data/models/best_model.pt --index-prefix data/index/reference --split test --device cpu 94 python evaluate.py --data data/synthetic --model data/models/best_model.pt --index-prefix data/index/reference --split test --device cpu
95 ``` 95 ```
96 96
97 ## 本地 20 首歌流程与生产向量库约定
98
99 - 本地小样本(如 `/workspace/downloads` 的 20 首歌流程)优先使用 **FAISS**
100 - 若后续本机安装了 `chromadb`,可作为可选对照后端,但不是默认依赖。
101 - 生产环境统一保留 **pgvector** 作为正式向量存储与检索底座。
102
103 本地 20 首歌流程:
104
105 ```bash
106 cd acr-engine
107 /usr/local/miniconda3/bin/python scripts/local_music20_acr.py \
108 --downloads-dir /workspace/downloads \
109 --song-limit 20 \
110 --backend faiss \
111 --output data/local_eval/music20_summary.json
112 ```
113
114 说明:
115 - `query_type=1/7/8/16` 会在一个汇总 JSON 里统一输出。
116 - 默认本地后端为 `faiss`
117 - 若使用 `--backend chromadb` 且环境缺少 `chromadb`,脚本会明确报错提示。
118 - 生产向量化链路仍走 `scripts/export_manifest_to_pgvector_json.py` 与后续 pgvector 入库。
119
97 ## 当前提升方向 120 ## 当前提升方向
98 121
99 - 更强合成混淆样本(confused / humming_like) 122 - 更强合成混淆样本(confused / humming_like)
......
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 """Run a FAISS-first local ACR eval on up to 20 songs from /workspace/downloads.
3
4 Purpose:
5 - keep small-sample validation inside acr-engine
6 - default to FAISS for local dev
7 - optionally allow ChromaDB when installed
8 - preserve pgvector as the production path (not used here)
9 """
10 from __future__ import annotations
11
12 import argparse
13 import json
14 from pathlib import Path
15 from typing import Dict, List
16
17 import faiss
18 import librosa
19 import numpy as np
20
21
22 DEFAULT_DOWNLOADS = Path('/workspace/downloads')
23 DEFAULT_OUTPUT = Path('/root/vprecog/acr-engine/data/local_eval/music20_summary.json')
24 SUPPORTED_QUERY_TYPES = (1, 7, 8, 16)
25 REFERENCE_TYPE = 11
26
27
28 def parse_args() -> argparse.Namespace:
29 ap = argparse.ArgumentParser()
30 ap.add_argument('--downloads-dir', default=str(DEFAULT_DOWNLOADS))
31 ap.add_argument('--song-limit', type=int, default=20)
32 ap.add_argument('--duration', type=float, default=8.0)
33 ap.add_argument('--sr', type=int, default=22050)
34 ap.add_argument('--topk', type=int, default=3)
35 ap.add_argument('--backend', choices=['faiss', 'chromadb'], default='faiss')
36 ap.add_argument('--output', default=str(DEFAULT_OUTPUT))
37 return ap.parse_args()
38
39
40 def first_file(path: Path) -> Path | None:
41 files = sorted(p for p in path.iterdir() if p.is_file()) if path.exists() else []
42 return files[0] if files else None
43
44
45 def collect_pairs(downloads_dir: Path, song_limit: int, query_type: int) -> List[Dict[str, str]]:
46 pairs = []
47 for song_dir in sorted(p for p in downloads_dir.iterdir() if p.is_dir()):
48 ref = first_file(song_dir / f'type_{REFERENCE_TYPE}')
49 qry = first_file(song_dir / f'type_{query_type}')
50 if ref and qry:
51 pairs.append({
52 'song_id': song_dir.name,
53 'reference_path': str(ref),
54 'query_path': str(qry),
55 })
56 if len(pairs) >= song_limit:
57 break
58 return pairs
59
60
61 def load_audio(path: str, sr: int, duration: float) -> np.ndarray:
62 y, _ = librosa.load(path, sr=sr, mono=True, duration=duration)
63 target_len = int(sr * duration)
64 if len(y) < target_len:
65 y = np.pad(y, (0, target_len - len(y)))
66 else:
67 y = y[:target_len]
68 return y.astype(np.float32)
69
70
71 def embed_chroma(path: str, sr: int, duration: float) -> np.ndarray:
72 y = load_audio(path, sr=sr, duration=duration)
73 chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=12)
74 feat = np.concatenate([chroma.mean(axis=1), chroma.std(axis=1)], axis=0).astype(np.float32)
75 norm = np.linalg.norm(feat)
76 if norm > 0:
77 feat = feat / norm
78 return feat
79
80
81 def run_faiss(ref_matrix: np.ndarray, qry_matrix: np.ndarray, topk: int):
82 index = faiss.IndexFlatIP(ref_matrix.shape[1])
83 index.add(ref_matrix)
84 return index.search(qry_matrix, topk)
85
86
87 def run_chromadb(ref_matrix: np.ndarray, qry_matrix: np.ndarray, topk: int):
88 try:
89 import chromadb # type: ignore
90 except Exception as exc: # pragma: no cover - env-dependent
91 raise SystemExit(f'ChromaDB backend requested but unavailable: {exc}')
92
93 client = chromadb.EphemeralClient()
94 collection = client.create_collection('music20_local_eval')
95 ref_ids = [str(i) for i in range(len(ref_matrix))]
96 collection.add(ids=ref_ids, embeddings=ref_matrix.tolist())
97 result = collection.query(query_embeddings=qry_matrix.tolist(), n_results=topk)
98 distances = np.asarray(result['distances'], dtype=np.float32)
99 idxs = np.asarray([[int(x) for x in row] for row in result['ids']], dtype=np.int32)
100 sims = 1.0 - distances
101 return sims, idxs
102
103
104 def evaluate_query_type(downloads_dir: Path, song_limit: int, query_type: int, sr: int, duration: float, topk: int, backend: str):
105 pairs = collect_pairs(downloads_dir, song_limit, query_type=query_type)
106 if not pairs:
107 return {
108 'query_type': query_type,
109 'reference_type': REFERENCE_TYPE,
110 'song_count': 0,
111 'file_count': 0,
112 'topk': topk,
113 'metrics': {'top1': 0.0, 'top3': 0.0},
114 'results': [],
115 'note': 'No matching query/reference pairs found.',
116 }
117
118 ref_vecs = [embed_chroma(item['reference_path'], sr, duration) for item in pairs]
119 qry_vecs = [embed_chroma(item['query_path'], sr, duration) for item in pairs]
120 ref_ids = [item['song_id'] for item in pairs]
121
122 ref_matrix = np.vstack(ref_vecs).astype(np.float32)
123 qry_matrix = np.vstack(qry_vecs).astype(np.float32)
124
125 if backend == 'faiss':
126 sims, idxs = run_faiss(ref_matrix, qry_matrix, topk)
127 else:
128 sims, idxs = run_chromadb(ref_matrix, qry_matrix, topk)
129
130 ranks = []
131 results = []
132 for i, item in enumerate(pairs):
133 candidates = []
134 rank = None
135 for j in range(topk):
136 ref_idx = int(idxs[i, j])
137 cand_song_id = ref_ids[ref_idx]
138 score = float(sims[i, j])
139 candidates.append({'rank': j + 1, 'song_id': cand_song_id, 'score': score})
140 if cand_song_id == item['song_id'] and rank is None:
141 rank = j + 1
142 if rank is None:
143 rank = topk + 1
144 ranks.append(rank)
145 results.append({
146 'song_id': item['song_id'],
147 'query_path': item['query_path'],
148 'reference_path': item['reference_path'],
149 'rank': rank,
150 'candidates': candidates,
151 })
152
153 top1 = sum(1 for r in ranks if r == 1) / len(ranks)
154 top3 = sum(1 for r in ranks if r <= min(3, topk)) / len(ranks)
155 return {
156 'query_type': query_type,
157 'reference_type': REFERENCE_TYPE,
158 'song_count': len(pairs),
159 'file_count': len(pairs) * 2,
160 'topk': topk,
161 'metrics': {'top1': top1, 'top3': top3},
162 'results': results,
163 }
164
165
166 def main() -> None:
167 args = parse_args()
168 downloads_dir = Path(args.downloads_dir)
169 out = Path(args.output)
170 out.parent.mkdir(parents=True, exist_ok=True)
171
172 summary = {
173 'backend': args.backend,
174 'purpose': 'Local 20-song ACR sanity flow for development; production remains pgvector.',
175 'downloads_dir': str(downloads_dir),
176 'song_limit': args.song_limit,
177 'duration_sec': args.duration,
178 'sr': args.sr,
179 'evaluations': [],
180 }
181 for query_type in SUPPORTED_QUERY_TYPES:
182 summary['evaluations'].append(
183 evaluate_query_type(downloads_dir, args.song_limit, query_type, args.sr, args.duration, args.topk, args.backend)
184 )
185
186 out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
187 print(json.dumps(summary, ensure_ascii=False, indent=2))
188
189
190 if __name__ == '__main__':
191 main()
1 import tempfile
2 import unittest
3 from pathlib import Path
4
5 from scripts.local_music20_acr import collect_pairs, first_file
6
7
8 class LocalMusic20AcrTests(unittest.TestCase):
9 def test_first_file_returns_none_for_missing_dir(self):
10 with tempfile.TemporaryDirectory() as tmp:
11 self.assertIsNone(first_file(Path(tmp) / 'missing'))
12
13 def test_collect_pairs_uses_reference_and_query_types(self):
14 with tempfile.TemporaryDirectory() as tmp:
15 root = Path(tmp)
16 song = root / '1'
17 (song / 'type_11').mkdir(parents=True)
18 (song / 'type_7').mkdir(parents=True)
19 ((song / 'type_11') / 'ref.wav').write_bytes(b'r')
20 ((song / 'type_7') / 'query.mp3').write_bytes(b'q')
21
22 pairs = collect_pairs(root, song_limit=20, query_type=7)
23
24 self.assertEqual(len(pairs), 1)
25 self.assertEqual(pairs[0]['song_id'], '1')
26 self.assertTrue(pairs[0]['reference_path'].endswith('ref.wav'))
27 self.assertTrue(pairs[0]['query_path'].endswith('query.mp3'))
28
29
30 if __name__ == '__main__':
31 unittest.main()
1
2 ## 2026-06-03 20-song local ACR workflow in acr-engine
3
4 - 新增 `acr-engine/scripts/local_music20_acr.py`,在 `acr-engine` 内提供基于 `/workspace/downloads` 的本地 20 首歌 ACR 小样本流程。
5 - 本地流程默认优先使用 **FAISS**`chromadb` 作为可选对照后端;生产环境继续保留 **pgvector**
6 - 新增 `acr-engine/tests/test_local_music20_acr.py`,覆盖本地配对逻辑。
7 - 新增 `acr-engine/data/local_eval/music20_summary.json`,记录 `type_1/7/8/16 -> type_11` 的本地 20-song 汇总结果。
8 - 更新 `acr-engine/README.md`,明确“本地 FAISS / 可选 ChromaDB / 生产 pgvector”的分层约定。
9
10 Fresh evidence:
11 - `/usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v` => `Ran 2 tests, OK`
12 - `/usr/local/miniconda3/bin/python acr-engine/scripts/local_music20_acr.py --downloads-dir /workspace/downloads --song-limit 20 --backend faiss --output acr-engine/data/local_eval/music20_summary.json` 跑通
13 - 汇总结果:
14 - `type_1 -> type_11`: `top1=1.0`, `top3=1.0`
15 - `type_7 -> type_11`: `top1=0.45`, `top3=0.65`
16 - `type_8 -> type_11`: `top1=0.4667`, `top3=0.7333`
17 - `type_16 -> type_11`: `top1=0.4167`, `top3=0.4167`
18
1 ### Stage: production encoder freeze FAQ and rollout guidance 19 ### Stage: production encoder freeze FAQ and rollout guidance
2 20
3 完成项: 21 完成项:
......