Commit 2898ef26 2898ef26ef8c15a9eacad47c63db03ff172ed1b3 by cnb.bofCdSsphPA

Add the song_id pgvector evaluation scaffolding

Constraint: we need a song-level evaluation path that matches the future pgvector production shape before moving off the local FAISS proving lane
Rejected: jumping straight to a live pgvector-only implementation | we still need a reproducible repo-local evaluation harness and artifact trail first
Confidence: high
Scope-risk: moderate
Directive: keep future pgvector work song_id-first and measure each query type separately before aggregating product claims
Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py --downloads-dir /workspace/downloads --song-limit 20 --out-dir acr-engine/data/pgvector_eval/music20; /usr/local/miniconda3/bin/python acr-engine/scripts/evaluate_songid_pgvector_path.py --reference-embeddings-jsonl acr-engine/data/pgvector_eval/music20/reference_embeddings.jsonl --query-embeddings-jsonl acr-engine/data/pgvector_eval/music20/query_embeddings.jsonl --output acr-engine/data/pgvector_eval/music20/songid_eval_report.json
Not-tested: live PostgreSQL/pgvector online retrieval path
1 parent a0ceb991
1 {
2 "backend": "faiss-as-pgvector-standin",
3 "note": "Uses song-level aggregation compatible with a future pgvector online path.",
4 "overall": {
5 "count": 22,
6 "top1": 0.909091,
7 "top3": 0.954545,
8 "top10": 0.954545,
9 "mrr": 0.934343,
10 "mean_rank": 1.8182,
11 "median_rank": 1.0
12 },
13 "by_query_type": {
14 "1": {
15 "count": 20,
16 "top1": 1.0,
17 "top3": 1.0,
18 "top10": 1.0,
19 "mrr": 1.0,
20 "mean_rank": 1.0,
21 "median_rank": 1.0
22 },
23 "7": {
24 "count": 2,
25 "top1": 0.0,
26 "top3": 0.5,
27 "top10": 0.5,
28 "mrr": 0.277778,
29 "mean_rank": 10.0,
30 "median_rank": 10.0
31 }
32 },
33 "examples": {
34 "1": [
35 {
36 "song_id": "100",
37 "rank": 1,
38 "top3": [
39 [
40 "100",
41 0.9099869644641876,
42 0.9999855160713196,
43 0.9999855160713196,
44 1
45 ],
46 [
47 "116",
48 0.8674689626693726,
49 0.9527432918548584,
50 0.9527432918548584,
51 1
52 ],
53 [
54 "103",
55 0.8665370559692382,
56 0.9517078399658203,
57 0.9517078399658203,
58 1
59 ]
60 ]
61 },
62 {
63 "song_id": "101",
64 "rank": 1,
65 "top3": [
66 [
67 "101",
68 0.9099996781349182,
69 0.9999996423721313,
70 0.9999996423721313,
71 1
72 ],
73 [
74 "118",
75 0.8930539643764497,
76 0.9811710715293884,
77 0.9811710715293884,
78 1
79 ],
80 [
81 "116",
82 0.8920178270339967,
83 0.9800198078155518,
84 0.9800198078155518,
85 1
86 ]
87 ]
88 },
89 {
90 "song_id": "102",
91 "rank": 1,
92 "top3": [
93 [
94 "102",
95 0.9099974250793457,
96 0.9999971389770508,
97 0.9999971389770508,
98 1
99 ],
100 [
101 "113",
102 0.878619978427887,
103 0.9651333093643188,
104 0.9651333093643188,
105 1
106 ],
107 [
108 "118",
109 0.8727551674842834,
110 0.9586168527603149,
111 0.9586168527603149,
112 1
113 ]
114 ]
115 },
116 {
117 "song_id": "103",
118 "rank": 1,
119 "top3": [
120 [
121 "103",
122 0.9078967189788818,
123 0.9976630210876465,
124 0.9976630210876465,
125 1
126 ],
127 [
128 "116",
129 0.8892688846588135,
130 0.9769654273986816,
131 0.9769654273986816,
132 1
133 ],
134 [
135 "109",
136 0.8786498045921325,
137 0.965166449546814,
138 0.965166449546814,
139 1
140 ]
141 ]
142 },
143 {
144 "song_id": "104",
145 "rank": 1,
146 "top3": [
147 [
148 "104",
149 0.9099890029430389,
150 0.999987781047821,
151 0.999987781047821,
152 1
153 ],
154 [
155 "109",
156 0.8646899795532226,
157 0.9496555328369141,
158 0.9496555328369141,
159 1
160 ],
161 [
162 "116",
163 0.8414634442329406,
164 0.9238482713699341,
165 0.9238482713699341,
166 1
167 ]
168 ]
169 }
170 ],
171 "7": [
172 {
173 "song_id": "111",
174 "rank": 18,
175 "top3": [
176 [
177 "109",
178 0.8765411591529846,
179 0.9628235101699829,
180 0.9628235101699829,
181 1
182 ],
183 [
184 "116",
185 0.8749382710456848,
186 0.9610425233840942,
187 0.9610425233840942,
188 1
189 ],
190 [
191 "118",
192 0.8641276276111602,
193 0.9490306973457336,
194 0.9490306973457336,
195 1
196 ]
197 ]
198 },
199 {
200 "song_id": "116",
201 "rank": 2,
202 "top3": [
203 [
204 "109",
205 0.8701787447929383,
206 0.9557541608810425,
207 0.9557541608810425,
208 1
209 ],
210 [
211 "116",
212 0.8674952483177185,
213 0.9527724981307983,
214 0.9527724981307983,
215 1
216 ],
217 [
218 "103",
219 0.8659579670429229,
220 0.95106440782547,
221 0.95106440782547,
222 1
223 ]
224 ]
225 }
226 ]
227 }
228 }
...\ No newline at end of file ...\ No newline at end of file
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import sys
7 from collections import defaultdict
8 from pathlib import Path
9 from statistics import median
10
11 ROOT = Path(__file__).resolve().parents[1]
12 if str(ROOT) not in sys.path:
13 sys.path.insert(0, str(ROOT))
14
15 import faiss
16 import numpy as np
17
18
19 def load_jsonl(path: Path):
20 return [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines() if line.strip()]
21
22
23 def aggregate_song_scores(song_ids, sims, idxs):
24 aggregated = defaultdict(list)
25 for score, idx in zip(sims, idxs):
26 aggregated[song_ids[int(idx)]].append(float(score))
27 ranked = []
28 for song_id, vals in aggregated.items():
29 vals.sort(reverse=True)
30 max_sim = vals[0]
31 top3_avg = sum(vals[:3]) / min(3, len(vals))
32 vote = len(vals)
33 combined = 0.6 * max_sim + 0.3 * top3_avg + 0.1 * min(vote / 10.0, 1.0)
34 ranked.append((song_id, combined, max_sim, top3_avg, vote))
35 ranked.sort(key=lambda x: x[1], reverse=True)
36 return ranked
37
38
39 def compute_metrics(ranks, topk):
40 if not ranks:
41 return {'count': 0}
42 return {
43 'count': len(ranks),
44 'top1': round(sum(1 for r in ranks if r == 1) / len(ranks), 6),
45 'top3': round(sum(1 for r in ranks if r <= 3) / len(ranks), 6),
46 f'top{topk}': round(sum(1 for r in ranks if r <= topk) / len(ranks), 6),
47 'mrr': round(sum(1.0 / r for r in ranks) / len(ranks), 6),
48 'mean_rank': round(sum(ranks) / len(ranks), 4),
49 'median_rank': median(ranks),
50 }
51
52
53 def main():
54 ap = argparse.ArgumentParser()
55 ap.add_argument('--reference-embeddings-jsonl', required=True)
56 ap.add_argument('--query-embeddings-jsonl', required=True)
57 ap.add_argument('--topn', type=int, default=20)
58 ap.add_argument('--topk', type=int, default=10)
59 ap.add_argument('--output', required=True)
60 args = ap.parse_args()
61
62 refs = load_jsonl(Path(args.reference_embeddings_jsonl))
63 queries = load_jsonl(Path(args.query_embeddings_jsonl))
64 ref_matrix = np.asarray([r['embedding'] for r in refs], dtype=np.float32)
65 song_ids = [r['song_id'] for r in refs]
66 index = faiss.IndexFlatIP(ref_matrix.shape[1])
67 index.add(ref_matrix)
68
69 by_type = defaultdict(list)
70 examples = defaultdict(list)
71 for q in queries:
72 qvec = np.asarray(q['embedding'], dtype=np.float32).reshape(1, -1)
73 sims, idxs = index.search(qvec, args.topn)
74 ranked = aggregate_song_scores(song_ids, sims[0], idxs[0])
75 gold = q['song_id']
76 rank = next((i + 1 for i, item in enumerate(ranked) if item[0] == gold), len(ranked) + 1)
77 qtype = str(q['query_type'])
78 by_type[qtype].append(rank)
79 if len(examples[qtype]) < 5:
80 examples[qtype].append({'song_id': gold, 'rank': rank, 'top3': ranked[:3]})
81
82 report = {
83 'backend': 'faiss-as-pgvector-standin',
84 'note': 'Uses song-level aggregation compatible with a future pgvector online path.',
85 'overall': compute_metrics([r for ranks in by_type.values() for r in ranks], args.topk),
86 'by_query_type': {qtype: compute_metrics(ranks, args.topk) for qtype, ranks in by_type.items()},
87 'examples': examples,
88 }
89 out = Path(args.output)
90 out.parent.mkdir(parents=True, exist_ok=True)
91 out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding='utf-8')
92 print(json.dumps(report, ensure_ascii=False, indent=2))
93
94
95 if __name__ == '__main__':
96 main()
1 #!/usr/bin/env /usr/local/miniconda3/bin/python
2 from __future__ import annotations
3
4 import argparse
5 import json
6 import sys
7 from pathlib import Path
8
9 ROOT = Path(__file__).resolve().parents[1]
10 if str(ROOT) not in sys.path:
11 sys.path.insert(0, str(ROOT))
12
13 from scripts.local_music20_acr import REFERENCE_TYPE, SUPPORTED_QUERY_TYPES, embed_chroma, first_file
14
15
16 def parse_args():
17 ap = argparse.ArgumentParser()
18 ap.add_argument('--downloads-dir', default='/workspace/downloads')
19 ap.add_argument('--song-limit', type=int, default=20)
20 ap.add_argument('--duration', type=float, default=8.0)
21 ap.add_argument('--sr', type=int, default=22050)
22 ap.add_argument('--out-dir', default='data/pgvector_eval/music20')
23 return ap.parse_args()
24
25
26 def main():
27 args = parse_args()
28 downloads_dir = Path(args.downloads_dir)
29 out_dir = Path(args.out_dir)
30 out_dir.mkdir(parents=True, exist_ok=True)
31 ref_path = out_dir / 'reference_embeddings.jsonl'
32 qry_path = out_dir / 'query_embeddings.jsonl'
33 ref_f = ref_path.open('w', encoding='utf-8')
34 qry_f = qry_path.open('w', encoding='utf-8')
35 ref_count = qry_count = 0
36 refs_seen = set()
37 for song_dir in sorted(p for p in downloads_dir.iterdir() if p.is_dir()):
38 ref = first_file(song_dir / f'type_{REFERENCE_TYPE}')
39 if ref and song_dir.name not in refs_seen and len(refs_seen) < args.song_limit:
40 row = {
41 'song_id': song_dir.name,
42 'audio_path': str(ref),
43 'type': 'reference',
44 'embedding': embed_chroma(str(ref), args.sr, args.duration).tolist(),
45 }
46 ref_f.write(json.dumps(row, ensure_ascii=False) + '\n')
47 ref_count += 1
48 refs_seen.add(song_dir.name)
49 for query_type in SUPPORTED_QUERY_TYPES:
50 kept = 0
51 for song_id in sorted(refs_seen):
52 song_dir = downloads_dir / song_id
53 qry = first_file(song_dir / f'type_{query_type}')
54 if not qry:
55 continue
56 row = {
57 'song_id': song_id,
58 'audio_path': str(qry),
59 'query_type': query_type,
60 'embedding': embed_chroma(str(qry), args.sr, args.duration).tolist(),
61 }
62 qry_f.write(json.dumps(row, ensure_ascii=False) + '\n')
63 qry_count += 1
64 kept += 1
65 print(f'query_type={query_type} rows={kept}')
66 ref_f.close()
67 qry_f.close()
68 print(json.dumps({'reference_rows': ref_count, 'query_rows': qry_count, 'out_dir': str(out_dir.resolve())}, ensure_ascii=False, indent=2))
69
70
71 if __name__ == '__main__':
72 main()
1 import unittest
2
3 from scripts.evaluate_songid_pgvector_path import aggregate_song_scores, compute_metrics
4
5
6 class SongIdPgvectorPathTests(unittest.TestCase):
7 def test_aggregate_song_scores_ranks_by_combined_score(self):
8 song_ids = ['a', 'a', 'b', 'c']
9 sims = [0.9, 0.85, 0.95, 0.2]
10 idxs = [0, 1, 2, 3]
11 ranked = aggregate_song_scores(song_ids, sims, idxs)
12 self.assertEqual(ranked[0][0], 'b')
13 self.assertEqual(ranked[1][0], 'a')
14
15 def test_compute_metrics(self):
16 metrics = compute_metrics([1, 2, 4], 5)
17 self.assertEqual(metrics['count'], 3)
18 self.assertEqual(metrics['top1'], 0.333333)
19 self.assertEqual(metrics['top3'], 0.666667)
20 self.assertEqual(metrics['top5'], 1.0)
21
22
23 if __name__ == '__main__':
24 unittest.main()
1 - 新增 `acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py``acr-engine/scripts/evaluate_songid_pgvector_path.py`,补齐 song_id 级 pgvector 评测脚手架。
2 - 新增 `acr-engine/data/pgvector_eval/music20/` 评测产物,当前 `faiss-as-pgvector-standin` 结果:整体 `top1=0.9091``top3=0.9545`;其中 `query_type=1` 很强(`top1=1.0`),`query_type=7` 仍明显偏弱(`top1=0.0``top3=0.5`)。
1 - 新增 `acr-engine/data/local_eval/voice_workspace20_type7_eval.json`,对当前 `workspace_music20` 语义做了 20 条 `type_7` 批量验证:`top1=0.0``top3=0.05`,说明业务 song_id 正确性仍明显不足。 3 - 新增 `acr-engine/data/local_eval/voice_workspace20_type7_eval.json`,对当前 `workspace_music20` 语义做了 20 条 `type_7` 批量验证:`top1=0.0``top3=0.05`,说明业务 song_id 正确性仍明显不足。
2 - 新增 `acr-engine/data/local_eval/voice_workspace20_type8_eval.json``voice_workspace20_type16_eval.json`,补充 business-corpus voice correctness 基线:`type_8 top1=0.0/top3=0.0``type_16 top1=0.0/top3=0.0` 4 - 新增 `acr-engine/data/local_eval/voice_workspace20_type8_eval.json``voice_workspace20_type16_eval.json`,补充 business-corpus voice correctness 基线:`type_8 top1=0.0/top3=0.0``type_16 top1=0.0/top3=0.0`
3 - architect review 当前结论:`APPROVED (WATCH)`,允许继续沿当前架构推进,但不能把当前 business-corpus 结果视作完成。 5 - architect review 当前结论:`APPROVED (WATCH)`,允许继续沿当前架构推进,但不能把当前 business-corpus 结果视作完成。
......
...@@ -51,3 +51,5 @@ flowchart TD ...@@ -51,3 +51,5 @@ flowchart TD
51 - handoff / changelog / docs README 已同步 51 - handoff / changelog / docs README 已同步
52 52
53 - handoff 已刷新:yes(已指向 voice service runtime 当前状态与下一步排查路径) 53 - handoff 已刷新:yes(已指向 voice service runtime 当前状态与下一步排查路径)
54
55 - business-corpus song_id baseline 已生成:yes(`data/pgvector_eval/music20/songid_eval_report.json`
......
...@@ -46,6 +46,15 @@ ...@@ -46,6 +46,15 @@
46 3. 把哼唱评测集接入 `evaluate.py` 或独立评测脚本 46 3. 把哼唱评测集接入 `evaluate.py` 或独立评测脚本
47 4. 继续做 docs 第二轮收敛,只保留当前有效主文档 47 4. 继续做 docs 第二轮收敛,只保留当前有效主文档
48 48
49 - 已新增 song_id 级 pgvector 评测脚手架:
50 - `acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py`
51 - `acr-engine/scripts/evaluate_songid_pgvector_path.py`
52 - 当前 `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` 结果:
53 - overall: `top1=0.9091`, `top3=0.9545`
54 - `query_type=1`: `top1=1.0`, `top3=1.0`
55 - `query_type=7`: `top1=0.0`, `top3=0.5`
56 - 注意:当前 20-song 导出里 `query_type=8/16` 行数仍不足(0 行),说明下一步需要专门扩 business reference / query 选择,而不是只沿用当前前 20 首 reference。
57
49 - 当前 `workspace_music20` 业务正确性初测(`acr-engine/data/local_eval/voice_workspace20_type7_eval.json`): 58 - 当前 `workspace_music20` 业务正确性初测(`acr-engine/data/local_eval/voice_workspace20_type7_eval.json`):
50 - `num_queries=20` 59 - `num_queries=20`
51 - `top1=0.0` 60 - `top1=0.0`
......