Commit 2898ef26 2898ef26ef8c15a9eacad47c63db03ff172ed1b3 by cnb.bofCdSsphPA

Add the song_id pgvector evaluation scaffolding

Constraint: we need a song-level evaluation path that matches the future pgvector production shape before moving off the local FAISS proving lane
Rejected: jumping straight to a live pgvector-only implementation | we still need a reproducible repo-local evaluation harness and artifact trail first
Confidence: high
Scope-risk: moderate
Directive: keep future pgvector work song_id-first and measure each query type separately before aggregating product claims
Tested: /usr/local/miniconda3/bin/python -m unittest discover -s acr-engine/tests -v; /usr/local/miniconda3/bin/python acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py --downloads-dir /workspace/downloads --song-limit 20 --out-dir acr-engine/data/pgvector_eval/music20; /usr/local/miniconda3/bin/python acr-engine/scripts/evaluate_songid_pgvector_path.py --reference-embeddings-jsonl acr-engine/data/pgvector_eval/music20/reference_embeddings.jsonl --query-embeddings-jsonl acr-engine/data/pgvector_eval/music20/query_embeddings.jsonl --output acr-engine/data/pgvector_eval/music20/songid_eval_report.json
Not-tested: live PostgreSQL/pgvector online retrieval path
1 parent a0ceb991
{
"backend": "faiss-as-pgvector-standin",
"note": "Uses song-level aggregation compatible with a future pgvector online path.",
"overall": {
"count": 22,
"top1": 0.909091,
"top3": 0.954545,
"top10": 0.954545,
"mrr": 0.934343,
"mean_rank": 1.8182,
"median_rank": 1.0
},
"by_query_type": {
"1": {
"count": 20,
"top1": 1.0,
"top3": 1.0,
"top10": 1.0,
"mrr": 1.0,
"mean_rank": 1.0,
"median_rank": 1.0
},
"7": {
"count": 2,
"top1": 0.0,
"top3": 0.5,
"top10": 0.5,
"mrr": 0.277778,
"mean_rank": 10.0,
"median_rank": 10.0
}
},
"examples": {
"1": [
{
"song_id": "100",
"rank": 1,
"top3": [
[
"100",
0.9099869644641876,
0.9999855160713196,
0.9999855160713196,
1
],
[
"116",
0.8674689626693726,
0.9527432918548584,
0.9527432918548584,
1
],
[
"103",
0.8665370559692382,
0.9517078399658203,
0.9517078399658203,
1
]
]
},
{
"song_id": "101",
"rank": 1,
"top3": [
[
"101",
0.9099996781349182,
0.9999996423721313,
0.9999996423721313,
1
],
[
"118",
0.8930539643764497,
0.9811710715293884,
0.9811710715293884,
1
],
[
"116",
0.8920178270339967,
0.9800198078155518,
0.9800198078155518,
1
]
]
},
{
"song_id": "102",
"rank": 1,
"top3": [
[
"102",
0.9099974250793457,
0.9999971389770508,
0.9999971389770508,
1
],
[
"113",
0.878619978427887,
0.9651333093643188,
0.9651333093643188,
1
],
[
"118",
0.8727551674842834,
0.9586168527603149,
0.9586168527603149,
1
]
]
},
{
"song_id": "103",
"rank": 1,
"top3": [
[
"103",
0.9078967189788818,
0.9976630210876465,
0.9976630210876465,
1
],
[
"116",
0.8892688846588135,
0.9769654273986816,
0.9769654273986816,
1
],
[
"109",
0.8786498045921325,
0.965166449546814,
0.965166449546814,
1
]
]
},
{
"song_id": "104",
"rank": 1,
"top3": [
[
"104",
0.9099890029430389,
0.999987781047821,
0.999987781047821,
1
],
[
"109",
0.8646899795532226,
0.9496555328369141,
0.9496555328369141,
1
],
[
"116",
0.8414634442329406,
0.9238482713699341,
0.9238482713699341,
1
]
]
}
],
"7": [
{
"song_id": "111",
"rank": 18,
"top3": [
[
"109",
0.8765411591529846,
0.9628235101699829,
0.9628235101699829,
1
],
[
"116",
0.8749382710456848,
0.9610425233840942,
0.9610425233840942,
1
],
[
"118",
0.8641276276111602,
0.9490306973457336,
0.9490306973457336,
1
]
]
},
{
"song_id": "116",
"rank": 2,
"top3": [
[
"109",
0.8701787447929383,
0.9557541608810425,
0.9557541608810425,
1
],
[
"116",
0.8674952483177185,
0.9527724981307983,
0.9527724981307983,
1
],
[
"103",
0.8659579670429229,
0.95106440782547,
0.95106440782547,
1
]
]
}
]
}
}
\ No newline at end of file
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import sys
from collections import defaultdict
from pathlib import Path
from statistics import median
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
import faiss
import numpy as np
def load_jsonl(path: Path):
return [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines() if line.strip()]
def aggregate_song_scores(song_ids, sims, idxs):
aggregated = defaultdict(list)
for score, idx in zip(sims, idxs):
aggregated[song_ids[int(idx)]].append(float(score))
ranked = []
for song_id, vals in aggregated.items():
vals.sort(reverse=True)
max_sim = vals[0]
top3_avg = sum(vals[:3]) / min(3, len(vals))
vote = len(vals)
combined = 0.6 * max_sim + 0.3 * top3_avg + 0.1 * min(vote / 10.0, 1.0)
ranked.append((song_id, combined, max_sim, top3_avg, vote))
ranked.sort(key=lambda x: x[1], reverse=True)
return ranked
def compute_metrics(ranks, topk):
if not ranks:
return {'count': 0}
return {
'count': len(ranks),
'top1': round(sum(1 for r in ranks if r == 1) / len(ranks), 6),
'top3': round(sum(1 for r in ranks if r <= 3) / len(ranks), 6),
f'top{topk}': round(sum(1 for r in ranks if r <= topk) / len(ranks), 6),
'mrr': round(sum(1.0 / r for r in ranks) / len(ranks), 6),
'mean_rank': round(sum(ranks) / len(ranks), 4),
'median_rank': median(ranks),
}
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--reference-embeddings-jsonl', required=True)
ap.add_argument('--query-embeddings-jsonl', required=True)
ap.add_argument('--topn', type=int, default=20)
ap.add_argument('--topk', type=int, default=10)
ap.add_argument('--output', required=True)
args = ap.parse_args()
refs = load_jsonl(Path(args.reference_embeddings_jsonl))
queries = load_jsonl(Path(args.query_embeddings_jsonl))
ref_matrix = np.asarray([r['embedding'] for r in refs], dtype=np.float32)
song_ids = [r['song_id'] for r in refs]
index = faiss.IndexFlatIP(ref_matrix.shape[1])
index.add(ref_matrix)
by_type = defaultdict(list)
examples = defaultdict(list)
for q in queries:
qvec = np.asarray(q['embedding'], dtype=np.float32).reshape(1, -1)
sims, idxs = index.search(qvec, args.topn)
ranked = aggregate_song_scores(song_ids, sims[0], idxs[0])
gold = q['song_id']
rank = next((i + 1 for i, item in enumerate(ranked) if item[0] == gold), len(ranked) + 1)
qtype = str(q['query_type'])
by_type[qtype].append(rank)
if len(examples[qtype]) < 5:
examples[qtype].append({'song_id': gold, 'rank': rank, 'top3': ranked[:3]})
report = {
'backend': 'faiss-as-pgvector-standin',
'note': 'Uses song-level aggregation compatible with a future pgvector online path.',
'overall': compute_metrics([r for ranks in by_type.values() for r in ranks], args.topk),
'by_query_type': {qtype: compute_metrics(ranks, args.topk) for qtype, ranks in by_type.items()},
'examples': examples,
}
out = Path(args.output)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding='utf-8')
print(json.dumps(report, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from scripts.local_music20_acr import REFERENCE_TYPE, SUPPORTED_QUERY_TYPES, embed_chroma, first_file
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument('--downloads-dir', default='/workspace/downloads')
ap.add_argument('--song-limit', type=int, default=20)
ap.add_argument('--duration', type=float, default=8.0)
ap.add_argument('--sr', type=int, default=22050)
ap.add_argument('--out-dir', default='data/pgvector_eval/music20')
return ap.parse_args()
def main():
args = parse_args()
downloads_dir = Path(args.downloads_dir)
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
ref_path = out_dir / 'reference_embeddings.jsonl'
qry_path = out_dir / 'query_embeddings.jsonl'
ref_f = ref_path.open('w', encoding='utf-8')
qry_f = qry_path.open('w', encoding='utf-8')
ref_count = qry_count = 0
refs_seen = set()
for song_dir in sorted(p for p in downloads_dir.iterdir() if p.is_dir()):
ref = first_file(song_dir / f'type_{REFERENCE_TYPE}')
if ref and song_dir.name not in refs_seen and len(refs_seen) < args.song_limit:
row = {
'song_id': song_dir.name,
'audio_path': str(ref),
'type': 'reference',
'embedding': embed_chroma(str(ref), args.sr, args.duration).tolist(),
}
ref_f.write(json.dumps(row, ensure_ascii=False) + '\n')
ref_count += 1
refs_seen.add(song_dir.name)
for query_type in SUPPORTED_QUERY_TYPES:
kept = 0
for song_id in sorted(refs_seen):
song_dir = downloads_dir / song_id
qry = first_file(song_dir / f'type_{query_type}')
if not qry:
continue
row = {
'song_id': song_id,
'audio_path': str(qry),
'query_type': query_type,
'embedding': embed_chroma(str(qry), args.sr, args.duration).tolist(),
}
qry_f.write(json.dumps(row, ensure_ascii=False) + '\n')
qry_count += 1
kept += 1
print(f'query_type={query_type} rows={kept}')
ref_f.close()
qry_f.close()
print(json.dumps({'reference_rows': ref_count, 'query_rows': qry_count, 'out_dir': str(out_dir.resolve())}, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()
import unittest
from scripts.evaluate_songid_pgvector_path import aggregate_song_scores, compute_metrics
class SongIdPgvectorPathTests(unittest.TestCase):
def test_aggregate_song_scores_ranks_by_combined_score(self):
song_ids = ['a', 'a', 'b', 'c']
sims = [0.9, 0.85, 0.95, 0.2]
idxs = [0, 1, 2, 3]
ranked = aggregate_song_scores(song_ids, sims, idxs)
self.assertEqual(ranked[0][0], 'b')
self.assertEqual(ranked[1][0], 'a')
def test_compute_metrics(self):
metrics = compute_metrics([1, 2, 4], 5)
self.assertEqual(metrics['count'], 3)
self.assertEqual(metrics['top1'], 0.333333)
self.assertEqual(metrics['top3'], 0.666667)
self.assertEqual(metrics['top5'], 1.0)
if __name__ == '__main__':
unittest.main()
- 新增 `acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py``acr-engine/scripts/evaluate_songid_pgvector_path.py`,补齐 song_id 级 pgvector 评测脚手架。
- 新增 `acr-engine/data/pgvector_eval/music20/` 评测产物,当前 `faiss-as-pgvector-standin` 结果:整体 `top1=0.9091``top3=0.9545`;其中 `query_type=1` 很强(`top1=1.0`),`query_type=7` 仍明显偏弱(`top1=0.0``top3=0.5`)。
- 新增 `acr-engine/data/local_eval/voice_workspace20_type7_eval.json`,对当前 `workspace_music20` 语义做了 20 条 `type_7` 批量验证:`top1=0.0``top3=0.05`,说明业务 song_id 正确性仍明显不足。
- 新增 `acr-engine/data/local_eval/voice_workspace20_type8_eval.json``voice_workspace20_type16_eval.json`,补充 business-corpus voice correctness 基线:`type_8 top1=0.0/top3=0.0``type_16 top1=0.0/top3=0.0`
- architect review 当前结论:`APPROVED (WATCH)`,允许继续沿当前架构推进,但不能把当前 business-corpus 结果视作完成。
......
......@@ -51,3 +51,5 @@ flowchart TD
- handoff / changelog / docs README 已同步
- handoff 已刷新:yes(已指向 voice service runtime 当前状态与下一步排查路径)
- business-corpus song_id baseline 已生成:yes(`data/pgvector_eval/music20/songid_eval_report.json`
......
......@@ -46,6 +46,15 @@
3. 把哼唱评测集接入 `evaluate.py` 或独立评测脚本
4. 继续做 docs 第二轮收敛,只保留当前有效主文档
- 已新增 song_id 级 pgvector 评测脚手架:
- `acr-engine/scripts/export_workspace_music20_embeddings_jsonl.py`
- `acr-engine/scripts/evaluate_songid_pgvector_path.py`
- 当前 `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` 结果:
- overall: `top1=0.9091`, `top3=0.9545`
- `query_type=1`: `top1=1.0`, `top3=1.0`
- `query_type=7`: `top1=0.0`, `top3=0.5`
- 注意:当前 20-song 导出里 `query_type=8/16` 行数仍不足(0 行),说明下一步需要专门扩 business reference / query 选择,而不是只沿用当前前 20 首 reference。
- 当前 `workspace_music20` 业务正确性初测(`acr-engine/data/local_eval/voice_workspace20_type7_eval.json`):
- `num_queries=20`
- `top1=0.0`
......