Commit 6a97ca13 6a97ca13f76962ceaccff8137f9496bd0a9821a3 by cnb.bofCdSsphPA

Make the exact lane fail honestly before real audio is mounted

Constraint: the Phase-1 exact lane must not pretend success when reference audio is unreadable, and repeated writes must be idempotent at the database boundary.
Rejected: keep partial-success writes in completed state | rejected because it would blur asset-readability failures and weaken auditability.
Confidence: high
Scope-risk: moderate
Directive: preserve the repo-local chromaprint-style wording and the all-or-nothing failure semantics until production audio mounts and real extractor validation are in place.
Tested: py_compile for chromaprint matcher and chromaprint worker; live PostgreSQL unique index creation on acr_test; non-dry-run chromaprint worker attempt with job_status=failed and failure_reason=unreadable_audio_assets; bootstrap reset back to pending; architect review APPROVED.
Not-tested: successful audio_fingerprint writes against mounted production audio, semantic worker real writes, large-scale concurrent exact-lane execution.
1 parent b4f304c1
{
"worker": "run_chromaprint_job",
"schema": "acr_test",
"job": {
"extraction_job_id": 1,
"feature_set_id": 2,
"target_scope": "reference_set:phase1_hot_reference_v1",
"job_status": "pending",
"shard_key": "phase1/reference/chromaprint/v1",
"job_metadata": {
"lane": "exact",
"phase": "phase1",
"priority": "p0"
},
"feature_name": "fingerprint_asset",
"feature_level": "asset",
"extraction_granularity": "full_asset",
"window_sec": 5.0,
"hop_sec": 2.5,
"embedding_dim": null,
"distance_metric": "hamming",
"feature_config": {
"lane": "exact",
"index_target": "audio_fingerprint"
},
"model_id": 2,
"model_name": "chromaprint",
"model_version": "v1",
"model_family": "fingerprint",
"input_sample_rate": 16000,
"output_embedding_dim": null,
"model_metadata": {
"lane": "exact",
"note": "exact fingerprint lane baseline",
"phase": "phase1"
}
},
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"reference_set_id": 2,
"reference_set_name": "phase1_hot_reference_v1",
"recording_count": 20,
"ready_asset_count": 20,
"active_window_count": 20
},
"scope_asset_count": 20,
"processed_assets": [],
"missing_assets": [
{
"asset_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav",
"reason": "missing_audio"
},
{
"asset_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav",
"reason": "missing_audio"
},
{
"asset_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav",
"reason": "missing_audio"
},
{
"asset_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav",
"reason": "missing_audio"
},
{
"asset_id": 6,
"storage_uri": "/workspace/downloads/105/type_11/57e61cde-4410-4751-93e9-d7a4ecece5791650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 7,
"storage_uri": "/workspace/downloads/106/type_11/bf61426c-67b7-4cf1-a9e7-f78cf519a0021650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 8,
"storage_uri": "/workspace/downloads/107/type_11/296bbc25-617c-4368-9a69-357aeec394381650256910.wav",
"reason": "missing_audio"
},
{
"asset_id": 9,
"storage_uri": "/workspace/downloads/108/type_11/d7e28fe6-4ad6-4243-b66b-d90ff5ca1e491650256909.wav",
"reason": "missing_audio"
},
{
"asset_id": 10,
"storage_uri": "/workspace/downloads/109/type_11/84acef9b-2a74-44bc-9eff-5ca7969ac9b61650256909.wav",
"reason": "missing_audio"
},
{
"asset_id": 11,
"storage_uri": "/workspace/downloads/110/type_11/2197b39e-23e2-4a66-b07e-dd672eab214a1650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 12,
"storage_uri": "/workspace/downloads/111/type_11/7f5256e8-de5f-41c5-bf76-419e05df72d81650256908.wav",
"reason": "missing_audio"
},
{
"asset_id": 13,
"storage_uri": "/workspace/downloads/112/type_11/34acd523-3c01-443d-ac3d-4ad7b9e2246f1650256907.wav",
"reason": "missing_audio"
},
{
"asset_id": 14,
"storage_uri": "/workspace/downloads/113/type_11/6d9438af-5d83-434b-bb20-76e28d0bbc4e1650256907.wav",
"reason": "missing_audio"
},
{
"asset_id": 15,
"storage_uri": "/workspace/downloads/114/type_11/0238ecbf-b234-470e-82e4-f3b80a267d771650256906.wav",
"reason": "missing_audio"
},
{
"asset_id": 16,
"storage_uri": "/workspace/downloads/115/type_11/aabad0ff-13de-4786-aa9c-40e1f957ed9f1650256906.wav",
"reason": "missing_audio"
},
{
"asset_id": 17,
"storage_uri": "/workspace/downloads/116/type_11/da34f6ff-39e7-4dde-8265-e1bb01b6263e1650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 18,
"storage_uri": "/workspace/downloads/117/type_11/1e1599e6-ebbd-4ceb-a81d-a320331ef6e31650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 19,
"storage_uri": "/workspace/downloads/118/type_11/db64461e-d752-4cf3-ab1d-56ff9232823d1650256901.wav",
"reason": "missing_audio"
},
{
"asset_id": 20,
"storage_uri": "/workspace/downloads/119/type_11/180dfa7d-836a-449c-990f-a3bf39c11da11650256898.wav",
"reason": "missing_audio"
}
],
"status_after_start": {
"extraction_job_id": 1,
"job_status": "running",
"input_count": 20,
"output_count": null,
"started_at": "2026-06-04T13:35:22.194865+08:00",
"finished_at": null,
"log_uri": null,
"metadata_json": {
"lane": "exact",
"phase": "phase1",
"worker": "run_chromaprint_job",
"dry_run": false,
"priority": "p0",
"output_target": "audio_fingerprint",
"execution_mode": "write_attempt",
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
}
}
},
"status_after_complete": null,
"status_after_failed": {
"extraction_job_id": 1,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"started_at": "2026-06-04T13:35:22.194865+08:00",
"finished_at": "2026-06-04T13:35:22.195659+08:00",
"log_uri": null,
"metadata_json": {
"lane": "exact",
"phase": "phase1",
"worker": "run_chromaprint_job",
"dry_run": false,
"priority": "p0",
"artifact_dir": "/workspace/acr-engine/data/pgvector_eval/music20/phase1_fingerprints",
"output_target": "audio_fingerprint",
"execution_mode": "write_attempt",
"failure_reason": "unreadable_audio_assets",
"write_target_table": "audio_fingerprint",
"missing_asset_count": 20,
"target_scope_summary": {
"scope_type": "reference_set",
"scope_value": "phase1_hot_reference_v1",
"recording_count": 20,
"reference_set_id": 2,
"ready_asset_count": 20,
"reference_set_name": "phase1_hot_reference_v1",
"active_window_count": 20
},
"missing_asset_samples": [
{
"reason": "missing_audio",
"asset_id": 1,
"storage_uri": "/workspace/downloads/100/type_11/93dfdeb0-7da5-42a8-9c71-cf12af57dd191650256918.wav"
},
{
"reason": "missing_audio",
"asset_id": 2,
"storage_uri": "/workspace/downloads/101/type_11/83c0c07f-4f96-4ff4-998c-58db910f3cfa1650256915.wav"
},
{
"reason": "missing_audio",
"asset_id": 3,
"storage_uri": "/workspace/downloads/102/type_11/43440ec5-70b4-4d50-8683-d3e41cad29411650256908.wav"
},
{
"reason": "missing_audio",
"asset_id": 4,
"storage_uri": "/workspace/downloads/103/type_11/19876dbb-fffc-40f8-9530-9322c9ed77681650256912.wav"
},
{
"reason": "missing_audio",
"asset_id": 5,
"storage_uri": "/workspace/downloads/104/type_11/4c1d3e22-045f-445b-ab87-ba1ae3ee09b31650256912.wav"
}
]
}
},
"next_write_target": "audio_fingerprint",
"notes": [
"dry-run preserves the verified planner -> job -> PostgreSQL state flow",
"non-dry-run now writes repo-local chromaprint-style hash artifacts plus audio_fingerprint rows when source audio is readable"
]
}
\ No newline at end of file
{
"job_row": {
"extraction_job_id": 1,
"job_status": "failed",
"input_count": 20,
"output_count": 0,
"failure_reason": "unreadable_audio_assets"
},
"audio_fingerprint_count": 0
}
\ No newline at end of file
......@@ -222,6 +222,9 @@ CREATE TABLE IF NOT EXISTS audio_fingerprint (
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX IF NOT EXISTS uq_audio_fingerprint_feature_asset
ON audio_fingerprint(feature_set_id, asset_id);
CREATE TABLE IF NOT EXISTS reference_set_registry (
reference_set_id BIGSERIAL PRIMARY KEY,
set_name TEXT NOT NULL UNIQUE,
......
......@@ -8,7 +8,6 @@ Implements landmark-based audio fingerprinting:
"""
import numpy as np
import librosa
from numpy.lib.stride_tricks import sliding_window_view
from collections import defaultdict
from typing import Dict, List, Tuple, Optional
......@@ -16,6 +15,50 @@ import pickle
import json
from pathlib import Path
import time
import wave
try:
import librosa # type: ignore
except ImportError: # pragma: no cover - optional dependency
librosa = None
def _resample_linear(y: np.ndarray, src_sr: int, target_sr: int) -> np.ndarray:
if src_sr == target_sr or y.size == 0:
return y.astype(np.float32, copy=False)
duration = y.shape[0] / float(src_sr)
target_len = max(int(round(duration * target_sr)), 1)
src_x = np.linspace(0.0, duration, num=y.shape[0], endpoint=False)
dst_x = np.linspace(0.0, duration, num=target_len, endpoint=False)
return np.interp(dst_x, src_x, y).astype(np.float32, copy=False)
def load_audio_mono(path: str, sr: int) -> tuple[np.ndarray, int]:
if librosa is not None:
y, _ = librosa.load(path, sr=sr, mono=True)
return y.astype(np.float32, copy=False), sr
with wave.open(path, 'rb') as wav_file:
src_sr = wav_file.getframerate()
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
frame_count = wav_file.getnframes()
raw = wav_file.readframes(frame_count)
if sample_width == 1:
y = np.frombuffer(raw, dtype=np.uint8).astype(np.float32)
y = (y - 128.0) / 128.0
elif sample_width == 2:
y = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
elif sample_width == 4:
y = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0
else:
raise ValueError(f'unsupported wav sample width: {sample_width}')
if channels > 1:
y = y.reshape(-1, channels).mean(axis=1)
y = _resample_linear(y, src_sr, sr)
return y, sr
class Fingerprint:
......@@ -51,8 +94,19 @@ class ChromaprintMatcher:
return candidate
def _spectrogram(self, y: np.ndarray) -> np.ndarray:
S = np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length))
return S
if librosa is not None:
return np.abs(librosa.stft(y, n_fft=self.n_fft, hop_length=self.hop_length))
if y.shape[0] < self.n_fft:
y = np.pad(y, (0, self.n_fft - y.shape[0]))
frame_count = 1 + max((y.shape[0] - self.n_fft) // self.hop_length, 0)
frames = np.stack(
[y[i * self.hop_length:i * self.hop_length + self.n_fft] for i in range(frame_count)],
axis=1,
)
window = np.hanning(self.n_fft).astype(np.float32)
frames = frames * window[:, None]
return np.abs(np.fft.rfft(frames, axis=0))
def _find_peaks(self, S: np.ndarray) -> List[Tuple[int, int, float]]:
if S.shape[0] <= self.peak_neighborhood or S.shape[1] <= self.peak_neighborhood:
......@@ -82,12 +136,15 @@ class ChromaprintMatcher:
return hashes
def index_song(self, song_id: str, y: np.ndarray):
S = self._spectrogram(y)
peaks = self._find_peaks(S)
hashes = self._hash_peaks(peaks)
hashes = self.extract_hashes(y)
for h, offset in hashes:
self.hash_db[h].append(Fingerprint(song_id, offset, h))
def extract_hashes(self, y: np.ndarray) -> List[Tuple[int, int]]:
S = self._spectrogram(y)
peaks = self._find_peaks(S)
return self._hash_peaks(peaks)
def index_songs_from_dir(
self,
songs_dir: str,
......@@ -137,7 +194,7 @@ class ChromaprintMatcher:
continue
song_id = item["song_id"]
try:
y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True)
y, _ = load_audio_mono(str(audio_path), sr=self.sr)
except Exception as exc:
skipped_refs += 1
print(
......
## 2026-06-04
- 更新 `run_chromaprint_job.py``src/engines/chromaprint_matcher.py`,把 exact lane 从“只有 dry-run”推进到“具备真实 `audio_fingerprint` 写入路径”;同时增加无 `librosa` 环境下的 `wave + numpy` 回退实现,避免 worker 被运行时依赖直接卡死。
-`audio_fingerprint` 补上 `(feature_set_id, asset_id)` 唯一索引,并把 exact lane 写入改成 `INSERT ... ON CONFLICT DO UPDATE`;同时把失败语义收紧为“全量成功 / 否则失败”,避免部分不可读资产被误标成 completed。
- 新增 `phase1_worker_chromaprint_write_attempt.json``phase1_worker_chromaprint_write_guard_report.json`,在 live PostgreSQL `acr_test` 上验证 exact lane 的非 dry-run 行为:当前因 `/workspace/downloads/...` 缺失导致 `scope_asset_count=20``processed_assets=0`,job 被明确标记为 `failed``failure_reason=unreadable_audio_assets`,证明写入路径已接上但受环境挂载阻塞。
- 新增 `bootstrap_phase1_reference_members_live.py``phase1_reference_member_bootstrap_report.json`,把 `acr_test``recording.is_reference=true` 的 20 条录音真实挂到 `phase1_hot_reference_v1`,使 worker dry-run 的 scope 从 `0` 提升为 `20 recordings / 20 assets / 20 windows`
- 根据 architect 复核修正 worker contract:`mark_job_status.py` 现支持真正的“CLI 覆盖 env”并限制状态白名单;`_job_common.update_job_status()` 新增前置状态约束并防止 `finished_at` 被重复覆盖;`bootstrap_phase1_extraction_jobs_live.py` 在恢复 pending 时会清空旧时间戳与计数;`run_embedding_job.py` 对 embedding job 契约做了更严格校验。
- 修正 `plan_phase1_extraction_jobs_live.py`:新增 schema 校验,命令模板显式锚定 `cd /workspace/acr-engine &&`,并把 `--complete-dry-run``--expected-status pending` 带入生成的命令,避免 planner 产物“看起来能跑但实际上缺关键上下文/步骤”。
......
......@@ -227,10 +227,62 @@ flowchart TD
后续把下面逻辑塞进 `run_chromaprint_job.py`
1. 读取 `recording_asset`
2. 调 chromaprint CLI / library
3.`audio_fingerprint`
4. 更新 `output_count`
5. 标记 `completed`
2. 读取可用音频并提取 exact-lane hash
3. 写 artifact JSON
4.`audio_fingerprint`
5. 更新 `output_count`
6. 标记 `completed`
### 当前 exact lane 的真实状态
这轮已经把 `run_chromaprint_job.py` 从“只有 dry-run”推进到:
- 如果 source audio 可读:
- 生成 repo-local chromaprint-style hash artifact
- 写入 `audio_fingerprint`
- 如果 source audio 不可读:
- 明确把 job 标记为 `failed`
-`failure_reason``missing_asset_count``missing_asset_samples` 写回 PostgreSQL
### 当前失败语义
当前 exact lane 采用的是 **全量成功 / 否则失败**
- 只要 scope 内任意 asset:
- 缺文件
- 解码失败
- hash 提取失败
就整体标记:
- `job_status = failed`
- `failure_reason = unreadable_audio_assets`
这样不会把“部分成功”伪装成 `completed`
### 当前依赖策略
当前 exact lane 不再强依赖 `librosa`
- 优先使用 `librosa`(如果环境里存在)
- 否则回退到:
- Python `wave`
- `numpy` 线性重采样
- `numpy` FFT spectrogram
这使得 worker contract 能在更瘦的运行环境里继续工作。
### 当前幂等保护
`audio_fingerprint` 现在补了:
- `UNIQUE(feature_set_id, asset_id)`
对应 worker 写入改成:
- `INSERT ... ON CONFLICT DO UPDATE`
因此 exact lane 对同一 `(feature_set_id, asset_id)` 的重复写入不再依赖应用层先查再写。
### 7.2 Embedding worker
......
......@@ -378,6 +378,66 @@ flowchart TD
- 基础 claim guard
- 基础重复执行保护
---
## exact lane 非 dry-run 写入尝试(新增)
这轮又继续向前推进了一步:
> `run_chromaprint_job.py` 已经不再只是 dry-run。
当前行为:
1. 如果 reference asset 对应音频文件可读:
- 提取 repo-local chromaprint-style hash
- 写 artifact JSON
-`audio_fingerprint`
- job 标记为 `completed`
2. 如果 reference asset 对应音频文件不可读:
- job 标记为 `failed`
-`metadata_json` 里写入:
- `failure_reason`
- `missing_asset_count`
- `missing_asset_samples`
### 本轮 live 结果
报告:
- `acr-engine/data/pgvector_eval/music20/phase1_worker_chromaprint_write_attempt.json`
- `acr-engine/data/pgvector_eval/music20/phase1_worker_chromaprint_write_guard_report.json`
关键结果:
- `scope_asset_count = 20`
- `processed_assets = 0`
- `missing_assets = 20`
- `job_status = failed`
- `failure_reason = unreadable_audio_assets`
- `audio_fingerprint_count = 0`
### 这说明什么
说明当前 exact lane 的 PostgreSQL worker contract 已经具备:
- 非 dry-run 的真实写入路径
- 明确的失败落盘
- 环境缺失时的可审计错误证据
- “全量成功 / 否则失败”的批次语义
- `audio_fingerprint(feature_set_id, asset_id)` 的原子 upsert 约束基础
但当前容器仍然缺:
- `/workspace/downloads/...` 实际音频文件
因此这轮证明的是:
- **worker 写入路径已经接上**
- **当前被环境数据挂载阻塞**
而不是 exact lane 逻辑本身还没落地。
- `type_7`
因此:
......
......@@ -191,10 +191,11 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql
- 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行
- `phase1_hot_reference_v1``acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows`
- worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json`
- exact lane 的 `run_chromaprint_job.py` 已具备非 dry-run 写入路径;当前在 `acr_test` 的 live 结果是因为 `/workspace/downloads/...` 缺失而明确 `failed`,不是继续假装 `completed`
### 未验证 / 仍是缺口
- **未实际跑 MERT / MuQ encoder-only 特征抽取**
- **worker 目前仍以 dry-run 为主,尚未写真实 `audio_fingerprint / audio_embedding`**
- **semantic / cover 等后续 lane 仍主要停留在 dry-run;exact lane 已接上真实 `audio_fingerprint` 写入路径,但当前容器缺 reference 音频挂载,live 结果仍停在可审计失败**
- **还未落更大规模的生产 reference set 真实业务数据(当前仅验证了 `acr_test` 下的 20-song live members)**
- **未定义最终线上分数融合细则**
- **type_8 / type_16 还没有进入当前 live JSONL 的 PostgreSQL 实测链**
......