Commit 80df0d30 80df0d301f60778aac95e3d4fd528af8c7afb47d by cnb.bofCdSsphPA

Why the song-centric semantic lane must move from placeholder to a real MERT baseline

Constraint: The current host now has torch/torchaudio/transformers, so the default song-centric pipeline should produce a real semantic baseline instead of a runtime-ready placeholder
Rejected: Keep the placeholder branch after runtime became available | would leave the main pipeline in a misleading half-ready state
Confidence: medium
Scope-risk: narrow
Directive: Preserve the local_wavehash_embed fallback, but treat mert-v1-95m as the default semantic baseline until MuQ is added as a challenger
Tested: installed torch-2.12.0+cpu, torchaudio-2.11.0+cpu, transformers-5.10.1; py_compile for enrich_songcentric_manifest_with_local_features.py; reran song-centric pipeline; verified latest embedding rows are mert-v1-95m; markdown link check on /workspace/docs
Not-tested: MuQ adapter implementation and production vector-table persistence are still pending
1 parent b0c52b54
...@@ -8,6 +8,8 @@ import json ...@@ -8,6 +8,8 @@ import json
8 import wave 8 import wave
9 from pathlib import Path 9 from pathlib import Path
10 10
11 import numpy as np
12
11 ROOT = Path(__file__).resolve().parents[1] 13 ROOT = Path(__file__).resolve().parents[1]
12 import sys 14 import sys
13 if str(ROOT) not in sys.path: 15 if str(ROOT) not in sys.path:
...@@ -15,6 +17,9 @@ if str(ROOT) not in sys.path: ...@@ -15,6 +17,9 @@ if str(ROOT) not in sys.path:
15 17
16 from src.engines.chromaprint_matcher import ChromaprintMatcher, load_audio_mono 18 from src.engines.chromaprint_matcher import ChromaprintMatcher, load_audio_mono
17 19
20 MERT_MODEL_ID = 'm-a-p/MERT-v1-95M'
21 _MERT_RUNTIME = None
22
18 23
19 def load_jsonl(path: Path): 24 def load_jsonl(path: Path):
20 for line in path.read_text().splitlines(): 25 for line in path.read_text().splitlines():
...@@ -72,8 +77,77 @@ def extract_matcher_fingerprint(path: Path, start_ms: int, end_ms: int) -> dict ...@@ -72,8 +77,77 @@ def extract_matcher_fingerprint(path: Path, start_ms: int, end_ms: int) -> dict
72 return None 77 return None
73 78
74 79
75 def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok: bool, missing: list[str]) -> dict: 80 def load_mert_runtime():
81 global _MERT_RUNTIME
82 if _MERT_RUNTIME is not None:
83 return _MERT_RUNTIME
84 import torch
85 import torchaudio
86 from transformers import Wav2Vec2FeatureExtractor, AutoModel
87 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MERT_MODEL_ID, trust_remote_code=True)
88 model = AutoModel.from_pretrained(MERT_MODEL_ID, trust_remote_code=True)
89 model.eval()
90 _MERT_RUNTIME = {
91 'torch': torch,
92 'torchaudio': torchaudio,
93 'feature_extractor': feature_extractor,
94 'model': model,
95 'sample_rate': int(feature_extractor.sampling_rate),
96 'hidden_size': int(getattr(model.config, 'hidden_size', 768)),
97 }
98 return _MERT_RUNTIME
99
100
101 def extract_mert_embedding(asset_path: Path, start_ms: int, end_ms: int) -> dict:
102 rt = load_mert_runtime()
103 torch = rt['torch']
104 samples, sr = load_audio_mono(str(asset_path), sr=rt['sample_rate'])
105 samples = np.asarray(samples, dtype=np.float32)
106 start_frame = int(start_ms * sr / 1000)
107 end_frame = int(end_ms * sr / 1000)
108 segment = samples[start_frame:end_frame]
109 if segment.size == 0:
110 raise ValueError('empty segment for MERT extraction')
111 inputs = rt['feature_extractor'](
112 segment,
113 sampling_rate=sr,
114 return_tensors='pt',
115 )
116 with torch.no_grad():
117 outputs = rt['model'](**inputs)
118 emb = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy().astype(np.float32)
119 digest = hashlib.sha256(emb.tobytes()).hexdigest()
120 return {
121 'embedding_dim': int(emb.shape[0]),
122 'embedding_uri': f"inline-mert://{digest[:16]}:{start_ms}:{end_ms}",
123 'vector_table_name': f"audio_embedding_vector_{int(emb.shape[0])}_placeholder",
124 'checksum': f"emb:{digest[:16]}",
125 'metadata_json': {
126 'semantic_backend': 'mert_runtime',
127 'embedding_preview': [float(x) for x in emb[:8]],
128 'model_id': MERT_MODEL_ID,
129 'sample_rate': sr,
130 },
131 }
132
133
134 def build_semantic_feature(asset_path: Path, stats: dict, start_ms: int, end_ms: int, runtime_ok: bool, missing: list[str]) -> dict:
76 if runtime_ok: 135 if runtime_ok:
136 try:
137 mert = extract_mert_embedding(asset_path, start_ms, end_ms)
138 return {
139 'feature_type': 'embedding',
140 'model_name': 'mert-v1-95m',
141 'model_version': 'hf-main',
142 'feature_set_name': 'mert_5s_hop2.5_v1',
143 'feature_schema_ver': 'v1',
144 'embedding_dim': mert['embedding_dim'],
145 'embedding_uri': mert['embedding_uri'],
146 'vector_table_name': mert['vector_table_name'],
147 'checksum': mert['checksum'],
148 'metadata_json': mert['metadata_json'],
149 }
150 except Exception as exc:
77 return { 151 return {
78 'feature_type': 'embedding', 152 'feature_type': 'embedding',
79 'model_name': 'semantic_runtime_ready_placeholder', 153 'model_name': 'semantic_runtime_ready_placeholder',
...@@ -84,7 +158,7 @@ def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok: ...@@ -84,7 +158,7 @@ def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok:
84 'embedding_uri': f"runtime-ready://{stats['digest'][:16]}:{start_ms}:{end_ms}", 158 'embedding_uri': f"runtime-ready://{stats['digest'][:16]}:{start_ms}:{end_ms}",
85 'vector_table_name': 'audio_embedding_vector_8_placeholder', 159 'vector_table_name': 'audio_embedding_vector_8_placeholder',
86 'checksum': f"emb:{stats['digest'][:16]}", 160 'checksum': f"emb:{stats['digest'][:16]}",
87 'metadata_json': {'semantic_backend': 'runtime_ready_placeholder'}, 161 'metadata_json': {'semantic_backend': 'runtime_ready_placeholder', 'runtime_error': str(exc)},
88 } 162 }
89 return { 163 return {
90 'feature_type': 'embedding', 164 'feature_type': 'embedding',
...@@ -162,7 +236,7 @@ def main() -> int: ...@@ -162,7 +236,7 @@ def main() -> int:
162 } 236 }
163 fallback_fp_count += 1 237 fallback_fp_count += 1
164 238
165 emb = build_semantic_feature(stats, window['start_ms'], window['end_ms'], runtime_ok, missing_runtime) 239 emb = build_semantic_feature(asset_path, stats, window['start_ms'], window['end_ms'], runtime_ok, missing_runtime)
166 if runtime_ok: 240 if runtime_ok:
167 semantic_runtime_ready_count += 1 241 semantic_runtime_ready_count += 1
168 else: 242 else:
......
1 # Changelog 1 # Changelog
2 2
3 ## 2026-06-04 3 ## 2026-06-04
4 - fresh runtime 进展:已在当前 host 成功安装 `torch-2.12.0+cpu``torchaudio-2.11.0+cpu``transformers-5.10.1`,重跑 song-centric 主链后确认 `semantic_runtime_available = true``semantic_runtime_ready_count = 5``semantic_fallback_count = 0`;当前 semantic 已从 fallback 推进到 `semantic_runtime_ready_placeholder`,下一步只差接真实 `MERT / MuQ` adapter。 4 - fresh runtime 进展:已在当前 host 成功安装 `torch-2.12.0+cpu``torchaudio-2.11.0+cpu``transformers-5.10.1`,重跑 song-centric 主链后确认 `semantic_runtime_available = true``semantic_runtime_ready_count = 5``semantic_fallback_count = 0`;当前 semantic 已从 fallback 推进到 `mert-v1-95m`,下一步可在不破坏当前 MERT 基线的前提下继续接 `MuQ` adapter。
5 - 收敛 `docs/` 到当前 song-centric 主线,只保留 `README / start-here / session-handoff / postgresql-data-model / postgres_db_schema_samples / CHANGELOG` 六份核心文档,删除旧的 v2 / planner-worker / registry 扩展文档,避免新同学误入已退居次线的设计。 5 - 收敛 `docs/` 到当前 song-centric 主线,只保留 `README / start-here / session-handoff / postgresql-data-model / postgres_db_schema_samples / CHANGELOG` 六份核心文档,删除旧的 v2 / planner-worker / registry 扩展文档,避免新同学误入已退居次线的设计。
6 - 重写 `docs/postgresql-data-model.md`,明确 `保存切片的数据 + 模型 + feature` 的落表方案:`window``audio_object`,模型身份落 `feature_fact.model_name/model_version/feature_set_name`,具体 `fingerprint/embedding` 也统一落 `feature_fact` 6 - 重写 `docs/postgresql-data-model.md`,明确 `保存切片的数据 + 模型 + feature` 的落表方案:`window``audio_object`,模型身份落 `feature_fact.model_name/model_version/feature_set_name`,具体 `fingerprint/embedding` 也统一落 `feature_fact`
7 - 重写 `docs/postgres_db_schema_samples.md` 与入口文档,补充当前 4 表主链的流程图、典型 SQL 样例、查询回溯路径与写入顺序,统一文档口径到 `media_entity -> audio_object -> feature_fact -> set_membership` 7 - 重写 `docs/postgres_db_schema_samples.md` 与入口文档,补充当前 4 表主链的流程图、典型 SQL 样例、查询回溯路径与写入顺序,统一文档口径到 `media_entity -> audio_object -> feature_fact -> set_membership`
......
...@@ -33,7 +33,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. ...@@ -33,7 +33,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127.
33 - `semantic_runtime_missing = []` 33 - `semantic_runtime_missing = []`
34 - `semantic_runtime_ready_count = 5` 34 - `semantic_runtime_ready_count = 5`
35 - `semantic_fallback_count = 0` 35 - `semantic_fallback_count = 0`
36 - `import_counts = media_entity:9 / audio_object:22 / feature_fact:29 / set_membership:9` 36 - `import_counts = media_entity:9 / audio_object:22 / feature_fact:34 / set_membership:9`
37 37
38 --- 38 ---
39 39
...@@ -122,10 +122,10 @@ flowchart TD ...@@ -122,10 +122,10 @@ flowchart TD
122 122
123 - `torch / torchaudio / transformers` 已可导入 123 - `torch / torchaudio / transformers` 已可导入
124 - 当前 `semantic_runtime_available = true` 124 - 当前 `semantic_runtime_available = true`
125 - 当前 semantic 仍不是 `MERT / MuQ`,而是 `semantic_runtime_ready_placeholder` 125 - 当前 semantic 已接上真实 `mert-v1-95m` baseline
126 126
127 这说明当前主要 blocker 已从“依赖缺失”推进为: 127 这说明当前主要 blocker 已从“依赖缺失”推进为:
128 > **runtime 已就绪,但真实 semantic adapter 还没接入。** 128 > **runtime 已就绪,真实 `MERT` baseline 已接入,下一步可继续接 `MuQ`。**
129 129
130 --- 130 ---
131 131
...@@ -174,7 +174,7 @@ flowchart TD ...@@ -174,7 +174,7 @@ flowchart TD
174 - exact lane 已优先复用 `ChromaprintMatcher` 174 - exact lane 已优先复用 `ChromaprintMatcher`
175 - semantic lane 还没有真实接入 `MERT / MuQ` 175 - semantic lane 还没有真实接入 `MERT / MuQ`
176 - runtime 就绪时,当前会产出: 176 - runtime 就绪时,当前会产出:
177 - `model_name = semantic_runtime_ready_placeholder` 177 - `model_name = mert-v1-95m`
178 - fallback 分支仍保留: 178 - fallback 分支仍保留:
179 - `model_name = local_wavehash_embed` 179 - `model_name = local_wavehash_embed`
180 180
......
...@@ -31,7 +31,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. ...@@ -31,7 +31,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127.
31 - `semantic_runtime_missing = []` 31 - `semantic_runtime_missing = []`
32 - `semantic_runtime_ready_count = 5` 32 - `semantic_runtime_ready_count = 5`
33 - `semantic_fallback_count = 0` 33 - `semantic_fallback_count = 0`
34 - `import_counts = media_entity:9 / audio_object:22 / feature_fact:29 / set_membership:9` 34 - `import_counts = media_entity:9 / audio_object:22 / feature_fact:34 / set_membership:9`
35 35
36 --- 36 ---
37 37
...@@ -100,7 +100,7 @@ flowchart TD ...@@ -100,7 +100,7 @@ flowchart TD
100 - 真实目录 -> manifest -> import 已打通 100 - 真实目录 -> manifest -> import 已打通
101 - 真实目录 -> fingerprint enrichment -> import 已打通 101 - 真实目录 -> fingerprint enrichment -> import 已打通
102 - semantic lane 已做成 runtime-ready 102 - semantic lane 已做成 runtime-ready
103 - 当前 host 已能进入 runtime-ready placeholder 分支,下一步只差接真实 `MERT / MuQ` 103 - 当前 host 已能进入 runtime-ready placeholder 分支,下一步可在不破坏当前 MERT 基线的前提下继续接 `MuQ`
104 - 当前 exact lane 已优先复用仓库内 `ChromaprintMatcher` 104 - 当前 exact lane 已优先复用仓库内 `ChromaprintMatcher`
105 105
106 --- 106 ---
...@@ -108,14 +108,14 @@ flowchart TD ...@@ -108,14 +108,14 @@ flowchart TD
108 ## 7. 当前最该继续什么 108 ## 7. 当前最该继续什么
109 109
110 ### 第一优先级 110 ### 第一优先级
111 把 semantic lane 从 `semantic_runtime_ready_placeholder` 升级成真实 encoder adapter,且不破坏现有宿主链。 111 把 semantic lane 从 `mert-v1-95m` baseline 扩展到 `MuQ` challenger,且不破坏现有宿主链。
112 112
113 ### 当前 host 事实 113 ### 当前 host 事实
114 - `torch` 已可导入 114 - `torch` 已可导入
115 - `torchaudio` 已可导入 115 - `torchaudio` 已可导入
116 - `transformers` 已可导入 116 - `transformers` 已可导入
117 - 当前 `semantic_runtime_available = true` 117 - 当前 `semantic_runtime_available = true`
118 - 当前最新主链产出仍是 `semantic_runtime_ready_placeholder`,不是真实 `MERT / MuQ` 118 - 当前最新主链产出已经是 `mert-v1-95m`;下一步可继续补 `MuQ` challenger
119 119
120 --- 120 ---
121 121
......