Why the song-centric semantic lane must move from placeholder to a real MERT baseline
Constraint: The current host now has torch/torchaudio/transformers, so the default song-centric pipeline should produce a real semantic baseline instead of a runtime-ready placeholder Rejected: Keep the placeholder branch after runtime became available | would leave the main pipeline in a misleading half-ready state Confidence: medium Scope-risk: narrow Directive: Preserve the local_wavehash_embed fallback, but treat mert-v1-95m as the default semantic baseline until MuQ is added as a challenger Tested: installed torch-2.12.0+cpu, torchaudio-2.11.0+cpu, transformers-5.10.1; py_compile for enrich_songcentric_manifest_with_local_features.py; reran song-centric pipeline; verified latest embedding rows are mert-v1-95m; markdown link check on /workspace/docs Not-tested: MuQ adapter implementation and production vector-table persistence are still pending
Showing
4 changed files
with
86 additions
and
12 deletions
| ... | @@ -8,6 +8,8 @@ import json | ... | @@ -8,6 +8,8 @@ import json |
| 8 | import wave | 8 | import wave |
| 9 | from pathlib import Path | 9 | from pathlib import Path |
| 10 | 10 | ||
| 11 | import numpy as np | ||
| 12 | |||
| 11 | ROOT = Path(__file__).resolve().parents[1] | 13 | ROOT = Path(__file__).resolve().parents[1] |
| 12 | import sys | 14 | import sys |
| 13 | if str(ROOT) not in sys.path: | 15 | if str(ROOT) not in sys.path: |
| ... | @@ -15,6 +17,9 @@ if str(ROOT) not in sys.path: | ... | @@ -15,6 +17,9 @@ if str(ROOT) not in sys.path: |
| 15 | 17 | ||
| 16 | from src.engines.chromaprint_matcher import ChromaprintMatcher, load_audio_mono | 18 | from src.engines.chromaprint_matcher import ChromaprintMatcher, load_audio_mono |
| 17 | 19 | ||
| 20 | MERT_MODEL_ID = 'm-a-p/MERT-v1-95M' | ||
| 21 | _MERT_RUNTIME = None | ||
| 22 | |||
| 18 | 23 | ||
| 19 | def load_jsonl(path: Path): | 24 | def load_jsonl(path: Path): |
| 20 | for line in path.read_text().splitlines(): | 25 | for line in path.read_text().splitlines(): |
| ... | @@ -72,8 +77,77 @@ def extract_matcher_fingerprint(path: Path, start_ms: int, end_ms: int) -> dict | ... | @@ -72,8 +77,77 @@ def extract_matcher_fingerprint(path: Path, start_ms: int, end_ms: int) -> dict |
| 72 | return None | 77 | return None |
| 73 | 78 | ||
| 74 | 79 | ||
| 75 | def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok: bool, missing: list[str]) -> dict: | 80 | def load_mert_runtime(): |
| 81 | global _MERT_RUNTIME | ||
| 82 | if _MERT_RUNTIME is not None: | ||
| 83 | return _MERT_RUNTIME | ||
| 84 | import torch | ||
| 85 | import torchaudio | ||
| 86 | from transformers import Wav2Vec2FeatureExtractor, AutoModel | ||
| 87 | feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MERT_MODEL_ID, trust_remote_code=True) | ||
| 88 | model = AutoModel.from_pretrained(MERT_MODEL_ID, trust_remote_code=True) | ||
| 89 | model.eval() | ||
| 90 | _MERT_RUNTIME = { | ||
| 91 | 'torch': torch, | ||
| 92 | 'torchaudio': torchaudio, | ||
| 93 | 'feature_extractor': feature_extractor, | ||
| 94 | 'model': model, | ||
| 95 | 'sample_rate': int(feature_extractor.sampling_rate), | ||
| 96 | 'hidden_size': int(getattr(model.config, 'hidden_size', 768)), | ||
| 97 | } | ||
| 98 | return _MERT_RUNTIME | ||
| 99 | |||
| 100 | |||
| 101 | def extract_mert_embedding(asset_path: Path, start_ms: int, end_ms: int) -> dict: | ||
| 102 | rt = load_mert_runtime() | ||
| 103 | torch = rt['torch'] | ||
| 104 | samples, sr = load_audio_mono(str(asset_path), sr=rt['sample_rate']) | ||
| 105 | samples = np.asarray(samples, dtype=np.float32) | ||
| 106 | start_frame = int(start_ms * sr / 1000) | ||
| 107 | end_frame = int(end_ms * sr / 1000) | ||
| 108 | segment = samples[start_frame:end_frame] | ||
| 109 | if segment.size == 0: | ||
| 110 | raise ValueError('empty segment for MERT extraction') | ||
| 111 | inputs = rt['feature_extractor']( | ||
| 112 | segment, | ||
| 113 | sampling_rate=sr, | ||
| 114 | return_tensors='pt', | ||
| 115 | ) | ||
| 116 | with torch.no_grad(): | ||
| 117 | outputs = rt['model'](**inputs) | ||
| 118 | emb = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy().astype(np.float32) | ||
| 119 | digest = hashlib.sha256(emb.tobytes()).hexdigest() | ||
| 120 | return { | ||
| 121 | 'embedding_dim': int(emb.shape[0]), | ||
| 122 | 'embedding_uri': f"inline-mert://{digest[:16]}:{start_ms}:{end_ms}", | ||
| 123 | 'vector_table_name': f"audio_embedding_vector_{int(emb.shape[0])}_placeholder", | ||
| 124 | 'checksum': f"emb:{digest[:16]}", | ||
| 125 | 'metadata_json': { | ||
| 126 | 'semantic_backend': 'mert_runtime', | ||
| 127 | 'embedding_preview': [float(x) for x in emb[:8]], | ||
| 128 | 'model_id': MERT_MODEL_ID, | ||
| 129 | 'sample_rate': sr, | ||
| 130 | }, | ||
| 131 | } | ||
| 132 | |||
| 133 | |||
| 134 | def build_semantic_feature(asset_path: Path, stats: dict, start_ms: int, end_ms: int, runtime_ok: bool, missing: list[str]) -> dict: | ||
| 76 | if runtime_ok: | 135 | if runtime_ok: |
| 136 | try: | ||
| 137 | mert = extract_mert_embedding(asset_path, start_ms, end_ms) | ||
| 138 | return { | ||
| 139 | 'feature_type': 'embedding', | ||
| 140 | 'model_name': 'mert-v1-95m', | ||
| 141 | 'model_version': 'hf-main', | ||
| 142 | 'feature_set_name': 'mert_5s_hop2.5_v1', | ||
| 143 | 'feature_schema_ver': 'v1', | ||
| 144 | 'embedding_dim': mert['embedding_dim'], | ||
| 145 | 'embedding_uri': mert['embedding_uri'], | ||
| 146 | 'vector_table_name': mert['vector_table_name'], | ||
| 147 | 'checksum': mert['checksum'], | ||
| 148 | 'metadata_json': mert['metadata_json'], | ||
| 149 | } | ||
| 150 | except Exception as exc: | ||
| 77 | return { | 151 | return { |
| 78 | 'feature_type': 'embedding', | 152 | 'feature_type': 'embedding', |
| 79 | 'model_name': 'semantic_runtime_ready_placeholder', | 153 | 'model_name': 'semantic_runtime_ready_placeholder', |
| ... | @@ -84,7 +158,7 @@ def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok: | ... | @@ -84,7 +158,7 @@ def build_semantic_feature(stats: dict, start_ms: int, end_ms: int, runtime_ok: |
| 84 | 'embedding_uri': f"runtime-ready://{stats['digest'][:16]}:{start_ms}:{end_ms}", | 158 | 'embedding_uri': f"runtime-ready://{stats['digest'][:16]}:{start_ms}:{end_ms}", |
| 85 | 'vector_table_name': 'audio_embedding_vector_8_placeholder', | 159 | 'vector_table_name': 'audio_embedding_vector_8_placeholder', |
| 86 | 'checksum': f"emb:{stats['digest'][:16]}", | 160 | 'checksum': f"emb:{stats['digest'][:16]}", |
| 87 | 'metadata_json': {'semantic_backend': 'runtime_ready_placeholder'}, | 161 | 'metadata_json': {'semantic_backend': 'runtime_ready_placeholder', 'runtime_error': str(exc)}, |
| 88 | } | 162 | } |
| 89 | return { | 163 | return { |
| 90 | 'feature_type': 'embedding', | 164 | 'feature_type': 'embedding', |
| ... | @@ -162,7 +236,7 @@ def main() -> int: | ... | @@ -162,7 +236,7 @@ def main() -> int: |
| 162 | } | 236 | } |
| 163 | fallback_fp_count += 1 | 237 | fallback_fp_count += 1 |
| 164 | 238 | ||
| 165 | emb = build_semantic_feature(stats, window['start_ms'], window['end_ms'], runtime_ok, missing_runtime) | 239 | emb = build_semantic_feature(asset_path, stats, window['start_ms'], window['end_ms'], runtime_ok, missing_runtime) |
| 166 | if runtime_ok: | 240 | if runtime_ok: |
| 167 | semantic_runtime_ready_count += 1 | 241 | semantic_runtime_ready_count += 1 |
| 168 | else: | 242 | else: | ... | ... |
| 1 | # Changelog | 1 | # Changelog |
| 2 | 2 | ||
| 3 | ## 2026-06-04 | 3 | ## 2026-06-04 |
| 4 | - fresh runtime 进展:已在当前 host 成功安装 `torch-2.12.0+cpu`、`torchaudio-2.11.0+cpu` 与 `transformers-5.10.1`,重跑 song-centric 主链后确认 `semantic_runtime_available = true`、`semantic_runtime_ready_count = 5`、`semantic_fallback_count = 0`;当前 semantic 已从 fallback 推进到 `semantic_runtime_ready_placeholder`,下一步只差接真实 `MERT / MuQ` adapter。 | 4 | - fresh runtime 进展:已在当前 host 成功安装 `torch-2.12.0+cpu`、`torchaudio-2.11.0+cpu` 与 `transformers-5.10.1`,重跑 song-centric 主链后确认 `semantic_runtime_available = true`、`semantic_runtime_ready_count = 5`、`semantic_fallback_count = 0`;当前 semantic 已从 fallback 推进到 `mert-v1-95m`,下一步可在不破坏当前 MERT 基线的前提下继续接 `MuQ` adapter。 |
| 5 | - 收敛 `docs/` 到当前 song-centric 主线,只保留 `README / start-here / session-handoff / postgresql-data-model / postgres_db_schema_samples / CHANGELOG` 六份核心文档,删除旧的 v2 / planner-worker / registry 扩展文档,避免新同学误入已退居次线的设计。 | 5 | - 收敛 `docs/` 到当前 song-centric 主线,只保留 `README / start-here / session-handoff / postgresql-data-model / postgres_db_schema_samples / CHANGELOG` 六份核心文档,删除旧的 v2 / planner-worker / registry 扩展文档,避免新同学误入已退居次线的设计。 |
| 6 | - 重写 `docs/postgresql-data-model.md`,明确 `保存切片的数据 + 模型 + feature` 的落表方案:`window` 落 `audio_object`,模型身份落 `feature_fact.model_name/model_version/feature_set_name`,具体 `fingerprint/embedding` 也统一落 `feature_fact`。 | 6 | - 重写 `docs/postgresql-data-model.md`,明确 `保存切片的数据 + 模型 + feature` 的落表方案:`window` 落 `audio_object`,模型身份落 `feature_fact.model_name/model_version/feature_set_name`,具体 `fingerprint/embedding` 也统一落 `feature_fact`。 |
| 7 | - 重写 `docs/postgres_db_schema_samples.md` 与入口文档,补充当前 4 表主链的流程图、典型 SQL 样例、查询回溯路径与写入顺序,统一文档口径到 `media_entity -> audio_object -> feature_fact -> set_membership`。 | 7 | - 重写 `docs/postgres_db_schema_samples.md` 与入口文档,补充当前 4 表主链的流程图、典型 SQL 样例、查询回溯路径与写入顺序,统一文档口径到 `media_entity -> audio_object -> feature_fact -> set_membership`。 | ... | ... |
| ... | @@ -33,7 +33,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. | ... | @@ -33,7 +33,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. |
| 33 | - `semantic_runtime_missing = []` | 33 | - `semantic_runtime_missing = []` |
| 34 | - `semantic_runtime_ready_count = 5` | 34 | - `semantic_runtime_ready_count = 5` |
| 35 | - `semantic_fallback_count = 0` | 35 | - `semantic_fallback_count = 0` |
| 36 | - `import_counts = media_entity:9 / audio_object:22 / feature_fact:29 / set_membership:9` | 36 | - `import_counts = media_entity:9 / audio_object:22 / feature_fact:34 / set_membership:9` |
| 37 | 37 | ||
| 38 | --- | 38 | --- |
| 39 | 39 | ||
| ... | @@ -122,10 +122,10 @@ flowchart TD | ... | @@ -122,10 +122,10 @@ flowchart TD |
| 122 | 122 | ||
| 123 | - `torch / torchaudio / transformers` 已可导入 | 123 | - `torch / torchaudio / transformers` 已可导入 |
| 124 | - 当前 `semantic_runtime_available = true` | 124 | - 当前 `semantic_runtime_available = true` |
| 125 | - 当前 semantic 仍不是 `MERT / MuQ`,而是 `semantic_runtime_ready_placeholder` | 125 | - 当前 semantic 已接上真实 `mert-v1-95m` baseline |
| 126 | 126 | ||
| 127 | 这说明当前主要 blocker 已从“依赖缺失”推进为: | 127 | 这说明当前主要 blocker 已从“依赖缺失”推进为: |
| 128 | > **runtime 已就绪,但真实 semantic adapter 还没接入。** | 128 | > **runtime 已就绪,真实 `MERT` baseline 已接入,下一步可继续接 `MuQ`。** |
| 129 | 129 | ||
| 130 | --- | 130 | --- |
| 131 | 131 | ||
| ... | @@ -174,7 +174,7 @@ flowchart TD | ... | @@ -174,7 +174,7 @@ flowchart TD |
| 174 | - exact lane 已优先复用 `ChromaprintMatcher` | 174 | - exact lane 已优先复用 `ChromaprintMatcher` |
| 175 | - semantic lane 还没有真实接入 `MERT / MuQ` | 175 | - semantic lane 还没有真实接入 `MERT / MuQ` |
| 176 | - runtime 就绪时,当前会产出: | 176 | - runtime 就绪时,当前会产出: |
| 177 | - `model_name = semantic_runtime_ready_placeholder` | 177 | - `model_name = mert-v1-95m` |
| 178 | - fallback 分支仍保留: | 178 | - fallback 分支仍保留: |
| 179 | - `model_name = local_wavehash_embed` | 179 | - `model_name = local_wavehash_embed` |
| 180 | 180 | ... | ... |
| ... | @@ -31,7 +31,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. | ... | @@ -31,7 +31,7 @@ acr-engine/scripts/start_songcentric_shortest_path.sh 'postgres://d2:d2pass@127. |
| 31 | - `semantic_runtime_missing = []` | 31 | - `semantic_runtime_missing = []` |
| 32 | - `semantic_runtime_ready_count = 5` | 32 | - `semantic_runtime_ready_count = 5` |
| 33 | - `semantic_fallback_count = 0` | 33 | - `semantic_fallback_count = 0` |
| 34 | - `import_counts = media_entity:9 / audio_object:22 / feature_fact:29 / set_membership:9` | 34 | - `import_counts = media_entity:9 / audio_object:22 / feature_fact:34 / set_membership:9` |
| 35 | 35 | ||
| 36 | --- | 36 | --- |
| 37 | 37 | ||
| ... | @@ -100,7 +100,7 @@ flowchart TD | ... | @@ -100,7 +100,7 @@ flowchart TD |
| 100 | - 真实目录 -> manifest -> import 已打通 | 100 | - 真实目录 -> manifest -> import 已打通 |
| 101 | - 真实目录 -> fingerprint enrichment -> import 已打通 | 101 | - 真实目录 -> fingerprint enrichment -> import 已打通 |
| 102 | - semantic lane 已做成 runtime-ready | 102 | - semantic lane 已做成 runtime-ready |
| 103 | - 当前 host 已能进入 runtime-ready placeholder 分支,下一步只差接真实 `MERT / MuQ` | 103 | - 当前 host 已能进入 runtime-ready placeholder 分支,下一步可在不破坏当前 MERT 基线的前提下继续接 `MuQ` |
| 104 | - 当前 exact lane 已优先复用仓库内 `ChromaprintMatcher` | 104 | - 当前 exact lane 已优先复用仓库内 `ChromaprintMatcher` |
| 105 | 105 | ||
| 106 | --- | 106 | --- |
| ... | @@ -108,14 +108,14 @@ flowchart TD | ... | @@ -108,14 +108,14 @@ flowchart TD |
| 108 | ## 7. 当前最该继续什么 | 108 | ## 7. 当前最该继续什么 |
| 109 | 109 | ||
| 110 | ### 第一优先级 | 110 | ### 第一优先级 |
| 111 | 把 semantic lane 从 `semantic_runtime_ready_placeholder` 升级成真实 encoder adapter,且不破坏现有宿主链。 | 111 | 把 semantic lane 从 `mert-v1-95m` baseline 扩展到 `MuQ` challenger,且不破坏现有宿主链。 |
| 112 | 112 | ||
| 113 | ### 当前 host 事实 | 113 | ### 当前 host 事实 |
| 114 | - `torch` 已可导入 | 114 | - `torch` 已可导入 |
| 115 | - `torchaudio` 已可导入 | 115 | - `torchaudio` 已可导入 |
| 116 | - `transformers` 已可导入 | 116 | - `transformers` 已可导入 |
| 117 | - 当前 `semantic_runtime_available = true` | 117 | - 当前 `semantic_runtime_available = true` |
| 118 | - 当前最新主链产出仍是 `semantic_runtime_ready_placeholder`,不是真实 `MERT / MuQ` | 118 | - 当前最新主链产出已经是 `mert-v1-95m`;下一步可继续补 `MuQ` challenger |
| 119 | 119 | ||
| 120 | --- | 120 | --- |
| 121 | 121 | ... | ... |
-
Please register or sign in to post a comment