Make the documentation system navigable, sourced, and release-ready
Turn the docs set into a layered documentation portal with navigation, source tracing, and reusable governance templates so the project can scale beyond ad hoc notes. Constraint: Industrialization requires documentation that supports decisions, traceability, and release discipline Rejected: Keep docs as isolated topical files without navigation or templates | would slow onboarding and weaken release governance Confidence: high Scope-risk: narrow Directive: Keep future docs in the executive-summary -> diagram -> table -> text -> appendix pattern with explicit Sources sections Tested: structural checks for core docs and templates; source-section checks; docs file-presence checks; service /config and /health smoke checks from earlier stage remain valid Not-tested: rendered markdown visuals in a browser; external publishing pipeline
Showing
18 changed files
with
1000 additions
and
441 deletions
No preview for this file type
| ... | @@ -8,25 +8,36 @@ from pydantic import BaseModel | ... | @@ -8,25 +8,36 @@ from pydantic import BaseModel |
| 8 | from src.engines.chromaprint_matcher import ChromaprintMatcher | 8 | from src.engines.chromaprint_matcher import ChromaprintMatcher |
| 9 | from src.engines.ecapa_embedder import ECAPAEmbedder | 9 | from src.engines.ecapa_embedder import ECAPAEmbedder |
| 10 | from src.engines.hybrid_engine import HybridEngine | 10 | from src.engines.hybrid_engine import HybridEngine |
| 11 | from src.service.settings import ServiceSettings | ||
| 11 | 12 | ||
| 12 | 13 | ||
| 13 | class RecognizeRequest(BaseModel): | 14 | class RecognizeRequest(BaseModel): |
| 14 | query_path: str | 15 | query_path: str |
| 15 | data_dir: str = "data/synthetic_v2" | 16 | data_dir: Optional[str] = None |
| 16 | model_path: str = "data/models_v3/best_model.pt" | 17 | model_path: Optional[str] = None |
| 17 | index_prefix: str = "data/index_v3/reference" | 18 | index_prefix: Optional[str] = None |
| 18 | top_n: int = 5 | 19 | top_n: int = 5 |
| 19 | device: str = "cpu" | 20 | device: Optional[str] = None |
| 20 | 21 | ||
| 21 | 22 | ||
| 22 | class BuildIndexRequest(BaseModel): | 23 | class BuildIndexRequest(BaseModel): |
| 23 | data_dir: str | 24 | data_dir: Optional[str] = None |
| 24 | model_path: str | 25 | model_path: Optional[str] = None |
| 25 | output_dir: str | 26 | output_dir: str |
| 26 | device: str = "cpu" | 27 | device: Optional[str] = None |
| 27 | 28 | ||
| 28 | 29 | ||
| 29 | app = FastAPI(title="ACR Service", version="0.1.0") | 30 | app = FastAPI(title="ACR Service", version="0.2.0") |
| 31 | settings = ServiceSettings() | ||
| 32 | |||
| 33 | |||
| 34 | def _resolve(req_data_dir=None, req_model_path=None, req_index_prefix=None, req_device=None): | ||
| 35 | return { | ||
| 36 | "data_dir": req_data_dir or settings.data_dir, | ||
| 37 | "model_path": req_model_path or settings.model_path, | ||
| 38 | "index_prefix": req_index_prefix or settings.index_prefix, | ||
| 39 | "device": req_device or settings.device, | ||
| 40 | } | ||
| 30 | 41 | ||
| 31 | 42 | ||
| 32 | def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) -> HybridEngine: | 43 | def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) -> HybridEngine: |
| ... | @@ -57,14 +68,20 @@ def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) | ... | @@ -57,14 +68,20 @@ def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) |
| 57 | 68 | ||
| 58 | @app.get("/health") | 69 | @app.get("/health") |
| 59 | def health(): | 70 | def health(): |
| 60 | return {"status": "ok"} | 71 | return {"status": "ok", "service": "acr", "version": "0.2.0"} |
| 72 | |||
| 73 | |||
| 74 | @app.get("/config") | ||
| 75 | def config(): | ||
| 76 | return settings.model_dump() | ||
| 61 | 77 | ||
| 62 | 78 | ||
| 63 | @app.post("/recognize") | 79 | @app.post("/recognize") |
| 64 | def recognize(req: RecognizeRequest): | 80 | def recognize(req: RecognizeRequest): |
| 81 | resolved = _resolve(req.data_dir, req.model_path, req.index_prefix, req.device) | ||
| 65 | if not Path(req.query_path).exists(): | 82 | if not Path(req.query_path).exists(): |
| 66 | raise HTTPException(status_code=400, detail=f"Missing query file: {req.query_path}") | 83 | raise HTTPException(status_code=400, detail=f"Missing query file: {req.query_path}") |
| 67 | engine = _load_engine(req.data_dir, req.model_path, req.index_prefix, req.device) | 84 | engine = _load_engine(**resolved) |
| 68 | return engine.recognize(req.query_path, top_n=req.top_n) | 85 | return engine.recognize(req.query_path, top_n=req.top_n) |
| 69 | 86 | ||
| 70 | 87 | ||
| ... | @@ -72,9 +89,10 @@ def recognize(req: RecognizeRequest): | ... | @@ -72,9 +89,10 @@ def recognize(req: RecognizeRequest): |
| 72 | def build_index(req: BuildIndexRequest): | 89 | def build_index(req: BuildIndexRequest): |
| 73 | from run_demo import build_chroma_index, build_embedding_index | 90 | from run_demo import build_chroma_index, build_embedding_index |
| 74 | 91 | ||
| 75 | data_dir = Path(req.data_dir) | 92 | resolved = _resolve(req.data_dir, req.model_path, None, req.device) |
| 93 | data_dir = Path(resolved["data_dir"]) | ||
| 76 | out_dir = Path(req.output_dir) | 94 | out_dir = Path(req.output_dir) |
| 77 | out_dir.mkdir(parents=True, exist_ok=True) | 95 | out_dir.mkdir(parents=True, exist_ok=True) |
| 78 | build_chroma_index(data_dir, out_dir) | 96 | build_chroma_index(data_dir, out_dir) |
| 79 | _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(req.model_path), out_dir / "reference", req.device) | 97 | _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(resolved["model_path"]), out_dir / "reference", resolved["device"]) |
| 80 | return {"status": "ok", "num_reference_windows": len(ref_ids), "embedding_dim": int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0} | 98 | return {"status": "ok", "num_reference_windows": len(ref_ids), "embedding_dim": int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0} | ... | ... |
acr-engine/src/service/settings.py
0 → 100644
| 1 | from pathlib import Path | ||
| 2 | from pydantic import BaseModel | ||
| 3 | |||
| 4 | |||
| 5 | class ServiceSettings(BaseModel): | ||
| 6 | data_dir: str = "data/synthetic_v2" | ||
| 7 | model_path: str = "data/models_v3/best_model.pt" | ||
| 8 | index_prefix: str = "data/index_v3/reference" | ||
| 9 | device: str = "cpu" | ||
| 10 | |||
| 11 | def ensure_parent_dirs(self): | ||
| 12 | for p in [Path(self.data_dir), Path(self.model_path).parent, Path(self.index_prefix).parent]: | ||
| 13 | p.mkdir(parents=True, exist_ok=True) |
| ... | @@ -75,3 +75,23 @@ | ... | @@ -75,3 +75,23 @@ |
| 75 | - API `build_index(...)` 成功返回 reference window 数量 | 75 | - API `build_index(...)` 成功返回 reference window 数量 |
| 76 | - API `recognize(...)` 成功返回候选结果 | 76 | - API `recognize(...)` 成功返回候选结果 |
| 77 | - `train.py --dry-run` 成功 | 77 | - `train.py --dry-run` 成功 |
| 78 | |||
| 79 | ## 2026-06-02 | ||
| 80 | |||
| 81 | ### Stage: 文档治理闭环(导航 / 引用 / 模板) | ||
| 82 | |||
| 83 | 完成项: | ||
| 84 | - 新增 `docs/README.md` 作为文档总入口 | ||
| 85 | - 新增 `docs/references-and-sources.md` 作为引用来源总图 | ||
| 86 | - 新增 `docs/benchmark-report-template.md` | ||
| 87 | - 新增 `docs/model-card-template.md` | ||
| 88 | - 新增 `docs/release-checklist.md` | ||
| 89 | - 核心文档统一补充 `Sources` 小节 | ||
| 90 | - 核心文档统一补齐 executive summary / mermaid / table / appendix 风格 | ||
| 91 | |||
| 92 | 验证结果: | ||
| 93 | - docs 总入口结构检查通过 | ||
| 94 | - references map 结构检查通过 | ||
| 95 | - 核心 docs 存在性检查通过 | ||
| 96 | - benchmark/model/release 模板结构检查通过 | ||
| 97 | - 所有核心文档均具备 Sources;SOTA 文档已补齐 Mermaid 图 | ... | ... |
docs/README.md
0 → 100644
| 1 | # ACR Docs Overview | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 一页结论 | ||
| 6 | |||
| 7 | 这套文档已经按“**重点 → 图 → 表 → 文 → 细节**”重构,建议按下面顺序阅读: | ||
| 8 | |||
| 9 | 1. **项目定位与职责** | ||
| 10 | 2. **系统架构** | ||
| 11 | 3. **数据规范** | ||
| 12 | 4. **服务接口** | ||
| 13 | 5. **benchmark 与工业化路线** | ||
| 14 | 6. **数据来源与许可** | ||
| 15 | 7. **SOTA 调研** | ||
| 16 | |||
| 17 | --- | ||
| 18 | |||
| 19 | ## 1. 文档导航图 | ||
| 20 | |||
| 21 | ```mermaid | ||
| 22 | flowchart TD | ||
| 23 | A[Docs Entry] --> B[Project Responsibility] | ||
| 24 | A --> C[Architecture] | ||
| 25 | A --> D[Dataset Spec] | ||
| 26 | A --> E[Service API] | ||
| 27 | A --> F[Industrial Benchmark] | ||
| 28 | A --> G[Industrialization Roadmap] | ||
| 29 | A --> H[Licensing & Sources] | ||
| 30 | A --> I[SOTA Research] | ||
| 31 | |||
| 32 | B --> C | ||
| 33 | C --> D | ||
| 34 | C --> E | ||
| 35 | D --> F | ||
| 36 | F --> G | ||
| 37 | H --> G | ||
| 38 | I --> G | ||
| 39 | ``` | ||
| 40 | |||
| 41 | --- | ||
| 42 | |||
| 43 | ## 2. 阅读顺序表 | ||
| 44 | |||
| 45 | | 读者角色 | 建议先读 | | ||
| 46 | |---|---| | ||
| 47 | | 产品/负责人 | `industrialization-roadmap.md` | | ||
| 48 | | 算法/模型 | `acr-architecture.md`, `dataset-spec.md`, `sota-research-2026.md` | | ||
| 49 | | 平台/后端 | `service-api.md`, `industrial-benchmark-spec.md` | | ||
| 50 | | 数据/合规 | `dataset-sources-and-licensing.md` | | ||
| 51 | | 新成员 | `project-responsibility-map.md`, `README.md` | | ||
| 52 | |||
| 53 | --- | ||
| 54 | |||
| 55 | ## 3. 文档清单 | ||
| 56 | |||
| 57 | - `project-responsibility-map.md` | ||
| 58 | - `acr-architecture.md` | ||
| 59 | - `dataset-spec.md` | ||
| 60 | - `service-api.md` | ||
| 61 | - `industrial-benchmark-spec.md` | ||
| 62 | - `industrialization-roadmap.md` | ||
| 63 | - `dataset-sources-and-licensing.md` | ||
| 64 | - `sota-research-2026.md` | ||
| 65 | - `CHANGELOG.md` | ||
| 66 | |||
| 67 | --- | ||
| 68 | |||
| 69 | ## 4. 文字说明 | ||
| 70 | |||
| 71 | 这套文档不是“平铺型说明书”,而是尽量面向: | ||
| 72 | - 决策 | ||
| 73 | - 分工 | ||
| 74 | - 分层 | ||
| 75 | - 工业化演进 | ||
| 76 | |||
| 77 | 因此每份文档都优先呈现: | ||
| 78 | - 重点结论 | ||
| 79 | - 图示关系 | ||
| 80 | - 表格归纳 | ||
| 81 | - 文字说明 | ||
| 82 | - 细节附录 | ||
| 83 | |||
| 84 | --- | ||
| 85 | |||
| 86 | ## 5. 细节附录 | ||
| 87 | |||
| 88 | 建议后续继续补充: | ||
| 89 | - Benchmark report 模板 | ||
| 90 | - Model card 模板 | ||
| 91 | - License review checklist | ||
| 92 | - Release checklist | ||
| 93 | |||
| 94 | ## Sources | ||
| 95 | - This file is an internal documentation navigation artifact for the current repo state. |
| 1 | # ACR 项目架构图 | 1 | # ACR 系统架构图 |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 1. 总体架构 | 5 | ## 一页结论 |
| 6 | |||
| 7 | - 识别链路已不是单一模型,而是 **指纹 + 向量 + melody-aware rerank** 的混合结构 | ||
| 8 | - 数据与服务已经进入工业化演进阶段 | ||
| 9 | - 当前主短板在:`humming_like` 与 `confused` 的 hard-case 精度 | ||
| 10 | |||
| 11 | --- | ||
| 12 | |||
| 13 | ## 1. 总体架构图 | ||
| 6 | 14 | ||
| 7 | ```mermaid | 15 | ```mermaid |
| 8 | flowchart LR | 16 | flowchart LR |
| 9 | Q[Query 音频] --> P[预处理] | 17 | Q[Query Audio] --> P[Preprocess] |
| 10 | P --> F1[传统指纹特征] | 18 | P --> F1[Chromaprint Features] |
| 11 | P --> F2[Mel 特征] | 19 | P --> F2[128-Mel Features] |
| 20 | P --> F3[Melody Signature] | ||
| 21 | |||
| 22 | F1 --> M1[Fingerprint Matcher] | ||
| 23 | F2 --> M2[ECAPA + BandSplit Embedder] | ||
| 24 | F3 --> M3[Melody Similarity] | ||
| 12 | 25 | ||
| 13 | F1 --> M1[Chromaprint Matcher] | 26 | C[Catalog References] --> I1[Fingerprint Index] |
| 14 | F2 --> M2[ECAPA Embedder] | 27 | C --> I2[Embedding Window Index] |
| 28 | C --> I3[Reference Melody Cache] | ||
| 15 | 29 | ||
| 16 | R[Reference 曲库] --> I1[指纹索引] | 30 | M1 --> H[Hybrid Fusion] |
| 17 | R --> I2[Embedding 索引] | 31 | M2 --> H |
| 32 | M3 --> H | ||
| 18 | 33 | ||
| 19 | M1 --> C[候选集合] | 34 | H --> O[Top-K + Reject] |
| 20 | M2 --> C | ||
| 21 | C --> H[Hybrid 重排序] | ||
| 22 | H --> O[Top-K 识别结果] | ||
| 23 | ``` | 35 | ``` |
| 24 | 36 | ||
| 25 | ## 2. 训练架构 | 37 | --- |
| 38 | |||
| 39 | ## 2. 在线/离线分层图 | ||
| 26 | 40 | ||
| 27 | ```mermaid | 41 | ```mermaid |
| 28 | flowchart TD | 42 | flowchart TD |
| 29 | A[原始/合成音频] --> B[随机裁剪] | 43 | A[Offline Pipeline] --> A1[Dataset Prep] |
| 30 | B --> C[增强: 噪声/变速/移调/混响] | 44 | A --> A2[Training] |
| 31 | C --> D[Mel Spectrogram] | 45 | A --> A3[Index Build] |
| 32 | D --> E[ECAPA-TDNN] | 46 | A --> A4[Benchmark] |
| 33 | E --> F[Embedding] | 47 | |
| 34 | F --> G[SupCon Loss] | 48 | B[Online Service] --> B1[/health] |
| 35 | F --> H[AAM Softmax] | 49 | B --> B2[/recognize] |
| 36 | G --> I[联合优化] | 50 | B --> B3[/index/build] |
| 37 | H --> I | ||
| 38 | ``` | 51 | ``` |
| 39 | 52 | ||
| 40 | ## 3. 推理架构 | 53 | --- |
| 41 | 54 | ||
| 42 | ```mermaid | 55 | ## 3. 关键模块表 |
| 43 | sequenceDiagram | 56 | |
| 44 | participant U as User Query | 57 | | 模块 | 输入 | 输出 | 作用 | |
| 45 | participant P as Preprocessor | 58 | |---|---|---|---| |
| 46 | participant C as Chroma Matcher | 59 | | Preprocess | wav | mel/chroma/f0 | 统一特征入口 | |
| 47 | participant E as ECAPA Embedder | 60 | | Fingerprint Matcher | query audio | chroma candidates | 快速召回 | |
| 48 | participant H as Hybrid Engine | 61 | | ECAPA Embedder | mel | embeddings | 语义向量检索 | |
| 49 | 62 | | Melody Similarity | query/ref melody | melody score | 哼唱场景补强 | | |
| 50 | U->>P: 输入音频 | 63 | | Hybrid Fusion | multi-scores | ranked candidates | 综合排序 | |
| 51 | P->>C: 指纹特征 | 64 | | Service API | request | JSON result | 对外调用 | |
| 52 | P->>E: Mel 特征 | 65 | |
| 53 | C-->>H: Top-N 指纹候选 | 66 | --- |
| 54 | E-->>H: Top-N embedding 候选 | 67 | |
| 55 | H-->>U: 融合后的识别结果 | 68 | ## 4. 当前设计重点 |
| 56 | ``` | 69 | |
| 70 | ### 4.1 为什么是混合结构 | ||
| 71 | 纯指纹对哼唱弱,纯 embedding 对局部强匹配和解释性不足,因此使用混合结构更稳妥。 | ||
| 72 | |||
| 73 | ### 4.2 为什么加入 melody-aware | ||
| 74 | 目前 hard-case 主要在哼唱/近旋律混淆,因此用 melody signature 做辅助排序。 | ||
| 75 | |||
| 76 | ### 4.3 为什么要 window-level index | ||
| 77 | 整曲平均 embedding 会损失局部片段信息;window-level 更贴近 ACR 场景。 | ||
| 78 | |||
| 79 | --- | ||
| 57 | 80 | ||
| 58 | ## 4. 当前可运行闭环 | 81 | ## 5. 细节附录 |
| 59 | 82 | ||
| 60 | 1. 用 `synthetic.py` 生成合成曲库 | 83 | 代码映射: |
| 61 | 2. 用 `train.py` 训练 ECAPA 原型模型 | 84 | - `src/engines/chromaprint_matcher.py` |
| 62 | 3. 用 `run_demo.py build-index` 构建: | 85 | - `src/engines/ecapa_embedder.py` |
| 63 | - 指纹索引 | 86 | - `src/engines/hybrid_engine.py` |
| 64 | - embedding 索引 | 87 | - `src/service/app.py` |
| 65 | 4. 用 `run_demo.py recognize` 对片段做识别 | ||
| 66 | 88 | ||
| 67 | ## 5. 后续生产化架构建议 | ||
| 68 | 89 | ||
| 69 | - API Gateway | 90 | ## Sources |
| 70 | - 异步音频入库流水线 | 91 | - See `docs/references-and-sources.md` for the current source map. |
| 71 | - Faiss/HNSW 向量服务 | ||
| 72 | - Postgres/MySQL 元数据服务 | ||
| 73 | - 对象存储保存原始音频 | ||
| 74 | - 模型服务与索引服务解耦 | ... | ... |
docs/benchmark-report-template.md
0 → 100644
| 1 | # Benchmark Report Template | ||
| 2 | |||
| 3 | > 用于每次模型版本评测输出 | ||
| 4 | |||
| 5 | ## 一页结论 | ||
| 6 | - 模型版本: | ||
| 7 | - 数据版本: | ||
| 8 | - 核心结论: | ||
| 9 | - 是否通过上线门禁: | ||
| 10 | |||
| 11 | ## 1. 评测范围图 | ||
| 12 | |||
| 13 | ```mermaid | ||
| 14 | flowchart LR | ||
| 15 | A[Model Version] --> B[Datasets] | ||
| 16 | A --> C[Scenario Buckets] | ||
| 17 | A --> D[Latency / Ops] | ||
| 18 | ``` | ||
| 19 | |||
| 20 | ## 2. 指标表 | ||
| 21 | |||
| 22 | | Bucket | top1 | top5 | MRR | FAR | Notes | | ||
| 23 | |---|---:|---:|---:|---:|---| | ||
| 24 | | clean | | | | | | | ||
| 25 | | humming_like | | | | | | | ||
| 26 | | confused | | | | | | | ||
| 27 | |||
| 28 | ## 3. 文字分析 | ||
| 29 | - 最强项: | ||
| 30 | - 最弱项: | ||
| 31 | - 与上一版本对比: | ||
| 32 | |||
| 33 | ## 4. 细节附录 | ||
| 34 | - 评测命令 | ||
| 35 | - 数据清单 | ||
| 36 | - 原始 JSON 报告路径 | ||
| 37 | |||
| 38 | ## Sources | ||
| 39 | - `docs/industrial-benchmark-spec.md` |
| 1 | # Dataset Sources and Licensing Notes | 1 | # Dataset Sources and Licensing |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 注意 | 5 | ## 一页结论 |
| 6 | 以下仅为工程接入与研究规划说明,不等于法律意见。实际商用前需要逐条复核原始 license、dataset terms 和再训练约束。 | ||
| 7 | 6 | ||
| 8 | ## 候选数据源 | 7 | - 外部数据集接入的第一原则不是“能下载”,而是“**能否合法商用**” |
| 8 | - 当前建议优先级: | ||
| 9 | 1. FMA | ||
| 10 | 2. MTG-Jamendo | ||
| 11 | 3. CCMusic(审批/核验后) | ||
| 12 | 4. ModelScope music datasets(白名单后) | ||
| 13 | - ModelScope 与 CCMusic 当前都不能默认直接进入商用训练 | ||
| 9 | 14 | ||
| 10 | ### 1. FMA | 15 | --- |
| 11 | - URL: https://github.com/mdeff/fma | ||
| 12 | - 特点: 开放、MIR 常用、适合 retrieval baseline | ||
| 13 | - 风险: 音频 license 按 artist/track 可能不同,需逐条核验 | ||
| 14 | 16 | ||
| 15 | ### 2. MTG-Jamendo | 17 | ## 1. 来源分层图 |
| 16 | - URL: https://github.com/MTG/mtg-jamendo-dataset | ||
| 17 | - 特点: Creative Commons 来源,适合音乐检索/标签任务 | ||
| 18 | - 风险: 仍需按具体曲目用途与商业场景做 license 审查 | ||
| 19 | 18 | ||
| 20 | ### 3. CCMusic | 19 | ```mermaid |
| 21 | - 论文/介绍: https://transactions.ismir.net/articles/10.5334/tismir.194 | 20 | flowchart TD |
| 22 | - 主页: https://ccmusic-database.github.io/en/database/ccm.html | 21 | A[Candidate Datasets] --> B[Open / MIR Baselines] |
| 23 | - 特点: 中国音乐 MIR 数据资源丰富 | 22 | A --> C[Chinese / Regional Sources] |
| 24 | - 风险: 部分数据集可能需要申请或存在使用边界,必须单独核验 | 23 | A --> D[Discovery Surfaces] |
| 25 | 24 | ||
| 26 | ### 4. ModelScope music datasets | 25 | B --> B1[FMA] |
| 27 | - 入口: https://www.modelscope.cn/datasets | 26 | B --> B2[MTG-Jamendo] |
| 28 | - 搜索: https://modelscope.cn/search?page=1&search=music&type=dataset | 27 | C --> C1[CCMusic] |
| 29 | - 特点: 数据发现方便,可扩充中文生态 | 28 | D --> D1[ModelScope music datasets] |
| 30 | - 风险: license 分散,不能默认可商用;接入前必须建立白名单 | 29 | ``` |
| 31 | 30 | ||
| 32 | ## 接入原则 | 31 | --- |
| 33 | 32 | ||
| 34 | - 只接入 license 明确的数据集 | 33 | ## 2. 数据源表 |
| 35 | - 默认拒绝“来源不明 / 不允许商业使用 / 禁止训练衍生模型”的数据 | 34 | |
| 36 | - 训练前把数据集及许可信息落盘到 registry | 35 | | 数据源 | 角色 | 风险 | 当前策略 | |
| 36 | |---|---|---|---| | ||
| 37 | | FMA | 首批真实 baseline | track license 需核验 | review_required | | ||
| 38 | | MTG-Jamendo | retrieval/tagging corpus | CC 细则需核验 | review_required | | ||
| 39 | | CCMusic | 中文 MIR 资源 | 可能需申请/存在限制 | review_required | | ||
| 40 | | ModelScope music | 数据发现入口 | license 分散 | deny_until_whitelisted | | ||
| 41 | |||
| 42 | --- | ||
| 43 | |||
| 44 | ## 3. 白名单流程图 | ||
| 45 | |||
| 46 | ```mermaid | ||
| 47 | flowchart LR | ||
| 48 | A[发现数据集] --> B[收集 license / terms] | ||
| 49 | B --> C[法律/合规审查] | ||
| 50 | C --> D{可商用?} | ||
| 51 | D -- 是 --> E[加入 whitelist] | ||
| 52 | D -- 否 --> F[禁止进入训练] | ||
| 53 | ``` | ||
| 54 | |||
| 55 | --- | ||
| 56 | |||
| 57 | ## 4. 文字说明 | ||
| 58 | |||
| 59 | ### 4.1 为什么 ModelScope 只能先当 discovery surface | ||
| 60 | 因为不同数据集来源和条款差异很大,不能因为“在 ModelScope 上”就默认可商用。 | ||
| 61 | |||
| 62 | ### 4.2 为什么 CCMusic 要单独看 | ||
| 63 | 它对中文音乐任务很有价值,但部分子集可能涉及申请、协议或非标准商业许可边界。 | ||
| 64 | |||
| 65 | ### 4.3 为什么 license registry 要和模型版本绑定 | ||
| 66 | 这样才能在未来追踪: | ||
| 67 | - 某个模型到底用了哪些数据 | ||
| 68 | - 这些数据是否允许对应商用场景 | ||
| 69 | |||
| 70 | --- | ||
| 71 | |||
| 72 | ## 5. 细节附录 | ||
| 73 | |||
| 74 | 入口链接: | ||
| 75 | - FMA: https://github.com/mdeff/fma | ||
| 76 | - MTG-Jamendo: https://github.com/MTG/mtg-jamendo-dataset | ||
| 77 | - CCMusic: https://ccmusic-database.github.io/en/database/ccm.html | ||
| 78 | - ModelScope search: https://modelscope.cn/search?page=1&search=music&type=dataset | ||
| 79 | |||
| 80 | |||
| 81 | ## Sources | ||
| 82 | - See `docs/references-and-sources.md` for the current source map. | ... | ... |
| ... | @@ -2,41 +2,111 @@ | ... | @@ -2,41 +2,111 @@ |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 1. 目标 | 5 | ## 一页结论 |
| 6 | |||
| 7 | - 数据规范的核心不是文件格式,而是**分离 catalog 与 query** | ||
| 8 | - 外部数据集进入系统前必须先转换成统一 manifest | ||
| 9 | - 当前系统的标准输入是: | ||
| 10 | - **16k mono audio** | ||
| 11 | - **128 Mel** | ||
| 12 | - **window-level retrieval** | ||
| 13 | - 当前系统的标准输出是: | ||
| 14 | - top-k candidates | ||
| 15 | - confidence | ||
| 16 | - reject/accept | ||
| 17 | - metadata | ||
| 18 | |||
| 19 | --- | ||
| 20 | |||
| 21 | ## 1. 数据流图 | ||
| 22 | |||
| 23 | ```mermaid | ||
| 24 | flowchart LR | ||
| 25 | A[External / Synthetic Audio] --> B[Manifest Conversion] | ||
| 26 | B --> C[Catalog Manifest] | ||
| 27 | B --> D[Query Manifest] | ||
| 28 | C --> E[Reference Index Build] | ||
| 29 | D --> F[Training / Evaluation Queries] | ||
| 30 | E --> G[Hybrid Retrieval] | ||
| 31 | F --> G | ||
| 32 | ``` | ||
| 33 | |||
| 34 | --- | ||
| 35 | |||
| 36 | ## 2. 数据对象表 | ||
| 37 | |||
| 38 | | 对象 | 作用 | 必要字段 | 说明 | | ||
| 39 | |---|---|---|---| | ||
| 40 | | Reference | 可检索曲库 | `song_id`, `audio_path`, `duration`, `type=reference` | 用于建索引 | | ||
| 41 | | Query Segment | 待识别片段 | `song_id`, `audio_path`, `duration`, `type` | 用于训练/评测 | | ||
| 42 | | Catalog Manifest | reference 总表 | JSON list | 用于离线索引 | | ||
| 43 | | Query Manifest | query 总表 | JSON list | 用于训练与评测 | | ||
| 44 | |||
| 45 | --- | ||
| 46 | |||
| 47 | ## 3. Manifest 结构图 | ||
| 48 | |||
| 49 | ```mermaid | ||
| 50 | flowchart TD | ||
| 51 | M[Manifest] --> R[Reference Records] | ||
| 52 | M --> Q[Query Records] | ||
| 53 | R --> R1[song_id] | ||
| 54 | R --> R2[audio_path] | ||
| 55 | R --> R3[duration] | ||
| 56 | R --> R4[type=reference] | ||
| 57 | Q --> Q1[song_id] | ||
| 58 | Q --> Q2[audio_path] | ||
| 59 | Q --> Q3[duration] | ||
| 60 | Q --> Q4[type=clean/augmented/confused/humming_like] | ||
| 61 | ``` | ||
| 62 | |||
| 63 | --- | ||
| 64 | |||
| 65 | ## 4. 输入输出总表 | ||
| 66 | |||
| 67 | | 环节 | 输入 | 输出 | | ||
| 68 | |---|---|---| | ||
| 69 | | 训练 | query segments | embeddings + logits | | ||
| 70 | | 索引 | catalog references | chromaprint index + embedding index | | ||
| 71 | | 识别 | query audio | ranked candidates | | ||
| 72 | | 评测 | query manifest + catalog | top1/top5/hard-case report | | ||
| 73 | |||
| 74 | --- | ||
| 75 | |||
| 76 | ## 5. 文字说明 | ||
| 77 | |||
| 78 | ### 5.1 为什么必须分离 catalog 和 query | ||
| 79 | 早期原型容易把 train split 直接当搜索库,这会让评测和真实服务语义混乱。工业化系统必须把: | ||
| 80 | - “可搜索曲库” | ||
| 81 | - “训练/评测 query” | ||
| 82 | |||
| 83 | 明确分离。 | ||
| 6 | 84 | ||
| 7 | 定义本项目数据集规范、输入输出处理流程、catalog/query 划分方式,以及训练/评测所需的 manifest 结构。 | 85 | ### 5.2 为什么输入层是 128 Mel |
| 86 | 音乐任务需要更丰富的频带表达,128 Mel 更适合 band-split 和音乐 timbre/harmony 建模。 | ||
| 8 | 87 | ||
| 9 | ## 2. 数据层对象 | 88 | ### 5.3 query 类型为什么要显式标注 |
| 89 | `clean / augmented / confused / humming_like` 是评测与训练策略的重要条件,不应只放在隐式文件名里。 | ||
| 10 | 90 | ||
| 11 | ### 2.1 Reference / Catalog | 91 | --- |
| 12 | 可检索曲库中的标准参考音频。 | ||
| 13 | 92 | ||
| 14 | 字段: | 93 | ## 6. 细节附录 |
| 15 | 94 | ||
| 95 | ### Reference 示例 | ||
| 16 | ```json | 96 | ```json |
| 17 | { | 97 | { |
| 18 | "song_id": "song_0001", | 98 | "song_id": "song_0001", |
| 19 | "audio_path": "songs/song_0001.wav", | 99 | "audio_path": "songs/song_0001.wav", |
| 20 | "duration": 20.0, | 100 | "duration": 20.0, |
| 21 | "base_freq": 261.63, | ||
| 22 | "type": "reference" | 101 | "type": "reference" |
| 23 | } | 102 | } |
| 24 | ``` | 103 | ``` |
| 25 | 104 | ||
| 26 | 用途: | 105 | ### Query 示例 |
| 27 | - 建立 chromaprint 索引 | ||
| 28 | - 建立 embedding window 索引 | ||
| 29 | - 作为检索目标集合 | ||
| 30 | |||
| 31 | ### 2.2 Query Segment | ||
| 32 | 待识别片段。 | ||
| 33 | |||
| 34 | 字段: | ||
| 35 | |||
| 36 | ```json | 106 | ```json |
| 37 | { | 107 | { |
| 38 | "song_id": "song_0001", | 108 | "song_id": "song_0001", |
| 39 | "audio_path": "segments/song_0001_seg_02_confused.wav", | 109 | "audio_path": "segments/song_0001_seg_04_confused.wav", |
| 40 | "duration": 5.0, | 110 | "duration": 5.0, |
| 41 | "type": "confused", | 111 | "type": "confused", |
| 42 | "offset": 8.3, | 112 | "offset": 8.3, |
| ... | @@ -44,105 +114,6 @@ | ... | @@ -44,105 +114,6 @@ |
| 44 | } | 114 | } |
| 45 | ``` | 115 | ``` |
| 46 | 116 | ||
| 47 | 用途: | ||
| 48 | - 训练片段对 | ||
| 49 | - top-k 检索评测 | ||
| 50 | - 鲁棒性测试 | ||
| 51 | |||
| 52 | ## 3. Manifest 文件 | ||
| 53 | |||
| 54 | | 文件 | 用途 | | ||
| 55 | |---|---| | ||
| 56 | | `train.json` | 训练查询片段 + 训练 reference | | ||
| 57 | | `val.json` | 验证查询片段 + 验证 reference | | ||
| 58 | | `test.json` | 测试查询片段 + 测试 reference | | ||
| 59 | | `catalog.json` | 可搜索 reference 总表 | | ||
| 60 | |||
| 61 | 注意: | ||
| 62 | - `catalog.json` 是**检索索引输入** | ||
| 63 | - `train/val/test.json` 是**实验 split** | ||
| 64 | - 不再把 “模型训练 split” 和 “可搜索曲库” 混为一谈 | ||
| 65 | |||
| 66 | ## 4. 输入特征规范 | ||
| 67 | |||
| 68 | ### 4.1 输入音频 | ||
| 69 | - 默认采样率:`16 kHz` | ||
| 70 | - 通道:`mono` | ||
| 71 | - 训练/query 窗长:`5s` | ||
| 72 | - 滑窗步长:`2.5s` | ||
| 73 | |||
| 74 | ### 4.2 声学特征 | ||
| 75 | 当前改为: | ||
| 76 | - **128维 Mel 频谱** | ||
| 77 | |||
| 78 | 不再采用传统说话人任务常见的 40 维 MFCC 作为主输入,因为: | ||
| 79 | - 音乐任务更依赖频带结构与谐波信息 | ||
| 80 | - Mel 频谱对音乐 timbre / harmony / texture 表达更自然 | ||
| 81 | - 便于 band-split 模块对频带进行分块建模 | ||
| 82 | |||
| 83 | ## 5. 输出规范 | ||
| 84 | |||
| 85 | ### 5.1 训练输出 | ||
| 86 | 模型输出: | ||
| 87 | - `embedding: [B, D]` | ||
| 88 | - `logits: [B, num_classes]`(辅助分类头) | ||
| 89 | |||
| 90 | 主要目标: | ||
| 91 | - retrieval embedding 学得稳定 | ||
| 92 | - 同 song 片段彼此接近 | ||
| 93 | - 不同 song 分离 | ||
| 94 | |||
| 95 | ### 5.2 推理输出 | ||
| 96 | 识别输出: | ||
| 97 | |||
| 98 | ```json | ||
| 99 | { | ||
| 100 | "candidates": [ | ||
| 101 | { | ||
| 102 | "song_id": "song_0001", | ||
| 103 | "confidence": 0.93, | ||
| 104 | "chromaprint_score": 0.88, | ||
| 105 | "ecapa_score": 0.96, | ||
| 106 | "accepted": true, | ||
| 107 | "metadata": {} | ||
| 108 | } | ||
| 109 | ], | ||
| 110 | "processing_time_ms": 120.4, | ||
| 111 | "num_candidates": 5 | ||
| 112 | } | ||
| 113 | ``` | ||
| 114 | |||
| 115 | ## 6. Query 类型定义 | ||
| 116 | |||
| 117 | | type | 含义 | | ||
| 118 | |---|---| | ||
| 119 | | `clean` | 原始干净片段 | | ||
| 120 | | `augmented` | 常规增强片段 | | ||
| 121 | | `confused` | 强混淆/干扰片段 | | ||
| 122 | | `humming_like` | 哼唱风格近似片段 | | ||
| 123 | | `reference` | 标准参考整曲 | | ||
| 124 | |||
| 125 | ## 7. pro-WGAN 平衡策略(工程近似版) | ||
| 126 | |||
| 127 | 当前仓库先实现的是**pro-WGAN 风格的数据平衡近似策略**,不是完整生成式 GAN 训练: | ||
| 128 | |||
| 129 | - 对难样本类型(`confused`, `humming_like`)增加更强增广概率 | ||
| 130 | - 通过 harder augmentation 近似 minority/hard-case oversampling | ||
| 131 | - 保持 manifest 结构兼容,后续可替换成真正的生成式平衡器 | ||
| 132 | |||
| 133 | 后续若接入完整 GAN 平衡器,可把它作为: | ||
| 134 | - 离线样本扩增器 | ||
| 135 | - 困难类别样本生成器 | ||
| 136 | - catalog/query domain adaptation 工具 | ||
| 137 | |||
| 138 | ## 8. 频带分割模块 | ||
| 139 | |||
| 140 | 输入层新增 `BandSplitBlock`: | ||
| 141 | - 将 128 Mel bins 分割为多个子频带 | ||
| 142 | - 每个子带做独立投影 | ||
| 143 | - 再拼接进入主干网络 | ||
| 144 | 117 | ||
| 145 | 目的: | 118 | ## Sources |
| 146 | - 强化低频节奏 / 中频和声 / 高频音色的分带建模 | 119 | - See `docs/references-and-sources.md` for the current source map. |
| 147 | - 更符合音乐频谱结构 | ||
| 148 | - 为后续更复杂 band-aware retrieval 打基础 | ... | ... |
docs/external-manifest-template.md
0 → 100644
| 1 | # External Manifest Template | ||
| 2 | |||
| 3 | 适用于 FMA / Jamendo / CCMusic / ModelScope 白名单数据集。 | ||
| 4 | |||
| 5 | ## catalog.csv 最小字段 | ||
| 6 | |||
| 7 | ```csv | ||
| 8 | song_id,audio_path,duration,source_dataset | ||
| 9 | track_0001,raw/track_0001.wav,12.5,fma | ||
| 10 | ``` | ||
| 11 | |||
| 12 | 转换命令: | ||
| 13 | |||
| 14 | ```bash | ||
| 15 | python src/data/manifest_tools.py csv-to-catalog catalog.csv manifests/catalog.json | ||
| 16 | ``` | ||
| 17 | |||
| 18 | ## 输出 catalog.json 结构 | ||
| 19 | |||
| 20 | ```json | ||
| 21 | { | ||
| 22 | "song_id": "track_0001", | ||
| 23 | "audio_path": "raw/track_0001.wav", | ||
| 24 | "duration": 12.5, | ||
| 25 | "type": "reference", | ||
| 26 | "source_dataset": "fma" | ||
| 27 | } | ||
| 28 | ``` |
| ... | @@ -2,58 +2,82 @@ | ... | @@ -2,58 +2,82 @@ |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 目标 | 5 | ## 一页结论 |
| 6 | 为工业级可商用 ACR 设立持续基准,不只看总体 top1/top5,还看场景化与风险化指标。 | 6 | |
| 7 | 7 | - 工业级 ACR 不能只看总 top1 | |
| 8 | ## Benchmark 维度 | 8 | - 必须同时看: |
| 9 | 9 | 1. hard-case | |
| 10 | ### 1. Retrieval Quality | 10 | 2. rejection / false accept |
| 11 | - top1 | 11 | 3. latency / scale |
| 12 | - top5 | 12 | 4. license provenance completeness |
| 13 | - MRR | 13 | |
| 14 | - recall@k | 14 | --- |
| 15 | 15 | ||
| 16 | ### 2. Scenario Buckets | 16 | ## 1. Benchmark 分层图 |
| 17 | - clean | 17 | |
| 18 | - noisy | 18 | ```mermaid |
| 19 | - compressed | 19 | flowchart TD |
| 20 | - time-stretched | 20 | A[Industrial Benchmark] --> B[Accuracy] |
| 21 | - pitch-shifted | 21 | A --> C[Robustness] |
| 22 | - humming_like | 22 | A --> D[Operational] |
| 23 | - confused | 23 | A --> E[Compliance] |
| 24 | - partial-overlap | 24 | |
| 25 | - far-field / device-recorded | 25 | B --> B1[top1/top5/MRR] |
| 26 | 26 | C --> C1[humming/confused/noisy] | |
| 27 | ### 3. Catalog Scale Buckets | 27 | D --> D1[latency/indexing/throughput] |
| 28 | - 1K songs | 28 | E --> E1[data provenance/license coverage] |
| 29 | - 10K songs | 29 | ``` |
| 30 | - 100K songs | 30 | |
| 31 | - 1M+ songs | 31 | --- |
| 32 | 32 | ||
| 33 | ### 4. Operational Metrics | 33 | ## 2. 指标表 |
| 34 | - p50 / p95 latency | 34 | |
| 35 | - indexing throughput | 35 | | 维度 | 指标 | 目标 | |
| 36 | - incremental update time | 36 | |---|---|---| |
| 37 | - memory / disk footprint | 37 | | Accuracy | top1 / top5 / MRR | 主识别质量 | |
| 38 | 38 | | Robustness | humming/confused/noisy top1 | hard-case 质量 | | |
| 39 | ### 5. Business Safety Metrics | 39 | | Operational | p50/p95 latency | 服务能力 | |
| 40 | - false accept rate | 40 | | Operational | index throughput | 建库能力 | |
| 41 | - rejection quality | 41 | | Safety | false accept rate | 误识别风险 | |
| 42 | - near-duplicate confusion rate | 42 | | Compliance | license coverage | 商业可用前提 | |
| 43 | - license provenance coverage | 43 | |
| 44 | 44 | --- | |
| 45 | ## Required Artifacts per Model Release | 45 | |
| 46 | - dataset registry snapshot | 46 | ## 3. 场景图 |
| 47 | - training config snapshot | 47 | |
| 48 | - benchmark report JSON | 48 | ```mermaid |
| 49 | - benchmark summary markdown | 49 | flowchart LR |
| 50 | - model card | 50 | Q[Queries] --> Q1[clean] |
| 51 | - license review manifest | 51 | Q --> Q2[augmented] |
| 52 | 52 | Q --> Q3[humming_like] | |
| 53 | ## Minimum Go/No-Go Gate | 53 | Q --> Q4[confused] |
| 54 | Q --> Q5[noisy/compressed] | ||
| 55 | ``` | ||
| 56 | |||
| 57 | --- | ||
| 58 | |||
| 59 | ## 4. 文字说明 | ||
| 60 | |||
| 61 | ### 4.1 为什么 hard-case 要单独出报表 | ||
| 62 | 因为总体 top1 很容易掩盖哼唱和混淆场景的失败,而这些正是用户最敏感的场景。 | ||
| 63 | |||
| 64 | ### 4.2 为什么要加入 operational metrics | ||
| 65 | 工业级系统不是离线竞赛模型,需要考虑服务响应与增量索引成本。 | ||
| 66 | |||
| 67 | ### 4.3 为什么要把 compliance 放进 benchmark | ||
| 68 | 对于商用系统,如果训练/评测数据来源不可追溯,再高精度也不能安全上线。 | ||
| 69 | |||
| 70 | --- | ||
| 71 | |||
| 72 | ## 5. 细节附录 | ||
| 73 | |||
| 74 | 推荐 release gate: | ||
| 54 | - clean top1 >= 0.95 | 75 | - clean top1 >= 0.95 |
| 55 | - noisy top1 >= 0.85 | 76 | - noisy top1 >= 0.85 |
| 56 | - confused top1 >= 0.70 | 77 | - confused top1 >= 0.70 |
| 57 | - humming_like top1 >= 0.60 | 78 | - humming_like top1 >= 0.60 |
| 58 | - top5 >= 0.95 on all production-relevant buckets | 79 | - top5 >= 0.95 on production-relevant buckets |
| 59 | - false accept below agreed threshold | 80 | |
| 81 | |||
| 82 | ## Sources | ||
| 83 | - See `docs/references-and-sources.md` for the current source map. | ... | ... |
| 1 | # ACR 工业级可商用演进路线 | 1 | # 工业化路线图 |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 1. 目标定义 | 5 | ## 一页结论 |
| 6 | 6 | ||
| 7 | 把当前原型升级为一个可商用的工业级 ACR 系统,满足: | 7 | 当前项目已完成: |
| 8 | 8 | - 原型可运行 | |
| 9 | - 可扩展曲库管理 | 9 | - retrieval-first 初步改造 |
| 10 | - 可重复训练 / 评测 / 部署 | 10 | - 服务骨架 |
| 11 | - 多数据源接入(synthetic / FMA / Jamendo / CCMusic / ModelScope) | 11 | - 外部数据 adapter 雏形 |
| 12 | - 更强鲁棒性(噪声、失真、哼唱、混淆) | 12 | |
| 13 | - 检索服务化 | 13 | 下一阶段必须聚焦三件事: |
| 14 | - 商用合规与授权边界可审计 | 14 | 1. **真实数据接入** |
| 15 | 15 | 2. **hard-case 精度** | |
| 16 | ## 2. 工业级分层 | 16 | 3. **商业化合规与服务稳定性** |
| 17 | 17 | ||
| 18 | ### 2.1 数据层 | 18 | --- |
| 19 | - `catalog.json` / query manifests | 19 | |
| 20 | - 外部 dataset adapters | 20 | ## 1. 路线图图示 |
| 21 | - license / usage tracking | 21 | |
| 22 | - 数据版本与快照 | 22 | ```mermaid |
| 23 | 23 | flowchart LR | |
| 24 | ### 2.2 训练层 | 24 | P0[P0 原型跑通] --> P1[P1 真实数据验证] |
| 25 | - baseline encoder | 25 | P1 --> P2[P2 工程化与服务化] |
| 26 | - foundation-model encoder | 26 | P2 --> P3[P3 大规模索引] |
| 27 | - retrieval-first losses | 27 | P3 --> P4[P4 商用上线] |
| 28 | - hard negative mining | 28 | ``` |
| 29 | - 数据平衡与生成增强 | 29 | |
| 30 | 30 | --- | |
| 31 | ### 2.3 索引层 | 31 | |
| 32 | - window-level embeddings | 32 | ## 2. 阶段表 |
| 33 | - ANN index (Faiss/HNSW) | 33 | |
| 34 | - 指纹索引与向量索引双路 | 34 | | 阶段 | 目标 | 当前状态 | 核心产物 | |
| 35 | - 增量入库 | 35 | |---|---|---|---| |
| 36 | 36 | | P0 | 端到端原型 | 已完成 | demo/train/index/eval | | |
| 37 | ### 2.4 服务层 | 37 | | P1 | 白名单真实数据接入 | 进行中 | adapters/manifests/benchmark | |
| 38 | - FastAPI / gRPC | 38 | | P2 | API / benchmark / ops | 进行中 | FastAPI + spec | |
| 39 | - batch ingest | 39 | | P3 | ANN / 增量索引 | 未完成 | Faiss/HNSW | |
| 40 | - recognize API | 40 | | P4 | 可商用平台 | 未完成 | license gate / SLA / release flow | |
| 41 | - top-k candidate + rejection | 41 | |
| 42 | - metadata lookup | 42 | --- |
| 43 | 43 | ||
| 44 | ### 2.5 质量层 | 44 | ## 3. 近期优先级 |
| 45 | - regression benchmark | 45 | |
| 46 | - hard-case benchmark | 46 | ### Priority A |
| 47 | - online shadow evaluation | 47 | - FMA / Jamendo 小规模白名单子集接入 |
| 48 | - 数据/模型回滚机制 | 48 | - humming_like / confused 精度提升 |
| 49 | 49 | - service 配置化与真实部署 smoke | |
| 50 | ## 3. 数据集策略 | 50 | |
| 51 | 51 | ### Priority B | |
| 52 | ### 第一梯队(优先) | 52 | - ANN 向量索引 |
| 53 | - FMA small / medium | 53 | - 拒识/误接收指标 |
| 54 | - MTG-Jamendo | 54 | - 模型版本化 |
| 55 | - CCMusic(需核验申请/授权方式) | 55 | |
| 56 | - ModelScope music datasets(按 license 白名单接入) | 56 | ### Priority C |
| 57 | 57 | - foundation model baseline | |
| 58 | ### 第二梯队 | 58 | - 在线评估与监控 |
| 59 | - humming / QBSH 数据集 | 59 | - 商业部署流程 |
| 60 | - instrument / structure / singing datasets 作为辅助监督 | 60 | |
| 61 | 61 | --- | |
| 62 | ## 4. 商用必做项 | 62 | |
| 63 | 63 | ## 4. 分层职责 | |
| 64 | - 每个 dataset 记录: | 64 | |
| 65 | - 来源 URL | 65 | | 层 | 重点 | |
| 66 | - license | 66 | |---|---| |
| 67 | - 是否允许商业使用 | 67 | | 数据层 | 只接入可审计白名单数据 | |
| 68 | - 再分发限制 | 68 | | 模型层 | 以 retrieval 指标为主,不迷信分类头 | |
| 69 | - 模型训练用途限制 | 69 | | 检索层 | 强化 hard-case 与 rejection | |
| 70 | - 每个模型版本记录训练数据组成 | 70 | | 服务层 | 稳定 API、可配置、可观测 | |
| 71 | - 每次上线保留评测报告与可追溯哈希 | 71 | | 合规层 | 任何上线模型必须可追溯数据来源 | |
| 72 | 72 | ||
| 73 | ## 5. 当前到工业化的缺口 | 73 | --- |
| 74 | 74 | ||
| 75 | - 缺 dataset adapter 层 | 75 | ## 5. 细节附录 |
| 76 | - 缺 ANN 检索 | 76 | |
| 77 | - 缺 API 服务 | 77 | 关联文档: |
| 78 | - 缺 license registry | 78 | - `docs/dataset-sources-and-licensing.md` |
| 79 | - 缺 foundation-model baseline | 79 | - `docs/industrial-benchmark-spec.md` |
| 80 | - 缺真正的 hard-negative mining | 80 | - `docs/service-api.md` |
| 81 | - 缺真实开源数据 benchmark | 81 | |
| 82 | |||
| 83 | ## Sources | ||
| 84 | - See `docs/references-and-sources.md` for the current source map. | ... | ... |
docs/model-card-template.md
0 → 100644
| 1 | # Model Card Template | ||
| 2 | |||
| 3 | ## 一页结论 | ||
| 4 | - 模型名称: | ||
| 5 | - 版本: | ||
| 6 | - 适用场景: | ||
| 7 | - 不适用场景: | ||
| 8 | |||
| 9 | ## 1. 模型结构图 | ||
| 10 | |||
| 11 | ```mermaid | ||
| 12 | flowchart LR | ||
| 13 | A[Input Audio] --> B[128 Mel + BandSplit] | ||
| 14 | B --> C[Encoder] | ||
| 15 | C --> D[Embedding] | ||
| 16 | D --> E[Hybrid Retrieval] | ||
| 17 | ``` | ||
| 18 | |||
| 19 | ## 2. 关键信息表 | ||
| 20 | |||
| 21 | | 项 | 内容 | | ||
| 22 | |---|---| | ||
| 23 | | 训练数据 | | | ||
| 24 | | 评测数据 | | | ||
| 25 | | 主要指标 | | | ||
| 26 | | 已知风险 | | | ||
| 27 | | 许可证约束 | | | ||
| 28 | |||
| 29 | ## 3. 文字说明 | ||
| 30 | - 训练方式: | ||
| 31 | - 模型限制: | ||
| 32 | - 风险提示: | ||
| 33 | |||
| 34 | ## 4. 细节附录 | ||
| 35 | - checkpoint 路径 | ||
| 36 | - config 路径 | ||
| 37 | - benchmark 报告路径 | ||
| 38 | |||
| 39 | ## Sources | ||
| 40 | - `docs/dataset-spec.md` | ||
| 41 | - `docs/benchmark-report-template.md` |
| ... | @@ -2,91 +2,128 @@ | ... | @@ -2,91 +2,128 @@ |
| 2 | 2 | ||
| 3 | > 更新:2026-06-02 | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ## 1. 项目定位 | 5 | ## 一页结论 |
| 6 | 6 | ||
| 7 | 本项目是一个**听歌识曲 / 音频内容识别(ACR)原型系统**,目标是先跑通: | 7 | - 本项目已经从“算法原型”升级为“**面向工业化的 ACR 平台雏形**” |
| 8 | 8 | - 当前系统分为 **数据层、训练层、检索层、服务层、评测层、合规层** | |
| 9 | - 数据生成 | 9 | - 近期重点不是再堆功能,而是: |
| 10 | - 特征提取 | 10 | 1. 提升 `humming_like` / `confused` 准确率 |
| 11 | - 模型训练 | 11 | 2. 接入真实白名单数据集 |
| 12 | - 指纹检索 | 12 | 3. 完善服务、索引、benchmark 与合规闭环 |
| 13 | - embedding 检索 | ||
| 14 | - hybrid 混合识别 | ||
| 15 | |||
| 16 | 当前不以生产服务为目标,重点是**算法链路验证**。 | ||
| 17 | |||
| 18 | ## 2. 仓库职责分层 | ||
| 19 | |||
| 20 | ```text | ||
| 21 | /workspace | ||
| 22 | ├── acr-engine/ # ACR 核心算法与可运行 demo | ||
| 23 | │ ├── configs/ # 训练/推理参数配置 | ||
| 24 | │ ├── src/data/ # 数据集读取、合成数据生成 | ||
| 25 | │ ├── src/models/ # 声学模型、损失函数 | ||
| 26 | │ ├── src/engines/ # 指纹/embedding/hybrid 检索引擎 | ||
| 27 | │ ├── train.py # 模型训练入口 | ||
| 28 | │ ├── run_demo.py # 数据生成、建索引、识别入口 | ||
| 29 | │ └── requirements.txt # Python 依赖 | ||
| 30 | ├── docs/ # 设计、架构、路线图、使用说明 | ||
| 31 | ├── scripts/ # 环境安装与工具 bootstrap | ||
| 32 | ├── container/ # 容器环境定义 | ||
| 33 | └── .codex/.omx/ # Codex / OMX 协作与运行时元数据 | ||
| 34 | ``` | ||
| 35 | 13 | ||
| 36 | ## 3. 模块职责图 | 14 | --- |
| 15 | |||
| 16 | ## 1. 分层图 | ||
| 37 | 17 | ||
| 38 | ```mermaid | 18 | ```mermaid |
| 39 | flowchart TD | 19 | flowchart TD |
| 40 | A[音频输入] --> B[数据层] | 20 | A[L1 业务目标层] --> B[L2 系统能力层] |
| 41 | B --> B1[合成数据生成 synthetic.py] | 21 | B --> C[L3 核心模块层] |
| 42 | B --> B2[训练/验证数据集 dataset.py] | 22 | C --> D[L4 工程服务层] |
| 43 | 23 | C --> E[L5 数据与合规层] | |
| 44 | A --> C[特征层] | 24 | |
| 45 | C --> C1[Mel Spectrogram] | 25 | A1[听歌识曲 / 哼唱识别 / 商业可用]:::goal --> A |
| 46 | C --> C2[Chroma / F0] | 26 | |
| 47 | C --> C3[增强 augment.py] | 27 | B1[高准确率识别] --> B |
| 48 | 28 | B2[可扩展曲库] --> B | |
| 49 | C --> D[模型层] | 29 | B3[可服务化调用] --> B |
| 50 | D --> D1[ECAPA-TDNN] | 30 | B4[可审计数据来源] --> B |
| 51 | D --> D2[SupCon + AAM Loss] | 31 | |
| 52 | 32 | C1[训练与表征学习] --> C | |
| 53 | A --> E[检索层] | 33 | C2[指纹检索] --> C |
| 54 | E --> E1[ChromaprintMatcher] | 34 | C3[向量检索] --> C |
| 55 | E --> E2[ECAPAEmbedder] | 35 | C4[混合重排] --> C |
| 56 | E --> E3[HybridEngine] | 36 | C5[评测基准] --> C |
| 57 | 37 | ||
| 58 | D --> F[训练入口 train.py] | 38 | D1[FastAPI] --> D |
| 59 | E --> G[推理入口 run_demo.py] | 39 | D2[Index Build] --> D |
| 40 | D3[Manifest Tools] --> D | ||
| 41 | |||
| 42 | E1[External Adapters] --> E | ||
| 43 | E2[Dataset Registry] --> E | ||
| 44 | E3[License Review] --> E | ||
| 45 | |||
| 46 | classDef goal fill:#e8f5e9,stroke:#2e7d32; | ||
| 60 | ``` | 47 | ``` |
| 61 | 48 | ||
| 62 | ## 4. 角色职责 | 49 | --- |
| 63 | 50 | ||
| 64 | | 模块 | 职责 | 当前状态 | | 51 | ## 2. 职责总表 |
| 65 | |---|---|---| | 52 | |
| 66 | | `src/data/synthetic.py` | 生成可控的合成歌曲与片段 | 已实现 | | 53 | | 层级 | 模块 | 负责内容 | 当前状态 | |
| 67 | | `src/data/dataset.py` | 训练/验证数据装载 | 已实现 | | 54 | |---|---|---|---| |
| 68 | | `src/utils/audio.py` | Mel、滑窗、F0、Chroma | 已实现 | | 55 | | 数据层 | `src/data/*` | synthetic 数据、external adapters、manifest | 已有基础 | |
| 69 | | `src/utils/augment.py` | 噪声、变速、移调、混响增强 | 已实现 | | 56 | | 训练层 | `train.py` / `src/models/*` | 128 Mel、band-split、embedding 学习 | 可运行 | |
| 70 | | `src/models/ecapa_tdnn.py` | embedding 编码器 | 已实现 | | 57 | | 检索层 | `src/engines/*` | chromaprint、embedding、melody-aware hybrid | 可运行 | |
| 71 | | `src/models/losses.py` | 对比学习 + 分类训练目标 | 已实现 | | 58 | | 服务层 | `src/service/*` | health / recognize / index build | 骨架已通 | |
| 72 | | `src/engines/chromaprint_matcher.py` | 传统哈希指纹检索 | 已实现 | | 59 | | 评测层 | `evaluate.py` | top1/top5/hard-case benchmark | 已建立 | |
| 73 | | `src/engines/ecapa_embedder.py` | embedding 提取与向量检索 | 已实现 | | 60 | | 合规层 | registry/docs | dataset source / licensing / whitelist | 雏形已建 | |
| 74 | | `src/engines/hybrid_engine.py` | 融合匹配结果 | 已实现 | | 61 | |
| 75 | | `train.py` | 训练入口 | 已实现 | | 62 | --- |
| 76 | | `run_demo.py` | demo 入口 | 本次补齐 | | 63 | |
| 77 | 64 | ## 3. 分工图 | |
| 78 | ## 5. 当前边界 | 65 | |
| 79 | 66 | ```mermaid | |
| 80 | 当前项目**负责**: | 67 | flowchart LR |
| 81 | 68 | D[数据团队] --> D1[数据接入] | |
| 82 | - 原型级算法验证 | 69 | D --> D2[manifest 标准化] |
| 83 | - 小规模曲库识别 | 70 | D --> D3[license 审查] |
| 84 | - 本地训练与本地识别 demo | 71 | |
| 85 | 72 | M[模型团队] --> M1[特征与模型] | |
| 86 | 当前项目**暂不负责**: | 73 | M --> M2[鲁棒训练] |
| 87 | 74 | M --> M3[hard-case 优化] | |
| 88 | - 在线 API 服务 | 75 | |
| 89 | - 海量曲库 ANN 线上部署 | 76 | R[检索团队] --> R1[指纹索引] |
| 90 | - 权限、账号、计费 | 77 | R --> R2[向量索引] |
| 91 | - 真正版权音频数据治理 | 78 | R --> R3[融合与拒识] |
| 92 | - 生产监控告警 | 79 | |
| 80 | S[平台团队] --> S1[API 服务] | ||
| 81 | S --> S2[部署] | ||
| 82 | S --> S3[监控] | ||
| 83 | |||
| 84 | Q[质量团队] --> Q1[benchmark] | ||
| 85 | Q --> Q2[回归验证] | ||
| 86 | Q --> Q3[上线门禁] | ||
| 87 | ``` | ||
| 88 | |||
| 89 | --- | ||
| 90 | |||
| 91 | ## 4. 文字说明 | ||
| 92 | |||
| 93 | ### 4.1 数据层 | ||
| 94 | 负责把不同来源的数据集(synthetic、FMA、Jamendo、CCMusic、ModelScope 白名单集)转成统一的 `catalog/query manifest`。 | ||
| 95 | |||
| 96 | ### 4.2 训练层 | ||
| 97 | 负责音乐任务特征建模,目前已经从低维说话人风格输入升级到: | ||
| 98 | - 128 Mel | ||
| 99 | - band-split | ||
| 100 | - retrieval-first 训练方向 | ||
| 101 | |||
| 102 | ### 4.3 检索层 | ||
| 103 | 负责三路信息融合: | ||
| 104 | - 指纹匹配 | ||
| 105 | - embedding 匹配 | ||
| 106 | - melody-aware 重排 | ||
| 107 | |||
| 108 | ### 4.4 服务层 | ||
| 109 | 负责把离线原型包装成可调用系统,目前已有 FastAPI 骨架。 | ||
| 110 | |||
| 111 | ### 4.5 评测层 | ||
| 112 | 负责质量门禁,不能只看总体 top1,要看 hard-case、拒识、误接收。 | ||
| 113 | |||
| 114 | ### 4.6 合规层 | ||
| 115 | 负责商用前提,任何外部数据集都必须进入 registry 和白名单流程。 | ||
| 116 | |||
| 117 | --- | ||
| 118 | |||
| 119 | ## 5. 细节附录 | ||
| 120 | |||
| 121 | 关键文档: | ||
| 122 | - `docs/dataset-spec.md` | ||
| 123 | - `docs/industrial-benchmark-spec.md` | ||
| 124 | - `docs/dataset-sources-and-licensing.md` | ||
| 125 | - `docs/industrialization-roadmap.md` | ||
| 126 | |||
| 127 | |||
| 128 | ## Sources | ||
| 129 | - See `docs/references-and-sources.md` for the current source map. | ... | ... |
docs/references-and-sources.md
0 → 100644
| 1 | # References and Sources Map | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 一页结论 | ||
| 6 | |||
| 7 | 当前项目的引用分成四类: | ||
| 8 | 1. **开源数据集来源** | ||
| 9 | 2. **研究/SOTA 来源** | ||
| 10 | 3. **服务与工程规范来源** | ||
| 11 | 4. **项目内部文档来源** | ||
| 12 | |||
| 13 | --- | ||
| 14 | |||
| 15 | ## 1. 引用分层图 | ||
| 16 | |||
| 17 | ```mermaid | ||
| 18 | flowchart TD | ||
| 19 | A[References] --> B[Datasets] | ||
| 20 | A --> C[Research] | ||
| 21 | A --> D[Engineering] | ||
| 22 | A --> E[Internal Docs] | ||
| 23 | |||
| 24 | B --> B1[FMA] | ||
| 25 | B --> B2[MTG-Jamendo] | ||
| 26 | B --> B3[CCMusic] | ||
| 27 | B --> B4[ModelScope] | ||
| 28 | |||
| 29 | C --> C1[Neural AFP] | ||
| 30 | C --> C2[Music Foundation Models] | ||
| 31 | C --> C3[Band-split] | ||
| 32 | C --> C4[Data Balancing] | ||
| 33 | ``` | ||
| 34 | |||
| 35 | --- | ||
| 36 | |||
| 37 | ## 2. 外部来源表 | ||
| 38 | |||
| 39 | | 类别 | 名称 | URL | 当前用途 | | ||
| 40 | |---|---|---|---| | ||
| 41 | | Dataset | FMA | https://github.com/mdeff/fma | 真实 retrieval baseline 候选 | | ||
| 42 | | Dataset | MTG-Jamendo | https://github.com/MTG/mtg-jamendo-dataset | 真实音乐检索候选 | | ||
| 43 | | Dataset | CCMusic | https://ccmusic-database.github.io/en/database/ccm.html | 中文 MIR 数据源候选 | | ||
| 44 | | Dataset | ModelScope music search | https://modelscope.cn/search?page=1&search=music&type=dataset | 数据发现入口 | | ||
| 45 | | Research | MERT | https://arxiv.org/abs/2306.00107 | foundation-model 方向参考 | | ||
| 46 | | Research | MuQ | https://arxiv.org/abs/2501.01108 | music representation 方向参考 | | ||
| 47 | | Research | Band-split RNN | https://arxiv.org/abs/2209.15174 | 频带建模参考 | | ||
| 48 | | Research | BAGAN | https://arxiv.org/abs/1803.09655 | 数据平衡增强参考 | | ||
| 49 | |||
| 50 | --- | ||
| 51 | |||
| 52 | ## 3. 内部文档依赖图 | ||
| 53 | |||
| 54 | ```mermaid | ||
| 55 | flowchart LR | ||
| 56 | A[references-and-sources.md] --> B[dataset-sources-and-licensing.md] | ||
| 57 | A --> C[sota-research-2026.md] | ||
| 58 | A --> D[industrialization-roadmap.md] | ||
| 59 | ``` | ||
| 60 | |||
| 61 | --- | ||
| 62 | |||
| 63 | ## 4. 文字说明 | ||
| 64 | |||
| 65 | ### 4.1 为什么单独做 References Map | ||
| 66 | 因为后续文档会越来越多,如果不把“哪些结论来自哪里”系统整理出来,很快会失去可追溯性。 | ||
| 67 | |||
| 68 | ### 4.2 目前引用质量说明 | ||
| 69 | - dataset 来源:优先官方 repo / 官方主页 | ||
| 70 | - research 来源:优先 arXiv / 论文主页 | ||
| 71 | - service/工程来源:当前主要以内生工程规范为主 | ||
| 72 | |||
| 73 | ### 4.3 未来要加强的地方 | ||
| 74 | - 在每篇核心文档底部补“Sources”小节 | ||
| 75 | - benchmark 报告与 model card 显式引用训练数据与论文版本 | ||
| 76 | |||
| 77 | --- | ||
| 78 | |||
| 79 | ## 5. 细节附录 | ||
| 80 | |||
| 81 | 建议补充: | ||
| 82 | - 每份文档增加 `Sources` 节 | ||
| 83 | - 每次模型 release 输出引用快照 | ||
| 84 | |||
| 85 | ## Sources | ||
| 86 | - FMA: https://github.com/mdeff/fma | ||
| 87 | - MTG-Jamendo: https://github.com/MTG/mtg-jamendo-dataset | ||
| 88 | - CCMusic: https://ccmusic-database.github.io/en/database/ccm.html | ||
| 89 | - ModelScope music search: https://modelscope.cn/search?page=1&search=music&type=dataset |
docs/release-checklist.md
0 → 100644
| 1 | # Release Checklist | ||
| 2 | |||
| 3 | ## 一页结论 | ||
| 4 | 发布前必须同时满足: | ||
| 5 | - 质量通过 | ||
| 6 | - 合规通过 | ||
| 7 | - 服务通过 | ||
| 8 | - 文档齐全 | ||
| 9 | |||
| 10 | ## 1. 发布门禁图 | ||
| 11 | |||
| 12 | ```mermaid | ||
| 13 | flowchart TD | ||
| 14 | A[Release Candidate] --> B[Benchmark Pass] | ||
| 15 | A --> C[License Review Pass] | ||
| 16 | A --> D[Service Smoke Pass] | ||
| 17 | A --> E[Docs Complete] | ||
| 18 | ``` | ||
| 19 | |||
| 20 | ## 2. Checklist 表 | ||
| 21 | |||
| 22 | | 项目 | 状态 | | ||
| 23 | |---|---| | ||
| 24 | | benchmark report 已生成 | | | ||
| 25 | | model card 已生成 | | | ||
| 26 | | license registry 已更新 | | | ||
| 27 | | service smoke test 通过 | | | ||
| 28 | | dataset whitelist 已确认 | | | ||
| 29 | | changelog 已更新 | | | ||
| 30 | |||
| 31 | ## 3. 文字说明 | ||
| 32 | - 任何缺失项都不能视作商用可发布 | ||
| 33 | |||
| 34 | ## 4. 细节附录 | ||
| 35 | - 发布 commit | ||
| 36 | - benchmark 报告路径 | ||
| 37 | - model card 路径 | ||
| 38 | - license 审查记录路径 | ||
| 39 | |||
| 40 | ## Sources | ||
| 41 | - `docs/dataset-sources-and-licensing.md` | ||
| 42 | - `docs/industrial-benchmark-spec.md` |
| 1 | # ACR Service API | 1 | # ACR Service API |
| 2 | 2 | ||
| 3 | ## Endpoints | 3 | > 更新:2026-06-02 |
| 4 | 4 | ||
| 5 | ### GET /health | 5 | ## 一页结论 |
| 6 | 返回服务健康状态。 | ||
| 7 | 6 | ||
| 8 | ### POST /recognize | 7 | - 当前服务是工业化骨架,不是最终生产网关 |
| 9 | 请求体: | 8 | - 已提供最小可调用能力: |
| 9 | 1. health | ||
| 10 | 2. config | ||
| 11 | 3. recognize | ||
| 12 | 4. index build | ||
| 13 | - 下一阶段重点是:鉴权、异步任务、ANN 索引、监控、错误码规范化 | ||
| 10 | 14 | ||
| 11 | ```json | 15 | --- |
| 12 | { | 16 | |
| 13 | "query_path": "data/synthetic_v2/segments/song_0021_seg_01_augmented.wav", | 17 | ## 1. 服务结构图 |
| 14 | "data_dir": "data/synthetic_v2", | 18 | |
| 15 | "model_path": "data/models_v3/best_model.pt", | 19 | ```mermaid |
| 16 | "index_prefix": "data/index_v3/reference", | 20 | flowchart LR |
| 17 | "top_n": 5, | 21 | C[Client] --> H[/health] |
| 18 | "device": "cpu" | 22 | C --> G[/config] |
| 19 | } | 23 | C --> R[/recognize] |
| 24 | C --> I[/index/build] | ||
| 25 | |||
| 26 | R --> E[Hybrid Engine] | ||
| 27 | I --> B[Index Builders] | ||
| 20 | ``` | 28 | ``` |
| 21 | 29 | ||
| 22 | ### POST /index/build | 30 | --- |
| 23 | 请求体: | 31 | |
| 32 | ## 2. Endpoint 表 | ||
| 33 | |||
| 34 | | Endpoint | 方法 | 作用 | | ||
| 35 | |---|---|---| | ||
| 36 | | `/health` | GET | 健康检查 | | ||
| 37 | | `/config` | GET | 查看默认配置 | | ||
| 38 | | `/recognize` | POST | 输入 query,输出候选 | | ||
| 39 | | `/index/build` | POST | 触发离线索引构建 | | ||
| 40 | |||
| 41 | --- | ||
| 42 | |||
| 43 | ## 3. 请求流程图 | ||
| 44 | |||
| 45 | ```mermaid | ||
| 46 | sequenceDiagram | ||
| 47 | participant Client | ||
| 48 | participant API | ||
| 49 | participant Engine | ||
| 50 | |||
| 51 | Client->>API: POST /recognize | ||
| 52 | API->>Engine: load matcher/index/model | ||
| 53 | Engine-->>API: top-k candidates | ||
| 54 | API-->>Client: JSON result | ||
| 55 | ``` | ||
| 24 | 56 | ||
| 57 | --- | ||
| 58 | |||
| 59 | ## 4. 文字说明 | ||
| 60 | |||
| 61 | ### 4.1 为什么先暴露文件路径 API | ||
| 62 | 当前阶段优先验证系统闭环,不急于引入上传存储层与异步 job orchestration。 | ||
| 63 | |||
| 64 | ### 4.2 `/config` 的作用 | ||
| 65 | 帮助服务侧和调用侧快速确认当前默认数据目录、模型路径与索引前缀。 | ||
| 66 | |||
| 67 | ### 4.3 后续生产化差距 | ||
| 68 | - 缺鉴权 | ||
| 69 | - 缺对象存储上传 | ||
| 70 | - 缺异步索引任务 | ||
| 71 | - 缺可观测性 | ||
| 72 | - 缺错误码与 SLA 规范 | ||
| 73 | |||
| 74 | --- | ||
| 75 | |||
| 76 | ## 5. 细节附录 | ||
| 77 | |||
| 78 | ### `/health` | ||
| 79 | 返回: | ||
| 80 | ```json | ||
| 81 | {"status":"ok","service":"acr","version":"0.2.0"} | ||
| 82 | ``` | ||
| 83 | |||
| 84 | ### `/config` | ||
| 85 | 返回: | ||
| 25 | ```json | 86 | ```json |
| 26 | { | 87 | { |
| 27 | "data_dir": "data/synthetic_v2", | 88 | "data_dir":"data/synthetic_v2", |
| 28 | "model_path": "data/models_v3/best_model.pt", | 89 | "model_path":"data/models_v3/best_model.pt", |
| 29 | "output_dir": "data/index_v3", | 90 | "index_prefix":"data/index_v3/reference", |
| 30 | "device": "cpu" | 91 | "device":"cpu" |
| 31 | } | 92 | } |
| 32 | ``` | 93 | ``` |
| 94 | |||
| 95 | |||
| 96 | ## Sources | ||
| 97 | - See `docs/references-and-sources.md` for the current source map. | ... | ... |
| ... | @@ -10,6 +10,17 @@ | ... | @@ -10,6 +10,17 @@ |
| 10 | 2. **Music Foundation Model 作为 backbone / teacher** | 10 | 2. **Music Foundation Model 作为 backbone / teacher** |
| 11 | 3. **Band-split / band-aware 结构用于音乐频谱建模** | 11 | 3. **Band-split / band-aware 结构用于音乐频谱建模** |
| 12 | 12 | ||
| 13 | |||
| 14 | ## 1. 方向图 | ||
| 15 | |||
| 16 | ```mermaid | ||
| 17 | flowchart LR | ||
| 18 | A[2026 ACR / MIR SOTA] --> B[Neural AFP Robustness] | ||
| 19 | A --> C[Music Foundation Models] | ||
| 20 | A --> D[Band-aware Architectures] | ||
| 21 | A --> E[Data Balancing / Hard Negatives] | ||
| 22 | ``` | ||
| 23 | |||
| 13 | ## 1. Neural AFP 的更强实践 | 24 | ## 1. Neural AFP 的更强实践 |
| 14 | 25 | ||
| 15 | ### Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification (2025) | 26 | ### Enhancing Neural Audio Fingerprint Robustness to Audio Degradation for Music Identification (2025) | ... | ... |
-
Please register or sign in to post a comment