Make the fused Phase-1 ACR schema concrete with DDL samples
Constraint: Keep the storage design aligned to the current song-centric model while turning the 4-table fused schema into something engineers can directly review and implement. Rejected: Keep only conceptual docs without concrete SQL | It leaves too much ambiguity about where slices, models, and features actually land. Confidence: high Scope-risk: narrow Directive: Until the repository gains a production SQL file for the fused model, treat postgres_db_schema_samples.md as the authoritative DDL draft for media_entity/audio_object/feature_fact/set_membership. Tested: git diff --check on touched files; /usr/local/miniconda3/bin/python scripts/check_markdown_links.py --root docs returned OK for 11 active markdown files Not-tested: Executing the fused DDL against a live PostgreSQL schema
Showing
4 changed files
with
86 additions
and
2 deletions
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 重写 `docs/postgres_db_schema_samples.md` 为当前 song-centric 融合优先方案的 DDL 草案,补齐 4 张核心表(`media_entity` / `audio_object` / `feature_fact` / `set_membership`)、落表说明、流程图与常用 SQL 样例。 | ||
| 4 | |||
| 3 | - 在 `docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`。 | 5 | - 在 `docs/postgresql-data-model.md` 新增“切片数据 / 模型 / feature 具体落哪张表”的表格与流程图,明确当前默认回溯链为 `feature_fact -> audio_object(window) -> audio_object(asset) -> media_entity(song)`。 |
| 4 | - 收敛 `docs/README.md` 为当前 song-centric 设计入口,并清理 docs 目录中与当前设计无关的模板、开放数据、业务导出、历史路线类文档。 | 6 | - 收敛 `docs/README.md` 为当前 song-centric 设计入口,并清理 docs 目录中与当前设计无关的模板、开放数据、业务导出、历史路线类文档。 |
| 5 | 7 | ... | ... |
This diff is collapsed.
Click to expand it.
| ... | @@ -59,7 +59,7 @@ cd /workspace/acr-engine | ... | @@ -59,7 +59,7 @@ cd /workspace/acr-engine |
| 59 | ## 3. 用一句话理解项目 | 59 | ## 3. 用一句话理解项目 |
| 60 | 60 | ||
| 61 | 我们在做的是一个面向 **版权保护 / 听歌识曲 / 版本归属** 的音乐 ACR 系统, | 61 | 我们在做的是一个面向 **版权保护 / 听歌识曲 / 版本归属** 的音乐 ACR 系统, |
| 62 | 目标是从 `100w` 音频、约 `30w` 歌曲中,快速定位正确的 `song_id / work / recording` 归属。 | 62 | 目标是从 `100w` 音频、约 `30w` 歌曲中,快速定位正确的 `song_id` 归属;当前阶段暂不把版本/recording 作为必须返回对象。 |
| 63 | 63 | ||
| 64 | --- | 64 | --- |
| 65 | 65 | ||
| ... | @@ -71,7 +71,12 @@ cd /workspace/acr-engine | ... | @@ -71,7 +71,12 @@ cd /workspace/acr-engine |
| 71 | - semantic lane challenger:`MuQ` | 71 | - semantic lane challenger:`MuQ` |
| 72 | - historical baseline:`ECAPA` | 72 | - historical baseline:`ECAPA` |
| 73 | 73 | ||
| 74 | ### 数据主线 | 74 | ### 当前 Phase-1 最小主线 |
| 75 | ```text | ||
| 76 | song -> asset -> window | ||
| 77 | ``` | ||
| 78 | |||
| 79 | ### 可演进完整版主线 | ||
| 75 | ```text | 80 | ```text |
| 76 | canonical_song -> work -> recording -> recording_asset -> audio_window | 81 | canonical_song -> work -> recording -> recording_asset -> audio_window |
| 77 | ``` | 82 | ``` |
| ... | @@ -139,6 +144,7 @@ model_registry -> feature_set_registry -> audio_embedding / audio_fingerprint -> | ... | @@ -139,6 +144,7 @@ model_registry -> feature_set_registry -> audio_embedding / audio_fingerprint -> |
| 139 | - [README.md](./README.md) | 144 | - [README.md](./README.md) |
| 140 | - [session-handoff.md](./session-handoff.md) | 145 | - [session-handoff.md](./session-handoff.md) |
| 141 | - [postgresql-data-model.md](./postgresql-data-model.md) | 146 | - [postgresql-data-model.md](./postgresql-data-model.md) |
| 147 | - [postgres_db_schema_samples.md](./postgres_db_schema_samples.md) | ||
| 142 | - [phase1-worker-contract.md](./phase1-worker-contract.md) | 148 | - [phase1-worker-contract.md](./phase1-worker-contract.md) |
| 143 | 149 | ||
| 144 | ### 脚本 | 150 | ### 脚本 | ... | ... |
scripts/check_markdown_links.py
0 → 100755
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import fnmatch | ||
| 6 | import re | ||
| 7 | import sys | ||
| 8 | from pathlib import Path | ||
| 9 | |||
| 10 | LINK_RE = re.compile(r'!?(?:\[([^\]]*)\])\(([^)]+)\)') | ||
| 11 | SKIP_PREFIXES = ('http://', 'https://', 'mailto:', 'tel:', '#') | ||
| 12 | DEFAULT_EXCLUDES = ['CHANGELOG.md'] | ||
| 13 | |||
| 14 | |||
| 15 | def should_check(target: str) -> bool: | ||
| 16 | target = target.strip() | ||
| 17 | return bool(target) and not target.startswith(SKIP_PREFIXES) | ||
| 18 | |||
| 19 | |||
| 20 | def normalize_target(raw: str) -> str: | ||
| 21 | target = raw.strip() | ||
| 22 | if target.startswith('<') and target.endswith('>'): | ||
| 23 | target = target[1:-1] | ||
| 24 | target = target.split('#', 1)[0].split('?', 1)[0].strip() | ||
| 25 | return target | ||
| 26 | |||
| 27 | |||
| 28 | def iter_markdown_files(root: Path, excludes: list[str]) -> list[Path]: | ||
| 29 | files: list[Path] = [] | ||
| 30 | for path in sorted(root.rglob('*.md')): | ||
| 31 | rel = path.relative_to(root).as_posix() | ||
| 32 | if any(fnmatch.fnmatch(rel, pattern) for pattern in excludes): | ||
| 33 | continue | ||
| 34 | files.append(path) | ||
| 35 | return files | ||
| 36 | |||
| 37 | |||
| 38 | def scan_markdown_file(path: Path, root: Path) -> list[tuple[str, str]]: | ||
| 39 | missing: list[tuple[str, str]] = [] | ||
| 40 | text = path.read_text(encoding='utf-8') | ||
| 41 | for _, raw_target in LINK_RE.findall(text): | ||
| 42 | if not should_check(raw_target): | ||
| 43 | continue | ||
| 44 | target = normalize_target(raw_target) | ||
| 45 | if not target: | ||
| 46 | continue | ||
| 47 | resolved = (path.parent / target).resolve() | ||
| 48 | if not resolved.exists(): | ||
| 49 | missing.append((path.relative_to(root).as_posix(), raw_target)) | ||
| 50 | return missing | ||
| 51 | |||
| 52 | |||
| 53 | if __name__ == '__main__': | ||
| 54 | parser = argparse.ArgumentParser(description='Check relative Markdown links for missing files.') | ||
| 55 | parser.add_argument('--root', default='docs', help='Root directory containing markdown files') | ||
| 56 | parser.add_argument('--exclude', action='append', default=[], help='Glob patterns relative to root to exclude') | ||
| 57 | args = parser.parse_args() | ||
| 58 | |||
| 59 | root = Path(args.root).resolve() | ||
| 60 | if not root.exists(): | ||
| 61 | print(f'root not found: {root}', file=sys.stderr) | ||
| 62 | sys.exit(2) | ||
| 63 | |||
| 64 | excludes = DEFAULT_EXCLUDES + list(args.exclude) | ||
| 65 | files = iter_markdown_files(root, excludes) | ||
| 66 | failures: list[tuple[str, str]] = [] | ||
| 67 | for md in files: | ||
| 68 | failures.extend(scan_markdown_file(md, root)) | ||
| 69 | |||
| 70 | if failures: | ||
| 71 | print('Missing relative markdown targets:') | ||
| 72 | for source, target in failures: | ||
| 73 | print(f'- {source}: {target}') | ||
| 74 | sys.exit(1) | ||
| 75 | |||
| 76 | print(f'OK: checked {len(files)} markdown files under {root} (excluded: {excludes})') |
-
Please register or sign in to post a comment