Harden the Phase-1 worker contract before real extractors land
Constraint: planner outputs must be copy-runnable in the current environment and live PostgreSQL entrypoints must treat schema input as untrusted. Rejected: defer state guards until real inference arrives | rejected because repeat execution and empty-scope drift would corrupt Phase-1 evidence now. Confidence: high Scope-risk: moderate Directive: keep using the guarded job contract (expected status, schema validation, explicit python path) when replacing dry-run with real writes. Tested: py_compile for live bootstrap/planner/worker scripts; live PostgreSQL bootstrap for model registry, reference members, and extraction jobs; regenerated extraction plan report; chromaprint + mert dry-run worker runs with scope=20; double-claim guard report returns non-zero; architect review APPROVED. Not-tested: real fingerprint writes, real embedding writes, large-scale production reference-set ingestion beyond the 20-song acr_test sample.
Showing
20 changed files
with
408 additions
and
70 deletions
This diff is collapsed.
Click to expand it.
| 1 | { | ||
| 2 | "schema": "acr_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "set_name": "phase1_hot_reference_v1", | ||
| 5 | "member_role": "hot_reference", | ||
| 6 | "inserted_recording_ids": [], | ||
| 7 | "reused_recording_ids": [ | ||
| 8 | 1, | ||
| 9 | 2, | ||
| 10 | 3, | ||
| 11 | 4, | ||
| 12 | 5, | ||
| 13 | 6, | ||
| 14 | 7, | ||
| 15 | 8, | ||
| 16 | 9, | ||
| 17 | 10, | ||
| 18 | 11, | ||
| 19 | 12, | ||
| 20 | 13, | ||
| 21 | 14, | ||
| 22 | 15, | ||
| 23 | 16, | ||
| 24 | 17, | ||
| 25 | 18, | ||
| 26 | 19, | ||
| 27 | 20 | ||
| 28 | ], | ||
| 29 | "reference_set_id": 2, | ||
| 30 | "counts": { | ||
| 31 | "reference_recordings_seen": 20, | ||
| 32 | "inserted_members": 0, | ||
| 33 | "reused_members": 20, | ||
| 34 | "member_count": 20, | ||
| 35 | "ready_asset_count": 20, | ||
| 36 | "active_window_count": 20 | ||
| 37 | } | ||
| 38 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -40,16 +40,16 @@ | ... | @@ -40,16 +40,16 @@ |
| 40 | "scope_value": "phase1_hot_reference_v1", | 40 | "scope_value": "phase1_hot_reference_v1", |
| 41 | "reference_set_id": 2, | 41 | "reference_set_id": 2, |
| 42 | "reference_set_name": "phase1_hot_reference_v1", | 42 | "reference_set_name": "phase1_hot_reference_v1", |
| 43 | "recording_count": 0, | 43 | "recording_count": 20, |
| 44 | "ready_asset_count": 0, | 44 | "ready_asset_count": 20, |
| 45 | "active_window_count": 0 | 45 | "active_window_count": 20 |
| 46 | }, | 46 | }, |
| 47 | "status_after_start": { | 47 | "status_after_start": { |
| 48 | "extraction_job_id": 1, | 48 | "extraction_job_id": 1, |
| 49 | "job_status": "running", | 49 | "job_status": "running", |
| 50 | "input_count": 0, | 50 | "input_count": 20, |
| 51 | "output_count": null, | 51 | "output_count": null, |
| 52 | "started_at": "2026-06-04T13:02:56.589356+08:00", | 52 | "started_at": "2026-06-04T13:17:25.914513+08:00", |
| 53 | "finished_at": null, | 53 | "finished_at": null, |
| 54 | "log_uri": null, | 54 | "log_uri": null, |
| 55 | "metadata_json": { | 55 | "metadata_json": { |
| ... | @@ -63,21 +63,21 @@ | ... | @@ -63,21 +63,21 @@ |
| 63 | "target_scope_summary": { | 63 | "target_scope_summary": { |
| 64 | "scope_type": "reference_set", | 64 | "scope_type": "reference_set", |
| 65 | "scope_value": "phase1_hot_reference_v1", | 65 | "scope_value": "phase1_hot_reference_v1", |
| 66 | "recording_count": 0, | 66 | "recording_count": 20, |
| 67 | "reference_set_id": 2, | 67 | "reference_set_id": 2, |
| 68 | "ready_asset_count": 0, | 68 | "ready_asset_count": 20, |
| 69 | "reference_set_name": "phase1_hot_reference_v1", | 69 | "reference_set_name": "phase1_hot_reference_v1", |
| 70 | "active_window_count": 0 | 70 | "active_window_count": 20 |
| 71 | } | 71 | } |
| 72 | } | 72 | } |
| 73 | }, | 73 | }, |
| 74 | "status_after_complete": { | 74 | "status_after_complete": { |
| 75 | "extraction_job_id": 1, | 75 | "extraction_job_id": 1, |
| 76 | "job_status": "completed", | 76 | "job_status": "completed", |
| 77 | "input_count": 0, | 77 | "input_count": 20, |
| 78 | "output_count": 0, | 78 | "output_count": 0, |
| 79 | "started_at": "2026-06-04T13:02:56.589356+08:00", | 79 | "started_at": "2026-06-04T13:17:25.914513+08:00", |
| 80 | "finished_at": "2026-06-04T13:02:56.591597+08:00", | 80 | "finished_at": "2026-06-04T13:17:25.915231+08:00", |
| 81 | "log_uri": null, | 81 | "log_uri": null, |
| 82 | "metadata_json": { | 82 | "metadata_json": { |
| 83 | "lane": "exact", | 83 | "lane": "exact", |
| ... | @@ -92,11 +92,11 @@ | ... | @@ -92,11 +92,11 @@ |
| 92 | "target_scope_summary": { | 92 | "target_scope_summary": { |
| 93 | "scope_type": "reference_set", | 93 | "scope_type": "reference_set", |
| 94 | "scope_value": "phase1_hot_reference_v1", | 94 | "scope_value": "phase1_hot_reference_v1", |
| 95 | "recording_count": 0, | 95 | "recording_count": 20, |
| 96 | "reference_set_id": 2, | 96 | "reference_set_id": 2, |
| 97 | "ready_asset_count": 0, | 97 | "ready_asset_count": 20, |
| 98 | "reference_set_name": "phase1_hot_reference_v1", | 98 | "reference_set_name": "phase1_hot_reference_v1", |
| 99 | "active_window_count": 0 | 99 | "active_window_count": 20 |
| 100 | } | 100 | } |
| 101 | } | 101 | } |
| 102 | }, | 102 | }, | ... | ... |
| ... | @@ -39,16 +39,16 @@ | ... | @@ -39,16 +39,16 @@ |
| 39 | "scope_value": "phase1_hot_reference_v1", | 39 | "scope_value": "phase1_hot_reference_v1", |
| 40 | "reference_set_id": 2, | 40 | "reference_set_id": 2, |
| 41 | "reference_set_name": "phase1_hot_reference_v1", | 41 | "reference_set_name": "phase1_hot_reference_v1", |
| 42 | "recording_count": 0, | 42 | "recording_count": 20, |
| 43 | "ready_asset_count": 0, | 43 | "ready_asset_count": 20, |
| 44 | "active_window_count": 0 | 44 | "active_window_count": 20 |
| 45 | }, | 45 | }, |
| 46 | "status_after_start": { | 46 | "status_after_start": { |
| 47 | "extraction_job_id": 2, | 47 | "extraction_job_id": 2, |
| 48 | "job_status": "running", | 48 | "job_status": "running", |
| 49 | "input_count": 0, | 49 | "input_count": 20, |
| 50 | "output_count": null, | 50 | "output_count": null, |
| 51 | "started_at": "2026-06-04T13:02:56.714882+08:00", | 51 | "started_at": "2026-06-04T13:17:26.054365+08:00", |
| 52 | "finished_at": null, | 52 | "finished_at": null, |
| 53 | "log_uri": null, | 53 | "log_uri": null, |
| 54 | "metadata_json": { | 54 | "metadata_json": { |
| ... | @@ -63,21 +63,21 @@ | ... | @@ -63,21 +63,21 @@ |
| 63 | "target_scope_summary": { | 63 | "target_scope_summary": { |
| 64 | "scope_type": "reference_set", | 64 | "scope_type": "reference_set", |
| 65 | "scope_value": "phase1_hot_reference_v1", | 65 | "scope_value": "phase1_hot_reference_v1", |
| 66 | "recording_count": 0, | 66 | "recording_count": 20, |
| 67 | "reference_set_id": 2, | 67 | "reference_set_id": 2, |
| 68 | "ready_asset_count": 0, | 68 | "ready_asset_count": 20, |
| 69 | "reference_set_name": "phase1_hot_reference_v1", | 69 | "reference_set_name": "phase1_hot_reference_v1", |
| 70 | "active_window_count": 0 | 70 | "active_window_count": 20 |
| 71 | } | 71 | } |
| 72 | } | 72 | } |
| 73 | }, | 73 | }, |
| 74 | "status_after_complete": { | 74 | "status_after_complete": { |
| 75 | "extraction_job_id": 2, | 75 | "extraction_job_id": 2, |
| 76 | "job_status": "completed", | 76 | "job_status": "completed", |
| 77 | "input_count": 0, | 77 | "input_count": 20, |
| 78 | "output_count": 0, | 78 | "output_count": 0, |
| 79 | "started_at": "2026-06-04T13:02:56.714882+08:00", | 79 | "started_at": "2026-06-04T13:17:26.054365+08:00", |
| 80 | "finished_at": "2026-06-04T13:02:56.715469+08:00", | 80 | "finished_at": "2026-06-04T13:17:26.055184+08:00", |
| 81 | "log_uri": null, | 81 | "log_uri": null, |
| 82 | "metadata_json": { | 82 | "metadata_json": { |
| 83 | "lane": "semantic", | 83 | "lane": "semantic", |
| ... | @@ -93,11 +93,11 @@ | ... | @@ -93,11 +93,11 @@ |
| 93 | "target_scope_summary": { | 93 | "target_scope_summary": { |
| 94 | "scope_type": "reference_set", | 94 | "scope_type": "reference_set", |
| 95 | "scope_value": "phase1_hot_reference_v1", | 95 | "scope_value": "phase1_hot_reference_v1", |
| 96 | "recording_count": 0, | 96 | "recording_count": 20, |
| 97 | "reference_set_id": 2, | 97 | "reference_set_id": 2, |
| 98 | "ready_asset_count": 0, | 98 | "ready_asset_count": 20, |
| 99 | "reference_set_name": "phase1_hot_reference_v1", | 99 | "reference_set_name": "phase1_hot_reference_v1", |
| 100 | "active_window_count": 0 | 100 | "active_window_count": 20 |
| 101 | } | 101 | } |
| 102 | } | 102 | } |
| 103 | }, | 103 | }, | ... | ... |
| 1 | { | 1 | { |
| 2 | "worker": "mark_job_status", | 2 | "worker": "mark_job_status", |
| 3 | "schema": "acr_test", | 3 | "schema": "acr_test", |
| 4 | "job_id": 1, | ||
| 4 | "update": { | 5 | "update": { |
| 5 | "extraction_job_id": 1, | 6 | "extraction_job_id": 1, |
| 6 | "job_status": "pending", | 7 | "job_status": "pending", |
| 7 | "input_count": 0, | 8 | "input_count": 20, |
| 8 | "output_count": 0, | 9 | "output_count": 0, |
| 9 | "started_at": "2026-06-04T13:02:56.589356+08:00", | 10 | "started_at": "2026-06-04T13:17:25.914513+08:00", |
| 10 | "finished_at": "2026-06-04T13:02:56.591597+08:00", | 11 | "finished_at": "2026-06-04T13:17:25.915231+08:00", |
| 11 | "log_uri": null, | 12 | "log_uri": null, |
| 12 | "metadata_json": { | 13 | "metadata_json": { |
| 13 | "lane": "exact", | 14 | "lane": "exact", |
| 14 | "phase": "phase1", | 15 | "phase": "phase1", |
| 15 | "priority": "p0" | 16 | "worker": "run_chromaprint_job", |
| 17 | "dry_run": true, | ||
| 18 | "priority": "p0", | ||
| 19 | "output_target": "audio_fingerprint", | ||
| 20 | "dry_run_result": "completed_without_feature_write", | ||
| 21 | "execution_mode": "dry_run", | ||
| 22 | "write_target_table": "audio_fingerprint", | ||
| 23 | "target_scope_summary": { | ||
| 24 | "scope_type": "reference_set", | ||
| 25 | "scope_value": "phase1_hot_reference_v1", | ||
| 26 | "recording_count": 20, | ||
| 27 | "reference_set_id": 2, | ||
| 28 | "ready_asset_count": 20, | ||
| 29 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 30 | "active_window_count": 20 | ||
| 31 | } | ||
| 16 | } | 32 | } |
| 17 | } | 33 | } |
| 18 | } | 34 | } |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -4,11 +4,17 @@ from __future__ import annotations | ... | @@ -4,11 +4,17 @@ from __future__ import annotations |
| 4 | import argparse | 4 | import argparse |
| 5 | import json | 5 | import json |
| 6 | from pathlib import Path | 6 | from pathlib import Path |
| 7 | import sys | ||
| 7 | from typing import Any | 8 | from typing import Any |
| 8 | 9 | ||
| 9 | import psycopg | 10 | import psycopg |
| 10 | 11 | ||
| 11 | ROOT = Path(__file__).resolve().parents[1] | 12 | ROOT = Path(__file__).resolve().parents[1] |
| 13 | if str(ROOT) not in sys.path: | ||
| 14 | sys.path.insert(0, str(ROOT)) | ||
| 15 | |||
| 16 | from workers._job_common import validate_schema | ||
| 17 | |||
| 12 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_jobs_report.json' | 18 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_jobs_report.json' |
| 13 | 19 | ||
| 14 | JOB_SPECS = [ | 20 | JOB_SPECS = [ |
| ... | @@ -117,6 +123,11 @@ def ensure_job(conn: psycopg.Connection, feature_set_id: int, job: dict[str, Any | ... | @@ -117,6 +123,11 @@ def ensure_job(conn: psycopg.Connection, feature_set_id: int, job: dict[str, Any |
| 117 | """ | 123 | """ |
| 118 | UPDATE feature_extraction_job | 124 | UPDATE feature_extraction_job |
| 119 | SET job_status = %s, | 125 | SET job_status = %s, |
| 126 | input_count = NULL, | ||
| 127 | output_count = NULL, | ||
| 128 | started_at = NULL, | ||
| 129 | finished_at = NULL, | ||
| 130 | log_uri = NULL, | ||
| 120 | metadata_json = %s::jsonb | 131 | metadata_json = %s::jsonb |
| 121 | WHERE extraction_job_id = %s; | 132 | WHERE extraction_job_id = %s; |
| 122 | """, | 133 | """, |
| ... | @@ -148,14 +159,15 @@ def main() -> None: | ... | @@ -148,14 +159,15 @@ def main() -> None: |
| 148 | ap.add_argument('--schema', default='acr_test') | 159 | ap.add_argument('--schema', default='acr_test') |
| 149 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | 160 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) |
| 150 | args = ap.parse_args() | 161 | args = ap.parse_args() |
| 162 | schema = validate_schema(args.schema) | ||
| 151 | 163 | ||
| 152 | summary: dict[str, Any] = { | 164 | summary: dict[str, Any] = { |
| 153 | 'schema': args.schema, | 165 | 'schema': schema, |
| 154 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | 166 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', |
| 155 | 'jobs': [], | 167 | 'jobs': [], |
| 156 | } | 168 | } |
| 157 | with psycopg.connect(args.dsn, autocommit=True) as conn: | 169 | with psycopg.connect(args.dsn, autocommit=True) as conn: |
| 158 | conn.execute(f'SET search_path TO {args.schema}, public;') | 170 | conn.execute(f'SET search_path TO {schema}, public;') |
| 159 | for job in JOB_SPECS: | 171 | for job in JOB_SPECS: |
| 160 | feature_set_id = resolve_feature_set_id(conn, job) | 172 | feature_set_id = resolve_feature_set_id(conn, job) |
| 161 | extraction_job_id, operation = ensure_job(conn, feature_set_id, job) | 173 | extraction_job_id, operation = ensure_job(conn, feature_set_id, job) | ... | ... |
| ... | @@ -4,11 +4,17 @@ from __future__ import annotations | ... | @@ -4,11 +4,17 @@ from __future__ import annotations |
| 4 | import argparse | 4 | import argparse |
| 5 | import json | 5 | import json |
| 6 | from pathlib import Path | 6 | from pathlib import Path |
| 7 | import sys | ||
| 7 | from typing import Any | 8 | from typing import Any |
| 8 | 9 | ||
| 9 | import psycopg | 10 | import psycopg |
| 10 | 11 | ||
| 11 | ROOT = Path(__file__).resolve().parents[1] | 12 | ROOT = Path(__file__).resolve().parents[1] |
| 13 | if str(ROOT) not in sys.path: | ||
| 14 | sys.path.insert(0, str(ROOT)) | ||
| 15 | |||
| 16 | from workers._job_common import validate_schema | ||
| 17 | |||
| 12 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_registry_bootstrap_report.json' | 18 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_registry_bootstrap_report.json' |
| 13 | 19 | ||
| 14 | MODELS = [ | 20 | MODELS = [ |
| ... | @@ -360,9 +366,10 @@ def main() -> None: | ... | @@ -360,9 +366,10 @@ def main() -> None: |
| 360 | ap.add_argument('--schema', default='acr_test') | 366 | ap.add_argument('--schema', default='acr_test') |
| 361 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | 367 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) |
| 362 | args = ap.parse_args() | 368 | args = ap.parse_args() |
| 369 | schema = validate_schema(args.schema) | ||
| 363 | 370 | ||
| 364 | summary: dict[str, Any] = { | 371 | summary: dict[str, Any] = { |
| 365 | 'schema': args.schema, | 372 | 'schema': schema, |
| 366 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | 373 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', |
| 367 | 'models': [], | 374 | 'models': [], |
| 368 | 'feature_sets': [], | 375 | 'feature_sets': [], |
| ... | @@ -370,7 +377,7 @@ def main() -> None: | ... | @@ -370,7 +377,7 @@ def main() -> None: |
| 370 | } | 377 | } |
| 371 | 378 | ||
| 372 | with psycopg.connect(args.dsn, autocommit=True) as conn: | 379 | with psycopg.connect(args.dsn, autocommit=True) as conn: |
| 373 | conn.execute(f'SET search_path TO {args.schema}, public;') | 380 | conn.execute(f'SET search_path TO {schema}, public;') |
| 374 | model_ids: dict[tuple[str, str], int] = {} | 381 | model_ids: dict[tuple[str, str], int] = {} |
| 375 | for model in MODELS: | 382 | for model in MODELS: |
| 376 | model_id, operation = upsert_model(conn, model) | 383 | model_id, operation = upsert_model(conn, model) | ... | ... |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | import sys | ||
| 8 | |||
| 9 | import psycopg | ||
| 10 | |||
| 11 | ROOT = Path(__file__).resolve().parents[1] | ||
| 12 | if str(ROOT) not in sys.path: | ||
| 13 | sys.path.insert(0, str(ROOT)) | ||
| 14 | |||
| 15 | from workers._job_common import validate_schema | ||
| 16 | |||
| 17 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_reference_member_bootstrap_report.json' | ||
| 18 | |||
| 19 | |||
| 20 | def main() -> None: | ||
| 21 | ap = argparse.ArgumentParser() | ||
| 22 | ap.add_argument('--dsn', required=True) | ||
| 23 | ap.add_argument('--schema', default='acr_test') | ||
| 24 | ap.add_argument('--set-name', default='phase1_hot_reference_v1') | ||
| 25 | ap.add_argument('--member-role', default='hot_reference') | ||
| 26 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 27 | args = ap.parse_args() | ||
| 28 | schema = validate_schema(args.schema) | ||
| 29 | |||
| 30 | summary: dict[str, object] = { | ||
| 31 | 'schema': schema, | ||
| 32 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 33 | 'set_name': args.set_name, | ||
| 34 | 'member_role': args.member_role, | ||
| 35 | 'inserted_recording_ids': [], | ||
| 36 | 'reused_recording_ids': [], | ||
| 37 | } | ||
| 38 | |||
| 39 | with psycopg.connect(args.dsn, autocommit=True) as conn: | ||
| 40 | conn.execute(f'SET search_path TO {schema}, public;') | ||
| 41 | |||
| 42 | ref_row = conn.execute( | ||
| 43 | 'SELECT reference_set_id FROM reference_set_registry WHERE set_name = %s LIMIT 1;', | ||
| 44 | (args.set_name,), | ||
| 45 | ).fetchone() | ||
| 46 | if not ref_row: | ||
| 47 | raise RuntimeError(f'reference_set_registry.set_name not found: {args.set_name}') | ||
| 48 | reference_set_id = int(ref_row[0]) | ||
| 49 | summary['reference_set_id'] = reference_set_id | ||
| 50 | |||
| 51 | recordings = conn.execute( | ||
| 52 | """ | ||
| 53 | SELECT recording_id | ||
| 54 | FROM recording | ||
| 55 | WHERE is_reference = TRUE | ||
| 56 | ORDER BY recording_id; | ||
| 57 | """ | ||
| 58 | ).fetchall() | ||
| 59 | for row in recordings: | ||
| 60 | recording_id = int(row[0]) | ||
| 61 | existing = conn.execute( | ||
| 62 | """ | ||
| 63 | SELECT 1 | ||
| 64 | FROM reference_set_member | ||
| 65 | WHERE reference_set_id = %s | ||
| 66 | AND recording_id = %s | ||
| 67 | LIMIT 1; | ||
| 68 | """, | ||
| 69 | (reference_set_id, recording_id), | ||
| 70 | ).fetchone() | ||
| 71 | if existing: | ||
| 72 | summary['reused_recording_ids'].append(recording_id) | ||
| 73 | continue | ||
| 74 | conn.execute( | ||
| 75 | """ | ||
| 76 | INSERT INTO reference_set_member (reference_set_id, recording_id, member_role) | ||
| 77 | VALUES (%s, %s, %s); | ||
| 78 | """, | ||
| 79 | (reference_set_id, recording_id, args.member_role), | ||
| 80 | ) | ||
| 81 | summary['inserted_recording_ids'].append(recording_id) | ||
| 82 | |||
| 83 | counts = conn.execute( | ||
| 84 | """ | ||
| 85 | SELECT | ||
| 86 | count(*) AS member_count, | ||
| 87 | count(DISTINCT ra.asset_id) FILTER (WHERE ra.ingest_status = 'ready') AS ready_asset_count, | ||
| 88 | count(DISTINCT aw.window_id) FILTER (WHERE aw.active_for_index) AS active_window_count | ||
| 89 | FROM reference_set_member rsm | ||
| 90 | LEFT JOIN recording_asset ra ON ra.recording_id = rsm.recording_id | ||
| 91 | LEFT JOIN audio_window aw ON aw.recording_id = rsm.recording_id | ||
| 92 | WHERE rsm.reference_set_id = %s; | ||
| 93 | """, | ||
| 94 | (reference_set_id,), | ||
| 95 | ).fetchone() | ||
| 96 | summary['counts'] = { | ||
| 97 | 'reference_recordings_seen': len(recordings), | ||
| 98 | 'inserted_members': len(summary['inserted_recording_ids']), | ||
| 99 | 'reused_members': len(summary['reused_recording_ids']), | ||
| 100 | 'member_count': int(counts[0]), | ||
| 101 | 'ready_asset_count': int(counts[1]), | ||
| 102 | 'active_window_count': int(counts[2]), | ||
| 103 | } | ||
| 104 | |||
| 105 | output = Path(args.output) | ||
| 106 | output.parent.mkdir(parents=True, exist_ok=True) | ||
| 107 | output.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 108 | print(json.dumps(summary, ensure_ascii=False, indent=2)) | ||
| 109 | |||
| 110 | |||
| 111 | if __name__ == '__main__': | ||
| 112 | main() |
| ... | @@ -7,11 +7,17 @@ from collections import defaultdict | ... | @@ -7,11 +7,17 @@ from collections import defaultdict |
| 7 | from dataclasses import dataclass | 7 | from dataclasses import dataclass |
| 8 | from pathlib import Path | 8 | from pathlib import Path |
| 9 | from statistics import median | 9 | from statistics import median |
| 10 | import sys | ||
| 10 | from typing import Any | 11 | from typing import Any |
| 11 | 12 | ||
| 12 | import psycopg | 13 | import psycopg |
| 13 | 14 | ||
| 14 | ROOT = Path(__file__).resolve().parents[1] | 15 | ROOT = Path(__file__).resolve().parents[1] |
| 16 | if str(ROOT) not in sys.path: | ||
| 17 | sys.path.insert(0, str(ROOT)) | ||
| 18 | |||
| 19 | from workers._job_common import validate_schema | ||
| 20 | |||
| 15 | DEFAULT_SCHEMA_SQL = ROOT / 'sql' / 'acr_pg_schema_v2.sql' | 21 | DEFAULT_SCHEMA_SQL = ROOT / 'sql' / 'acr_pg_schema_v2.sql' |
| 16 | DEFAULT_REFERENCE = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'reference_embeddings.jsonl' | 22 | DEFAULT_REFERENCE = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'reference_embeddings.jsonl' |
| 17 | DEFAULT_QUERY = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'query_embeddings.jsonl' | 23 | DEFAULT_QUERY = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'query_embeddings.jsonl' |
| ... | @@ -84,6 +90,7 @@ def aggregate_song_scores(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: | ... | @@ -84,6 +90,7 @@ def aggregate_song_scores(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: |
| 84 | 90 | ||
| 85 | 91 | ||
| 86 | def reset_schema(conn: psycopg.Connection, schema: str) -> None: | 92 | def reset_schema(conn: psycopg.Connection, schema: str) -> None: |
| 93 | schema = validate_schema(schema) | ||
| 87 | conn.execute(f'DROP SCHEMA IF EXISTS {schema} CASCADE;') | 94 | conn.execute(f'DROP SCHEMA IF EXISTS {schema} CASCADE;') |
| 88 | conn.execute(f'CREATE SCHEMA {schema};') | 95 | conn.execute(f'CREATE SCHEMA {schema};') |
| 89 | conn.execute(f'SET search_path TO {schema}, public;') | 96 | conn.execute(f'SET search_path TO {schema}, public;') |
| ... | @@ -455,16 +462,17 @@ def main() -> None: | ... | @@ -455,16 +462,17 @@ def main() -> None: |
| 455 | ap.add_argument('--topk', type=int, default=10) | 462 | ap.add_argument('--topk', type=int, default=10) |
| 456 | ap.add_argument('--reset-schema', action='store_true') | 463 | ap.add_argument('--reset-schema', action='store_true') |
| 457 | args = ap.parse_args() | 464 | args = ap.parse_args() |
| 465 | schema = validate_schema(args.schema) | ||
| 458 | 466 | ||
| 459 | refs = load_jsonl(Path(args.reference_embeddings_jsonl)) | 467 | refs = load_jsonl(Path(args.reference_embeddings_jsonl)) |
| 460 | queries = load_jsonl(Path(args.query_embeddings_jsonl)) | 468 | queries = load_jsonl(Path(args.query_embeddings_jsonl)) |
| 461 | 469 | ||
| 462 | with psycopg.connect(args.dsn, autocommit=True) as conn: | 470 | with psycopg.connect(args.dsn, autocommit=True) as conn: |
| 463 | if args.reset_schema: | 471 | if args.reset_schema: |
| 464 | reset_schema(conn, args.schema) | 472 | reset_schema(conn, schema) |
| 465 | else: | 473 | else: |
| 466 | conn.execute(f'CREATE SCHEMA IF NOT EXISTS {args.schema};') | 474 | conn.execute(f'CREATE SCHEMA IF NOT EXISTS {schema};') |
| 467 | conn.execute(f'SET search_path TO {args.schema}, public;') | 475 | conn.execute(f'SET search_path TO {schema}, public;') |
| 468 | apply_schema(conn, Path(args.schema_sql)) | 476 | apply_schema(conn, Path(args.schema_sql)) |
| 469 | model_id, feature_set_id, reference_set_id, retrieval_index_id = seed_registry(conn) | 477 | model_id, feature_set_id, reference_set_id, retrieval_index_id = seed_registry(conn) |
| 470 | entities = ingest_references(conn, refs, feature_set_id, reference_set_id) | 478 | entities = ingest_references(conn, refs, feature_set_id, reference_set_id) |
| ... | @@ -483,7 +491,7 @@ def main() -> None: | ... | @@ -483,7 +491,7 @@ def main() -> None: |
| 483 | } | 491 | } |
| 484 | 492 | ||
| 485 | payload = { | 493 | payload = { |
| 486 | 'schema': args.schema, | 494 | 'schema': schema, |
| 487 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | 495 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', |
| 488 | 'input': { | 496 | 'input': { |
| 489 | 'reference_embeddings_jsonl': args.reference_embeddings_jsonl, | 497 | 'reference_embeddings_jsonl': args.reference_embeddings_jsonl, | ... | ... |
| ... | @@ -4,11 +4,17 @@ from __future__ import annotations | ... | @@ -4,11 +4,17 @@ from __future__ import annotations |
| 4 | import argparse | 4 | import argparse |
| 5 | import json | 5 | import json |
| 6 | from pathlib import Path | 6 | from pathlib import Path |
| 7 | import sys | ||
| 7 | from typing import Any | 8 | from typing import Any |
| 8 | 9 | ||
| 9 | import psycopg | 10 | import psycopg |
| 10 | 11 | ||
| 11 | ROOT = Path(__file__).resolve().parents[1] | 12 | ROOT = Path(__file__).resolve().parents[1] |
| 13 | if str(ROOT) not in sys.path: | ||
| 14 | sys.path.insert(0, str(ROOT)) | ||
| 15 | |||
| 16 | from workers._job_common import validate_schema | ||
| 17 | |||
| 12 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_plan_report.json' | 18 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_plan_report.json' |
| 13 | 19 | ||
| 14 | LANE_PRIORITY = { | 20 | LANE_PRIORITY = { |
| ... | @@ -16,6 +22,7 @@ LANE_PRIORITY = { | ... | @@ -16,6 +22,7 @@ LANE_PRIORITY = { |
| 16 | 'semantic': 1, | 22 | 'semantic': 1, |
| 17 | 'cover': 2, | 23 | 'cover': 2, |
| 18 | } | 24 | } |
| 25 | PYTHON_BIN = '/usr/local/miniconda3/bin/python' | ||
| 19 | 26 | ||
| 20 | 27 | ||
| 21 | def parse_target_scope(target_scope: str) -> dict[str, Any]: | 28 | def parse_target_scope(target_scope: str) -> dict[str, Any]: |
| ... | @@ -26,8 +33,10 @@ def parse_target_scope(target_scope: str) -> dict[str, Any]: | ... | @@ -26,8 +33,10 @@ def parse_target_scope(target_scope: str) -> dict[str, Any]: |
| 26 | 33 | ||
| 27 | 34 | ||
| 28 | def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: | 35 | def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: |
| 36 | command_prefix = 'cd /workspace/acr-engine && ' | ||
| 29 | base_env = ( | 37 | base_env = ( |
| 30 | 'PG_DSN="${PG_DSN:?set PG_DSN}" ' | 38 | command_prefix |
| 39 | + 'PG_DSN="${PG_DSN:?set PG_DSN}" ' | ||
| 31 | f"EXTRACTION_JOB_ID={job['extraction_job_id']} " | 40 | f"EXTRACTION_JOB_ID={job['extraction_job_id']} " |
| 32 | f"FEATURE_SET_ID={job['feature_set_id']} " | 41 | f"FEATURE_SET_ID={job['feature_set_id']} " |
| 33 | f"TARGET_SCOPE='{job['target_scope']}' " | 42 | f"TARGET_SCOPE='{job['target_scope']}' " |
| ... | @@ -37,16 +46,16 @@ def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: | ... | @@ -37,16 +46,16 @@ def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: |
| 37 | if job['lane'] == 'exact': | 46 | if job['lane'] == 'exact': |
| 38 | commands.append( | 47 | commands.append( |
| 39 | base_env | 48 | base_env |
| 40 | + " OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py" | 49 | + f" OUTPUT_TARGET=audio_fingerprint \\\n{PYTHON_BIN} workers/run_chromaprint_job.py --complete-dry-run" |
| 41 | ) | 50 | ) |
| 42 | else: | 51 | else: |
| 43 | commands.append( | 52 | commands.append( |
| 44 | base_env | 53 | base_env |
| 45 | + f" MODEL_NAME={job['model_name']} MODEL_VERSION={job['model_version']} VECTOR_TABLE={job['vector_table']} OUTPUT_TARGET={job['physical_target']} \\\npython workers/run_embedding_job.py" | 54 | + f" MODEL_NAME={job['model_name']} MODEL_VERSION={job['model_version']} VECTOR_TABLE={job['vector_table']} OUTPUT_TARGET={job['physical_target']} \\\n{PYTHON_BIN} workers/run_embedding_job.py --complete-dry-run" |
| 46 | ) | 55 | ) |
| 47 | commands.append( | 56 | commands.append( |
| 48 | base_env | 57 | base_env |
| 49 | + " \\\npython workers/mark_job_status.py --status running" | 58 | + f" \\\n{PYTHON_BIN} workers/mark_job_status.py --status running --expected-status pending" |
| 50 | ) | 59 | ) |
| 51 | return commands | 60 | return commands |
| 52 | 61 | ||
| ... | @@ -59,8 +68,9 @@ def main() -> None: | ... | @@ -59,8 +68,9 @@ def main() -> None: |
| 59 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | 68 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) |
| 60 | args = ap.parse_args() | 69 | args = ap.parse_args() |
| 61 | 70 | ||
| 71 | schema = validate_schema(args.schema) | ||
| 62 | with psycopg.connect(args.dsn) as conn: | 72 | with psycopg.connect(args.dsn) as conn: |
| 63 | conn.execute(f'SET search_path TO {args.schema}, public;') | 73 | conn.execute(f'SET search_path TO {schema}, public;') |
| 64 | rows = conn.execute( | 74 | rows = conn.execute( |
| 65 | """ | 75 | """ |
| 66 | SELECT | 76 | SELECT |
| ... | @@ -138,7 +148,7 @@ def main() -> None: | ... | @@ -138,7 +148,7 @@ def main() -> None: |
| 138 | f"target scope: {row[2]}", | 148 | f"target scope: {row[2]}", |
| 139 | ], | 149 | ], |
| 140 | } | 150 | } |
| 141 | item['command_suggestions'] = build_command_suggestions(item, args.schema) | 151 | item['command_suggestions'] = build_command_suggestions(item, schema) |
| 142 | jobs.append(item) | 152 | jobs.append(item) |
| 143 | by_lane.setdefault(lane, []).append(item) | 153 | by_lane.setdefault(lane, []).append(item) |
| 144 | 154 | ||
| ... | @@ -147,7 +157,7 @@ def main() -> None: | ... | @@ -147,7 +157,7 @@ def main() -> None: |
| 147 | lane_jobs.sort(key=lambda x: x['extraction_job_id']) | 157 | lane_jobs.sort(key=lambda x: x['extraction_job_id']) |
| 148 | 158 | ||
| 149 | payload = { | 159 | payload = { |
| 150 | 'schema': args.schema, | 160 | 'schema': schema, |
| 151 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | 161 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', |
| 152 | 'job_status_filter': args.job_status, | 162 | 'job_status_filter': args.job_status, |
| 153 | 'counts': { | 163 | 'counts': { | ... | ... |
| ... | @@ -10,6 +10,7 @@ from typing import Any | ... | @@ -10,6 +10,7 @@ from typing import Any |
| 10 | import psycopg | 10 | import psycopg |
| 11 | 11 | ||
| 12 | SCHEMA_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$') | 12 | SCHEMA_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$') |
| 13 | ALLOWED_JOB_STATUSES = {'pending', 'running', 'completed', 'failed'} | ||
| 13 | 14 | ||
| 14 | 15 | ||
| 15 | @dataclass | 16 | @dataclass |
| ... | @@ -177,6 +178,7 @@ def update_job_status( | ... | @@ -177,6 +178,7 @@ def update_job_status( |
| 177 | extraction_job_id: int, | 178 | extraction_job_id: int, |
| 178 | *, | 179 | *, |
| 179 | status: str, | 180 | status: str, |
| 181 | expected_status: str | None = None, | ||
| 180 | input_count: int | None = None, | 182 | input_count: int | None = None, |
| 181 | output_count: int | None = None, | 183 | output_count: int | None = None, |
| 182 | log_uri: str | None = None, | 184 | log_uri: str | None = None, |
| ... | @@ -184,6 +186,10 @@ def update_job_status( | ... | @@ -184,6 +186,10 @@ def update_job_status( |
| 184 | set_started_at: bool = False, | 186 | set_started_at: bool = False, |
| 185 | set_finished_at: bool = False, | 187 | set_finished_at: bool = False, |
| 186 | ) -> dict[str, Any]: | 188 | ) -> dict[str, Any]: |
| 189 | if status not in ALLOWED_JOB_STATUSES: | ||
| 190 | raise SystemExit(f'invalid job status: {status}') | ||
| 191 | if expected_status is not None and expected_status not in ALLOWED_JOB_STATUSES: | ||
| 192 | raise SystemExit(f'invalid expected job status: {expected_status}') | ||
| 187 | patch = json.dumps(metadata_patch or {}, ensure_ascii=False) | 193 | patch = json.dumps(metadata_patch or {}, ensure_ascii=False) |
| 188 | row = conn.execute( | 194 | row = conn.execute( |
| 189 | """ | 195 | """ |
| ... | @@ -197,11 +203,12 @@ def update_job_status( | ... | @@ -197,11 +203,12 @@ def update_job_status( |
| 197 | ELSE started_at | 203 | ELSE started_at |
| 198 | END, | 204 | END, |
| 199 | finished_at = CASE | 205 | finished_at = CASE |
| 200 | WHEN %s THEN NOW() | 206 | WHEN %s THEN COALESCE(finished_at, NOW()) |
| 201 | ELSE finished_at | 207 | ELSE finished_at |
| 202 | END, | 208 | END, |
| 203 | metadata_json = COALESCE(metadata_json, '{}'::jsonb) || %s::jsonb | 209 | metadata_json = COALESCE(metadata_json, '{}'::jsonb) || %s::jsonb |
| 204 | WHERE extraction_job_id = %s | 210 | WHERE extraction_job_id = %s |
| 211 | AND (%s OR job_status = %s) | ||
| 205 | RETURNING extraction_job_id, job_status, input_count, output_count, started_at, finished_at, log_uri, metadata_json; | 212 | RETURNING extraction_job_id, job_status, input_count, output_count, started_at, finished_at, log_uri, metadata_json; |
| 206 | """, | 213 | """, |
| 207 | ( | 214 | ( |
| ... | @@ -213,10 +220,13 @@ def update_job_status( | ... | @@ -213,10 +220,13 @@ def update_job_status( |
| 213 | set_finished_at, | 220 | set_finished_at, |
| 214 | patch, | 221 | patch, |
| 215 | extraction_job_id, | 222 | extraction_job_id, |
| 223 | expected_status is None, | ||
| 224 | expected_status, | ||
| 216 | ), | 225 | ), |
| 217 | ).fetchone() | 226 | ).fetchone() |
| 218 | if not row: | 227 | if not row: |
| 219 | raise SystemExit(f'failed to update feature_extraction_job={extraction_job_id}') | 228 | expectation = f' with expected_status={expected_status}' if expected_status else '' |
| 229 | raise SystemExit(f'failed to update feature_extraction_job={extraction_job_id}{expectation}') | ||
| 220 | return { | 230 | return { |
| 221 | 'extraction_job_id': int(row[0]), | 231 | 'extraction_job_id': int(row[0]), |
| 222 | 'job_status': row[1], | 232 | 'job_status': row[1], | ... | ... |
| ... | @@ -5,15 +5,16 @@ import argparse | ... | @@ -5,15 +5,16 @@ import argparse |
| 5 | import json | 5 | import json |
| 6 | import os | 6 | import os |
| 7 | 7 | ||
| 8 | from _job_common import connect, emit_payload, require_env, update_job_status | 8 | from _job_common import ALLOWED_JOB_STATUSES, connect, emit_payload, update_job_status |
| 9 | 9 | ||
| 10 | 10 | ||
| 11 | def main() -> None: | 11 | def main() -> None: |
| 12 | ap = argparse.ArgumentParser() | 12 | ap = argparse.ArgumentParser() |
| 13 | ap.add_argument('--dsn', default=os.environ.get('PG_DSN')) | 13 | ap.add_argument('--dsn', default=os.environ.get('PG_DSN')) |
| 14 | ap.add_argument('--schema', default=os.environ.get('PG_SCHEMA', 'acr_test')) | 14 | ap.add_argument('--schema', default=os.environ.get('PG_SCHEMA', 'acr_test')) |
| 15 | ap.add_argument('--job-id', type=int, default=int(require_env('EXTRACTION_JOB_ID', '0'))) | 15 | ap.add_argument('--job-id', type=int) |
| 16 | ap.add_argument('--status', required=True) | 16 | ap.add_argument('--status', required=True, choices=sorted(ALLOWED_JOB_STATUSES)) |
| 17 | ap.add_argument('--expected-status', choices=sorted(ALLOWED_JOB_STATUSES)) | ||
| 17 | ap.add_argument('--input-count', type=int) | 18 | ap.add_argument('--input-count', type=int) |
| 18 | ap.add_argument('--output-count', type=int) | 19 | ap.add_argument('--output-count', type=int) |
| 19 | ap.add_argument('--log-uri') | 20 | ap.add_argument('--log-uri') |
| ... | @@ -25,7 +26,8 @@ def main() -> None: | ... | @@ -25,7 +26,8 @@ def main() -> None: |
| 25 | 26 | ||
| 26 | if not args.dsn: | 27 | if not args.dsn: |
| 27 | raise SystemExit('missing --dsn or PG_DSN') | 28 | raise SystemExit('missing --dsn or PG_DSN') |
| 28 | if not args.job_id: | 29 | job_id = args.job_id or int(os.environ.get('EXTRACTION_JOB_ID', '0')) |
| 30 | if not job_id: | ||
| 29 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') | 31 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') |
| 30 | 32 | ||
| 31 | metadata_patch = json.loads(args.metadata_json) if args.metadata_json else {} | 33 | metadata_patch = json.loads(args.metadata_json) if args.metadata_json else {} |
| ... | @@ -33,8 +35,9 @@ def main() -> None: | ... | @@ -33,8 +35,9 @@ def main() -> None: |
| 33 | with connect(args.dsn, args.schema) as conn: | 35 | with connect(args.dsn, args.schema) as conn: |
| 34 | updated = update_job_status( | 36 | updated = update_job_status( |
| 35 | conn, | 37 | conn, |
| 36 | args.job_id, | 38 | job_id, |
| 37 | status=args.status, | 39 | status=args.status, |
| 40 | expected_status=args.expected_status, | ||
| 38 | input_count=args.input_count, | 41 | input_count=args.input_count, |
| 39 | output_count=args.output_count, | 42 | output_count=args.output_count, |
| 40 | log_uri=args.log_uri, | 43 | log_uri=args.log_uri, |
| ... | @@ -47,6 +50,7 @@ def main() -> None: | ... | @@ -47,6 +50,7 @@ def main() -> None: |
| 47 | { | 50 | { |
| 48 | 'worker': 'mark_job_status', | 51 | 'worker': 'mark_job_status', |
| 49 | 'schema': args.schema, | 52 | 'schema': args.schema, |
| 53 | 'job_id': job_id, | ||
| 50 | 'update': updated, | 54 | 'update': updated, |
| 51 | }, | 55 | }, |
| 52 | args.output, | 56 | args.output, | ... | ... |
| ... | @@ -31,6 +31,7 @@ def main() -> None: | ... | @@ -31,6 +31,7 @@ def main() -> None: |
| 31 | conn, | 31 | conn, |
| 32 | job.extraction_job_id, | 32 | job.extraction_job_id, |
| 33 | status='running', | 33 | status='running', |
| 34 | expected_status='pending', | ||
| 34 | input_count=scope['ready_asset_count'], | 35 | input_count=scope['ready_asset_count'], |
| 35 | metadata_patch={ | 36 | metadata_patch={ |
| 36 | 'worker': 'run_chromaprint_job', | 37 | 'worker': 'run_chromaprint_job', |
| ... | @@ -47,6 +48,7 @@ def main() -> None: | ... | @@ -47,6 +48,7 @@ def main() -> None: |
| 47 | conn, | 48 | conn, |
| 48 | job.extraction_job_id, | 49 | job.extraction_job_id, |
| 49 | status='completed', | 50 | status='completed', |
| 51 | expected_status='running', | ||
| 50 | output_count=0, | 52 | output_count=0, |
| 51 | metadata_patch={ | 53 | metadata_patch={ |
| 52 | 'worker': 'run_chromaprint_job', | 54 | 'worker': 'run_chromaprint_job', | ... | ... |
| ... | @@ -34,6 +34,11 @@ def main() -> None: | ... | @@ -34,6 +34,11 @@ def main() -> None: |
| 34 | job = fetch_job_context(conn, args.job_id) | 34 | job = fetch_job_context(conn, args.job_id) |
| 35 | if job.model_name == 'chromaprint': | 35 | if job.model_name == 'chromaprint': |
| 36 | raise SystemExit(f'feature_extraction_job={args.job_id} is not an embedding job') | 36 | raise SystemExit(f'feature_extraction_job={args.job_id} is not an embedding job') |
| 37 | if job.feature_name != 'semantic_embedding' or job.feature_level != 'window': | ||
| 38 | raise SystemExit( | ||
| 39 | f'feature_extraction_job={args.job_id} does not match embedding feature contract: ' | ||
| 40 | f'{job.feature_name}/{job.feature_level}' | ||
| 41 | ) | ||
| 37 | if args.model_name and job.model_name != args.model_name: | 42 | if args.model_name and job.model_name != args.model_name: |
| 38 | raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') | 43 | raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') |
| 39 | if args.model_version and job.model_version != args.model_version: | 44 | if args.model_version and job.model_version != args.model_version: |
| ... | @@ -44,6 +49,7 @@ def main() -> None: | ... | @@ -44,6 +49,7 @@ def main() -> None: |
| 44 | conn, | 49 | conn, |
| 45 | job.extraction_job_id, | 50 | job.extraction_job_id, |
| 46 | status='running', | 51 | status='running', |
| 52 | expected_status='pending', | ||
| 47 | input_count=scope['active_window_count'] or scope['ready_asset_count'], | 53 | input_count=scope['active_window_count'] or scope['ready_asset_count'], |
| 48 | metadata_patch={ | 54 | metadata_patch={ |
| 49 | 'worker': 'run_embedding_job', | 55 | 'worker': 'run_embedding_job', |
| ... | @@ -61,6 +67,7 @@ def main() -> None: | ... | @@ -61,6 +67,7 @@ def main() -> None: |
| 61 | conn, | 67 | conn, |
| 62 | job.extraction_job_id, | 68 | job.extraction_job_id, |
| 63 | status='completed', | 69 | status='completed', |
| 70 | expected_status='running', | ||
| 64 | output_count=0, | 71 | output_count=0, |
| 65 | metadata_patch={ | 72 | metadata_patch={ |
| 66 | 'worker': 'run_embedding_job', | 73 | 'worker': 'run_embedding_job', | ... | ... |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `bootstrap_phase1_reference_members_live.py` 与 `phase1_reference_member_bootstrap_report.json`,把 `acr_test` 中 `recording.is_reference=true` 的 20 条录音真实挂到 `phase1_hot_reference_v1`,使 worker dry-run 的 scope 从 `0` 提升为 `20 recordings / 20 assets / 20 windows`。 | ||
| 4 | - 根据 architect 复核修正 worker contract:`mark_job_status.py` 现支持真正的“CLI 覆盖 env”并限制状态白名单;`_job_common.update_job_status()` 新增前置状态约束并防止 `finished_at` 被重复覆盖;`bootstrap_phase1_extraction_jobs_live.py` 在恢复 pending 时会清空旧时间戳与计数;`run_embedding_job.py` 对 embedding job 契约做了更严格校验。 | ||
| 5 | - 修正 `plan_phase1_extraction_jobs_live.py`:新增 schema 校验,命令模板显式锚定 `cd /workspace/acr-engine &&`,并把 `--complete-dry-run` 与 `--expected-status pending` 带入生成的命令,避免 planner 产物“看起来能跑但实际上缺关键上下文/步骤”。 | ||
| 6 | - 新增 `phase1_worker_double_claim_guard_report.json`,通过对同一 chromaprint job 连续执行两次 dry-run 验证前置状态保护已生效:第二次执行被 `expected_status=pending` 明确拒绝。 | ||
| 7 | - 把 `validate_schema()` 统一推广到 `bootstrap_phase1_model_registry_live.py`、`bootstrap_phase1_reference_members_live.py`、`bootstrap_phase1_extraction_jobs_live.py` 与 `live_pgvector_music20_eval.py`,补齐整个 PostgreSQL live CLI 链的 schema 参数保护。 | ||
| 3 | - 新增 [Phase-1 Worker Contract](./phase1-worker-contract.md) 与 `acr-engine/workers/_job_common.py`、`mark_job_status.py`、`run_chromaprint_job.py`、`run_embedding_job.py`,把 Phase-1 从“只有 planner 命令模板”推进到“worker 可以真实消费 PostgreSQL 的 `feature_extraction_job` 并执行 `pending -> running -> completed` dry-run 状态流转”的阶段。 | 8 | - 新增 [Phase-1 Worker Contract](./phase1-worker-contract.md) 与 `acr-engine/workers/_job_common.py`、`mark_job_status.py`、`run_chromaprint_job.py`、`run_embedding_job.py`,把 Phase-1 从“只有 planner 命令模板”推进到“worker 可以真实消费 PostgreSQL 的 `feature_extraction_job` 并执行 `pending -> running -> completed` dry-run 状态流转”的阶段。 |
| 4 | - 新增 `phase1_worker_chromaprint_dry_run.json`、`phase1_worker_embedding_dry_run.json` 与 `phase1_worker_mark_pending_report.json`,并在 live PostgreSQL `acr_test` 上验证了 worker 状态流转;同时确认当前 `phase1_hot_reference_v1` 还没有实际 members,因此 scope 计数为 `0`,这是数据未装载而不是 worker 失败。 | 9 | - 新增 `phase1_worker_chromaprint_dry_run.json`、`phase1_worker_embedding_dry_run.json` 与 `phase1_worker_mark_pending_report.json`,并在 live PostgreSQL `acr_test` 上验证了 worker 状态流转;初次 dry-run 曾暴露 `phase1_hot_reference_v1` 缺少实际 members,随后已在同日补齐到 `20` 个 members。 |
| 5 | - 修正 `plan_phase1_extraction_jobs_live.py` 的命令模板,把 `PG_DSN=\"${PG_DSN:?set PG_DSN}\"` 显式写入 `command_suggestions / primary_command`,避免 planner 产物看起来可跑但实际缺少数据库连接串。 | 10 | - 修正 `plan_phase1_extraction_jobs_live.py` 的命令模板,把 `PG_DSN=\"${PG_DSN:?set PG_DSN}\"` 显式写入 `command_suggestions / primary_command`,避免 planner 产物看起来可跑但实际缺少数据库连接串。 |
| 6 | - 更新 `plan_phase1_extraction_jobs_live.py` 与 `phase1_extraction_plan_report.json`,把 Phase-1 execution plan 从“仅有排序计划”推进到“附带 `command_suggestions / primary_command` 的可复制执行命令模板”。 | 11 | - 更新 `plan_phase1_extraction_jobs_live.py` 与 `phase1_extraction_plan_report.json`,把 Phase-1 execution plan 从“仅有排序计划”推进到“附带 `command_suggestions / primary_command` 的可复制执行命令模板”。 |
| 7 | - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。 | 12 | - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。 | ... | ... |
| ... | @@ -223,6 +223,7 @@ flowchart TD | ... | @@ -223,6 +223,7 @@ flowchart TD |
| 223 | 223 | ||
| 224 | 当前已经不只是 registry/bootstrap 了,还补上了最小真实 worker 执行面: | 224 | 当前已经不只是 registry/bootstrap 了,还补上了最小真实 worker 执行面: |
| 225 | 225 | ||
| 226 | - `acr-engine/scripts/bootstrap_phase1_reference_members_live.py` | ||
| 226 | - `acr-engine/workers/mark_job_status.py` | 227 | - `acr-engine/workers/mark_job_status.py` |
| 227 | - `acr-engine/workers/run_chromaprint_job.py` | 228 | - `acr-engine/workers/run_chromaprint_job.py` |
| 228 | - `acr-engine/workers/run_embedding_job.py` | 229 | - `acr-engine/workers/run_embedding_job.py` |
| ... | @@ -437,15 +438,15 @@ cd /workspace/acr-engine | ... | @@ -437,15 +438,15 @@ cd /workspace/acr-engine |
| 437 | #### exact lane | 438 | #### exact lane |
| 438 | 439 | ||
| 439 | ```bash | 440 | ```bash |
| 440 | EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \ | 441 | cd /workspace/acr-engine && PG_DSN="${PG_DSN:?set PG_DSN}" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \ |
| 441 | python workers/run_chromaprint_job.py | 442 | /usr/local/miniconda3/bin/python workers/run_chromaprint_job.py --complete-dry-run |
| 442 | ``` | 443 | ``` |
| 443 | 444 | ||
| 444 | #### semantic lane | 445 | #### semantic lane |
| 445 | 446 | ||
| 446 | ```bash | 447 | ```bash |
| 447 | EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \ | 448 | cd /workspace/acr-engine && PG_DSN="${PG_DSN:?set PG_DSN}" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \ |
| 448 | python workers/run_embedding_job.py | 449 | /usr/local/miniconda3/bin/python workers/run_embedding_job.py --complete-dry-run |
| 449 | ``` | 450 | ``` |
| 450 | 451 | ||
| 451 | 这意味着下个 session 不需要先手工拼环境变量和 job 绑定关系,而可以直接从 planner 报告里复制命令模板。 | 452 | 这意味着下个 session 不需要先手工拼环境变量和 job 绑定关系,而可以直接从 planner 报告里复制命令模板。 | ... | ... |
| ... | @@ -26,6 +26,7 @@ | ... | @@ -26,6 +26,7 @@ |
| 26 | 26 | ||
| 27 | 位于: | 27 | 位于: |
| 28 | 28 | ||
| 29 | - `acr-engine/scripts/bootstrap_phase1_reference_members_live.py` | ||
| 29 | - `acr-engine/workers/mark_job_status.py` | 30 | - `acr-engine/workers/mark_job_status.py` |
| 30 | - `acr-engine/workers/run_chromaprint_job.py` | 31 | - `acr-engine/workers/run_chromaprint_job.py` |
| 31 | - `acr-engine/workers/run_embedding_job.py` | 32 | - `acr-engine/workers/run_embedding_job.py` |
| ... | @@ -40,6 +41,21 @@ | ... | @@ -40,6 +41,21 @@ |
| 40 | | `run_embedding_job.py` | semantic lane worker | | 41 | | `run_embedding_job.py` | semantic lane worker | |
| 41 | | `_job_common.py` | 共享的 job 读取、scope 解析、状态回写逻辑 | | 42 | | `_job_common.py` | 共享的 job 读取、scope 解析、状态回写逻辑 | |
| 42 | 43 | ||
| 44 | ### 配套 bootstrap | ||
| 45 | |||
| 46 | 为了让 worker 不再面对空 scope,这轮还补上了: | ||
| 47 | |||
| 48 | - `acr-engine/scripts/bootstrap_phase1_reference_members_live.py` | ||
| 49 | |||
| 50 | 它会把当前 `recording.is_reference = true` 的录音挂到: | ||
| 51 | |||
| 52 | - `phase1_hot_reference_v1` | ||
| 53 | |||
| 54 | 这样 worker 可以真实看到: | ||
| 55 | - `recording_count` | ||
| 56 | - `ready_asset_count` | ||
| 57 | - `active_window_count` | ||
| 58 | |||
| 43 | --- | 59 | --- |
| 44 | 60 | ||
| 45 | ## 2. 当前状态机 | 61 | ## 2. 当前状态机 |
| ... | @@ -56,6 +72,30 @@ flowchart LR | ... | @@ -56,6 +72,30 @@ flowchart LR |
| 56 | - `pending -> running` | 72 | - `pending -> running` |
| 57 | - `running -> completed`(dry-run 模式) | 73 | - `running -> completed`(dry-run 模式) |
| 58 | 74 | ||
| 75 | ### 当前状态保护 | ||
| 76 | |||
| 77 | - worker 认领 job 时要求前置状态为 `pending` | ||
| 78 | - worker 完成 job 时要求前置状态为 `running` | ||
| 79 | - `mark_job_status.py` 只接受: | ||
| 80 | - `pending` | ||
| 81 | - `running` | ||
| 82 | - `completed` | ||
| 83 | - `failed` | ||
| 84 | - `finished_at` 只在首次完成时落值,不再被重复覆盖 | ||
| 85 | |||
| 86 | ### 已验证的 guard 行为 | ||
| 87 | |||
| 88 | 当前已真实验证: | ||
| 89 | |||
| 90 | 1. 同一 chromaprint job 第一次 dry-run: | ||
| 91 | - 成功 `pending -> running -> completed` | ||
| 92 | 2. 不做 reset,直接第二次执行同一 job: | ||
| 93 | - 被前置状态保护拒绝 | ||
| 94 | |||
| 95 | 对应证据: | ||
| 96 | |||
| 97 | - `acr-engine/data/pgvector_eval/music20/phase1_worker_double_claim_guard_report.json` | ||
| 98 | |||
| 59 | ### 设计意图 | 99 | ### 设计意图 |
| 60 | 100 | ||
| 61 | 先把 **作业契约与状态流转** 固定住,再把真正的模型推理塞进去。 | 101 | 先把 **作业契约与状态流转** 固定住,再把真正的模型推理塞进去。 |
| ... | @@ -94,11 +134,31 @@ flowchart LR | ... | @@ -94,11 +134,31 @@ flowchart LR |
| 94 | `plan_phase1_extraction_jobs_live.py` 现在会显式生成: | 134 | `plan_phase1_extraction_jobs_live.py` 现在会显式生成: |
| 95 | 135 | ||
| 96 | ```bash | 136 | ```bash |
| 97 | PG_DSN="${PG_DSN:?set PG_DSN}" ... | 137 | cd /workspace/acr-engine && PG_DSN="${PG_DSN:?set PG_DSN}" ... |
| 98 | ``` | 138 | ``` |
| 99 | 139 | ||
| 100 | 这样复制命令时,如果调用方忘了提供数据库连接串,会立刻失败,而不是静默跑空。 | 140 | 这样复制命令时,如果调用方忘了提供数据库连接串,会立刻失败,而不是静默跑空。 |
| 101 | 141 | ||
| 142 | 当前 planner 还会显式使用: | ||
| 143 | |||
| 144 | ```bash | ||
| 145 | /usr/local/miniconda3/bin/python | ||
| 146 | ``` | ||
| 147 | |||
| 148 | 原因是当前环境里 `python` 不在 PATH 上,但这个解释器路径已被验证可用。 | ||
| 149 | |||
| 150 | 对于当前 dry-run worker,planner 的主命令模板也会显式带上: | ||
| 151 | |||
| 152 | ```bash | ||
| 153 | --complete-dry-run | ||
| 154 | ``` | ||
| 155 | |||
| 156 | 这样 `primary_command` 就能直接复现: | ||
| 157 | |||
| 158 | ```text | ||
| 159 | pending -> running -> completed | ||
| 160 | ``` | ||
| 161 | |||
| 102 | --- | 162 | --- |
| 103 | 163 | ||
| 104 | ## 4. PostgreSQL 读取契约 | 164 | ## 4. PostgreSQL 读取契约 | ... | ... |
| ... | @@ -69,9 +69,11 @@ | ... | @@ -69,9 +69,11 @@ |
| 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | | 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | |
| 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | | 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | |
| 71 | | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | | 71 | | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | |
| 72 | | reference member bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_reference_member_bootstrap_report.json` | | ||
| 72 | | chromaprint worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_chromaprint_dry_run.json` | | 73 | | chromaprint worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_chromaprint_dry_run.json` | |
| 73 | | embedding worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_dry_run.json` | | 74 | | embedding worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_dry_run.json` | |
| 74 | | job status 手工回写报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_mark_pending_report.json` | | 75 | | job status 手工回写报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_mark_pending_report.json` | |
| 76 | | double-claim guard 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_double_claim_guard_report.json` | | ||
| 75 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | | 77 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | |
| 76 | 78 | ||
| 77 | --- | 79 | --- |
| ... | @@ -309,6 +311,13 @@ flowchart TD | ... | @@ -309,6 +311,13 @@ flowchart TD |
| 309 | - 把 job 状态恢复为 `pending` | 311 | - 把 job 状态恢复为 `pending` |
| 310 | - 保证后续 session 可以从同一批 jobs 继续推进 | 312 | - 保证后续 session 可以从同一批 jobs 继续推进 |
| 311 | 313 | ||
| 314 | 4. `plan_phase1_extraction_jobs_live.py` | ||
| 315 | - 当前生成的主命令模板已显式带: | ||
| 316 | - `cd /workspace/acr-engine &&` | ||
| 317 | - `PG_DSN="${PG_DSN:?set PG_DSN}"` | ||
| 318 | - `--complete-dry-run` | ||
| 319 | - 因此 `primary_command` 已经可以直接复现当前 dry-run 状态流转 | ||
| 320 | |||
| 312 | ### 为什么先做 dry-run | 321 | ### 为什么先做 dry-run |
| 313 | 322 | ||
| 314 | 因为当前第一优先级是把下面这些东西固定住: | 323 | 因为当前第一优先级是把下面这些东西固定住: |
| ... | @@ -324,21 +333,51 @@ flowchart TD | ... | @@ -324,21 +333,51 @@ flowchart TD |
| 324 | 333 | ||
| 325 | 接进去,整体风险更低。 | 334 | 接进去,整体风险更低。 |
| 326 | 335 | ||
| 327 | ### 当前 live 结果的一个关键解释 | 336 | ### 当前 live 结果的关键更新 |
| 337 | |||
| 338 | 本轮已经新增: | ||
| 328 | 339 | ||
| 329 | 本轮 worker dry-run 里,`phase1_hot_reference_v1` 已经存在,但在 `acr_test` schema 里**还没有实际 member**,所以: | 340 | - `acr-engine/scripts/bootstrap_phase1_reference_members_live.py` |
| 330 | 341 | ||
| 331 | - `recording_count=0` | 342 | 并已把 `acr_test.phase1_hot_reference_v1` 真实挂上 `20` 条 reference recordings,因此当前 worker dry-run 看到的 scope 已变成: |
| 332 | - `ready_asset_count=0` | ||
| 333 | - `active_window_count=0` | ||
| 334 | 343 | ||
| 335 | 这不是 worker 异常,而是当前 Phase-1 live 数据面还没把业务 reference recordings 真实装进去。 | 344 | - `recording_count=20` |
| 336 | 因此这轮验证证明的是: | 345 | - `ready_asset_count=20` |
| 346 | - `active_window_count=20` | ||
| 347 | |||
| 348 | 这说明当前验证已经从“空 scope 状态机演示”推进到: | ||
| 337 | 349 | ||
| 338 | - planner -> worker 命令兼容 | 350 | - planner -> worker 命令兼容 |
| 339 | - worker -> PostgreSQL 状态流转可用 | 351 | - worker -> PostgreSQL 状态流转可用 |
| 352 | - reference_set -> recording/asset/window scope 解析可用 | ||
| 353 | |||
| 354 | 仍然要注意: | ||
| 355 | |||
| 356 | - 这依然是 **dry-run** | ||
| 357 | - 还**不是**真实特征抽取吞吐验证 | ||
| 358 | |||
| 359 | ### 当前并发/重试保护验证 | ||
| 360 | |||
| 361 | 本轮还额外做了一个故意的重复执行测试: | ||
| 362 | |||
| 363 | 1. 先让 `feature_extraction_job=1` 从 `pending -> running -> completed` | ||
| 364 | 2. 不做 reset,直接再次执行同一个 chromaprint dry-run worker | ||
| 365 | 3. 预期第二次执行失败,因为 worker 认领 job 时要求: | ||
| 366 | - `expected_status = pending` | ||
| 367 | |||
| 368 | 实际结果见: | ||
| 369 | |||
| 370 | - `phase1_worker_double_claim_guard_report.json` | ||
| 371 | |||
| 372 | 关键证据: | ||
| 373 | |||
| 374 | - `double_claim_exit_code = 1` | ||
| 375 | - `stderr = failed to update feature_extraction_job=1 with expected_status=pending` | ||
| 376 | |||
| 377 | 这证明当前最小 worker contract 已经具备: | ||
| 340 | 378 | ||
| 341 | 还**不是**真实特征抽取吞吐验证。 | 379 | - 基础 claim guard |
| 380 | - 基础重复执行保护 | ||
| 342 | - `type_7` | 381 | - `type_7` |
| 343 | 382 | ||
| 344 | 因此: | 383 | 因此: | ... | ... |
| ... | @@ -189,12 +189,13 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -189,12 +189,13 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 | 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 |
| 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` | 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` |
| 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 | 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 |
| 192 | - 最新 live worker 证据表明:`phase1_hot_reference_v1` 当前在 `acr_test` 里还没有实际 members,所以 dry-run 已验证状态机,但 scope 计数仍是 `0` | 192 | - `phase1_hot_reference_v1` 在 `acr_test` 里已经真实补齐 `20` 个 reference members,因此 worker dry-run 当前看到的 scope 已是 `20 recordings / 20 assets / 20 windows` |
| 193 | - worker contract 现在已有基础前置状态保护;重复执行同一 chromaprint dry-run job 会被 `expected_status=pending` 明确拒绝,证据见 `phase1_worker_double_claim_guard_report.json` | ||
| 193 | 194 | ||
| 194 | ### 未验证 / 仍是缺口 | 195 | ### 未验证 / 仍是缺口 |
| 195 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | 196 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** |
| 196 | - **worker 目前仍以 dry-run 为主,尚未写真实 `audio_fingerprint / audio_embedding`** | 197 | - **worker 目前仍以 dry-run 为主,尚未写真实 `audio_fingerprint / audio_embedding`** |
| 197 | - **未落 reference set 的真实业务数据** | 198 | - **还未落更大规模的生产 reference set 真实业务数据(当前仅验证了 `acr_test` 下的 20-song live members)** |
| 198 | - **未定义最终线上分数融合细则** | 199 | - **未定义最终线上分数融合细则** |
| 199 | - **type_8 / type_16 还没有进入当前 live JSONL 的 PostgreSQL 实测链** | 200 | - **type_8 / type_16 还没有进入当前 live JSONL 的 PostgreSQL 实测链** |
| 200 | - **当前容器里缺少 `/workspace/downloads`,因此暂时无法直接从业务样本目录继续补 type_8 / type_16 live query** | 201 | - **当前容器里缺少 `/workspace/downloads`,因此暂时无法直接从业务样本目录继续补 type_8 / type_16 live query** | ... | ... |
-
Please register or sign in to post a comment