Make Phase-1 extraction jobs executable through PostgreSQL workers
Constraint: Phase-1 must stay encoder-only and use PostgreSQL as the orchestration/state plane before real extractor inference lands. Rejected: implement real MERT/MuQ inference first | rejected because planner/job/state contracts were not yet executable or verified end-to-end. Confidence: high Scope-risk: moderate Directive: preserve the worker job contract and replace dry-run incrementally with real fingerprint/embedding writes. Tested: py_compile for new workers and planner; live PostgreSQL dry-run for chromaprint job 1 and mert job 2; planner report regeneration; bootstrap restore to pending; git diff --check. Not-tested: real chromaprint extraction, real MERT/MuQ/ECAPA embedding writes, failed-job retry handling.
Showing
15 changed files
with
1076 additions
and
33 deletions
| ... | @@ -12,7 +12,7 @@ | ... | @@ -12,7 +12,7 @@ |
| 12 | "hop_sec": 2.5, | 12 | "hop_sec": 2.5, |
| 13 | "target_scope": "reference_set:phase1_hot_reference_v1", | 13 | "target_scope": "reference_set:phase1_hot_reference_v1", |
| 14 | "job_status": "pending", | 14 | "job_status": "pending", |
| 15 | "operation": "inserted" | 15 | "operation": "reused" |
| 16 | }, | 16 | }, |
| 17 | { | 17 | { |
| 18 | "extraction_job_id": 2, | 18 | "extraction_job_id": 2, |
| ... | @@ -24,7 +24,7 @@ | ... | @@ -24,7 +24,7 @@ |
| 24 | "hop_sec": 2.5, | 24 | "hop_sec": 2.5, |
| 25 | "target_scope": "reference_set:phase1_hot_reference_v1", | 25 | "target_scope": "reference_set:phase1_hot_reference_v1", |
| 26 | "job_status": "pending", | 26 | "job_status": "pending", |
| 27 | "operation": "inserted" | 27 | "operation": "reused" |
| 28 | }, | 28 | }, |
| 29 | { | 29 | { |
| 30 | "extraction_job_id": 3, | 30 | "extraction_job_id": 3, |
| ... | @@ -36,7 +36,7 @@ | ... | @@ -36,7 +36,7 @@ |
| 36 | "hop_sec": 5.0, | 36 | "hop_sec": 5.0, |
| 37 | "target_scope": "reference_set:phase1_hot_reference_v1", | 37 | "target_scope": "reference_set:phase1_hot_reference_v1", |
| 38 | "job_status": "pending", | 38 | "job_status": "pending", |
| 39 | "operation": "inserted" | 39 | "operation": "reused" |
| 40 | }, | 40 | }, |
| 41 | { | 41 | { |
| 42 | "extraction_job_id": 4, | 42 | "extraction_job_id": 4, |
| ... | @@ -48,7 +48,7 @@ | ... | @@ -48,7 +48,7 @@ |
| 48 | "hop_sec": 2.5, | 48 | "hop_sec": 2.5, |
| 49 | "target_scope": "reference_set:phase1_hot_reference_v1", | 49 | "target_scope": "reference_set:phase1_hot_reference_v1", |
| 50 | "job_status": "pending", | 50 | "job_status": "pending", |
| 51 | "operation": "inserted" | 51 | "operation": "reused" |
| 52 | }, | 52 | }, |
| 53 | { | 53 | { |
| 54 | "extraction_job_id": 5, | 54 | "extraction_job_id": 5, |
| ... | @@ -60,7 +60,7 @@ | ... | @@ -60,7 +60,7 @@ |
| 60 | "hop_sec": 2.5, | 60 | "hop_sec": 2.5, |
| 61 | "target_scope": "reference_set:phase1_hot_reference_v1", | 61 | "target_scope": "reference_set:phase1_hot_reference_v1", |
| 62 | "job_status": "pending", | 62 | "job_status": "pending", |
| 63 | "operation": "inserted" | 63 | "operation": "reused" |
| 64 | } | 64 | } |
| 65 | ], | 65 | ], |
| 66 | "counts": { | 66 | "counts": { | ... | ... |
| ... | @@ -51,8 +51,8 @@ | ... | @@ -51,8 +51,8 @@ |
| 51 | "target scope: reference_set:phase1_hot_reference_v1" | 51 | "target scope: reference_set:phase1_hot_reference_v1" |
| 52 | ], | 52 | ], |
| 53 | "command_suggestions": [ | 53 | "command_suggestions": [ |
| 54 | "EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py", | 54 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py", |
| 55 | "EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 55 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 56 | ] | 56 | ] |
| 57 | }, | 57 | }, |
| 58 | { | 58 | { |
| ... | @@ -96,8 +96,8 @@ | ... | @@ -96,8 +96,8 @@ |
| 96 | "target scope: reference_set:phase1_hot_reference_v1" | 96 | "target scope: reference_set:phase1_hot_reference_v1" |
| 97 | ], | 97 | ], |
| 98 | "command_suggestions": [ | 98 | "command_suggestions": [ |
| 99 | "EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 99 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 100 | "EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 100 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 101 | ] | 101 | ] |
| 102 | }, | 102 | }, |
| 103 | { | 103 | { |
| ... | @@ -141,8 +141,8 @@ | ... | @@ -141,8 +141,8 @@ |
| 141 | "target scope: reference_set:phase1_hot_reference_v1" | 141 | "target scope: reference_set:phase1_hot_reference_v1" |
| 142 | ], | 142 | ], |
| 143 | "command_suggestions": [ | 143 | "command_suggestions": [ |
| 144 | "EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 144 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 145 | "EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 145 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 146 | ] | 146 | ] |
| 147 | }, | 147 | }, |
| 148 | { | 148 | { |
| ... | @@ -186,8 +186,8 @@ | ... | @@ -186,8 +186,8 @@ |
| 186 | "target scope: reference_set:phase1_hot_reference_v1" | 186 | "target scope: reference_set:phase1_hot_reference_v1" |
| 187 | ], | 187 | ], |
| 188 | "command_suggestions": [ | 188 | "command_suggestions": [ |
| 189 | "EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 189 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 190 | "EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 190 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 191 | ] | 191 | ] |
| 192 | }, | 192 | }, |
| 193 | { | 193 | { |
| ... | @@ -231,8 +231,8 @@ | ... | @@ -231,8 +231,8 @@ |
| 231 | "target scope: reference_set:phase1_hot_reference_v1" | 231 | "target scope: reference_set:phase1_hot_reference_v1" |
| 232 | ], | 232 | ], |
| 233 | "command_suggestions": [ | 233 | "command_suggestions": [ |
| 234 | "EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 234 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 235 | "EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 235 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 236 | ] | 236 | ] |
| 237 | } | 237 | } |
| 238 | ], | 238 | ], |
| ... | @@ -279,8 +279,8 @@ | ... | @@ -279,8 +279,8 @@ |
| 279 | "target scope: reference_set:phase1_hot_reference_v1" | 279 | "target scope: reference_set:phase1_hot_reference_v1" |
| 280 | ], | 280 | ], |
| 281 | "command_suggestions": [ | 281 | "command_suggestions": [ |
| 282 | "EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py", | 282 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py", |
| 283 | "EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 283 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 284 | ] | 284 | ] |
| 285 | } | 285 | } |
| 286 | ], | 286 | ], |
| ... | @@ -326,8 +326,8 @@ | ... | @@ -326,8 +326,8 @@ |
| 326 | "target scope: reference_set:phase1_hot_reference_v1" | 326 | "target scope: reference_set:phase1_hot_reference_v1" |
| 327 | ], | 327 | ], |
| 328 | "command_suggestions": [ | 328 | "command_suggestions": [ |
| 329 | "EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 329 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 330 | "EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 330 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 331 | ] | 331 | ] |
| 332 | }, | 332 | }, |
| 333 | { | 333 | { |
| ... | @@ -371,8 +371,8 @@ | ... | @@ -371,8 +371,8 @@ |
| 371 | "target scope: reference_set:phase1_hot_reference_v1" | 371 | "target scope: reference_set:phase1_hot_reference_v1" |
| 372 | ], | 372 | ], |
| 373 | "command_suggestions": [ | 373 | "command_suggestions": [ |
| 374 | "EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 374 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 375 | "EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 375 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 376 | ] | 376 | ] |
| 377 | }, | 377 | }, |
| 378 | { | 378 | { |
| ... | @@ -416,8 +416,8 @@ | ... | @@ -416,8 +416,8 @@ |
| 416 | "target scope: reference_set:phase1_hot_reference_v1" | 416 | "target scope: reference_set:phase1_hot_reference_v1" |
| 417 | ], | 417 | ], |
| 418 | "command_suggestions": [ | 418 | "command_suggestions": [ |
| 419 | "EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 419 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 420 | "EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 420 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 421 | ] | 421 | ] |
| 422 | }, | 422 | }, |
| 423 | { | 423 | { |
| ... | @@ -461,8 +461,8 @@ | ... | @@ -461,8 +461,8 @@ |
| 461 | "target scope: reference_set:phase1_hot_reference_v1" | 461 | "target scope: reference_set:phase1_hot_reference_v1" |
| 462 | ], | 462 | ], |
| 463 | "command_suggestions": [ | 463 | "command_suggestions": [ |
| 464 | "EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", | 464 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py", |
| 465 | "EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" | 465 | "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test \\\npython workers/mark_job_status.py --status running" |
| 466 | ] | 466 | ] |
| 467 | } | 467 | } |
| 468 | ] | 468 | ] |
| ... | @@ -477,7 +477,7 @@ | ... | @@ -477,7 +477,7 @@ |
| 477 | "window_sec": 5.0, | 477 | "window_sec": 5.0, |
| 478 | "hop_sec": 2.5, | 478 | "hop_sec": 2.5, |
| 479 | "physical_target": "audio_fingerprint", | 479 | "physical_target": "audio_fingerprint", |
| 480 | "primary_command": "EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py" | 480 | "primary_command": "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=1 FEATURE_SET_ID=2 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test OUTPUT_TARGET=audio_fingerprint \\\npython workers/run_chromaprint_job.py" |
| 481 | }, | 481 | }, |
| 482 | { | 482 | { |
| 483 | "order": 2, | 483 | "order": 2, |
| ... | @@ -488,7 +488,7 @@ | ... | @@ -488,7 +488,7 @@ |
| 488 | "window_sec": 5.0, | 488 | "window_sec": 5.0, |
| 489 | "hop_sec": 2.5, | 489 | "hop_sec": 2.5, |
| 490 | "physical_target": "audio_embedding", | 490 | "physical_target": "audio_embedding", |
| 491 | "primary_command": "EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" | 491 | "primary_command": "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=2 FEATURE_SET_ID=3 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" |
| 492 | }, | 492 | }, |
| 493 | { | 493 | { |
| 494 | "order": 3, | 494 | "order": 3, |
| ... | @@ -499,7 +499,7 @@ | ... | @@ -499,7 +499,7 @@ |
| 499 | "window_sec": 10.0, | 499 | "window_sec": 10.0, |
| 500 | "hop_sec": 5.0, | 500 | "hop_sec": 5.0, |
| 501 | "physical_target": "audio_embedding", | 501 | "physical_target": "audio_embedding", |
| 502 | "primary_command": "EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" | 502 | "primary_command": "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=3 FEATURE_SET_ID=4 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=mert MODEL_VERSION=v1-95m VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" |
| 503 | }, | 503 | }, |
| 504 | { | 504 | { |
| 505 | "order": 4, | 505 | "order": 4, |
| ... | @@ -510,7 +510,7 @@ | ... | @@ -510,7 +510,7 @@ |
| 510 | "window_sec": 5.0, | 510 | "window_sec": 5.0, |
| 511 | "hop_sec": 2.5, | 511 | "hop_sec": 2.5, |
| 512 | "physical_target": "audio_embedding", | 512 | "physical_target": "audio_embedding", |
| 513 | "primary_command": "EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" | 513 | "primary_command": "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=4 FEATURE_SET_ID=5 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=muq MODEL_VERSION=large-msd-iter VECTOR_TABLE=audio_embedding_vector_768 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" |
| 514 | }, | 514 | }, |
| 515 | { | 515 | { |
| 516 | "order": 5, | 516 | "order": 5, |
| ... | @@ -521,7 +521,7 @@ | ... | @@ -521,7 +521,7 @@ |
| 521 | "window_sec": 5.0, | 521 | "window_sec": 5.0, |
| 522 | "hop_sec": 2.5, | 522 | "hop_sec": 2.5, |
| 523 | "physical_target": "audio_embedding", | 523 | "physical_target": "audio_embedding", |
| 524 | "primary_command": "EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" | 524 | "primary_command": "PG_DSN=\"${PG_DSN:?set PG_DSN}\" EXTRACTION_JOB_ID=5 FEATURE_SET_ID=6 TARGET_SCOPE='reference_set:phase1_hot_reference_v1' PG_SCHEMA=acr_test MODEL_NAME=ecapa MODEL_VERSION=acr-baseline-v1 VECTOR_TABLE=audio_embedding_vector_192 OUTPUT_TARGET=audio_embedding \\\npython workers/run_embedding_job.py" |
| 525 | } | 525 | } |
| 526 | ] | 526 | ] |
| 527 | } | 527 | } |
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| 1 | { | ||
| 2 | "worker": "run_chromaprint_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 1, | ||
| 6 | "feature_set_id": 2, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/chromaprint/v1", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "exact", | ||
| 12 | "phase": "phase1", | ||
| 13 | "priority": "p0" | ||
| 14 | }, | ||
| 15 | "feature_name": "fingerprint_asset", | ||
| 16 | "feature_level": "asset", | ||
| 17 | "extraction_granularity": "full_asset", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": null, | ||
| 21 | "distance_metric": "hamming", | ||
| 22 | "feature_config": { | ||
| 23 | "lane": "exact", | ||
| 24 | "index_target": "audio_fingerprint" | ||
| 25 | }, | ||
| 26 | "model_id": 2, | ||
| 27 | "model_name": "chromaprint", | ||
| 28 | "model_version": "v1", | ||
| 29 | "model_family": "fingerprint", | ||
| 30 | "input_sample_rate": 16000, | ||
| 31 | "output_embedding_dim": null, | ||
| 32 | "model_metadata": { | ||
| 33 | "lane": "exact", | ||
| 34 | "note": "exact fingerprint lane baseline", | ||
| 35 | "phase": "phase1" | ||
| 36 | } | ||
| 37 | }, | ||
| 38 | "target_scope_summary": { | ||
| 39 | "scope_type": "reference_set", | ||
| 40 | "scope_value": "phase1_hot_reference_v1", | ||
| 41 | "reference_set_id": 2, | ||
| 42 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 43 | "recording_count": 0, | ||
| 44 | "ready_asset_count": 0, | ||
| 45 | "active_window_count": 0 | ||
| 46 | }, | ||
| 47 | "status_after_start": { | ||
| 48 | "extraction_job_id": 1, | ||
| 49 | "job_status": "running", | ||
| 50 | "input_count": 0, | ||
| 51 | "output_count": null, | ||
| 52 | "started_at": "2026-06-04T13:02:56.589356+08:00", | ||
| 53 | "finished_at": null, | ||
| 54 | "log_uri": null, | ||
| 55 | "metadata_json": { | ||
| 56 | "lane": "exact", | ||
| 57 | "phase": "phase1", | ||
| 58 | "worker": "run_chromaprint_job", | ||
| 59 | "dry_run": true, | ||
| 60 | "priority": "p0", | ||
| 61 | "output_target": "audio_fingerprint", | ||
| 62 | "execution_mode": "dry_run", | ||
| 63 | "target_scope_summary": { | ||
| 64 | "scope_type": "reference_set", | ||
| 65 | "scope_value": "phase1_hot_reference_v1", | ||
| 66 | "recording_count": 0, | ||
| 67 | "reference_set_id": 2, | ||
| 68 | "ready_asset_count": 0, | ||
| 69 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 70 | "active_window_count": 0 | ||
| 71 | } | ||
| 72 | } | ||
| 73 | }, | ||
| 74 | "status_after_complete": { | ||
| 75 | "extraction_job_id": 1, | ||
| 76 | "job_status": "completed", | ||
| 77 | "input_count": 0, | ||
| 78 | "output_count": 0, | ||
| 79 | "started_at": "2026-06-04T13:02:56.589356+08:00", | ||
| 80 | "finished_at": "2026-06-04T13:02:56.591597+08:00", | ||
| 81 | "log_uri": null, | ||
| 82 | "metadata_json": { | ||
| 83 | "lane": "exact", | ||
| 84 | "phase": "phase1", | ||
| 85 | "worker": "run_chromaprint_job", | ||
| 86 | "dry_run": true, | ||
| 87 | "priority": "p0", | ||
| 88 | "output_target": "audio_fingerprint", | ||
| 89 | "dry_run_result": "completed_without_feature_write", | ||
| 90 | "execution_mode": "dry_run", | ||
| 91 | "write_target_table": "audio_fingerprint", | ||
| 92 | "target_scope_summary": { | ||
| 93 | "scope_type": "reference_set", | ||
| 94 | "scope_value": "phase1_hot_reference_v1", | ||
| 95 | "recording_count": 0, | ||
| 96 | "reference_set_id": 2, | ||
| 97 | "ready_asset_count": 0, | ||
| 98 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 99 | "active_window_count": 0 | ||
| 100 | } | ||
| 101 | } | ||
| 102 | }, | ||
| 103 | "next_write_target": "audio_fingerprint", | ||
| 104 | "notes": [ | ||
| 105 | "this worker currently validates planner -> job -> PostgreSQL state flow", | ||
| 106 | "real chromaprint extraction can replace dry_run while preserving the same job contract" | ||
| 107 | ] | ||
| 108 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "run_embedding_job", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "job": { | ||
| 5 | "extraction_job_id": 2, | ||
| 6 | "feature_set_id": 3, | ||
| 7 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 8 | "job_status": "pending", | ||
| 9 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 10 | "job_metadata": { | ||
| 11 | "lane": "semantic", | ||
| 12 | "role": "primary_baseline", | ||
| 13 | "phase": "phase1" | ||
| 14 | }, | ||
| 15 | "feature_name": "semantic_embedding", | ||
| 16 | "feature_level": "window", | ||
| 17 | "extraction_granularity": "sliding_window", | ||
| 18 | "window_sec": 5.0, | ||
| 19 | "hop_sec": 2.5, | ||
| 20 | "embedding_dim": 768, | ||
| 21 | "distance_metric": "cosine", | ||
| 22 | "feature_config": { | ||
| 23 | "role": "primary_semantic_baseline" | ||
| 24 | }, | ||
| 25 | "model_id": 3, | ||
| 26 | "model_name": "mert", | ||
| 27 | "model_version": "v1-95m", | ||
| 28 | "model_family": "music_ssl", | ||
| 29 | "input_sample_rate": 24000, | ||
| 30 | "output_embedding_dim": 768, | ||
| 31 | "model_metadata": { | ||
| 32 | "lane": "semantic", | ||
| 33 | "role": "primary_baseline", | ||
| 34 | "phase": "phase1" | ||
| 35 | } | ||
| 36 | }, | ||
| 37 | "target_scope_summary": { | ||
| 38 | "scope_type": "reference_set", | ||
| 39 | "scope_value": "phase1_hot_reference_v1", | ||
| 40 | "reference_set_id": 2, | ||
| 41 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 42 | "recording_count": 0, | ||
| 43 | "ready_asset_count": 0, | ||
| 44 | "active_window_count": 0 | ||
| 45 | }, | ||
| 46 | "status_after_start": { | ||
| 47 | "extraction_job_id": 2, | ||
| 48 | "job_status": "running", | ||
| 49 | "input_count": 0, | ||
| 50 | "output_count": null, | ||
| 51 | "started_at": "2026-06-04T13:02:56.714882+08:00", | ||
| 52 | "finished_at": null, | ||
| 53 | "log_uri": null, | ||
| 54 | "metadata_json": { | ||
| 55 | "lane": "semantic", | ||
| 56 | "role": "primary_baseline", | ||
| 57 | "phase": "phase1", | ||
| 58 | "worker": "run_embedding_job", | ||
| 59 | "dry_run": true, | ||
| 60 | "vector_table": "audio_embedding_vector_768", | ||
| 61 | "output_target": "audio_embedding", | ||
| 62 | "execution_mode": "dry_run", | ||
| 63 | "target_scope_summary": { | ||
| 64 | "scope_type": "reference_set", | ||
| 65 | "scope_value": "phase1_hot_reference_v1", | ||
| 66 | "recording_count": 0, | ||
| 67 | "reference_set_id": 2, | ||
| 68 | "ready_asset_count": 0, | ||
| 69 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 70 | "active_window_count": 0 | ||
| 71 | } | ||
| 72 | } | ||
| 73 | }, | ||
| 74 | "status_after_complete": { | ||
| 75 | "extraction_job_id": 2, | ||
| 76 | "job_status": "completed", | ||
| 77 | "input_count": 0, | ||
| 78 | "output_count": 0, | ||
| 79 | "started_at": "2026-06-04T13:02:56.714882+08:00", | ||
| 80 | "finished_at": "2026-06-04T13:02:56.715469+08:00", | ||
| 81 | "log_uri": null, | ||
| 82 | "metadata_json": { | ||
| 83 | "lane": "semantic", | ||
| 84 | "role": "primary_baseline", | ||
| 85 | "phase": "phase1", | ||
| 86 | "worker": "run_embedding_job", | ||
| 87 | "dry_run": true, | ||
| 88 | "vector_table": "audio_embedding_vector_768", | ||
| 89 | "output_target": "audio_embedding", | ||
| 90 | "dry_run_result": "completed_without_feature_write", | ||
| 91 | "execution_mode": "dry_run", | ||
| 92 | "write_target_table": "audio_embedding", | ||
| 93 | "target_scope_summary": { | ||
| 94 | "scope_type": "reference_set", | ||
| 95 | "scope_value": "phase1_hot_reference_v1", | ||
| 96 | "recording_count": 0, | ||
| 97 | "reference_set_id": 2, | ||
| 98 | "ready_asset_count": 0, | ||
| 99 | "reference_set_name": "phase1_hot_reference_v1", | ||
| 100 | "active_window_count": 0 | ||
| 101 | } | ||
| 102 | } | ||
| 103 | }, | ||
| 104 | "resolved_vector_table": "audio_embedding_vector_768", | ||
| 105 | "notes": [ | ||
| 106 | "this worker currently validates planner -> job -> PostgreSQL state flow", | ||
| 107 | "real encoder inference can replace dry_run while preserving the same job contract" | ||
| 108 | ] | ||
| 109 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "worker": "mark_job_status", | ||
| 3 | "schema": "acr_test", | ||
| 4 | "update": { | ||
| 5 | "extraction_job_id": 1, | ||
| 6 | "job_status": "pending", | ||
| 7 | "input_count": 0, | ||
| 8 | "output_count": 0, | ||
| 9 | "started_at": "2026-06-04T13:02:56.589356+08:00", | ||
| 10 | "finished_at": "2026-06-04T13:02:56.591597+08:00", | ||
| 11 | "log_uri": null, | ||
| 12 | "metadata_json": { | ||
| 13 | "lane": "exact", | ||
| 14 | "phase": "phase1", | ||
| 15 | "priority": "p0" | ||
| 16 | } | ||
| 17 | } | ||
| 18 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| ... | @@ -26,7 +26,13 @@ def parse_target_scope(target_scope: str) -> dict[str, Any]: | ... | @@ -26,7 +26,13 @@ def parse_target_scope(target_scope: str) -> dict[str, Any]: |
| 26 | 26 | ||
| 27 | 27 | ||
| 28 | def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: | 28 | def build_command_suggestions(job: dict[str, Any], schema: str) -> list[str]: |
| 29 | base_env = f"EXTRACTION_JOB_ID={job['extraction_job_id']} FEATURE_SET_ID={job['feature_set_id']} TARGET_SCOPE='{job['target_scope']}' PG_SCHEMA={schema}" | 29 | base_env = ( |
| 30 | 'PG_DSN="${PG_DSN:?set PG_DSN}" ' | ||
| 31 | f"EXTRACTION_JOB_ID={job['extraction_job_id']} " | ||
| 32 | f"FEATURE_SET_ID={job['feature_set_id']} " | ||
| 33 | f"TARGET_SCOPE='{job['target_scope']}' " | ||
| 34 | f"PG_SCHEMA={schema}" | ||
| 35 | ) | ||
| 30 | commands = [] | 36 | commands = [] |
| 31 | if job['lane'] == 'exact': | 37 | if job['lane'] == 'exact': |
| 32 | commands.append( | 38 | commands.append( | ... | ... |
acr-engine/workers/_job_common.py
0 → 100644
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import json | ||
| 4 | import os | ||
| 5 | import re | ||
| 6 | from dataclasses import dataclass | ||
| 7 | from pathlib import Path | ||
| 8 | from typing import Any | ||
| 9 | |||
| 10 | import psycopg | ||
| 11 | |||
| 12 | SCHEMA_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$') | ||
| 13 | |||
| 14 | |||
| 15 | @dataclass | ||
| 16 | class JobContext: | ||
| 17 | extraction_job_id: int | ||
| 18 | feature_set_id: int | ||
| 19 | target_scope: str | ||
| 20 | job_status: str | ||
| 21 | shard_key: str | None | ||
| 22 | job_metadata: dict[str, Any] | ||
| 23 | feature_name: str | ||
| 24 | feature_level: str | ||
| 25 | extraction_granularity: str | ||
| 26 | window_sec: float | None | ||
| 27 | hop_sec: float | None | ||
| 28 | embedding_dim: int | None | ||
| 29 | distance_metric: str | ||
| 30 | feature_config: dict[str, Any] | ||
| 31 | model_id: int | ||
| 32 | model_name: str | ||
| 33 | model_version: str | ||
| 34 | model_family: str | ||
| 35 | input_sample_rate: int | None | ||
| 36 | output_embedding_dim: int | None | ||
| 37 | model_metadata: dict[str, Any] | ||
| 38 | |||
| 39 | |||
| 40 | def require_env(name: str, default: str | None = None) -> str: | ||
| 41 | value = os.environ.get(name, default) | ||
| 42 | if value is None or value == '': | ||
| 43 | raise SystemExit(f'missing required env: {name}') | ||
| 44 | return value | ||
| 45 | |||
| 46 | |||
| 47 | def validate_schema(schema: str) -> str: | ||
| 48 | if not SCHEMA_RE.match(schema): | ||
| 49 | raise SystemExit(f'invalid schema name: {schema}') | ||
| 50 | return schema | ||
| 51 | |||
| 52 | |||
| 53 | def ensure_output_parent(path: str | None) -> Path | None: | ||
| 54 | if not path: | ||
| 55 | return None | ||
| 56 | output = Path(path) | ||
| 57 | output.parent.mkdir(parents=True, exist_ok=True) | ||
| 58 | return output | ||
| 59 | |||
| 60 | |||
| 61 | def connect(dsn: str, schema: str, *, autocommit: bool = True) -> psycopg.Connection: | ||
| 62 | conn = psycopg.connect(dsn, autocommit=autocommit) | ||
| 63 | conn.execute(f'SET search_path TO {validate_schema(schema)}, public;') | ||
| 64 | return conn | ||
| 65 | |||
| 66 | |||
| 67 | def fetch_job_context(conn: psycopg.Connection, extraction_job_id: int) -> JobContext: | ||
| 68 | row = conn.execute( | ||
| 69 | """ | ||
| 70 | SELECT | ||
| 71 | fej.extraction_job_id, | ||
| 72 | fej.feature_set_id, | ||
| 73 | fej.target_scope, | ||
| 74 | fej.job_status, | ||
| 75 | fej.shard_key, | ||
| 76 | fej.metadata_json, | ||
| 77 | fs.feature_name, | ||
| 78 | fs.feature_level, | ||
| 79 | fs.extraction_granularity, | ||
| 80 | fs.window_sec, | ||
| 81 | fs.hop_sec, | ||
| 82 | fs.embedding_dim, | ||
| 83 | fs.distance_metric, | ||
| 84 | fs.config_json, | ||
| 85 | mr.model_id, | ||
| 86 | mr.model_name, | ||
| 87 | mr.model_version, | ||
| 88 | mr.model_family, | ||
| 89 | mr.input_sample_rate, | ||
| 90 | mr.output_embedding_dim, | ||
| 91 | mr.metadata_json | ||
| 92 | FROM feature_extraction_job fej | ||
| 93 | JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id | ||
| 94 | JOIN model_registry mr ON mr.model_id = fs.model_id | ||
| 95 | WHERE fej.extraction_job_id = %s | ||
| 96 | LIMIT 1; | ||
| 97 | """, | ||
| 98 | (extraction_job_id,), | ||
| 99 | ).fetchone() | ||
| 100 | if not row: | ||
| 101 | raise SystemExit(f'feature_extraction_job not found: {extraction_job_id}') | ||
| 102 | return JobContext( | ||
| 103 | extraction_job_id=int(row[0]), | ||
| 104 | feature_set_id=int(row[1]), | ||
| 105 | target_scope=row[2], | ||
| 106 | job_status=row[3], | ||
| 107 | shard_key=row[4], | ||
| 108 | job_metadata=row[5] or {}, | ||
| 109 | feature_name=row[6], | ||
| 110 | feature_level=row[7], | ||
| 111 | extraction_granularity=row[8], | ||
| 112 | window_sec=float(row[9]) if row[9] is not None else None, | ||
| 113 | hop_sec=float(row[10]) if row[10] is not None else None, | ||
| 114 | embedding_dim=int(row[11]) if row[11] is not None else None, | ||
| 115 | distance_metric=row[12], | ||
| 116 | feature_config=row[13] or {}, | ||
| 117 | model_id=int(row[14]), | ||
| 118 | model_name=row[15], | ||
| 119 | model_version=row[16], | ||
| 120 | model_family=row[17], | ||
| 121 | input_sample_rate=int(row[18]) if row[18] is not None else None, | ||
| 122 | output_embedding_dim=int(row[19]) if row[19] is not None else None, | ||
| 123 | model_metadata=row[20] or {}, | ||
| 124 | ) | ||
| 125 | |||
| 126 | |||
| 127 | def parse_target_scope(target_scope: str) -> tuple[str, str]: | ||
| 128 | if ':' in target_scope: | ||
| 129 | scope_type, scope_value = target_scope.split(':', 1) | ||
| 130 | return scope_type, scope_value | ||
| 131 | return 'unknown', target_scope | ||
| 132 | |||
| 133 | |||
| 134 | def resolve_scope_summary(conn: psycopg.Connection, target_scope: str) -> dict[str, Any]: | ||
| 135 | scope_type, scope_value = parse_target_scope(target_scope) | ||
| 136 | if scope_type == 'reference_set': | ||
| 137 | row = conn.execute( | ||
| 138 | """ | ||
| 139 | SELECT | ||
| 140 | rs.reference_set_id, | ||
| 141 | rs.set_name, | ||
| 142 | count(DISTINCT rsm.recording_id) AS recording_count, | ||
| 143 | count(DISTINCT ra.asset_id) FILTER (WHERE ra.ingest_status = 'ready') AS ready_asset_count, | ||
| 144 | count(DISTINCT aw.window_id) FILTER (WHERE aw.active_for_index) AS active_window_count | ||
| 145 | FROM reference_set_registry rs | ||
| 146 | LEFT JOIN reference_set_member rsm ON rsm.reference_set_id = rs.reference_set_id | ||
| 147 | LEFT JOIN recording_asset ra ON ra.recording_id = rsm.recording_id | ||
| 148 | LEFT JOIN audio_window aw ON aw.recording_id = rsm.recording_id | ||
| 149 | WHERE rs.set_name = %s | ||
| 150 | GROUP BY rs.reference_set_id, rs.set_name | ||
| 151 | LIMIT 1; | ||
| 152 | """, | ||
| 153 | (scope_value,), | ||
| 154 | ).fetchone() | ||
| 155 | if not row: | ||
| 156 | raise SystemExit(f'reference set not found for target_scope={target_scope}') | ||
| 157 | return { | ||
| 158 | 'scope_type': scope_type, | ||
| 159 | 'scope_value': scope_value, | ||
| 160 | 'reference_set_id': int(row[0]), | ||
| 161 | 'reference_set_name': row[1], | ||
| 162 | 'recording_count': int(row[2]), | ||
| 163 | 'ready_asset_count': int(row[3]), | ||
| 164 | 'active_window_count': int(row[4]), | ||
| 165 | } | ||
| 166 | return { | ||
| 167 | 'scope_type': scope_type, | ||
| 168 | 'scope_value': scope_value, | ||
| 169 | 'recording_count': 0, | ||
| 170 | 'ready_asset_count': 0, | ||
| 171 | 'active_window_count': 0, | ||
| 172 | } | ||
| 173 | |||
| 174 | |||
| 175 | def update_job_status( | ||
| 176 | conn: psycopg.Connection, | ||
| 177 | extraction_job_id: int, | ||
| 178 | *, | ||
| 179 | status: str, | ||
| 180 | input_count: int | None = None, | ||
| 181 | output_count: int | None = None, | ||
| 182 | log_uri: str | None = None, | ||
| 183 | metadata_patch: dict[str, Any] | None = None, | ||
| 184 | set_started_at: bool = False, | ||
| 185 | set_finished_at: bool = False, | ||
| 186 | ) -> dict[str, Any]: | ||
| 187 | patch = json.dumps(metadata_patch or {}, ensure_ascii=False) | ||
| 188 | row = conn.execute( | ||
| 189 | """ | ||
| 190 | UPDATE feature_extraction_job | ||
| 191 | SET job_status = %s, | ||
| 192 | input_count = COALESCE(%s, input_count), | ||
| 193 | output_count = COALESCE(%s, output_count), | ||
| 194 | log_uri = COALESCE(%s, log_uri), | ||
| 195 | started_at = CASE | ||
| 196 | WHEN %s THEN COALESCE(started_at, NOW()) | ||
| 197 | ELSE started_at | ||
| 198 | END, | ||
| 199 | finished_at = CASE | ||
| 200 | WHEN %s THEN NOW() | ||
| 201 | ELSE finished_at | ||
| 202 | END, | ||
| 203 | metadata_json = COALESCE(metadata_json, '{}'::jsonb) || %s::jsonb | ||
| 204 | WHERE extraction_job_id = %s | ||
| 205 | RETURNING extraction_job_id, job_status, input_count, output_count, started_at, finished_at, log_uri, metadata_json; | ||
| 206 | """, | ||
| 207 | ( | ||
| 208 | status, | ||
| 209 | input_count, | ||
| 210 | output_count, | ||
| 211 | log_uri, | ||
| 212 | set_started_at, | ||
| 213 | set_finished_at, | ||
| 214 | patch, | ||
| 215 | extraction_job_id, | ||
| 216 | ), | ||
| 217 | ).fetchone() | ||
| 218 | if not row: | ||
| 219 | raise SystemExit(f'failed to update feature_extraction_job={extraction_job_id}') | ||
| 220 | return { | ||
| 221 | 'extraction_job_id': int(row[0]), | ||
| 222 | 'job_status': row[1], | ||
| 223 | 'input_count': int(row[2]) if row[2] is not None else None, | ||
| 224 | 'output_count': int(row[3]) if row[3] is not None else None, | ||
| 225 | 'started_at': row[4].isoformat() if row[4] is not None else None, | ||
| 226 | 'finished_at': row[5].isoformat() if row[5] is not None else None, | ||
| 227 | 'log_uri': row[6], | ||
| 228 | 'metadata_json': row[7] or {}, | ||
| 229 | } | ||
| 230 | |||
| 231 | |||
| 232 | def emit_payload(payload: dict[str, Any], output: str | None) -> None: | ||
| 233 | text = json.dumps(payload, ensure_ascii=False, indent=2) | ||
| 234 | if output: | ||
| 235 | target = ensure_output_parent(output) | ||
| 236 | assert target is not None | ||
| 237 | target.write_text(text, encoding='utf-8') | ||
| 238 | print(text) |
acr-engine/workers/mark_job_status.py
0 → 100644
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | import os | ||
| 7 | |||
| 8 | from _job_common import connect, emit_payload, require_env, update_job_status | ||
| 9 | |||
| 10 | |||
| 11 | def main() -> None: | ||
| 12 | ap = argparse.ArgumentParser() | ||
| 13 | ap.add_argument('--dsn', default=os.environ.get('PG_DSN')) | ||
| 14 | ap.add_argument('--schema', default=os.environ.get('PG_SCHEMA', 'acr_test')) | ||
| 15 | ap.add_argument('--job-id', type=int, default=int(require_env('EXTRACTION_JOB_ID', '0'))) | ||
| 16 | ap.add_argument('--status', required=True) | ||
| 17 | ap.add_argument('--input-count', type=int) | ||
| 18 | ap.add_argument('--output-count', type=int) | ||
| 19 | ap.add_argument('--log-uri') | ||
| 20 | ap.add_argument('--metadata-json') | ||
| 21 | ap.add_argument('--set-started-at', action='store_true') | ||
| 22 | ap.add_argument('--set-finished-at', action='store_true') | ||
| 23 | ap.add_argument('--output') | ||
| 24 | args = ap.parse_args() | ||
| 25 | |||
| 26 | if not args.dsn: | ||
| 27 | raise SystemExit('missing --dsn or PG_DSN') | ||
| 28 | if not args.job_id: | ||
| 29 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') | ||
| 30 | |||
| 31 | metadata_patch = json.loads(args.metadata_json) if args.metadata_json else {} | ||
| 32 | |||
| 33 | with connect(args.dsn, args.schema) as conn: | ||
| 34 | updated = update_job_status( | ||
| 35 | conn, | ||
| 36 | args.job_id, | ||
| 37 | status=args.status, | ||
| 38 | input_count=args.input_count, | ||
| 39 | output_count=args.output_count, | ||
| 40 | log_uri=args.log_uri, | ||
| 41 | metadata_patch=metadata_patch, | ||
| 42 | set_started_at=args.set_started_at, | ||
| 43 | set_finished_at=args.set_finished_at, | ||
| 44 | ) | ||
| 45 | |||
| 46 | emit_payload( | ||
| 47 | { | ||
| 48 | 'worker': 'mark_job_status', | ||
| 49 | 'schema': args.schema, | ||
| 50 | 'update': updated, | ||
| 51 | }, | ||
| 52 | args.output, | ||
| 53 | ) | ||
| 54 | |||
| 55 | |||
| 56 | if __name__ == '__main__': | ||
| 57 | main() |
acr-engine/workers/run_chromaprint_job.py
0 → 100644
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import os | ||
| 6 | |||
| 7 | from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status | ||
| 8 | |||
| 9 | |||
| 10 | def main() -> None: | ||
| 11 | ap = argparse.ArgumentParser() | ||
| 12 | ap.add_argument('--dsn', default=os.environ.get('PG_DSN')) | ||
| 13 | ap.add_argument('--schema', default=os.environ.get('PG_SCHEMA', 'acr_test')) | ||
| 14 | ap.add_argument('--job-id', type=int, default=int(os.environ.get('EXTRACTION_JOB_ID', '0'))) | ||
| 15 | ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_fingerprint')) | ||
| 16 | ap.add_argument('--complete-dry-run', action='store_true') | ||
| 17 | ap.add_argument('--output') | ||
| 18 | args = ap.parse_args() | ||
| 19 | |||
| 20 | if not args.dsn: | ||
| 21 | raise SystemExit('missing --dsn or PG_DSN') | ||
| 22 | if not args.job_id: | ||
| 23 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') | ||
| 24 | |||
| 25 | with connect(args.dsn, args.schema) as conn: | ||
| 26 | job = fetch_job_context(conn, args.job_id) | ||
| 27 | if job.model_name != 'chromaprint': | ||
| 28 | raise SystemExit(f'feature_extraction_job={args.job_id} is not a chromaprint job') | ||
| 29 | scope = resolve_scope_summary(conn, job.target_scope) | ||
| 30 | running = update_job_status( | ||
| 31 | conn, | ||
| 32 | job.extraction_job_id, | ||
| 33 | status='running', | ||
| 34 | input_count=scope['ready_asset_count'], | ||
| 35 | metadata_patch={ | ||
| 36 | 'worker': 'run_chromaprint_job', | ||
| 37 | 'output_target': args.output_target, | ||
| 38 | 'dry_run': True, | ||
| 39 | 'target_scope_summary': scope, | ||
| 40 | 'execution_mode': 'dry_run', | ||
| 41 | }, | ||
| 42 | set_started_at=True, | ||
| 43 | ) | ||
| 44 | completed = None | ||
| 45 | if args.complete_dry_run: | ||
| 46 | completed = update_job_status( | ||
| 47 | conn, | ||
| 48 | job.extraction_job_id, | ||
| 49 | status='completed', | ||
| 50 | output_count=0, | ||
| 51 | metadata_patch={ | ||
| 52 | 'worker': 'run_chromaprint_job', | ||
| 53 | 'output_target': args.output_target, | ||
| 54 | 'dry_run': True, | ||
| 55 | 'dry_run_result': 'completed_without_feature_write', | ||
| 56 | 'write_target_table': 'audio_fingerprint', | ||
| 57 | }, | ||
| 58 | set_finished_at=True, | ||
| 59 | ) | ||
| 60 | |||
| 61 | emit_payload( | ||
| 62 | { | ||
| 63 | 'worker': 'run_chromaprint_job', | ||
| 64 | 'schema': args.schema, | ||
| 65 | 'job': job.__dict__, | ||
| 66 | 'target_scope_summary': scope, | ||
| 67 | 'status_after_start': running, | ||
| 68 | 'status_after_complete': completed, | ||
| 69 | 'next_write_target': 'audio_fingerprint', | ||
| 70 | 'notes': [ | ||
| 71 | 'this worker currently validates planner -> job -> PostgreSQL state flow', | ||
| 72 | 'real chromaprint extraction can replace dry_run while preserving the same job contract', | ||
| 73 | ], | ||
| 74 | }, | ||
| 75 | args.output, | ||
| 76 | ) | ||
| 77 | |||
| 78 | |||
| 79 | if __name__ == '__main__': | ||
| 80 | main() |
acr-engine/workers/run_embedding_job.py
0 → 100644
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import os | ||
| 6 | |||
| 7 | from _job_common import connect, emit_payload, fetch_job_context, resolve_scope_summary, update_job_status | ||
| 8 | |||
| 9 | VECTOR_TABLE_BY_DIM = { | ||
| 10 | 192: 'audio_embedding_vector_192', | ||
| 11 | 768: 'audio_embedding_vector_768', | ||
| 12 | } | ||
| 13 | |||
| 14 | |||
| 15 | def main() -> None: | ||
| 16 | ap = argparse.ArgumentParser() | ||
| 17 | ap.add_argument('--dsn', default=os.environ.get('PG_DSN')) | ||
| 18 | ap.add_argument('--schema', default=os.environ.get('PG_SCHEMA', 'acr_test')) | ||
| 19 | ap.add_argument('--job-id', type=int, default=int(os.environ.get('EXTRACTION_JOB_ID', '0'))) | ||
| 20 | ap.add_argument('--model-name', default=os.environ.get('MODEL_NAME')) | ||
| 21 | ap.add_argument('--model-version', default=os.environ.get('MODEL_VERSION')) | ||
| 22 | ap.add_argument('--vector-table', default=os.environ.get('VECTOR_TABLE')) | ||
| 23 | ap.add_argument('--output-target', default=os.environ.get('OUTPUT_TARGET', 'audio_embedding')) | ||
| 24 | ap.add_argument('--complete-dry-run', action='store_true') | ||
| 25 | ap.add_argument('--output') | ||
| 26 | args = ap.parse_args() | ||
| 27 | |||
| 28 | if not args.dsn: | ||
| 29 | raise SystemExit('missing --dsn or PG_DSN') | ||
| 30 | if not args.job_id: | ||
| 31 | raise SystemExit('missing --job-id or EXTRACTION_JOB_ID') | ||
| 32 | |||
| 33 | with connect(args.dsn, args.schema) as conn: | ||
| 34 | job = fetch_job_context(conn, args.job_id) | ||
| 35 | if job.model_name == 'chromaprint': | ||
| 36 | raise SystemExit(f'feature_extraction_job={args.job_id} is not an embedding job') | ||
| 37 | if args.model_name and job.model_name != args.model_name: | ||
| 38 | raise SystemExit(f'model mismatch: job={job.model_name} cli={args.model_name}') | ||
| 39 | if args.model_version and job.model_version != args.model_version: | ||
| 40 | raise SystemExit(f'model version mismatch: job={job.model_version} cli={args.model_version}') | ||
| 41 | resolved_vector_table = args.vector_table or VECTOR_TABLE_BY_DIM.get(job.embedding_dim or job.output_embedding_dim or -1) | ||
| 42 | scope = resolve_scope_summary(conn, job.target_scope) | ||
| 43 | running = update_job_status( | ||
| 44 | conn, | ||
| 45 | job.extraction_job_id, | ||
| 46 | status='running', | ||
| 47 | input_count=scope['active_window_count'] or scope['ready_asset_count'], | ||
| 48 | metadata_patch={ | ||
| 49 | 'worker': 'run_embedding_job', | ||
| 50 | 'output_target': args.output_target, | ||
| 51 | 'vector_table': resolved_vector_table, | ||
| 52 | 'dry_run': True, | ||
| 53 | 'target_scope_summary': scope, | ||
| 54 | 'execution_mode': 'dry_run', | ||
| 55 | }, | ||
| 56 | set_started_at=True, | ||
| 57 | ) | ||
| 58 | completed = None | ||
| 59 | if args.complete_dry_run: | ||
| 60 | completed = update_job_status( | ||
| 61 | conn, | ||
| 62 | job.extraction_job_id, | ||
| 63 | status='completed', | ||
| 64 | output_count=0, | ||
| 65 | metadata_patch={ | ||
| 66 | 'worker': 'run_embedding_job', | ||
| 67 | 'output_target': args.output_target, | ||
| 68 | 'vector_table': resolved_vector_table, | ||
| 69 | 'dry_run': True, | ||
| 70 | 'dry_run_result': 'completed_without_feature_write', | ||
| 71 | 'write_target_table': args.output_target, | ||
| 72 | }, | ||
| 73 | set_finished_at=True, | ||
| 74 | ) | ||
| 75 | |||
| 76 | emit_payload( | ||
| 77 | { | ||
| 78 | 'worker': 'run_embedding_job', | ||
| 79 | 'schema': args.schema, | ||
| 80 | 'job': job.__dict__, | ||
| 81 | 'target_scope_summary': scope, | ||
| 82 | 'status_after_start': running, | ||
| 83 | 'status_after_complete': completed, | ||
| 84 | 'resolved_vector_table': resolved_vector_table, | ||
| 85 | 'notes': [ | ||
| 86 | 'this worker currently validates planner -> job -> PostgreSQL state flow', | ||
| 87 | 'real encoder inference can replace dry_run while preserving the same job contract', | ||
| 88 | ], | ||
| 89 | }, | ||
| 90 | args.output, | ||
| 91 | ) | ||
| 92 | |||
| 93 | |||
| 94 | if __name__ == '__main__': | ||
| 95 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 [Phase-1 Worker Contract](./phase1-worker-contract.md) 与 `acr-engine/workers/_job_common.py`、`mark_job_status.py`、`run_chromaprint_job.py`、`run_embedding_job.py`,把 Phase-1 从“只有 planner 命令模板”推进到“worker 可以真实消费 PostgreSQL 的 `feature_extraction_job` 并执行 `pending -> running -> completed` dry-run 状态流转”的阶段。 | ||
| 4 | - 新增 `phase1_worker_chromaprint_dry_run.json`、`phase1_worker_embedding_dry_run.json` 与 `phase1_worker_mark_pending_report.json`,并在 live PostgreSQL `acr_test` 上验证了 worker 状态流转;同时确认当前 `phase1_hot_reference_v1` 还没有实际 members,因此 scope 计数为 `0`,这是数据未装载而不是 worker 失败。 | ||
| 5 | - 修正 `plan_phase1_extraction_jobs_live.py` 的命令模板,把 `PG_DSN=\"${PG_DSN:?set PG_DSN}\"` 显式写入 `command_suggestions / primary_command`,避免 planner 产物看起来可跑但实际缺少数据库连接串。 | ||
| 3 | - 更新 `plan_phase1_extraction_jobs_live.py` 与 `phase1_extraction_plan_report.json`,把 Phase-1 execution plan 从“仅有排序计划”推进到“附带 `command_suggestions / primary_command` 的可复制执行命令模板”。 | 6 | - 更新 `plan_phase1_extraction_jobs_live.py` 与 `phase1_extraction_plan_report.json`,把 Phase-1 execution plan 从“仅有排序计划”推进到“附带 `command_suggestions / primary_command` 的可复制执行命令模板”。 |
| 4 | - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。 | 7 | - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。 |
| 5 | - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 | 8 | - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 | ... | ... |
| ... | @@ -219,6 +219,36 @@ flowchart TD | ... | @@ -219,6 +219,36 @@ flowchart TD |
| 219 | 219 | ||
| 220 | --- | 220 | --- |
| 221 | 221 | ||
| 222 | ## 10. Phase-1 worker contract(新增执行层) | ||
| 223 | |||
| 224 | 当前已经不只是 registry/bootstrap 了,还补上了最小真实 worker 执行面: | ||
| 225 | |||
| 226 | - `acr-engine/workers/mark_job_status.py` | ||
| 227 | - `acr-engine/workers/run_chromaprint_job.py` | ||
| 228 | - `acr-engine/workers/run_embedding_job.py` | ||
| 229 | |||
| 230 | 这层的作用不是立即跑完真实抽特征,而是先把下面这条链打通: | ||
| 231 | |||
| 232 | ```text | ||
| 233 | planner -> feature_extraction_job -> worker -> PostgreSQL status update | ||
| 234 | ``` | ||
| 235 | |||
| 236 | ### 当前能力 | ||
| 237 | |||
| 238 | 1. 读取 `feature_extraction_job` | ||
| 239 | 2. 联表解析 `feature_set_registry + model_registry` | ||
| 240 | 3. 解析 `target_scope` | ||
| 241 | 4. 回写 `pending -> running -> completed` | ||
| 242 | 5. 为后续真模型推理保留稳定契约 | ||
| 243 | |||
| 244 | ### 推荐阅读 | ||
| 245 | |||
| 246 | 详细契约与流程图见: | ||
| 247 | |||
| 248 | - [docs/phase1-worker-contract.md](./phase1-worker-contract.md) | ||
| 249 | |||
| 250 | --- | ||
| 251 | |||
| 222 | ## 8. live PostgreSQL bootstrap 脚本 | 252 | ## 8. live PostgreSQL bootstrap 脚本 |
| 223 | 253 | ||
| 224 | 为了避免每次手工执行 SQL,本仓库现在提供了一个可直接连 PostgreSQL 的 live bootstrap 脚本: | 254 | 为了避免每次手工执行 SQL,本仓库现在提供了一个可直接连 PostgreSQL 的 live bootstrap 脚本: | ... | ... |
docs/phase1-worker-contract.md
0 → 100644
| 1 | # Phase-1 Worker Contract / 作业执行器契约 | ||
| 2 | |||
| 3 | > 更新:2026-06-04 | ||
| 4 | > 目标:把 Phase-1 从“只有 registry / plan”推进到“worker 可以真实消费 PostgreSQL 作业并更新状态”。 | ||
| 5 | |||
| 6 | --- | ||
| 7 | |||
| 8 | ## 一页结论 | ||
| 9 | |||
| 10 | 当前 Phase-1 已经具备一条最小真实执行链: | ||
| 11 | |||
| 12 | 1. planner 从 `feature_extraction_job` 读 pending jobs | ||
| 13 | 2. worker 读取 `extraction_job_id` | ||
| 14 | 3. worker 联表解析 `feature_set_registry + model_registry` | ||
| 15 | 4. worker 解析 `target_scope` | ||
| 16 | 5. worker 回写 `feature_extraction_job.job_status / input_count / output_count / metadata_json` | ||
| 17 | |||
| 18 | 也就是说,现在 PostgreSQL 不只是“数据字典”,已经开始承担: | ||
| 19 | - 作业编排面 | ||
| 20 | - 状态机面 | ||
| 21 | - 执行证据面 | ||
| 22 | |||
| 23 | --- | ||
| 24 | |||
| 25 | ## 1. 当前落地的 worker | ||
| 26 | |||
| 27 | 位于: | ||
| 28 | |||
| 29 | - `acr-engine/workers/mark_job_status.py` | ||
| 30 | - `acr-engine/workers/run_chromaprint_job.py` | ||
| 31 | - `acr-engine/workers/run_embedding_job.py` | ||
| 32 | - `acr-engine/workers/_job_common.py` | ||
| 33 | |||
| 34 | ### 角色划分 | ||
| 35 | |||
| 36 | | worker | 作用 | | ||
| 37 | |---|---| | ||
| 38 | | `mark_job_status.py` | 通用状态推进器 | | ||
| 39 | | `run_chromaprint_job.py` | exact lane worker | | ||
| 40 | | `run_embedding_job.py` | semantic lane worker | | ||
| 41 | | `_job_common.py` | 共享的 job 读取、scope 解析、状态回写逻辑 | | ||
| 42 | |||
| 43 | --- | ||
| 44 | |||
| 45 | ## 2. 当前状态机 | ||
| 46 | |||
| 47 | ```mermaid | ||
| 48 | flowchart LR | ||
| 49 | A[pending] --> B[running] | ||
| 50 | B --> C[completed] | ||
| 51 | B --> D[failed] | ||
| 52 | ``` | ||
| 53 | |||
| 54 | ### 当前已验证的状态流转 | ||
| 55 | |||
| 56 | - `pending -> running` | ||
| 57 | - `running -> completed`(dry-run 模式) | ||
| 58 | |||
| 59 | ### 设计意图 | ||
| 60 | |||
| 61 | 先把 **作业契约与状态流转** 固定住,再把真正的模型推理塞进去。 | ||
| 62 | 这样后续不管换成: | ||
| 63 | - `Chromaprint` | ||
| 64 | - `MERT` | ||
| 65 | - `MuQ` | ||
| 66 | - `CoverHunter encoder` | ||
| 67 | |||
| 68 | 都不需要重做 orchestration 数据结构。 | ||
| 69 | |||
| 70 | --- | ||
| 71 | |||
| 72 | ## 3. worker 输入契约 | ||
| 73 | |||
| 74 | ### 环境变量 | ||
| 75 | |||
| 76 | | 变量 | 说明 | | ||
| 77 | |---|---| | ||
| 78 | | `PG_DSN` | PostgreSQL 连接串 | | ||
| 79 | | `PG_SCHEMA` | 目标 schema | | ||
| 80 | | `EXTRACTION_JOB_ID` | 要执行的作业 id | | ||
| 81 | | `FEATURE_SET_ID` | 规划时附带,worker 可用于一致性检查 | | ||
| 82 | | `TARGET_SCOPE` | 规划时附带,worker 当前以 DB 中 job 记录为准 | | ||
| 83 | | `MODEL_NAME` | embedding worker 用于防错 | | ||
| 84 | | `MODEL_VERSION` | embedding worker 用于防错 | | ||
| 85 | | `VECTOR_TABLE` | embedding worker 目标向量表 | | ||
| 86 | | `OUTPUT_TARGET` | `audio_fingerprint` 或 `audio_embedding` | | ||
| 87 | |||
| 88 | ### CLI 参数 | ||
| 89 | |||
| 90 | 三个 worker 都支持显式 CLI 参数覆盖 env。 | ||
| 91 | |||
| 92 | ### planner 命令模板的当前约定 | ||
| 93 | |||
| 94 | `plan_phase1_extraction_jobs_live.py` 现在会显式生成: | ||
| 95 | |||
| 96 | ```bash | ||
| 97 | PG_DSN="${PG_DSN:?set PG_DSN}" ... | ||
| 98 | ``` | ||
| 99 | |||
| 100 | 这样复制命令时,如果调用方忘了提供数据库连接串,会立刻失败,而不是静默跑空。 | ||
| 101 | |||
| 102 | --- | ||
| 103 | |||
| 104 | ## 4. PostgreSQL 读取契约 | ||
| 105 | |||
| 106 | worker 当前真实读取: | ||
| 107 | |||
| 108 | 1. `feature_extraction_job` | ||
| 109 | 2. `feature_set_registry` | ||
| 110 | 3. `model_registry` | ||
| 111 | 4. `reference_set_registry` / `reference_set_member` | ||
| 112 | 5. `recording_asset` | ||
| 113 | 6. `audio_window` | ||
| 114 | |||
| 115 | ### 为什么要读 scope summary | ||
| 116 | |||
| 117 | 因为 Phase-1 第一阶段的核心不是“立刻抽出 embedding”,而是先确定: | ||
| 118 | |||
| 119 | - 这次 job 面向哪个 reference set | ||
| 120 | - 涉及多少 recording | ||
| 121 | - 涉及多少 ready asset | ||
| 122 | - 涉及多少 active window | ||
| 123 | |||
| 124 | 这样后续做: | ||
| 125 | - 分片 | ||
| 126 | - 并行 | ||
| 127 | - 重试 | ||
| 128 | - SLA 估算 | ||
| 129 | |||
| 130 | 才有稳定基线。 | ||
| 131 | |||
| 132 | --- | ||
| 133 | |||
| 134 | ## 5. 当前 dry-run 的真实意义 | ||
| 135 | |||
| 136 | 当前 worker 还没有真正调用模型做特征提取;它做的是: | ||
| 137 | |||
| 138 | 1. 验证 planner 命令模板可被真实消费 | ||
| 139 | 2. 验证 job -> feature_set -> model 的 join 契约 | ||
| 140 | 3. 验证 target scope 解析 | ||
| 141 | 4. 验证 PostgreSQL 作业状态回写 | ||
| 142 | 5. 为下一步真推理保留稳定入口 | ||
| 143 | |||
| 144 | 所以它不是假文档,而是: | ||
| 145 | |||
| 146 | > **先把工业执行面的骨架打通,再把模型推理填进去。** | ||
| 147 | |||
| 148 | --- | ||
| 149 | |||
| 150 | ## 6. 推荐执行顺序 | ||
| 151 | |||
| 152 | ```mermaid | ||
| 153 | flowchart TD | ||
| 154 | A[bootstrap model/feature/reference registry] --> B[bootstrap feature_extraction_job] | ||
| 155 | B --> C[plan pending jobs] | ||
| 156 | C --> D[run worker dry-run] | ||
| 157 | D --> E[validate status transitions] | ||
| 158 | E --> F[replace dry-run with real extractor] | ||
| 159 | ``` | ||
| 160 | |||
| 161 | --- | ||
| 162 | |||
| 163 | ## 7. exact lane 与 semantic lane 的后续替换点 | ||
| 164 | |||
| 165 | ### 7.1 Chromaprint worker | ||
| 166 | |||
| 167 | 后续把下面逻辑塞进 `run_chromaprint_job.py`: | ||
| 168 | |||
| 169 | 1. 读取 `recording_asset` | ||
| 170 | 2. 调 chromaprint CLI / library | ||
| 171 | 3. 写 `audio_fingerprint` | ||
| 172 | 4. 更新 `output_count` | ||
| 173 | 5. 标记 `completed` | ||
| 174 | |||
| 175 | ### 7.2 Embedding worker | ||
| 176 | |||
| 177 | 后续把下面逻辑塞进 `run_embedding_job.py`: | ||
| 178 | |||
| 179 | 1. 读取 `audio_window` | ||
| 180 | 2. 加载 `MERT` / `MuQ` / `ECAPA` | ||
| 181 | 3. 提取向量 | ||
| 182 | 4. 写 `audio_embedding` | ||
| 183 | 5. 写 `audio_embedding_vector_<dim>` | ||
| 184 | 6. 更新 `output_count` | ||
| 185 | 7. 标记 `completed` | ||
| 186 | |||
| 187 | --- | ||
| 188 | |||
| 189 | ## 8. 解决了什么问题 | ||
| 190 | |||
| 191 | 这次 worker contract 落地,主要解决了 4 个问题: | ||
| 192 | |||
| 193 | 1. **planner 不再只是纸面计划** | ||
| 194 | 2. **job status 有了真实推进器** | ||
| 195 | 3. **后续换模型不用重做 orchestration** | ||
| 196 | 4. **可以先 dry-run 验证执行链,再接入重模型** | ||
| 197 | |||
| 198 | --- | ||
| 199 | |||
| 200 | ## 9. 当前边界 | ||
| 201 | |||
| 202 | 当前还没有完成的部分: | ||
| 203 | |||
| 204 | - 真实 chromaprint 特征写入 | ||
| 205 | - 真实 MERT / MuQ / ECAPA embedding 写入 | ||
| 206 | - `failed` 重试策略 | ||
| 207 | - job 分片执行器 | ||
| 208 | - 幂等去重写入策略 | ||
| 209 | |||
| 210 | 但现在已经足够支撑下一阶段: | ||
| 211 | |||
| 212 | > **把真实 extractor 接到已经验证过的 PostgreSQL worker contract 上。** |
| ... | @@ -69,6 +69,9 @@ | ... | @@ -69,6 +69,9 @@ |
| 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | | 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | |
| 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | | 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | |
| 71 | | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | | 71 | | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | |
| 72 | | chromaprint worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_chromaprint_dry_run.json` | | ||
| 73 | | embedding worker dry-run 报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_embedding_dry_run.json` | | ||
| 74 | | job status 手工回写报告 | `acr-engine/data/pgvector_eval/music20/phase1_worker_mark_pending_report.json` | | ||
| 72 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | | 75 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | |
| 73 | 76 | ||
| 74 | --- | 77 | --- |
| ... | @@ -258,6 +261,84 @@ flowchart LR | ... | @@ -258,6 +261,84 @@ flowchart LR |
| 258 | 261 | ||
| 259 | 当前这次 live 样例里只实际包含: | 262 | 当前这次 live 样例里只实际包含: |
| 260 | - `type_1` | 263 | - `type_1` |
| 264 | |||
| 265 | --- | ||
| 266 | |||
| 267 | ## Phase-1 worker dry-run 测试链路(新增) | ||
| 268 | |||
| 269 | 这一步解决的是: | ||
| 270 | |||
| 271 | > planner 虽然已经能输出可复制命令,但之前仓库里没有真正的 worker 可以消费这些命令。 | ||
| 272 | |||
| 273 | 现在已经补上最小真实 worker: | ||
| 274 | |||
| 275 | - `acr-engine/workers/mark_job_status.py` | ||
| 276 | - `acr-engine/workers/run_chromaprint_job.py` | ||
| 277 | - `acr-engine/workers/run_embedding_job.py` | ||
| 278 | |||
| 279 | ### 测试目标 | ||
| 280 | |||
| 281 | 验证下面这条链是真实可走通的: | ||
| 282 | |||
| 283 | ```mermaid | ||
| 284 | flowchart TD | ||
| 285 | A[feature_extraction_job pending] --> B[planner 生成命令模板] | ||
| 286 | B --> C[worker 读取 extraction_job_id] | ||
| 287 | C --> D[worker 解析 feature/model/scope] | ||
| 288 | D --> E[worker 回写 running/completed] | ||
| 289 | E --> F[bootstrap 脚本可再次恢复 pending] | ||
| 290 | ``` | ||
| 291 | |||
| 292 | ### 当前验证口径 | ||
| 293 | |||
| 294 | 这轮先不跑真实模型推理,而是先验证工业执行面: | ||
| 295 | |||
| 296 | 1. `run_chromaprint_job.py` | ||
| 297 | - 真实连接 PostgreSQL | ||
| 298 | - 读取 `feature_extraction_job=1` | ||
| 299 | - 解析 `reference_set:phase1_hot_reference_v1` | ||
| 300 | - 回写 `running -> completed` | ||
| 301 | |||
| 302 | 2. `run_embedding_job.py` | ||
| 303 | - 真实连接 PostgreSQL | ||
| 304 | - 读取 `feature_extraction_job=2` | ||
| 305 | - 解析 `mert v1-95m` | ||
| 306 | - 回写 `running -> completed` | ||
| 307 | |||
| 308 | 3. 再次执行 `bootstrap_phase1_extraction_jobs_live.py` | ||
| 309 | - 把 job 状态恢复为 `pending` | ||
| 310 | - 保证后续 session 可以从同一批 jobs 继续推进 | ||
| 311 | |||
| 312 | ### 为什么先做 dry-run | ||
| 313 | |||
| 314 | 因为当前第一优先级是把下面这些东西固定住: | ||
| 315 | |||
| 316 | - job contract | ||
| 317 | - status transitions | ||
| 318 | - scope 解析 | ||
| 319 | - planner -> worker 命令兼容性 | ||
| 320 | |||
| 321 | 等这个骨架稳定后,再把真实的: | ||
| 322 | - chromaprint 提取 | ||
| 323 | - MERT / MuQ embedding 提取 | ||
| 324 | |||
| 325 | 接进去,整体风险更低。 | ||
| 326 | |||
| 327 | ### 当前 live 结果的一个关键解释 | ||
| 328 | |||
| 329 | 本轮 worker dry-run 里,`phase1_hot_reference_v1` 已经存在,但在 `acr_test` schema 里**还没有实际 member**,所以: | ||
| 330 | |||
| 331 | - `recording_count=0` | ||
| 332 | - `ready_asset_count=0` | ||
| 333 | - `active_window_count=0` | ||
| 334 | |||
| 335 | 这不是 worker 异常,而是当前 Phase-1 live 数据面还没把业务 reference recordings 真实装进去。 | ||
| 336 | 因此这轮验证证明的是: | ||
| 337 | |||
| 338 | - planner -> worker 命令兼容 | ||
| 339 | - worker -> PostgreSQL 状态流转可用 | ||
| 340 | |||
| 341 | 还**不是**真实特征抽取吞吐验证。 | ||
| 261 | - `type_7` | 342 | - `type_7` |
| 262 | 343 | ||
| 263 | 因此: | 344 | 因此: | ... | ... |
| ... | @@ -44,7 +44,8 @@ | ... | @@ -44,7 +44,8 @@ |
| 44 | 4. [docs/postgresql-data-model.md](./postgresql-data-model.md) | 44 | 4. [docs/postgresql-data-model.md](./postgresql-data-model.md) |
| 45 | 5. [docs/phase1-implementation-checklist.md](./phase1-implementation-checklist.md) | 45 | 5. [docs/phase1-implementation-checklist.md](./phase1-implementation-checklist.md) |
| 46 | 6. [docs/model-feature-registry-bootstrap.md](./model-feature-registry-bootstrap.md) | 46 | 6. [docs/model-feature-registry-bootstrap.md](./model-feature-registry-bootstrap.md) |
| 47 | 7. [docs/CHANGELOG.md](./CHANGELOG.md) | 47 | 7. [docs/phase1-worker-contract.md](./phase1-worker-contract.md) |
| 48 | 8. [docs/CHANGELOG.md](./CHANGELOG.md) | ||
| 48 | 49 | ||
| 49 | 如果只想快速恢复上下文,至少读前 5 个。 | 50 | 如果只想快速恢复上下文,至少读前 5 个。 |
| 50 | 51 | ||
| ... | @@ -186,9 +187,13 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -186,9 +187,13 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 186 | - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 | 187 | - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 |
| 187 | - PostgreSQL `acr_test` schema 上已真实生成 Phase-1 extraction execution plan,当前顺序是 `chromaprint -> mert -> mert-long -> muq -> ecapa` | 188 | - PostgreSQL `acr_test` schema 上已真实生成 Phase-1 extraction execution plan,当前顺序是 `chromaprint -> mert -> mert-long -> muq -> ecapa` |
| 188 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 | 189 | - extraction plan 报告里已包含 `command_suggestions / primary_command`,下次可直接从 plan 抄 worker 命令模板 |
| 190 | - Phase-1 worker 入口已真实落地:`run_chromaprint_job.py / run_embedding_job.py / mark_job_status.py` | ||
| 191 | - 下一阶段已经不是“补 planner”,而是把 dry-run worker 替换为真实 extractor,并把 `audio_fingerprint / audio_embedding` 写入做成幂等执行 | ||
| 192 | - 最新 live worker 证据表明:`phase1_hot_reference_v1` 当前在 `acr_test` 里还没有实际 members,所以 dry-run 已验证状态机,但 scope 计数仍是 `0` | ||
| 189 | 193 | ||
| 190 | ### 未验证 / 仍是缺口 | 194 | ### 未验证 / 仍是缺口 |
| 191 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | 195 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** |
| 196 | - **worker 目前仍以 dry-run 为主,尚未写真实 `audio_fingerprint / audio_embedding`** | ||
| 192 | - **未落 reference set 的真实业务数据** | 197 | - **未落 reference set 的真实业务数据** |
| 193 | - **未定义最终线上分数融合细则** | 198 | - **未定义最终线上分数融合细则** |
| 194 | - **type_8 / type_16 还没有进入当前 live JSONL 的 PostgreSQL 实测链** | 199 | - **type_8 / type_16 还没有进入当前 live JSONL 的 PostgreSQL 实测链** |
| ... | @@ -204,6 +209,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -204,6 +209,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 204 | - `a549d1d` — Clarify the ACR evolution path and freeze a production-grade data model | 209 | - `a549d1d` — Clarify the ACR evolution path and freeze a production-grade data model |
| 205 | - `e514a6c` — Keep the new ACR architecture guide clean for follow-up edits | 210 | - `e514a6c` — Keep the new ACR architecture guide clean for follow-up edits |
| 206 | - `4b23f54` — Make the Phase-1 ACR plan executable for each delivery role | 211 | - `4b23f54` — Make the Phase-1 ACR plan executable for each delivery role |
| 212 | - `0679481` — Attach runnable command templates to the extraction plan | ||
| 207 | 213 | ||
| 208 | 如果下次需要追踪文档补充点,可以从这三个提交开始看。 | 214 | 如果下次需要追踪文档补充点,可以从这三个提交开始看。 |
| 209 | 215 | ||
| ... | @@ -212,4 +218,4 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -212,4 +218,4 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 212 | ## 一句话交接 | 218 | ## 一句话交接 |
| 213 | 219 | ||
| 214 | > **下次启动不要再从“要不要换模型、要不要重构数据结构”开始讨论。** | 220 | > **下次启动不要再从“要不要换模型、要不要重构数据结构”开始讨论。** |
| 215 | > 这些方向已经定了。直接从 **PostgreSQL v2 schema 落库 + Phase-1 encoder-only 执行链** 开始推进。 | 221 | > 这些方向已经定了。直接从 **PostgreSQL v2 schema 落库 + Phase-1 worker/extractor 执行链** 开始推进。 | ... | ... |
-
Please register or sign in to post a comment