Generate a live execution plan from pending extraction jobs
Constraint: Ralph must keep turning PostgreSQL state into concrete next-step artifacts rather than leaving implied manual steps Rejected: Stop at creating pending jobs only | It still leaves future sessions to infer ordering and physical targets by hand Confidence: high Scope-risk: narrow Directive: Treat the planner report as the canonical bridge between pending jobs and real extraction workers Tested: /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' --schema acr_test --job-status pending --output data/pgvector_eval/music20/phase1_extraction_plan_report.json; /usr/local/miniconda3/bin/python -m py_compile scripts/plan_phase1_extraction_jobs_live.py; git diff --check -- acr-engine/scripts/plan_phase1_extraction_jobs_live.py acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json docs/model-feature-registry-bootstrap.md docs/postgres_db_schema_samples.md docs/session-handoff.md docs/CHANGELOG.md Not-tested: Actual worker that consumes the plan to run MERT/MuQ/Chromaprint extraction end-to-end
Showing
6 changed files
with
705 additions
and
0 deletions
| 1 | { | ||
| 2 | "schema": "acr_test", | ||
| 3 | "dsn_redacted": "postgres://d2:***@127.0.0.1:5432/d2", | ||
| 4 | "job_status_filter": "pending", | ||
| 5 | "counts": { | ||
| 6 | "jobs": 5, | ||
| 7 | "lanes": { | ||
| 8 | "exact": 1, | ||
| 9 | "semantic": 4 | ||
| 10 | } | ||
| 11 | }, | ||
| 12 | "ordered_jobs": [ | ||
| 13 | { | ||
| 14 | "priority_rank": 0, | ||
| 15 | "lane": "exact", | ||
| 16 | "extraction_job_id": 1, | ||
| 17 | "feature_set_id": 2, | ||
| 18 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 19 | "scope": { | ||
| 20 | "scope_type": "reference_set", | ||
| 21 | "scope_value": "phase1_hot_reference_v1" | ||
| 22 | }, | ||
| 23 | "job_status": "pending", | ||
| 24 | "shard_key": "phase1/reference/chromaprint/v1", | ||
| 25 | "model_name": "chromaprint", | ||
| 26 | "model_version": "v1", | ||
| 27 | "model_family": "fingerprint", | ||
| 28 | "input_sample_rate": 16000, | ||
| 29 | "feature_name": "fingerprint_asset", | ||
| 30 | "feature_level": "asset", | ||
| 31 | "extraction_granularity": "full_asset", | ||
| 32 | "window_sec": 5.0, | ||
| 33 | "hop_sec": 2.5, | ||
| 34 | "embedding_dim": null, | ||
| 35 | "distance_metric": "hamming", | ||
| 36 | "physical_target": "audio_fingerprint", | ||
| 37 | "vector_table": null, | ||
| 38 | "job_metadata": { | ||
| 39 | "lane": "exact", | ||
| 40 | "phase": "phase1", | ||
| 41 | "priority": "p0" | ||
| 42 | }, | ||
| 43 | "model_metadata": { | ||
| 44 | "lane": "exact", | ||
| 45 | "note": "exact fingerprint lane baseline", | ||
| 46 | "phase": "phase1" | ||
| 47 | }, | ||
| 48 | "execution_notes": [ | ||
| 49 | "run feature extraction for chromaprint v1", | ||
| 50 | "write to audio_fingerprint", | ||
| 51 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 52 | ] | ||
| 53 | }, | ||
| 54 | { | ||
| 55 | "priority_rank": 1, | ||
| 56 | "lane": "semantic", | ||
| 57 | "extraction_job_id": 2, | ||
| 58 | "feature_set_id": 3, | ||
| 59 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 60 | "scope": { | ||
| 61 | "scope_type": "reference_set", | ||
| 62 | "scope_value": "phase1_hot_reference_v1" | ||
| 63 | }, | ||
| 64 | "job_status": "pending", | ||
| 65 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 66 | "model_name": "mert", | ||
| 67 | "model_version": "v1-95m", | ||
| 68 | "model_family": "music_ssl", | ||
| 69 | "input_sample_rate": 24000, | ||
| 70 | "feature_name": "semantic_embedding", | ||
| 71 | "feature_level": "window", | ||
| 72 | "extraction_granularity": "sliding_window", | ||
| 73 | "window_sec": 5.0, | ||
| 74 | "hop_sec": 2.5, | ||
| 75 | "embedding_dim": 768, | ||
| 76 | "distance_metric": "cosine", | ||
| 77 | "physical_target": "audio_embedding", | ||
| 78 | "vector_table": "audio_embedding_vector_768", | ||
| 79 | "job_metadata": { | ||
| 80 | "lane": "semantic", | ||
| 81 | "role": "primary_baseline", | ||
| 82 | "phase": "phase1" | ||
| 83 | }, | ||
| 84 | "model_metadata": { | ||
| 85 | "lane": "semantic", | ||
| 86 | "role": "primary_baseline", | ||
| 87 | "phase": "phase1" | ||
| 88 | }, | ||
| 89 | "execution_notes": [ | ||
| 90 | "run feature extraction for mert v1-95m", | ||
| 91 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 92 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 93 | ] | ||
| 94 | }, | ||
| 95 | { | ||
| 96 | "priority_rank": 1, | ||
| 97 | "lane": "semantic", | ||
| 98 | "extraction_job_id": 3, | ||
| 99 | "feature_set_id": 4, | ||
| 100 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 101 | "scope": { | ||
| 102 | "scope_type": "reference_set", | ||
| 103 | "scope_value": "phase1_hot_reference_v1" | ||
| 104 | }, | ||
| 105 | "job_status": "pending", | ||
| 106 | "shard_key": "phase1/reference/mert/v1-95m/10s_5s", | ||
| 107 | "model_name": "mert", | ||
| 108 | "model_version": "v1-95m", | ||
| 109 | "model_family": "music_ssl", | ||
| 110 | "input_sample_rate": 24000, | ||
| 111 | "feature_name": "semantic_embedding", | ||
| 112 | "feature_level": "window", | ||
| 113 | "extraction_granularity": "sliding_window", | ||
| 114 | "window_sec": 10.0, | ||
| 115 | "hop_sec": 5.0, | ||
| 116 | "embedding_dim": 768, | ||
| 117 | "distance_metric": "cosine", | ||
| 118 | "physical_target": "audio_embedding", | ||
| 119 | "vector_table": "audio_embedding_vector_768", | ||
| 120 | "job_metadata": { | ||
| 121 | "lane": "semantic", | ||
| 122 | "role": "long_context_validation", | ||
| 123 | "phase": "phase1" | ||
| 124 | }, | ||
| 125 | "model_metadata": { | ||
| 126 | "lane": "semantic", | ||
| 127 | "role": "primary_baseline", | ||
| 128 | "phase": "phase1" | ||
| 129 | }, | ||
| 130 | "execution_notes": [ | ||
| 131 | "run feature extraction for mert v1-95m", | ||
| 132 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 133 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 134 | ] | ||
| 135 | }, | ||
| 136 | { | ||
| 137 | "priority_rank": 1, | ||
| 138 | "lane": "semantic", | ||
| 139 | "extraction_job_id": 4, | ||
| 140 | "feature_set_id": 5, | ||
| 141 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 142 | "scope": { | ||
| 143 | "scope_type": "reference_set", | ||
| 144 | "scope_value": "phase1_hot_reference_v1" | ||
| 145 | }, | ||
| 146 | "job_status": "pending", | ||
| 147 | "shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s", | ||
| 148 | "model_name": "muq", | ||
| 149 | "model_version": "large-msd-iter", | ||
| 150 | "model_family": "music_ssl", | ||
| 151 | "input_sample_rate": 24000, | ||
| 152 | "feature_name": "semantic_embedding", | ||
| 153 | "feature_level": "window", | ||
| 154 | "extraction_granularity": "sliding_window", | ||
| 155 | "window_sec": 5.0, | ||
| 156 | "hop_sec": 2.5, | ||
| 157 | "embedding_dim": 768, | ||
| 158 | "distance_metric": "cosine", | ||
| 159 | "physical_target": "audio_embedding", | ||
| 160 | "vector_table": "audio_embedding_vector_768", | ||
| 161 | "job_metadata": { | ||
| 162 | "lane": "semantic", | ||
| 163 | "role": "challenger", | ||
| 164 | "phase": "phase1" | ||
| 165 | }, | ||
| 166 | "model_metadata": { | ||
| 167 | "lane": "semantic", | ||
| 168 | "role": "challenger", | ||
| 169 | "phase": "phase1" | ||
| 170 | }, | ||
| 171 | "execution_notes": [ | ||
| 172 | "run feature extraction for muq large-msd-iter", | ||
| 173 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 174 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 175 | ] | ||
| 176 | }, | ||
| 177 | { | ||
| 178 | "priority_rank": 1, | ||
| 179 | "lane": "semantic", | ||
| 180 | "extraction_job_id": 5, | ||
| 181 | "feature_set_id": 6, | ||
| 182 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 183 | "scope": { | ||
| 184 | "scope_type": "reference_set", | ||
| 185 | "scope_value": "phase1_hot_reference_v1" | ||
| 186 | }, | ||
| 187 | "job_status": "pending", | ||
| 188 | "shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s", | ||
| 189 | "model_name": "ecapa", | ||
| 190 | "model_version": "acr-baseline-v1", | ||
| 191 | "model_family": "speech_derived", | ||
| 192 | "input_sample_rate": 16000, | ||
| 193 | "feature_name": "semantic_embedding", | ||
| 194 | "feature_level": "window", | ||
| 195 | "extraction_granularity": "sliding_window", | ||
| 196 | "window_sec": 5.0, | ||
| 197 | "hop_sec": 2.5, | ||
| 198 | "embedding_dim": 192, | ||
| 199 | "distance_metric": "cosine", | ||
| 200 | "physical_target": "audio_embedding", | ||
| 201 | "vector_table": "audio_embedding_vector_192", | ||
| 202 | "job_metadata": { | ||
| 203 | "lane": "semantic", | ||
| 204 | "role": "historical_baseline", | ||
| 205 | "phase": "phase1" | ||
| 206 | }, | ||
| 207 | "model_metadata": { | ||
| 208 | "lane": "semantic", | ||
| 209 | "role": "historical_baseline", | ||
| 210 | "phase": "phase1" | ||
| 211 | }, | ||
| 212 | "execution_notes": [ | ||
| 213 | "run feature extraction for ecapa acr-baseline-v1", | ||
| 214 | "write to audio_embedding + audio_embedding_vector_192", | ||
| 215 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 216 | ] | ||
| 217 | } | ||
| 218 | ], | ||
| 219 | "by_lane": { | ||
| 220 | "exact": [ | ||
| 221 | { | ||
| 222 | "priority_rank": 0, | ||
| 223 | "lane": "exact", | ||
| 224 | "extraction_job_id": 1, | ||
| 225 | "feature_set_id": 2, | ||
| 226 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 227 | "scope": { | ||
| 228 | "scope_type": "reference_set", | ||
| 229 | "scope_value": "phase1_hot_reference_v1" | ||
| 230 | }, | ||
| 231 | "job_status": "pending", | ||
| 232 | "shard_key": "phase1/reference/chromaprint/v1", | ||
| 233 | "model_name": "chromaprint", | ||
| 234 | "model_version": "v1", | ||
| 235 | "model_family": "fingerprint", | ||
| 236 | "input_sample_rate": 16000, | ||
| 237 | "feature_name": "fingerprint_asset", | ||
| 238 | "feature_level": "asset", | ||
| 239 | "extraction_granularity": "full_asset", | ||
| 240 | "window_sec": 5.0, | ||
| 241 | "hop_sec": 2.5, | ||
| 242 | "embedding_dim": null, | ||
| 243 | "distance_metric": "hamming", | ||
| 244 | "physical_target": "audio_fingerprint", | ||
| 245 | "vector_table": null, | ||
| 246 | "job_metadata": { | ||
| 247 | "lane": "exact", | ||
| 248 | "phase": "phase1", | ||
| 249 | "priority": "p0" | ||
| 250 | }, | ||
| 251 | "model_metadata": { | ||
| 252 | "lane": "exact", | ||
| 253 | "note": "exact fingerprint lane baseline", | ||
| 254 | "phase": "phase1" | ||
| 255 | }, | ||
| 256 | "execution_notes": [ | ||
| 257 | "run feature extraction for chromaprint v1", | ||
| 258 | "write to audio_fingerprint", | ||
| 259 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 260 | ] | ||
| 261 | } | ||
| 262 | ], | ||
| 263 | "semantic": [ | ||
| 264 | { | ||
| 265 | "priority_rank": 1, | ||
| 266 | "lane": "semantic", | ||
| 267 | "extraction_job_id": 2, | ||
| 268 | "feature_set_id": 3, | ||
| 269 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 270 | "scope": { | ||
| 271 | "scope_type": "reference_set", | ||
| 272 | "scope_value": "phase1_hot_reference_v1" | ||
| 273 | }, | ||
| 274 | "job_status": "pending", | ||
| 275 | "shard_key": "phase1/reference/mert/v1-95m/5s_2.5s", | ||
| 276 | "model_name": "mert", | ||
| 277 | "model_version": "v1-95m", | ||
| 278 | "model_family": "music_ssl", | ||
| 279 | "input_sample_rate": 24000, | ||
| 280 | "feature_name": "semantic_embedding", | ||
| 281 | "feature_level": "window", | ||
| 282 | "extraction_granularity": "sliding_window", | ||
| 283 | "window_sec": 5.0, | ||
| 284 | "hop_sec": 2.5, | ||
| 285 | "embedding_dim": 768, | ||
| 286 | "distance_metric": "cosine", | ||
| 287 | "physical_target": "audio_embedding", | ||
| 288 | "vector_table": "audio_embedding_vector_768", | ||
| 289 | "job_metadata": { | ||
| 290 | "lane": "semantic", | ||
| 291 | "role": "primary_baseline", | ||
| 292 | "phase": "phase1" | ||
| 293 | }, | ||
| 294 | "model_metadata": { | ||
| 295 | "lane": "semantic", | ||
| 296 | "role": "primary_baseline", | ||
| 297 | "phase": "phase1" | ||
| 298 | }, | ||
| 299 | "execution_notes": [ | ||
| 300 | "run feature extraction for mert v1-95m", | ||
| 301 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 302 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 303 | ] | ||
| 304 | }, | ||
| 305 | { | ||
| 306 | "priority_rank": 1, | ||
| 307 | "lane": "semantic", | ||
| 308 | "extraction_job_id": 3, | ||
| 309 | "feature_set_id": 4, | ||
| 310 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 311 | "scope": { | ||
| 312 | "scope_type": "reference_set", | ||
| 313 | "scope_value": "phase1_hot_reference_v1" | ||
| 314 | }, | ||
| 315 | "job_status": "pending", | ||
| 316 | "shard_key": "phase1/reference/mert/v1-95m/10s_5s", | ||
| 317 | "model_name": "mert", | ||
| 318 | "model_version": "v1-95m", | ||
| 319 | "model_family": "music_ssl", | ||
| 320 | "input_sample_rate": 24000, | ||
| 321 | "feature_name": "semantic_embedding", | ||
| 322 | "feature_level": "window", | ||
| 323 | "extraction_granularity": "sliding_window", | ||
| 324 | "window_sec": 10.0, | ||
| 325 | "hop_sec": 5.0, | ||
| 326 | "embedding_dim": 768, | ||
| 327 | "distance_metric": "cosine", | ||
| 328 | "physical_target": "audio_embedding", | ||
| 329 | "vector_table": "audio_embedding_vector_768", | ||
| 330 | "job_metadata": { | ||
| 331 | "lane": "semantic", | ||
| 332 | "role": "long_context_validation", | ||
| 333 | "phase": "phase1" | ||
| 334 | }, | ||
| 335 | "model_metadata": { | ||
| 336 | "lane": "semantic", | ||
| 337 | "role": "primary_baseline", | ||
| 338 | "phase": "phase1" | ||
| 339 | }, | ||
| 340 | "execution_notes": [ | ||
| 341 | "run feature extraction for mert v1-95m", | ||
| 342 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 343 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 344 | ] | ||
| 345 | }, | ||
| 346 | { | ||
| 347 | "priority_rank": 1, | ||
| 348 | "lane": "semantic", | ||
| 349 | "extraction_job_id": 4, | ||
| 350 | "feature_set_id": 5, | ||
| 351 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 352 | "scope": { | ||
| 353 | "scope_type": "reference_set", | ||
| 354 | "scope_value": "phase1_hot_reference_v1" | ||
| 355 | }, | ||
| 356 | "job_status": "pending", | ||
| 357 | "shard_key": "phase1/reference/muq/large-msd-iter/5s_2.5s", | ||
| 358 | "model_name": "muq", | ||
| 359 | "model_version": "large-msd-iter", | ||
| 360 | "model_family": "music_ssl", | ||
| 361 | "input_sample_rate": 24000, | ||
| 362 | "feature_name": "semantic_embedding", | ||
| 363 | "feature_level": "window", | ||
| 364 | "extraction_granularity": "sliding_window", | ||
| 365 | "window_sec": 5.0, | ||
| 366 | "hop_sec": 2.5, | ||
| 367 | "embedding_dim": 768, | ||
| 368 | "distance_metric": "cosine", | ||
| 369 | "physical_target": "audio_embedding", | ||
| 370 | "vector_table": "audio_embedding_vector_768", | ||
| 371 | "job_metadata": { | ||
| 372 | "lane": "semantic", | ||
| 373 | "role": "challenger", | ||
| 374 | "phase": "phase1" | ||
| 375 | }, | ||
| 376 | "model_metadata": { | ||
| 377 | "lane": "semantic", | ||
| 378 | "role": "challenger", | ||
| 379 | "phase": "phase1" | ||
| 380 | }, | ||
| 381 | "execution_notes": [ | ||
| 382 | "run feature extraction for muq large-msd-iter", | ||
| 383 | "write to audio_embedding + audio_embedding_vector_768", | ||
| 384 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 385 | ] | ||
| 386 | }, | ||
| 387 | { | ||
| 388 | "priority_rank": 1, | ||
| 389 | "lane": "semantic", | ||
| 390 | "extraction_job_id": 5, | ||
| 391 | "feature_set_id": 6, | ||
| 392 | "target_scope": "reference_set:phase1_hot_reference_v1", | ||
| 393 | "scope": { | ||
| 394 | "scope_type": "reference_set", | ||
| 395 | "scope_value": "phase1_hot_reference_v1" | ||
| 396 | }, | ||
| 397 | "job_status": "pending", | ||
| 398 | "shard_key": "phase1/reference/ecapa/acr-baseline-v1/5s_2.5s", | ||
| 399 | "model_name": "ecapa", | ||
| 400 | "model_version": "acr-baseline-v1", | ||
| 401 | "model_family": "speech_derived", | ||
| 402 | "input_sample_rate": 16000, | ||
| 403 | "feature_name": "semantic_embedding", | ||
| 404 | "feature_level": "window", | ||
| 405 | "extraction_granularity": "sliding_window", | ||
| 406 | "window_sec": 5.0, | ||
| 407 | "hop_sec": 2.5, | ||
| 408 | "embedding_dim": 192, | ||
| 409 | "distance_metric": "cosine", | ||
| 410 | "physical_target": "audio_embedding", | ||
| 411 | "vector_table": "audio_embedding_vector_192", | ||
| 412 | "job_metadata": { | ||
| 413 | "lane": "semantic", | ||
| 414 | "role": "historical_baseline", | ||
| 415 | "phase": "phase1" | ||
| 416 | }, | ||
| 417 | "model_metadata": { | ||
| 418 | "lane": "semantic", | ||
| 419 | "role": "historical_baseline", | ||
| 420 | "phase": "phase1" | ||
| 421 | }, | ||
| 422 | "execution_notes": [ | ||
| 423 | "run feature extraction for ecapa acr-baseline-v1", | ||
| 424 | "write to audio_embedding + audio_embedding_vector_192", | ||
| 425 | "target scope: reference_set:phase1_hot_reference_v1" | ||
| 426 | ] | ||
| 427 | } | ||
| 428 | ] | ||
| 429 | }, | ||
| 430 | "execution_order_summary": [ | ||
| 431 | { | ||
| 432 | "order": 1, | ||
| 433 | "extraction_job_id": 1, | ||
| 434 | "lane": "exact", | ||
| 435 | "model_name": "chromaprint", | ||
| 436 | "feature_name": "fingerprint_asset", | ||
| 437 | "window_sec": 5.0, | ||
| 438 | "hop_sec": 2.5, | ||
| 439 | "physical_target": "audio_fingerprint" | ||
| 440 | }, | ||
| 441 | { | ||
| 442 | "order": 2, | ||
| 443 | "extraction_job_id": 2, | ||
| 444 | "lane": "semantic", | ||
| 445 | "model_name": "mert", | ||
| 446 | "feature_name": "semantic_embedding", | ||
| 447 | "window_sec": 5.0, | ||
| 448 | "hop_sec": 2.5, | ||
| 449 | "physical_target": "audio_embedding" | ||
| 450 | }, | ||
| 451 | { | ||
| 452 | "order": 3, | ||
| 453 | "extraction_job_id": 3, | ||
| 454 | "lane": "semantic", | ||
| 455 | "model_name": "mert", | ||
| 456 | "feature_name": "semantic_embedding", | ||
| 457 | "window_sec": 10.0, | ||
| 458 | "hop_sec": 5.0, | ||
| 459 | "physical_target": "audio_embedding" | ||
| 460 | }, | ||
| 461 | { | ||
| 462 | "order": 4, | ||
| 463 | "extraction_job_id": 4, | ||
| 464 | "lane": "semantic", | ||
| 465 | "model_name": "muq", | ||
| 466 | "feature_name": "semantic_embedding", | ||
| 467 | "window_sec": 5.0, | ||
| 468 | "hop_sec": 2.5, | ||
| 469 | "physical_target": "audio_embedding" | ||
| 470 | }, | ||
| 471 | { | ||
| 472 | "order": 5, | ||
| 473 | "extraction_job_id": 5, | ||
| 474 | "lane": "semantic", | ||
| 475 | "model_name": "ecapa", | ||
| 476 | "feature_name": "semantic_embedding", | ||
| 477 | "window_sec": 5.0, | ||
| 478 | "hop_sec": 2.5, | ||
| 479 | "physical_target": "audio_embedding" | ||
| 480 | } | ||
| 481 | ] | ||
| 482 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env /usr/local/miniconda3/bin/python | ||
| 2 | from __future__ import annotations | ||
| 3 | |||
| 4 | import argparse | ||
| 5 | import json | ||
| 6 | from pathlib import Path | ||
| 7 | from typing import Any | ||
| 8 | |||
| 9 | import psycopg | ||
| 10 | |||
| 11 | ROOT = Path(__file__).resolve().parents[1] | ||
| 12 | DEFAULT_OUTPUT = ROOT / 'data' / 'pgvector_eval' / 'music20' / 'phase1_extraction_plan_report.json' | ||
| 13 | |||
| 14 | LANE_PRIORITY = { | ||
| 15 | 'exact': 0, | ||
| 16 | 'semantic': 1, | ||
| 17 | 'cover': 2, | ||
| 18 | } | ||
| 19 | |||
| 20 | |||
| 21 | def parse_target_scope(target_scope: str) -> dict[str, Any]: | ||
| 22 | if ':' in target_scope: | ||
| 23 | scope_type, scope_value = target_scope.split(':', 1) | ||
| 24 | return {'scope_type': scope_type, 'scope_value': scope_value} | ||
| 25 | return {'scope_type': 'unknown', 'scope_value': target_scope} | ||
| 26 | |||
| 27 | |||
| 28 | def main() -> None: | ||
| 29 | ap = argparse.ArgumentParser() | ||
| 30 | ap.add_argument('--dsn', required=True) | ||
| 31 | ap.add_argument('--schema', default='acr_test') | ||
| 32 | ap.add_argument('--job-status', default='pending') | ||
| 33 | ap.add_argument('--output', default=str(DEFAULT_OUTPUT)) | ||
| 34 | args = ap.parse_args() | ||
| 35 | |||
| 36 | with psycopg.connect(args.dsn) as conn: | ||
| 37 | conn.execute(f'SET search_path TO {args.schema}, public;') | ||
| 38 | rows = conn.execute( | ||
| 39 | """ | ||
| 40 | SELECT | ||
| 41 | fej.extraction_job_id, | ||
| 42 | fej.feature_set_id, | ||
| 43 | fej.target_scope, | ||
| 44 | fej.job_status, | ||
| 45 | fej.shard_key, | ||
| 46 | fej.metadata_json, | ||
| 47 | fs.feature_name, | ||
| 48 | fs.feature_level, | ||
| 49 | fs.extraction_granularity, | ||
| 50 | fs.window_sec, | ||
| 51 | fs.hop_sec, | ||
| 52 | fs.embedding_dim, | ||
| 53 | fs.distance_metric, | ||
| 54 | mr.model_name, | ||
| 55 | mr.model_version, | ||
| 56 | mr.model_family, | ||
| 57 | mr.output_embedding_dim, | ||
| 58 | mr.input_sample_rate, | ||
| 59 | mr.default_window_sec, | ||
| 60 | mr.default_hop_sec, | ||
| 61 | mr.metadata_json | ||
| 62 | FROM feature_extraction_job fej | ||
| 63 | JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id | ||
| 64 | JOIN model_registry mr ON mr.model_id = fs.model_id | ||
| 65 | WHERE fej.job_status = %s | ||
| 66 | ORDER BY fej.extraction_job_id; | ||
| 67 | """, | ||
| 68 | (args.job_status,), | ||
| 69 | ).fetchall() | ||
| 70 | |||
| 71 | jobs = [] | ||
| 72 | by_lane: dict[str, list[dict[str, Any]]] = {} | ||
| 73 | for row in rows: | ||
| 74 | job_meta = row[5] or {} | ||
| 75 | model_meta = row[20] or {} | ||
| 76 | lane = job_meta.get('lane') or model_meta.get('lane') or 'unknown' | ||
| 77 | scope = parse_target_scope(row[2]) | ||
| 78 | physical_target = 'audio_fingerprint' if row[6] == 'fingerprint_asset' else 'audio_embedding' | ||
| 79 | vector_table = None | ||
| 80 | if row[11] == 192: | ||
| 81 | vector_table = 'audio_embedding_vector_192' | ||
| 82 | elif row[11] == 768: | ||
| 83 | vector_table = 'audio_embedding_vector_768' | ||
| 84 | |||
| 85 | item = { | ||
| 86 | 'priority_rank': LANE_PRIORITY.get(lane, 99), | ||
| 87 | 'lane': lane, | ||
| 88 | 'extraction_job_id': row[0], | ||
| 89 | 'feature_set_id': row[1], | ||
| 90 | 'target_scope': row[2], | ||
| 91 | 'scope': scope, | ||
| 92 | 'job_status': row[3], | ||
| 93 | 'shard_key': row[4], | ||
| 94 | 'model_name': row[13], | ||
| 95 | 'model_version': row[14], | ||
| 96 | 'model_family': row[15], | ||
| 97 | 'input_sample_rate': row[17], | ||
| 98 | 'feature_name': row[6], | ||
| 99 | 'feature_level': row[7], | ||
| 100 | 'extraction_granularity': row[8], | ||
| 101 | 'window_sec': float(row[9]) if row[9] is not None else None, | ||
| 102 | 'hop_sec': float(row[10]) if row[10] is not None else None, | ||
| 103 | 'embedding_dim': row[11], | ||
| 104 | 'distance_metric': row[12], | ||
| 105 | 'physical_target': physical_target, | ||
| 106 | 'vector_table': vector_table, | ||
| 107 | 'job_metadata': job_meta, | ||
| 108 | 'model_metadata': model_meta, | ||
| 109 | 'execution_notes': [ | ||
| 110 | f"run feature extraction for {row[13]} {row[14]}", | ||
| 111 | f"write to {physical_target}" + (f" + {vector_table}" if vector_table else ''), | ||
| 112 | f"target scope: {row[2]}", | ||
| 113 | ], | ||
| 114 | } | ||
| 115 | jobs.append(item) | ||
| 116 | by_lane.setdefault(lane, []).append(item) | ||
| 117 | |||
| 118 | jobs.sort(key=lambda x: (x['priority_rank'], x['extraction_job_id'])) | ||
| 119 | for lane_jobs in by_lane.values(): | ||
| 120 | lane_jobs.sort(key=lambda x: x['extraction_job_id']) | ||
| 121 | |||
| 122 | payload = { | ||
| 123 | 'schema': args.schema, | ||
| 124 | 'dsn_redacted': 'postgres://d2:***@127.0.0.1:5432/d2', | ||
| 125 | 'job_status_filter': args.job_status, | ||
| 126 | 'counts': { | ||
| 127 | 'jobs': len(jobs), | ||
| 128 | 'lanes': {lane: len(items) for lane, items in sorted(by_lane.items())}, | ||
| 129 | }, | ||
| 130 | 'ordered_jobs': jobs, | ||
| 131 | 'by_lane': by_lane, | ||
| 132 | 'execution_order_summary': [ | ||
| 133 | { | ||
| 134 | 'order': idx + 1, | ||
| 135 | 'extraction_job_id': job['extraction_job_id'], | ||
| 136 | 'lane': job['lane'], | ||
| 137 | 'model_name': job['model_name'], | ||
| 138 | 'feature_name': job['feature_name'], | ||
| 139 | 'window_sec': job['window_sec'], | ||
| 140 | 'hop_sec': job['hop_sec'], | ||
| 141 | 'physical_target': job['physical_target'], | ||
| 142 | } | ||
| 143 | for idx, job in enumerate(jobs) | ||
| 144 | ], | ||
| 145 | } | ||
| 146 | |||
| 147 | out = Path(args.output) | ||
| 148 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 149 | out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding='utf-8') | ||
| 150 | print(json.dumps(payload, ensure_ascii=False, indent=2)) | ||
| 151 | |||
| 152 | |||
| 153 | if __name__ == '__main__': | ||
| 154 | main() |
| 1 | ## 2026-06-04 | 1 | ## 2026-06-04 |
| 2 | 2 | ||
| 3 | - 新增 `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json`,支持从 PostgreSQL 的 `feature_extraction_job` 真实读取 pending jobs,并联表生成按 lane / priority 排序的 Phase-1 execution plan。 | ||
| 3 | - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 | 4 | - 新增 `acr-engine/scripts/bootstrap_phase1_extraction_jobs_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json`,把 Phase-1 的 `feature_extraction_job` 初始化做成可直接连 PostgreSQL 的 live 脚本,并已在 `acr_test` schema 真实创建 5 条 pending jobs。 |
| 4 | - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py` 在 `acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。 | 5 | - 补充 `phase1_registry_bootstrap_idempotency_report.json` 与文档说明,验证 `bootstrap_phase1_model_registry_live.py` 在 `acr_test` schema 上连续执行两次后表计数保持稳定,证明 Phase-1 registry bootstrap 具备可重复执行的幂等性。 |
| 5 | - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 | 6 | - 新增 `acr-engine/scripts/bootstrap_phase1_model_registry_live.py` 与 `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json`,把 Phase-1 的 `chromaprint / mert / muq / ecapa` 与对应 `feature_set_registry / reference_set_registry` 初始化做成可直接连 PostgreSQL 的 live bootstrap 脚本,并已在 `acr_test` schema 验证通过。 | ... | ... |
| ... | @@ -344,3 +344,56 @@ cd /workspace/acr-engine | ... | @@ -344,3 +344,56 @@ cd /workspace/acr-engine |
| 344 | 这意味着: | 344 | 这意味着: |
| 345 | 345 | ||
| 346 | > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。 | 346 | > 现在 PostgreSQL 里已经不只是“模型定义”和“特征定义”,而是连 **下一步该跑哪些抽特征任务** 都已经具备结构化入口了。 |
| 347 | |||
| 348 | --- | ||
| 349 | |||
| 350 | ## 10. Phase-1 extraction plan(从 pending jobs 生成) | ||
| 351 | |||
| 352 | 当 `feature_extraction_job` 已经存在后,下一步通常不是马上手敲命令,而是先从 PostgreSQL 生成一个**统一执行计划**。 | ||
| 353 | |||
| 354 | 本仓库现在已经提供: | ||
| 355 | |||
| 356 | - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` | ||
| 357 | |||
| 358 | 用途: | ||
| 359 | - 读取 `feature_extraction_job` | ||
| 360 | - 过滤 `job_status=pending` | ||
| 361 | - 联表 `feature_set_registry + model_registry` | ||
| 362 | - 生成按 lane / priority 排序的 execution plan | ||
| 363 | |||
| 364 | ### 10.1 执行命令 | ||
| 365 | |||
| 366 | ```bash | ||
| 367 | cd /workspace/acr-engine | ||
| 368 | /usr/local/miniconda3/bin/python scripts/plan_phase1_extraction_jobs_live.py \ | ||
| 369 | --dsn 'postgres://d2:d2pass@127.0.0.1:5432/d2' \ | ||
| 370 | --schema acr_test \ | ||
| 371 | --job-status pending \ | ||
| 372 | --output data/pgvector_eval/music20/phase1_extraction_plan_report.json | ||
| 373 | ``` | ||
| 374 | |||
| 375 | ### 10.2 当前已验证结果(acr_test) | ||
| 376 | |||
| 377 | 本轮已真实生成一份 ordered execution plan: | ||
| 378 | |||
| 379 | | order | lane | model | feature | physical_target | | ||
| 380 | |---|---|---|---|---| | ||
| 381 | | 1 | `exact` | `chromaprint` | `fingerprint_asset` | `audio_fingerprint` | | ||
| 382 | | 2 | `semantic` | `mert` | `semantic_embedding 5s/2.5s` | `audio_embedding` | | ||
| 383 | | 3 | `semantic` | `mert` | `semantic_embedding 10s/5s` | `audio_embedding` | | ||
| 384 | | 4 | `semantic` | `muq` | `semantic_embedding 5s/2.5s` | `audio_embedding` | | ||
| 385 | | 5 | `semantic` | `ecapa` | `semantic_embedding 5s/2.5s` | `audio_embedding` | | ||
| 386 | |||
| 387 | 其中 planner 还会自动给出: | ||
| 388 | - `vector_table` | ||
| 389 | - `audio_embedding_vector_768` | ||
| 390 | - `audio_embedding_vector_192` | ||
| 391 | - `target_scope` | ||
| 392 | - `execution_notes` | ||
| 393 | |||
| 394 | 当前产物: | ||
| 395 | - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | ||
| 396 | |||
| 397 | 结论: | ||
| 398 | |||
| 399 | > 现在 PostgreSQL 里已经不仅能描述“有哪些 job”,还可以直接生成**按执行顺序排好的抽特征计划**。 | ... | ... |
| ... | @@ -68,6 +68,7 @@ | ... | @@ -68,6 +68,7 @@ |
| 68 | | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | | 68 | | registry bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_report.json` | |
| 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | | 69 | | registry bootstrap 幂等性报告 | `acr-engine/data/pgvector_eval/music20/phase1_registry_bootstrap_idempotency_report.json` | |
| 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | | 70 | | extraction job bootstrap 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | |
| 71 | | extraction plan 报告 | `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | | ||
| 71 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | | 72 | | 历史对照报告 | `acr-engine/data/pgvector_eval/music20/songid_eval_report.json` | |
| 72 | 73 | ||
| 73 | --- | 74 | --- |
| ... | @@ -416,6 +417,19 @@ flowchart LR | ... | @@ -416,6 +417,19 @@ flowchart LR |
| 416 | 对应 live 报告: | 417 | 对应 live 报告: |
| 417 | - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` | 418 | - `acr-engine/data/pgvector_eval/music20/phase1_extraction_jobs_report.json` |
| 418 | 419 | ||
| 420 | ### 本轮继续新增:pending jobs 已可生成 live execution plan | ||
| 421 | |||
| 422 | 在 extraction jobs 之后,本轮又新增: | ||
| 423 | |||
| 424 | - `acr-engine/scripts/plan_phase1_extraction_jobs_live.py` | ||
| 425 | |||
| 426 | 它已经在 `acr_test` schema 上真实读取 5 条 `pending` jobs,并生成按执行顺序排列的 plan: | ||
| 427 | - `chromaprint exact lane` 优先 | ||
| 428 | - 然后是 `mert / muq / ecapa` 的 semantic lanes | ||
| 429 | |||
| 430 | 对应 live 报告: | ||
| 431 | - `acr-engine/data/pgvector_eval/music20/phase1_extraction_plan_report.json` | ||
| 432 | |||
| 419 | ### 路线 1:继续做 PostgreSQL 工程化 | 433 | ### 路线 1:继续做 PostgreSQL 工程化 |
| 420 | 434 | ||
| 421 | 1. 把 `live_pgvector_music20_eval.py` 泛化成: | 435 | 1. 把 `live_pgvector_music20_eval.py` 泛化成: | ... | ... |
| ... | @@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql | ... | @@ -184,6 +184,7 @@ sed -n '1,320p' acr-engine/sql/acr_pg_schema_v2.sql |
| 184 | - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` | 184 | - PostgreSQL `acr_test` schema 上已真实写入 Phase-1 registry bootstrap:`chromaprint / mert / muq / ecapa` + 5 组 feature set + `phase1_hot_reference_v1` |
| 185 | - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变 | 185 | - Phase-1 registry bootstrap 已有幂等性证据:同 schema 连续执行两次后,`model_registry=5 / feature_set_registry=6 / reference_set_registry=2` 保持不变 |
| 186 | - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 | 186 | - PostgreSQL `acr_test` schema 上已真实创建 5 条 `feature_extraction_job`,后续 MERT / MuQ 接入可直接从 pending jobs 启动 |
| 187 | - PostgreSQL `acr_test` schema 上已真实生成 Phase-1 extraction execution plan,当前顺序是 `chromaprint -> mert -> mert-long -> muq -> ecapa` | ||
| 187 | 188 | ||
| 188 | ### 未验证 / 仍是缺口 | 189 | ### 未验证 / 仍是缺口 |
| 189 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | 190 | - **未实际跑 MERT / MuQ encoder-only 特征抽取** | ... | ... |
-
Please register or sign in to post a comment