Turn pgvector planning into repo-native ingestion templates
Constraint: The user needs concrete downstream data handling guidance now, and future vector retrieval work should not start from abstract docs alone Rejected: Leave pgvector support at prose-only guidance | Delays integration by forcing later sessions to reinvent schema and export bridges Confidence: high Scope-risk: narrow Directive: Keep schema/export templates aligned with actual manifest semantics before adding live database loaders Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/scripts/export_manifest_to_pgvector_json.py; /usr/local/miniconda3/bin/python acr-engine/scripts/export_manifest_to_pgvector_json.py --data acr-engine/data/synthetic_v2 --split test --source-dataset synthetic_v2 --output acr-engine/reports/pgvector_manifest_export_test.json Not-tested: Live PostgreSQL/pgvector ingestion remains pending a real database target
Showing
5 changed files
with
756 additions
and
0 deletions
| 1 | { | ||
| 2 | "songs": [ | ||
| 3 | { | ||
| 4 | "song_id": "song_0000", | ||
| 5 | "title": "song_0000", | ||
| 6 | "artist": null, | ||
| 7 | "version_id": null, | ||
| 8 | "source_dataset": "synthetic_v2", | ||
| 9 | "license": null | ||
| 10 | }, | ||
| 11 | { | ||
| 12 | "song_id": "song_0001", | ||
| 13 | "title": "song_0001", | ||
| 14 | "artist": null, | ||
| 15 | "version_id": null, | ||
| 16 | "source_dataset": "synthetic_v2", | ||
| 17 | "license": null | ||
| 18 | }, | ||
| 19 | { | ||
| 20 | "song_id": "song_0002", | ||
| 21 | "title": "song_0002", | ||
| 22 | "artist": null, | ||
| 23 | "version_id": null, | ||
| 24 | "source_dataset": "synthetic_v2", | ||
| 25 | "license": null | ||
| 26 | }, | ||
| 27 | { | ||
| 28 | "song_id": "song_0003", | ||
| 29 | "title": "song_0003", | ||
| 30 | "artist": null, | ||
| 31 | "version_id": null, | ||
| 32 | "source_dataset": "synthetic_v2", | ||
| 33 | "license": null | ||
| 34 | }, | ||
| 35 | { | ||
| 36 | "song_id": "song_0004", | ||
| 37 | "title": "song_0004", | ||
| 38 | "artist": null, | ||
| 39 | "version_id": null, | ||
| 40 | "source_dataset": "synthetic_v2", | ||
| 41 | "license": null | ||
| 42 | }, | ||
| 43 | { | ||
| 44 | "song_id": "song_0005", | ||
| 45 | "title": "song_0005", | ||
| 46 | "artist": null, | ||
| 47 | "version_id": null, | ||
| 48 | "source_dataset": "synthetic_v2", | ||
| 49 | "license": null | ||
| 50 | }, | ||
| 51 | { | ||
| 52 | "song_id": "song_0006", | ||
| 53 | "title": "song_0006", | ||
| 54 | "artist": null, | ||
| 55 | "version_id": null, | ||
| 56 | "source_dataset": "synthetic_v2", | ||
| 57 | "license": null | ||
| 58 | }, | ||
| 59 | { | ||
| 60 | "song_id": "song_0007", | ||
| 61 | "title": "song_0007", | ||
| 62 | "artist": null, | ||
| 63 | "version_id": null, | ||
| 64 | "source_dataset": "synthetic_v2", | ||
| 65 | "license": null | ||
| 66 | }, | ||
| 67 | { | ||
| 68 | "song_id": "song_0008", | ||
| 69 | "title": "song_0008", | ||
| 70 | "artist": null, | ||
| 71 | "version_id": null, | ||
| 72 | "source_dataset": "synthetic_v2", | ||
| 73 | "license": null | ||
| 74 | }, | ||
| 75 | { | ||
| 76 | "song_id": "song_0009", | ||
| 77 | "title": "song_0009", | ||
| 78 | "artist": null, | ||
| 79 | "version_id": null, | ||
| 80 | "source_dataset": "synthetic_v2", | ||
| 81 | "license": null | ||
| 82 | }, | ||
| 83 | { | ||
| 84 | "song_id": "song_0010", | ||
| 85 | "title": "song_0010", | ||
| 86 | "artist": null, | ||
| 87 | "version_id": null, | ||
| 88 | "source_dataset": "synthetic_v2", | ||
| 89 | "license": null | ||
| 90 | }, | ||
| 91 | { | ||
| 92 | "song_id": "song_0011", | ||
| 93 | "title": "song_0011", | ||
| 94 | "artist": null, | ||
| 95 | "version_id": null, | ||
| 96 | "source_dataset": "synthetic_v2", | ||
| 97 | "license": null | ||
| 98 | }, | ||
| 99 | { | ||
| 100 | "song_id": "song_0012", | ||
| 101 | "title": "song_0012", | ||
| 102 | "artist": null, | ||
| 103 | "version_id": null, | ||
| 104 | "source_dataset": "synthetic_v2", | ||
| 105 | "license": null | ||
| 106 | }, | ||
| 107 | { | ||
| 108 | "song_id": "song_0013", | ||
| 109 | "title": "song_0013", | ||
| 110 | "artist": null, | ||
| 111 | "version_id": null, | ||
| 112 | "source_dataset": "synthetic_v2", | ||
| 113 | "license": null | ||
| 114 | }, | ||
| 115 | { | ||
| 116 | "song_id": "song_0014", | ||
| 117 | "title": "song_0014", | ||
| 118 | "artist": null, | ||
| 119 | "version_id": null, | ||
| 120 | "source_dataset": "synthetic_v2", | ||
| 121 | "license": null | ||
| 122 | }, | ||
| 123 | { | ||
| 124 | "song_id": "song_0015", | ||
| 125 | "title": "song_0015", | ||
| 126 | "artist": null, | ||
| 127 | "version_id": null, | ||
| 128 | "source_dataset": "synthetic_v2", | ||
| 129 | "license": null | ||
| 130 | }, | ||
| 131 | { | ||
| 132 | "song_id": "song_0016", | ||
| 133 | "title": "song_0016", | ||
| 134 | "artist": null, | ||
| 135 | "version_id": null, | ||
| 136 | "source_dataset": "synthetic_v2", | ||
| 137 | "license": null | ||
| 138 | }, | ||
| 139 | { | ||
| 140 | "song_id": "song_0017", | ||
| 141 | "title": "song_0017", | ||
| 142 | "artist": null, | ||
| 143 | "version_id": null, | ||
| 144 | "source_dataset": "synthetic_v2", | ||
| 145 | "license": null | ||
| 146 | }, | ||
| 147 | { | ||
| 148 | "song_id": "song_0018", | ||
| 149 | "title": "song_0018", | ||
| 150 | "artist": null, | ||
| 151 | "version_id": null, | ||
| 152 | "source_dataset": "synthetic_v2", | ||
| 153 | "license": null | ||
| 154 | }, | ||
| 155 | { | ||
| 156 | "song_id": "song_0019", | ||
| 157 | "title": "song_0019", | ||
| 158 | "artist": null, | ||
| 159 | "version_id": null, | ||
| 160 | "source_dataset": "synthetic_v2", | ||
| 161 | "license": null | ||
| 162 | }, | ||
| 163 | { | ||
| 164 | "song_id": "song_0020", | ||
| 165 | "title": "song_0020", | ||
| 166 | "artist": null, | ||
| 167 | "version_id": null, | ||
| 168 | "source_dataset": "synthetic_v2", | ||
| 169 | "license": null | ||
| 170 | }, | ||
| 171 | { | ||
| 172 | "song_id": "song_0021", | ||
| 173 | "title": "song_0021", | ||
| 174 | "artist": null, | ||
| 175 | "version_id": null, | ||
| 176 | "source_dataset": "synthetic_v2", | ||
| 177 | "license": null | ||
| 178 | }, | ||
| 179 | { | ||
| 180 | "song_id": "song_0022", | ||
| 181 | "title": "song_0022", | ||
| 182 | "artist": null, | ||
| 183 | "version_id": null, | ||
| 184 | "source_dataset": "synthetic_v2", | ||
| 185 | "license": null | ||
| 186 | }, | ||
| 187 | { | ||
| 188 | "song_id": "song_0023", | ||
| 189 | "title": "song_0023", | ||
| 190 | "artist": null, | ||
| 191 | "version_id": null, | ||
| 192 | "source_dataset": "synthetic_v2", | ||
| 193 | "license": null | ||
| 194 | } | ||
| 195 | ], | ||
| 196 | "references": [ | ||
| 197 | { | ||
| 198 | "song_id": "song_0000", | ||
| 199 | "audio_uri": "songs/song_0000.wav", | ||
| 200 | "duration_sec": 15.0, | ||
| 201 | "sample_rate": 16000 | ||
| 202 | }, | ||
| 203 | { | ||
| 204 | "song_id": "song_0001", | ||
| 205 | "audio_uri": "songs/song_0001.wav", | ||
| 206 | "duration_sec": 15.0, | ||
| 207 | "sample_rate": 16000 | ||
| 208 | }, | ||
| 209 | { | ||
| 210 | "song_id": "song_0002", | ||
| 211 | "audio_uri": "songs/song_0002.wav", | ||
| 212 | "duration_sec": 15.0, | ||
| 213 | "sample_rate": 16000 | ||
| 214 | }, | ||
| 215 | { | ||
| 216 | "song_id": "song_0003", | ||
| 217 | "audio_uri": "songs/song_0003.wav", | ||
| 218 | "duration_sec": 15.0, | ||
| 219 | "sample_rate": 16000 | ||
| 220 | }, | ||
| 221 | { | ||
| 222 | "song_id": "song_0004", | ||
| 223 | "audio_uri": "songs/song_0004.wav", | ||
| 224 | "duration_sec": 15.0, | ||
| 225 | "sample_rate": 16000 | ||
| 226 | }, | ||
| 227 | { | ||
| 228 | "song_id": "song_0005", | ||
| 229 | "audio_uri": "songs/song_0005.wav", | ||
| 230 | "duration_sec": 15.0, | ||
| 231 | "sample_rate": 16000 | ||
| 232 | }, | ||
| 233 | { | ||
| 234 | "song_id": "song_0006", | ||
| 235 | "audio_uri": "songs/song_0006.wav", | ||
| 236 | "duration_sec": 15.0, | ||
| 237 | "sample_rate": 16000 | ||
| 238 | }, | ||
| 239 | { | ||
| 240 | "song_id": "song_0007", | ||
| 241 | "audio_uri": "songs/song_0007.wav", | ||
| 242 | "duration_sec": 15.0, | ||
| 243 | "sample_rate": 16000 | ||
| 244 | }, | ||
| 245 | { | ||
| 246 | "song_id": "song_0008", | ||
| 247 | "audio_uri": "songs/song_0008.wav", | ||
| 248 | "duration_sec": 15.0, | ||
| 249 | "sample_rate": 16000 | ||
| 250 | }, | ||
| 251 | { | ||
| 252 | "song_id": "song_0009", | ||
| 253 | "audio_uri": "songs/song_0009.wav", | ||
| 254 | "duration_sec": 15.0, | ||
| 255 | "sample_rate": 16000 | ||
| 256 | }, | ||
| 257 | { | ||
| 258 | "song_id": "song_0010", | ||
| 259 | "audio_uri": "songs/song_0010.wav", | ||
| 260 | "duration_sec": 15.0, | ||
| 261 | "sample_rate": 16000 | ||
| 262 | }, | ||
| 263 | { | ||
| 264 | "song_id": "song_0011", | ||
| 265 | "audio_uri": "songs/song_0011.wav", | ||
| 266 | "duration_sec": 15.0, | ||
| 267 | "sample_rate": 16000 | ||
| 268 | }, | ||
| 269 | { | ||
| 270 | "song_id": "song_0012", | ||
| 271 | "audio_uri": "songs/song_0012.wav", | ||
| 272 | "duration_sec": 15.0, | ||
| 273 | "sample_rate": 16000 | ||
| 274 | }, | ||
| 275 | { | ||
| 276 | "song_id": "song_0013", | ||
| 277 | "audio_uri": "songs/song_0013.wav", | ||
| 278 | "duration_sec": 15.0, | ||
| 279 | "sample_rate": 16000 | ||
| 280 | }, | ||
| 281 | { | ||
| 282 | "song_id": "song_0014", | ||
| 283 | "audio_uri": "songs/song_0014.wav", | ||
| 284 | "duration_sec": 15.0, | ||
| 285 | "sample_rate": 16000 | ||
| 286 | }, | ||
| 287 | { | ||
| 288 | "song_id": "song_0015", | ||
| 289 | "audio_uri": "songs/song_0015.wav", | ||
| 290 | "duration_sec": 15.0, | ||
| 291 | "sample_rate": 16000 | ||
| 292 | }, | ||
| 293 | { | ||
| 294 | "song_id": "song_0016", | ||
| 295 | "audio_uri": "songs/song_0016.wav", | ||
| 296 | "duration_sec": 15.0, | ||
| 297 | "sample_rate": 16000 | ||
| 298 | }, | ||
| 299 | { | ||
| 300 | "song_id": "song_0017", | ||
| 301 | "audio_uri": "songs/song_0017.wav", | ||
| 302 | "duration_sec": 15.0, | ||
| 303 | "sample_rate": 16000 | ||
| 304 | }, | ||
| 305 | { | ||
| 306 | "song_id": "song_0018", | ||
| 307 | "audio_uri": "songs/song_0018.wav", | ||
| 308 | "duration_sec": 15.0, | ||
| 309 | "sample_rate": 16000 | ||
| 310 | }, | ||
| 311 | { | ||
| 312 | "song_id": "song_0019", | ||
| 313 | "audio_uri": "songs/song_0019.wav", | ||
| 314 | "duration_sec": 15.0, | ||
| 315 | "sample_rate": 16000 | ||
| 316 | }, | ||
| 317 | { | ||
| 318 | "song_id": "song_0020", | ||
| 319 | "audio_uri": "songs/song_0020.wav", | ||
| 320 | "duration_sec": 15.0, | ||
| 321 | "sample_rate": 16000 | ||
| 322 | }, | ||
| 323 | { | ||
| 324 | "song_id": "song_0021", | ||
| 325 | "audio_uri": "songs/song_0021.wav", | ||
| 326 | "duration_sec": 15.0, | ||
| 327 | "sample_rate": 16000 | ||
| 328 | }, | ||
| 329 | { | ||
| 330 | "song_id": "song_0022", | ||
| 331 | "audio_uri": "songs/song_0022.wav", | ||
| 332 | "duration_sec": 15.0, | ||
| 333 | "sample_rate": 16000 | ||
| 334 | }, | ||
| 335 | { | ||
| 336 | "song_id": "song_0023", | ||
| 337 | "audio_uri": "songs/song_0023.wav", | ||
| 338 | "duration_sec": 15.0, | ||
| 339 | "sample_rate": 16000 | ||
| 340 | } | ||
| 341 | ], | ||
| 342 | "segments": [ | ||
| 343 | { | ||
| 344 | "song_id": "song_0020", | ||
| 345 | "audio_uri": "segments/song_0020_seg_00.wav", | ||
| 346 | "offset_sec": 4.349828784349853, | ||
| 347 | "duration_sec": 5.0, | ||
| 348 | "split": "test", | ||
| 349 | "type": "clean", | ||
| 350 | "segment_type": "mid", | ||
| 351 | "source_dataset": "synthetic_v2" | ||
| 352 | }, | ||
| 353 | { | ||
| 354 | "song_id": "song_0020", | ||
| 355 | "audio_uri": "segments/song_0020_seg_01.wav", | ||
| 356 | "offset_sec": 9.642182747327407, | ||
| 357 | "duration_sec": 5.0, | ||
| 358 | "split": "test", | ||
| 359 | "type": "clean", | ||
| 360 | "segment_type": "mid", | ||
| 361 | "source_dataset": "synthetic_v2" | ||
| 362 | }, | ||
| 363 | { | ||
| 364 | "song_id": "song_0020", | ||
| 365 | "audio_uri": "segments/song_0020_seg_02_augmented.wav", | ||
| 366 | "offset_sec": 2.367717347418965, | ||
| 367 | "duration_sec": 5.0, | ||
| 368 | "split": "test", | ||
| 369 | "type": "augmented", | ||
| 370 | "segment_type": "intro", | ||
| 371 | "source_dataset": "synthetic_v2" | ||
| 372 | }, | ||
| 373 | { | ||
| 374 | "song_id": "song_0020", | ||
| 375 | "audio_uri": "segments/song_0020_seg_03_humming_like.wav", | ||
| 376 | "offset_sec": 3.180577192661006, | ||
| 377 | "duration_sec": 5.0, | ||
| 378 | "split": "test", | ||
| 379 | "type": "humming_like", | ||
| 380 | "segment_type": "mid", | ||
| 381 | "source_dataset": "synthetic_v2" | ||
| 382 | }, | ||
| 383 | { | ||
| 384 | "song_id": "song_0020", | ||
| 385 | "audio_uri": "segments/song_0020_seg_04_confused.wav", | ||
| 386 | "offset_sec": 4.660551124366617, | ||
| 387 | "duration_sec": 5.0, | ||
| 388 | "split": "test", | ||
| 389 | "type": "confused", | ||
| 390 | "segment_type": "mid", | ||
| 391 | "source_dataset": "synthetic_v2" | ||
| 392 | }, | ||
| 393 | { | ||
| 394 | "song_id": "song_0021", | ||
| 395 | "audio_uri": "segments/song_0021_seg_00.wav", | ||
| 396 | "offset_sec": 5.631088908640184, | ||
| 397 | "duration_sec": 5.0, | ||
| 398 | "split": "test", | ||
| 399 | "type": "clean", | ||
| 400 | "segment_type": "mid", | ||
| 401 | "source_dataset": "synthetic_v2" | ||
| 402 | }, | ||
| 403 | { | ||
| 404 | "song_id": "song_0021", | ||
| 405 | "audio_uri": "segments/song_0021_seg_01.wav", | ||
| 406 | "offset_sec": 1.8823366490525628, | ||
| 407 | "duration_sec": 5.0, | ||
| 408 | "split": "test", | ||
| 409 | "type": "clean", | ||
| 410 | "segment_type": "intro", | ||
| 411 | "source_dataset": "synthetic_v2" | ||
| 412 | }, | ||
| 413 | { | ||
| 414 | "song_id": "song_0021", | ||
| 415 | "audio_uri": "segments/song_0021_seg_02_augmented.wav", | ||
| 416 | "offset_sec": 9.88006210404643, | ||
| 417 | "duration_sec": 5.0, | ||
| 418 | "split": "test", | ||
| 419 | "type": "augmented", | ||
| 420 | "segment_type": "mid", | ||
| 421 | "source_dataset": "synthetic_v2" | ||
| 422 | }, | ||
| 423 | { | ||
| 424 | "song_id": "song_0021", | ||
| 425 | "audio_uri": "segments/song_0021_seg_03_humming_like.wav", | ||
| 426 | "offset_sec": 0.9025737685090285, | ||
| 427 | "duration_sec": 5.0, | ||
| 428 | "split": "test", | ||
| 429 | "type": "humming_like", | ||
| 430 | "segment_type": "intro", | ||
| 431 | "source_dataset": "synthetic_v2" | ||
| 432 | }, | ||
| 433 | { | ||
| 434 | "song_id": "song_0021", | ||
| 435 | "audio_uri": "segments/song_0021_seg_04_confused.wav", | ||
| 436 | "offset_sec": 1.3048954561918258, | ||
| 437 | "duration_sec": 5.0, | ||
| 438 | "split": "test", | ||
| 439 | "type": "confused", | ||
| 440 | "segment_type": "intro", | ||
| 441 | "source_dataset": "synthetic_v2" | ||
| 442 | }, | ||
| 443 | { | ||
| 444 | "song_id": "song_0022", | ||
| 445 | "audio_uri": "segments/song_0022_seg_00.wav", | ||
| 446 | "offset_sec": 3.9746734850812295, | ||
| 447 | "duration_sec": 5.0, | ||
| 448 | "split": "test", | ||
| 449 | "type": "clean", | ||
| 450 | "segment_type": "mid", | ||
| 451 | "source_dataset": "synthetic_v2" | ||
| 452 | }, | ||
| 453 | { | ||
| 454 | "song_id": "song_0022", | ||
| 455 | "audio_uri": "segments/song_0022_seg_01.wav", | ||
| 456 | "offset_sec": 4.890968121206573, | ||
| 457 | "duration_sec": 5.0, | ||
| 458 | "split": "test", | ||
| 459 | "type": "clean", | ||
| 460 | "segment_type": "mid", | ||
| 461 | "source_dataset": "synthetic_v2" | ||
| 462 | }, | ||
| 463 | { | ||
| 464 | "song_id": "song_0022", | ||
| 465 | "audio_uri": "segments/song_0022_seg_02_augmented.wav", | ||
| 466 | "offset_sec": 6.610400547460049, | ||
| 467 | "duration_sec": 5.0, | ||
| 468 | "split": "test", | ||
| 469 | "type": "augmented", | ||
| 470 | "segment_type": "mid", | ||
| 471 | "source_dataset": "synthetic_v2" | ||
| 472 | }, | ||
| 473 | { | ||
| 474 | "song_id": "song_0022", | ||
| 475 | "audio_uri": "segments/song_0022_seg_03_humming_like.wav", | ||
| 476 | "offset_sec": 2.6329596668288424, | ||
| 477 | "duration_sec": 5.0, | ||
| 478 | "split": "test", | ||
| 479 | "type": "humming_like", | ||
| 480 | "segment_type": "intro", | ||
| 481 | "source_dataset": "synthetic_v2" | ||
| 482 | }, | ||
| 483 | { | ||
| 484 | "song_id": "song_0022", | ||
| 485 | "audio_uri": "segments/song_0022_seg_04_confused.wav", | ||
| 486 | "offset_sec": 0.8570731183991709, | ||
| 487 | "duration_sec": 5.0, | ||
| 488 | "split": "test", | ||
| 489 | "type": "confused", | ||
| 490 | "segment_type": "intro", | ||
| 491 | "source_dataset": "synthetic_v2" | ||
| 492 | }, | ||
| 493 | { | ||
| 494 | "song_id": "song_0023", | ||
| 495 | "audio_uri": "segments/song_0023_seg_00.wav", | ||
| 496 | "offset_sec": 4.461034326075292, | ||
| 497 | "duration_sec": 5.0, | ||
| 498 | "split": "test", | ||
| 499 | "type": "clean", | ||
| 500 | "segment_type": "mid", | ||
| 501 | "source_dataset": "synthetic_v2" | ||
| 502 | }, | ||
| 503 | { | ||
| 504 | "song_id": "song_0023", | ||
| 505 | "audio_uri": "segments/song_0023_seg_01.wav", | ||
| 506 | "offset_sec": 9.605203782802876, | ||
| 507 | "duration_sec": 5.0, | ||
| 508 | "split": "test", | ||
| 509 | "type": "clean", | ||
| 510 | "segment_type": "mid", | ||
| 511 | "source_dataset": "synthetic_v2" | ||
| 512 | }, | ||
| 513 | { | ||
| 514 | "song_id": "song_0023", | ||
| 515 | "audio_uri": "segments/song_0023_seg_02_augmented.wav", | ||
| 516 | "offset_sec": 4.7458228906154805, | ||
| 517 | "duration_sec": 5.0, | ||
| 518 | "split": "test", | ||
| 519 | "type": "augmented", | ||
| 520 | "segment_type": "mid", | ||
| 521 | "source_dataset": "synthetic_v2" | ||
| 522 | }, | ||
| 523 | { | ||
| 524 | "song_id": "song_0023", | ||
| 525 | "audio_uri": "segments/song_0023_seg_03_humming_like.wav", | ||
| 526 | "offset_sec": 8.308702013555955, | ||
| 527 | "duration_sec": 5.0, | ||
| 528 | "split": "test", | ||
| 529 | "type": "humming_like", | ||
| 530 | "segment_type": "mid", | ||
| 531 | "source_dataset": "synthetic_v2" | ||
| 532 | }, | ||
| 533 | { | ||
| 534 | "song_id": "song_0023", | ||
| 535 | "audio_uri": "segments/song_0023_seg_04_confused.wav", | ||
| 536 | "offset_sec": 2.213510770155481, | ||
| 537 | "duration_sec": 5.0, | ||
| 538 | "split": "test", | ||
| 539 | "type": "confused", | ||
| 540 | "segment_type": "intro", | ||
| 541 | "source_dataset": "synthetic_v2" | ||
| 542 | } | ||
| 543 | ] | ||
| 544 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | #!/usr/bin/env python3 | ||
| 2 | """Export project manifests into a pgvector-friendly JSON payload. | ||
| 3 | |||
| 4 | This does not require PostgreSQL at runtime. It prepares normalized rows so a | ||
| 5 | future loader can bulk ingest them into Postgres/pgvector safely. | ||
| 6 | """ | ||
| 7 | |||
| 8 | from __future__ import annotations | ||
| 9 | |||
| 10 | import argparse | ||
| 11 | import json | ||
| 12 | from pathlib import Path | ||
| 13 | |||
| 14 | |||
| 15 | def load_json(path: Path): | ||
| 16 | return json.loads(path.read_text()) | ||
| 17 | |||
| 18 | |||
| 19 | def main(): | ||
| 20 | parser = argparse.ArgumentParser() | ||
| 21 | parser.add_argument("--data", required=True, help="manifest directory") | ||
| 22 | parser.add_argument("--output", required=True) | ||
| 23 | parser.add_argument("--split", default="train") | ||
| 24 | parser.add_argument("--source-dataset", default="unknown") | ||
| 25 | args = parser.parse_args() | ||
| 26 | |||
| 27 | data_dir = Path(args.data) | ||
| 28 | catalog = load_json(data_dir / "catalog.json") | ||
| 29 | split_rows = load_json(data_dir / f"{args.split}.json") | ||
| 30 | |||
| 31 | songs = {} | ||
| 32 | references = [] | ||
| 33 | segments = [] | ||
| 34 | |||
| 35 | for row in catalog: | ||
| 36 | song_id = row["song_id"] | ||
| 37 | songs.setdefault(song_id, { | ||
| 38 | "song_id": song_id, | ||
| 39 | "title": song_id, | ||
| 40 | "artist": None, | ||
| 41 | "version_id": None, | ||
| 42 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 43 | "license": None, | ||
| 44 | }) | ||
| 45 | if row.get("type") == "reference": | ||
| 46 | references.append({ | ||
| 47 | "song_id": song_id, | ||
| 48 | "audio_uri": row["audio_path"], | ||
| 49 | "duration_sec": row["duration"], | ||
| 50 | "sample_rate": 16000, | ||
| 51 | }) | ||
| 52 | |||
| 53 | for row in split_rows: | ||
| 54 | if row.get("type") == "reference": | ||
| 55 | continue | ||
| 56 | song_id = row["song_id"] | ||
| 57 | songs.setdefault(song_id, { | ||
| 58 | "song_id": song_id, | ||
| 59 | "title": song_id, | ||
| 60 | "artist": None, | ||
| 61 | "version_id": None, | ||
| 62 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 63 | "license": None, | ||
| 64 | }) | ||
| 65 | segments.append({ | ||
| 66 | "song_id": song_id, | ||
| 67 | "audio_uri": row["audio_path"], | ||
| 68 | "offset_sec": row.get("offset", 0.0), | ||
| 69 | "duration_sec": row["duration"], | ||
| 70 | "split": args.split, | ||
| 71 | "type": row.get("type", "unknown"), | ||
| 72 | "segment_type": row.get("segment_type"), | ||
| 73 | "source_dataset": row.get("source_dataset", args.source_dataset), | ||
| 74 | }) | ||
| 75 | |||
| 76 | payload = { | ||
| 77 | "songs": list(songs.values()), | ||
| 78 | "references": references, | ||
| 79 | "segments": segments, | ||
| 80 | } | ||
| 81 | |||
| 82 | out = Path(args.output) | ||
| 83 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 84 | out.write_text(json.dumps(payload, indent=2, ensure_ascii=False)) | ||
| 85 | print(json.dumps({ | ||
| 86 | "status": "ok", | ||
| 87 | "output": str(out.resolve()), | ||
| 88 | "songs": len(payload["songs"]), | ||
| 89 | "references": len(payload["references"]), | ||
| 90 | "segments": len(payload["segments"]), | ||
| 91 | }, indent=2, ensure_ascii=False)) | ||
| 92 | |||
| 93 | |||
| 94 | if __name__ == "__main__": | ||
| 95 | main() |
acr-engine/sql/pgvector_schema.sql
0 → 100644
| 1 | CREATE EXTENSION IF NOT EXISTS vector; | ||
| 2 | |||
| 3 | CREATE TABLE IF NOT EXISTS songs ( | ||
| 4 | song_id TEXT PRIMARY KEY, | ||
| 5 | title TEXT, | ||
| 6 | artist TEXT, | ||
| 7 | version_id TEXT, | ||
| 8 | source_dataset TEXT, | ||
| 9 | license TEXT, | ||
| 10 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 11 | ); | ||
| 12 | |||
| 13 | CREATE TABLE IF NOT EXISTS references ( | ||
| 14 | reference_id BIGSERIAL PRIMARY KEY, | ||
| 15 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 16 | audio_uri TEXT NOT NULL, | ||
| 17 | duration_sec DOUBLE PRECISION NOT NULL, | ||
| 18 | sample_rate INTEGER DEFAULT 16000, | ||
| 19 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 20 | ); | ||
| 21 | |||
| 22 | CREATE TABLE IF NOT EXISTS segments ( | ||
| 23 | segment_id BIGSERIAL PRIMARY KEY, | ||
| 24 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 25 | audio_uri TEXT NOT NULL, | ||
| 26 | offset_sec DOUBLE PRECISION DEFAULT 0, | ||
| 27 | duration_sec DOUBLE PRECISION NOT NULL, | ||
| 28 | split TEXT, | ||
| 29 | type TEXT NOT NULL, | ||
| 30 | segment_type TEXT, | ||
| 31 | source_dataset TEXT, | ||
| 32 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 33 | ); | ||
| 34 | |||
| 35 | CREATE TABLE IF NOT EXISTS reference_embeddings ( | ||
| 36 | embedding_id BIGSERIAL PRIMARY KEY, | ||
| 37 | reference_id BIGINT NOT NULL REFERENCES references(reference_id) ON DELETE CASCADE, | ||
| 38 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 39 | embedding vector(192) NOT NULL, | ||
| 40 | model_version TEXT NOT NULL, | ||
| 41 | data_version TEXT, | ||
| 42 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 43 | ); | ||
| 44 | |||
| 45 | CREATE TABLE IF NOT EXISTS query_embeddings ( | ||
| 46 | embedding_id BIGSERIAL PRIMARY KEY, | ||
| 47 | segment_id BIGINT NOT NULL REFERENCES segments(segment_id) ON DELETE CASCADE, | ||
| 48 | song_id TEXT NOT NULL REFERENCES songs(song_id) ON DELETE CASCADE, | ||
| 49 | embedding vector(192) NOT NULL, | ||
| 50 | model_version TEXT NOT NULL, | ||
| 51 | data_version TEXT, | ||
| 52 | created_at TIMESTAMPTZ DEFAULT NOW() | ||
| 53 | ); | ||
| 54 | |||
| 55 | CREATE INDEX IF NOT EXISTS idx_segments_song_id ON segments(song_id); | ||
| 56 | CREATE INDEX IF NOT EXISTS idx_references_song_id ON references(song_id); | ||
| 57 | CREATE INDEX IF NOT EXISTS idx_reference_embeddings_song_id ON reference_embeddings(song_id); | ||
| 58 | CREATE INDEX IF NOT EXISTS idx_query_embeddings_song_id ON query_embeddings(song_id); | ||
| 59 | |||
| 60 | CREATE INDEX IF NOT EXISTS idx_reference_embeddings_vector_cosine | ||
| 61 | ON reference_embeddings USING ivfflat (embedding vector_cosine_ops) | ||
| 62 | WITH (lists = 100); | ||
| 63 | |||
| 64 | CREATE INDEX IF NOT EXISTS idx_query_embeddings_vector_cosine | ||
| 65 | ON query_embeddings USING ivfflat (embedding vector_cosine_ops) | ||
| 66 | WITH (lists = 100); |
| ... | @@ -234,6 +234,26 @@ | ... | @@ -234,6 +234,26 @@ |
| 234 | 234 | ||
| 235 | 235 | ||
| 236 | 236 | ||
| 237 | |||
| 238 | ### Stage: pgvector 落库模板 | ||
| 239 | |||
| 240 | 完成项: | ||
| 241 | - 新增 [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql) | ||
| 242 | - 新增 [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py) | ||
| 243 | - 在 [docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 中补充可执行模板说明 | ||
| 244 | |||
| 245 | 验证结果: | ||
| 246 | - `/usr/local/miniconda3/bin/python -m py_compile scripts/export_manifest_to_pgvector_json.py` 成功 | ||
| 247 | - `/usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py --data data/synthetic_v2 --split test --source-dataset synthetic_v2 --output reports/pgvector_manifest_export_test.json` 成功 | ||
| 248 | - 当前导出结果: | ||
| 249 | - `songs=24` | ||
| 250 | - `references=24` | ||
| 251 | - `segments=20` | ||
| 252 | |||
| 253 | 结论: | ||
| 254 | - pgvector 方向现在不仅有概念文档,还有可直接复用的 schema 和 manifest 导出桥接脚本 | ||
| 255 | - 后续接 PostgreSQL 时返工成本会显著降低 | ||
| 256 | |||
| 237 | ### Stage: FMA 下载自动守护 | 257 | ### Stage: FMA 下载自动守护 |
| 238 | 258 | ||
| 239 | 完成项: | 259 | 完成项: | ... | ... |
| ... | @@ -508,6 +508,37 @@ val.json | ... | @@ -508,6 +508,37 @@ val.json |
| 508 | - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) | 508 | - [dataset-sources-and-licensing.md](./dataset-sources-and-licensing.md) |
| 509 | - [session-handoff.md](./session-handoff.md) | 509 | - [session-handoff.md](./session-handoff.md) |
| 510 | 510 | ||
| 511 | |||
| 512 | ## 12. 可直接落地的 pgvector 模板 | ||
| 513 | |||
| 514 | 仓库里现在已经补了两个可直接参考的模板: | ||
| 515 | |||
| 516 | - SQL schema: [acr-engine/sql/pgvector_schema.sql](../acr-engine/sql/pgvector_schema.sql) | ||
| 517 | - manifest 导出桥接脚本: [acr-engine/scripts/export_manifest_to_pgvector_json.py](../acr-engine/scripts/export_manifest_to_pgvector_json.py) | ||
| 518 | |||
| 519 | ### 导出示例 | ||
| 520 | |||
| 521 | ```bash | ||
| 522 | cd acr-engine | ||
| 523 | /usr/local/miniconda3/bin/python scripts/export_manifest_to_pgvector_json.py \ | ||
| 524 | --data data/synthetic_v2 \ | ||
| 525 | --split test \ | ||
| 526 | --source-dataset synthetic_v2 \ | ||
| 527 | --output reports/pgvector_manifest_export_test.json | ||
| 528 | ``` | ||
| 529 | |||
| 530 | ### 当前已验证结果 | ||
| 531 | |||
| 532 | - `songs=24` | ||
| 533 | - `references=24` | ||
| 534 | - `segments=20` | ||
| 535 | |||
| 536 | 这一步还不会直接写 PostgreSQL,作用是: | ||
| 537 | |||
| 538 | 1. 先把项目现有 manifest 规范转换成 pgvector-friendly 结构化 JSON | ||
| 539 | 2. 后续你们可以再用 bulk insert / COPY / ETL 把这些行落到 PostgreSQL | ||
| 540 | 3. embedding 生成后再写入 `vector(192)` 列 | ||
| 541 | |||
| 511 | ## Sources | 542 | ## Sources |
| 512 | 543 | ||
| 513 | - Current code behavior from: | 544 | - Current code behavior from: | ... | ... |
-
Please register or sign in to post a comment