add src
Showing
188 changed files
with
1994 additions
and
43 deletions
No preview for this file type
| ... | @@ -38,8 +38,9 @@ engine: | ... | @@ -38,8 +38,9 @@ engine: |
| 38 | n_fft: 1024 | 38 | n_fft: 1024 |
| 39 | hop_length: 256 | 39 | hop_length: 256 |
| 40 | hybrid: | 40 | hybrid: |
| 41 | chroma_weight: 0.3 | 41 | chroma_weight: 0.25 |
| 42 | ecapa_weight: 0.7 | 42 | ecapa_weight: 0.5 |
| 43 | melody_weight: 0.25 | ||
| 43 | reject_threshold: 0.4 | 44 | reject_threshold: 0.4 |
| 44 | 45 | ||
| 45 | augmentation: | 46 | augmentation: | ... | ... |
acr-engine/data/dataset_registry.json
0 → 100644
| 1 | [ | ||
| 2 | { | ||
| 3 | "name": "FMA", | ||
| 4 | "source_url": "https://github.com/mdeff/fma", | ||
| 5 | "license": "Track-dependent / metadata CC BY 4.0; verify per subset", | ||
| 6 | "commercial_use": "review_required", | ||
| 7 | "notes": "Good first realistic MIR baseline" | ||
| 8 | }, | ||
| 9 | { | ||
| 10 | "name": "MTG-Jamendo", | ||
| 11 | "source_url": "https://github.com/MTG/mtg-jamendo-dataset", | ||
| 12 | "license": "Creative Commons source tracks; verify exact subset terms", | ||
| 13 | "commercial_use": "review_required", | ||
| 14 | "notes": "Good retrieval/tagging corpus with scripts" | ||
| 15 | }, | ||
| 16 | { | ||
| 17 | "name": "CCMusic", | ||
| 18 | "source_url": "https://ccmusic-database.github.io/en/database/ccm.html", | ||
| 19 | "license": "varies / application may be required", | ||
| 20 | "commercial_use": "review_required", | ||
| 21 | "notes": "Useful Chinese MIR source, needs permission review" | ||
| 22 | }, | ||
| 23 | { | ||
| 24 | "name": "ModelScope-music", | ||
| 25 | "source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset", | ||
| 26 | "license": "varies by dataset", | ||
| 27 | "commercial_use": "deny_until_whitelisted", | ||
| 28 | "notes": "Discovery surface only until per-dataset review is complete" | ||
| 29 | } | ||
| 30 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 1 | { | ||
| 2 | "dataset": "modelscope_music", | ||
| 3 | "root": "data/external/modelscope_music", | ||
| 4 | "status": "initialized", | ||
| 5 | "next_steps": [ | ||
| 6 | "download raw audio according to upstream license terms", | ||
| 7 | "convert to catalog/query manifests", | ||
| 8 | "record license evidence before training" | ||
| 9 | ] | ||
| 10 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/index_api/chromaprint.pkl
0 → 100644
No preview for this file type
acr-engine/data/index_api/reference_embs.npy
0 → 100644
No preview for this file type
acr-engine/data/index_api/reference_ids.npy
0 → 100644
No preview for this file type
acr-engine/data/index_v3/chromaprint.pkl
0 → 100644
No preview for this file type
acr-engine/data/index_v3/reference_embs.npy
0 → 100644
No preview for this file type
acr-engine/data/index_v3/reference_ids.npy
0 → 100644
No preview for this file type
acr-engine/data/models_v3/best_model.pt
0 → 100644
This file is too large to display.
acr-engine/data/models_v3/song_to_idx.json
0 → 100644
| 1 | { | ||
| 2 | "song_0000": 0, | ||
| 3 | "song_0001": 1, | ||
| 4 | "song_0002": 2, | ||
| 5 | "song_0003": 3, | ||
| 6 | "song_0004": 4, | ||
| 7 | "song_0005": 5, | ||
| 8 | "song_0006": 6, | ||
| 9 | "song_0007": 7, | ||
| 10 | "song_0008": 8, | ||
| 11 | "song_0009": 9, | ||
| 12 | "song_0010": 10, | ||
| 13 | "song_0011": 11, | ||
| 14 | "song_0012": 12, | ||
| 15 | "song_0013": 13, | ||
| 16 | "song_0014": 14, | ||
| 17 | "song_0015": 15 | ||
| 18 | } | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/synthetic_v2/catalog.json
0 → 100644
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "song_0000", | ||
| 4 | "audio_path": "songs/song_0000.wav", | ||
| 5 | "duration": 15.0, | ||
| 6 | "base_freq": 130.81, | ||
| 7 | "type": "reference" | ||
| 8 | }, | ||
| 9 | { | ||
| 10 | "song_id": "song_0001", | ||
| 11 | "audio_path": "songs/song_0001.wav", | ||
| 12 | "duration": 15.0, | ||
| 13 | "base_freq": 146.83, | ||
| 14 | "type": "reference" | ||
| 15 | }, | ||
| 16 | { | ||
| 17 | "song_id": "song_0002", | ||
| 18 | "audio_path": "songs/song_0002.wav", | ||
| 19 | "duration": 15.0, | ||
| 20 | "base_freq": 164.81, | ||
| 21 | "type": "reference" | ||
| 22 | }, | ||
| 23 | { | ||
| 24 | "song_id": "song_0003", | ||
| 25 | "audio_path": "songs/song_0003.wav", | ||
| 26 | "duration": 15.0, | ||
| 27 | "base_freq": 174.61, | ||
| 28 | "type": "reference" | ||
| 29 | }, | ||
| 30 | { | ||
| 31 | "song_id": "song_0004", | ||
| 32 | "audio_path": "songs/song_0004.wav", | ||
| 33 | "duration": 15.0, | ||
| 34 | "base_freq": 196.0, | ||
| 35 | "type": "reference" | ||
| 36 | }, | ||
| 37 | { | ||
| 38 | "song_id": "song_0005", | ||
| 39 | "audio_path": "songs/song_0005.wav", | ||
| 40 | "duration": 15.0, | ||
| 41 | "base_freq": 220.0, | ||
| 42 | "type": "reference" | ||
| 43 | }, | ||
| 44 | { | ||
| 45 | "song_id": "song_0006", | ||
| 46 | "audio_path": "songs/song_0006.wav", | ||
| 47 | "duration": 15.0, | ||
| 48 | "base_freq": 246.94, | ||
| 49 | "type": "reference" | ||
| 50 | }, | ||
| 51 | { | ||
| 52 | "song_id": "song_0007", | ||
| 53 | "audio_path": "songs/song_0007.wav", | ||
| 54 | "duration": 15.0, | ||
| 55 | "base_freq": 261.63, | ||
| 56 | "type": "reference" | ||
| 57 | }, | ||
| 58 | { | ||
| 59 | "song_id": "song_0008", | ||
| 60 | "audio_path": "songs/song_0008.wav", | ||
| 61 | "duration": 15.0, | ||
| 62 | "base_freq": 293.66, | ||
| 63 | "type": "reference" | ||
| 64 | }, | ||
| 65 | { | ||
| 66 | "song_id": "song_0009", | ||
| 67 | "audio_path": "songs/song_0009.wav", | ||
| 68 | "duration": 15.0, | ||
| 69 | "base_freq": 329.63, | ||
| 70 | "type": "reference" | ||
| 71 | }, | ||
| 72 | { | ||
| 73 | "song_id": "song_0010", | ||
| 74 | "audio_path": "songs/song_0010.wav", | ||
| 75 | "duration": 15.0, | ||
| 76 | "base_freq": 349.23, | ||
| 77 | "type": "reference" | ||
| 78 | }, | ||
| 79 | { | ||
| 80 | "song_id": "song_0011", | ||
| 81 | "audio_path": "songs/song_0011.wav", | ||
| 82 | "duration": 15.0, | ||
| 83 | "base_freq": 392.0, | ||
| 84 | "type": "reference" | ||
| 85 | }, | ||
| 86 | { | ||
| 87 | "song_id": "song_0012", | ||
| 88 | "audio_path": "songs/song_0012.wav", | ||
| 89 | "duration": 15.0, | ||
| 90 | "base_freq": 440.0, | ||
| 91 | "type": "reference" | ||
| 92 | }, | ||
| 93 | { | ||
| 94 | "song_id": "song_0013", | ||
| 95 | "audio_path": "songs/song_0013.wav", | ||
| 96 | "duration": 15.0, | ||
| 97 | "base_freq": 493.88, | ||
| 98 | "type": "reference" | ||
| 99 | }, | ||
| 100 | { | ||
| 101 | "song_id": "song_0014", | ||
| 102 | "audio_path": "songs/song_0014.wav", | ||
| 103 | "duration": 15.0, | ||
| 104 | "base_freq": 523.25, | ||
| 105 | "type": "reference" | ||
| 106 | }, | ||
| 107 | { | ||
| 108 | "song_id": "song_0015", | ||
| 109 | "audio_path": "songs/song_0015.wav", | ||
| 110 | "duration": 15.0, | ||
| 111 | "base_freq": 587.33, | ||
| 112 | "type": "reference" | ||
| 113 | }, | ||
| 114 | { | ||
| 115 | "song_id": "song_0016", | ||
| 116 | "audio_path": "songs/song_0016.wav", | ||
| 117 | "duration": 15.0, | ||
| 118 | "base_freq": 659.25, | ||
| 119 | "type": "reference" | ||
| 120 | }, | ||
| 121 | { | ||
| 122 | "song_id": "song_0017", | ||
| 123 | "audio_path": "songs/song_0017.wav", | ||
| 124 | "duration": 15.0, | ||
| 125 | "base_freq": 698.46, | ||
| 126 | "type": "reference" | ||
| 127 | }, | ||
| 128 | { | ||
| 129 | "song_id": "song_0018", | ||
| 130 | "audio_path": "songs/song_0018.wav", | ||
| 131 | "duration": 15.0, | ||
| 132 | "base_freq": 783.99, | ||
| 133 | "type": "reference" | ||
| 134 | }, | ||
| 135 | { | ||
| 136 | "song_id": "song_0019", | ||
| 137 | "audio_path": "songs/song_0019.wav", | ||
| 138 | "duration": 15.0, | ||
| 139 | "base_freq": 880.0, | ||
| 140 | "type": "reference" | ||
| 141 | }, | ||
| 142 | { | ||
| 143 | "song_id": "song_0020", | ||
| 144 | "audio_path": "songs/song_0020.wav", | ||
| 145 | "duration": 15.0, | ||
| 146 | "base_freq": 987.77, | ||
| 147 | "type": "reference" | ||
| 148 | }, | ||
| 149 | { | ||
| 150 | "song_id": "song_0021", | ||
| 151 | "audio_path": "songs/song_0021.wav", | ||
| 152 | "duration": 15.0, | ||
| 153 | "base_freq": 146.8292605393491, | ||
| 154 | "type": "reference" | ||
| 155 | }, | ||
| 156 | { | ||
| 157 | "song_id": "song_0022", | ||
| 158 | "audio_path": "songs/song_0022.wav", | ||
| 159 | "duration": 15.0, | ||
| 160 | "base_freq": 164.81110255326524, | ||
| 161 | "type": "reference" | ||
| 162 | }, | ||
| 163 | { | ||
| 164 | "song_id": "song_0023", | ||
| 165 | "audio_path": "songs/song_0023.wav", | ||
| 166 | "duration": 15.0, | ||
| 167 | "base_freq": 184.99297018186778, | ||
| 168 | "type": "reference" | ||
| 169 | } | ||
| 170 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
acr-engine/data/synthetic_v2/test.json
0 → 100644
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "song_0020", | ||
| 4 | "audio_path": "segments/song_0020_seg_00.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 4.349828784349853, | ||
| 8 | "segment_type": "mid" | ||
| 9 | }, | ||
| 10 | { | ||
| 11 | "song_id": "song_0020", | ||
| 12 | "audio_path": "segments/song_0020_seg_01.wav", | ||
| 13 | "duration": 5.0, | ||
| 14 | "type": "clean", | ||
| 15 | "offset": 9.642182747327407, | ||
| 16 | "segment_type": "mid" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "song_0020", | ||
| 20 | "audio_path": "segments/song_0020_seg_02_augmented.wav", | ||
| 21 | "duration": 5.0, | ||
| 22 | "type": "augmented", | ||
| 23 | "offset": 2.367717347418965, | ||
| 24 | "segment_type": "intro" | ||
| 25 | }, | ||
| 26 | { | ||
| 27 | "song_id": "song_0020", | ||
| 28 | "audio_path": "segments/song_0020_seg_03_humming_like.wav", | ||
| 29 | "duration": 5.0, | ||
| 30 | "type": "humming_like", | ||
| 31 | "offset": 3.180577192661006, | ||
| 32 | "segment_type": "mid" | ||
| 33 | }, | ||
| 34 | { | ||
| 35 | "song_id": "song_0020", | ||
| 36 | "audio_path": "segments/song_0020_seg_04_confused.wav", | ||
| 37 | "duration": 5.0, | ||
| 38 | "type": "confused", | ||
| 39 | "offset": 4.660551124366617, | ||
| 40 | "segment_type": "mid" | ||
| 41 | }, | ||
| 42 | { | ||
| 43 | "song_id": "song_0020", | ||
| 44 | "audio_path": "songs/song_0020.wav", | ||
| 45 | "duration": 15.0, | ||
| 46 | "base_freq": 987.77, | ||
| 47 | "type": "reference" | ||
| 48 | }, | ||
| 49 | { | ||
| 50 | "song_id": "song_0021", | ||
| 51 | "audio_path": "segments/song_0021_seg_00.wav", | ||
| 52 | "duration": 5.0, | ||
| 53 | "type": "clean", | ||
| 54 | "offset": 5.631088908640184, | ||
| 55 | "segment_type": "mid" | ||
| 56 | }, | ||
| 57 | { | ||
| 58 | "song_id": "song_0021", | ||
| 59 | "audio_path": "segments/song_0021_seg_01.wav", | ||
| 60 | "duration": 5.0, | ||
| 61 | "type": "clean", | ||
| 62 | "offset": 1.8823366490525628, | ||
| 63 | "segment_type": "intro" | ||
| 64 | }, | ||
| 65 | { | ||
| 66 | "song_id": "song_0021", | ||
| 67 | "audio_path": "segments/song_0021_seg_02_augmented.wav", | ||
| 68 | "duration": 5.0, | ||
| 69 | "type": "augmented", | ||
| 70 | "offset": 9.88006210404643, | ||
| 71 | "segment_type": "mid" | ||
| 72 | }, | ||
| 73 | { | ||
| 74 | "song_id": "song_0021", | ||
| 75 | "audio_path": "segments/song_0021_seg_03_humming_like.wav", | ||
| 76 | "duration": 5.0, | ||
| 77 | "type": "humming_like", | ||
| 78 | "offset": 0.9025737685090285, | ||
| 79 | "segment_type": "intro" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "song_id": "song_0021", | ||
| 83 | "audio_path": "segments/song_0021_seg_04_confused.wav", | ||
| 84 | "duration": 5.0, | ||
| 85 | "type": "confused", | ||
| 86 | "offset": 1.3048954561918258, | ||
| 87 | "segment_type": "intro" | ||
| 88 | }, | ||
| 89 | { | ||
| 90 | "song_id": "song_0021", | ||
| 91 | "audio_path": "songs/song_0021.wav", | ||
| 92 | "duration": 15.0, | ||
| 93 | "base_freq": 146.8292605393491, | ||
| 94 | "type": "reference" | ||
| 95 | }, | ||
| 96 | { | ||
| 97 | "song_id": "song_0022", | ||
| 98 | "audio_path": "segments/song_0022_seg_00.wav", | ||
| 99 | "duration": 5.0, | ||
| 100 | "type": "clean", | ||
| 101 | "offset": 3.9746734850812295, | ||
| 102 | "segment_type": "mid" | ||
| 103 | }, | ||
| 104 | { | ||
| 105 | "song_id": "song_0022", | ||
| 106 | "audio_path": "segments/song_0022_seg_01.wav", | ||
| 107 | "duration": 5.0, | ||
| 108 | "type": "clean", | ||
| 109 | "offset": 4.890968121206573, | ||
| 110 | "segment_type": "mid" | ||
| 111 | }, | ||
| 112 | { | ||
| 113 | "song_id": "song_0022", | ||
| 114 | "audio_path": "segments/song_0022_seg_02_augmented.wav", | ||
| 115 | "duration": 5.0, | ||
| 116 | "type": "augmented", | ||
| 117 | "offset": 6.610400547460049, | ||
| 118 | "segment_type": "mid" | ||
| 119 | }, | ||
| 120 | { | ||
| 121 | "song_id": "song_0022", | ||
| 122 | "audio_path": "segments/song_0022_seg_03_humming_like.wav", | ||
| 123 | "duration": 5.0, | ||
| 124 | "type": "humming_like", | ||
| 125 | "offset": 2.6329596668288424, | ||
| 126 | "segment_type": "intro" | ||
| 127 | }, | ||
| 128 | { | ||
| 129 | "song_id": "song_0022", | ||
| 130 | "audio_path": "segments/song_0022_seg_04_confused.wav", | ||
| 131 | "duration": 5.0, | ||
| 132 | "type": "confused", | ||
| 133 | "offset": 0.8570731183991709, | ||
| 134 | "segment_type": "intro" | ||
| 135 | }, | ||
| 136 | { | ||
| 137 | "song_id": "song_0022", | ||
| 138 | "audio_path": "songs/song_0022.wav", | ||
| 139 | "duration": 15.0, | ||
| 140 | "base_freq": 164.81110255326524, | ||
| 141 | "type": "reference" | ||
| 142 | }, | ||
| 143 | { | ||
| 144 | "song_id": "song_0023", | ||
| 145 | "audio_path": "segments/song_0023_seg_00.wav", | ||
| 146 | "duration": 5.0, | ||
| 147 | "type": "clean", | ||
| 148 | "offset": 4.461034326075292, | ||
| 149 | "segment_type": "mid" | ||
| 150 | }, | ||
| 151 | { | ||
| 152 | "song_id": "song_0023", | ||
| 153 | "audio_path": "segments/song_0023_seg_01.wav", | ||
| 154 | "duration": 5.0, | ||
| 155 | "type": "clean", | ||
| 156 | "offset": 9.605203782802876, | ||
| 157 | "segment_type": "mid" | ||
| 158 | }, | ||
| 159 | { | ||
| 160 | "song_id": "song_0023", | ||
| 161 | "audio_path": "segments/song_0023_seg_02_augmented.wav", | ||
| 162 | "duration": 5.0, | ||
| 163 | "type": "augmented", | ||
| 164 | "offset": 4.7458228906154805, | ||
| 165 | "segment_type": "mid" | ||
| 166 | }, | ||
| 167 | { | ||
| 168 | "song_id": "song_0023", | ||
| 169 | "audio_path": "segments/song_0023_seg_03_humming_like.wav", | ||
| 170 | "duration": 5.0, | ||
| 171 | "type": "humming_like", | ||
| 172 | "offset": 8.308702013555955, | ||
| 173 | "segment_type": "mid" | ||
| 174 | }, | ||
| 175 | { | ||
| 176 | "song_id": "song_0023", | ||
| 177 | "audio_path": "segments/song_0023_seg_04_confused.wav", | ||
| 178 | "duration": 5.0, | ||
| 179 | "type": "confused", | ||
| 180 | "offset": 2.213510770155481, | ||
| 181 | "segment_type": "intro" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "song_id": "song_0023", | ||
| 185 | "audio_path": "songs/song_0023.wav", | ||
| 186 | "duration": 15.0, | ||
| 187 | "base_freq": 184.99297018186778, | ||
| 188 | "type": "reference" | ||
| 189 | } | ||
| 190 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/synthetic_v2/train.json
0 → 100644
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "song_0000", | ||
| 4 | "audio_path": "segments/song_0000_seg_00.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 9.538159275210802, | ||
| 8 | "segment_type": "mid" | ||
| 9 | }, | ||
| 10 | { | ||
| 11 | "song_id": "song_0000", | ||
| 12 | "audio_path": "segments/song_0000_seg_01.wav", | ||
| 13 | "duration": 5.0, | ||
| 14 | "type": "clean", | ||
| 15 | "offset": 8.75852940378194, | ||
| 16 | "segment_type": "mid" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "song_0000", | ||
| 20 | "audio_path": "segments/song_0000_seg_02_augmented.wav", | ||
| 21 | "duration": 5.0, | ||
| 22 | "type": "augmented", | ||
| 23 | "offset": 2.6338905075109076, | ||
| 24 | "segment_type": "intro" | ||
| 25 | }, | ||
| 26 | { | ||
| 27 | "song_id": "song_0000", | ||
| 28 | "audio_path": "segments/song_0000_seg_03_humming_like.wav", | ||
| 29 | "duration": 5.0, | ||
| 30 | "type": "humming_like", | ||
| 31 | "offset": 6.389494948660052, | ||
| 32 | "segment_type": "mid" | ||
| 33 | }, | ||
| 34 | { | ||
| 35 | "song_id": "song_0000", | ||
| 36 | "audio_path": "segments/song_0000_seg_04_confused.wav", | ||
| 37 | "duration": 5.0, | ||
| 38 | "type": "confused", | ||
| 39 | "offset": 5.303536721951775, | ||
| 40 | "segment_type": "mid" | ||
| 41 | }, | ||
| 42 | { | ||
| 43 | "song_id": "song_0000", | ||
| 44 | "audio_path": "songs/song_0000.wav", | ||
| 45 | "duration": 15.0, | ||
| 46 | "base_freq": 130.81, | ||
| 47 | "type": "reference" | ||
| 48 | }, | ||
| 49 | { | ||
| 50 | "song_id": "song_0001", | ||
| 51 | "audio_path": "segments/song_0001_seg_00.wav", | ||
| 52 | "duration": 5.0, | ||
| 53 | "type": "clean", | ||
| 54 | "offset": 5.227827155319589, | ||
| 55 | "segment_type": "mid" | ||
| 56 | }, | ||
| 57 | { | ||
| 58 | "song_id": "song_0001", | ||
| 59 | "audio_path": "segments/song_0001_seg_01.wav", | ||
| 60 | "duration": 5.0, | ||
| 61 | "type": "clean", | ||
| 62 | "offset": 9.347062577364273, | ||
| 63 | "segment_type": "mid" | ||
| 64 | }, | ||
| 65 | { | ||
| 66 | "song_id": "song_0001", | ||
| 67 | "audio_path": "segments/song_0001_seg_02_augmented.wav", | ||
| 68 | "duration": 5.0, | ||
| 69 | "type": "augmented", | ||
| 70 | "offset": 2.042591994235364, | ||
| 71 | "segment_type": "intro" | ||
| 72 | }, | ||
| 73 | { | ||
| 74 | "song_id": "song_0001", | ||
| 75 | "audio_path": "segments/song_0001_seg_03_humming_like.wav", | ||
| 76 | "duration": 5.0, | ||
| 77 | "type": "humming_like", | ||
| 78 | "offset": 3.1617719627185403, | ||
| 79 | "segment_type": "mid" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "song_id": "song_0001", | ||
| 83 | "audio_path": "segments/song_0001_seg_04_confused.wav", | ||
| 84 | "duration": 5.0, | ||
| 85 | "type": "confused", | ||
| 86 | "offset": 0.73260721099633, | ||
| 87 | "segment_type": "intro" | ||
| 88 | }, | ||
| 89 | { | ||
| 90 | "song_id": "song_0001", | ||
| 91 | "audio_path": "songs/song_0001.wav", | ||
| 92 | "duration": 15.0, | ||
| 93 | "base_freq": 146.83, | ||
| 94 | "type": "reference" | ||
| 95 | }, | ||
| 96 | { | ||
| 97 | "song_id": "song_0002", | ||
| 98 | "audio_path": "segments/song_0002_seg_00.wav", | ||
| 99 | "duration": 5.0, | ||
| 100 | "type": "clean", | ||
| 101 | "offset": 3.0928466220865323, | ||
| 102 | "segment_type": "mid" | ||
| 103 | }, | ||
| 104 | { | ||
| 105 | "song_id": "song_0002", | ||
| 106 | "audio_path": "segments/song_0002_seg_01.wav", | ||
| 107 | "duration": 5.0, | ||
| 108 | "type": "clean", | ||
| 109 | "offset": 4.083929086192168, | ||
| 110 | "segment_type": "mid" | ||
| 111 | }, | ||
| 112 | { | ||
| 113 | "song_id": "song_0002", | ||
| 114 | "audio_path": "segments/song_0002_seg_02_augmented.wav", | ||
| 115 | "duration": 5.0, | ||
| 116 | "type": "augmented", | ||
| 117 | "offset": 4.024003870577246, | ||
| 118 | "segment_type": "mid" | ||
| 119 | }, | ||
| 120 | { | ||
| 121 | "song_id": "song_0002", | ||
| 122 | "audio_path": "segments/song_0002_seg_03_humming_like.wav", | ||
| 123 | "duration": 5.0, | ||
| 124 | "type": "humming_like", | ||
| 125 | "offset": 9.028055457325827, | ||
| 126 | "segment_type": "mid" | ||
| 127 | }, | ||
| 128 | { | ||
| 129 | "song_id": "song_0002", | ||
| 130 | "audio_path": "segments/song_0002_seg_04_confused.wav", | ||
| 131 | "duration": 5.0, | ||
| 132 | "type": "confused", | ||
| 133 | "offset": 4.2988814998983464, | ||
| 134 | "segment_type": "mid" | ||
| 135 | }, | ||
| 136 | { | ||
| 137 | "song_id": "song_0002", | ||
| 138 | "audio_path": "songs/song_0002.wav", | ||
| 139 | "duration": 15.0, | ||
| 140 | "base_freq": 164.81, | ||
| 141 | "type": "reference" | ||
| 142 | }, | ||
| 143 | { | ||
| 144 | "song_id": "song_0003", | ||
| 145 | "audio_path": "segments/song_0003_seg_00.wav", | ||
| 146 | "duration": 5.0, | ||
| 147 | "type": "clean", | ||
| 148 | "offset": 0.1938328705001069, | ||
| 149 | "segment_type": "intro" | ||
| 150 | }, | ||
| 151 | { | ||
| 152 | "song_id": "song_0003", | ||
| 153 | "audio_path": "segments/song_0003_seg_01.wav", | ||
| 154 | "duration": 5.0, | ||
| 155 | "type": "clean", | ||
| 156 | "offset": 5.394190479225337, | ||
| 157 | "segment_type": "mid" | ||
| 158 | }, | ||
| 159 | { | ||
| 160 | "song_id": "song_0003", | ||
| 161 | "audio_path": "segments/song_0003_seg_02_augmented.wav", | ||
| 162 | "duration": 5.0, | ||
| 163 | "type": "augmented", | ||
| 164 | "offset": 9.999078285092093, | ||
| 165 | "segment_type": "mid" | ||
| 166 | }, | ||
| 167 | { | ||
| 168 | "song_id": "song_0003", | ||
| 169 | "audio_path": "segments/song_0003_seg_03_humming_like.wav", | ||
| 170 | "duration": 5.0, | ||
| 171 | "type": "humming_like", | ||
| 172 | "offset": 9.496117327159888, | ||
| 173 | "segment_type": "mid" | ||
| 174 | }, | ||
| 175 | { | ||
| 176 | "song_id": "song_0003", | ||
| 177 | "audio_path": "segments/song_0003_seg_04_confused.wav", | ||
| 178 | "duration": 5.0, | ||
| 179 | "type": "confused", | ||
| 180 | "offset": 2.1796454090650363, | ||
| 181 | "segment_type": "intro" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "song_id": "song_0003", | ||
| 185 | "audio_path": "songs/song_0003.wav", | ||
| 186 | "duration": 15.0, | ||
| 187 | "base_freq": 174.61, | ||
| 188 | "type": "reference" | ||
| 189 | }, | ||
| 190 | { | ||
| 191 | "song_id": "song_0004", | ||
| 192 | "audio_path": "segments/song_0004_seg_00.wav", | ||
| 193 | "duration": 5.0, | ||
| 194 | "type": "clean", | ||
| 195 | "offset": 9.654976431382948, | ||
| 196 | "segment_type": "mid" | ||
| 197 | }, | ||
| 198 | { | ||
| 199 | "song_id": "song_0004", | ||
| 200 | "audio_path": "segments/song_0004_seg_01.wav", | ||
| 201 | "duration": 5.0, | ||
| 202 | "type": "clean", | ||
| 203 | "offset": 2.524783904929726, | ||
| 204 | "segment_type": "intro" | ||
| 205 | }, | ||
| 206 | { | ||
| 207 | "song_id": "song_0004", | ||
| 208 | "audio_path": "segments/song_0004_seg_02_augmented.wav", | ||
| 209 | "duration": 5.0, | ||
| 210 | "type": "augmented", | ||
| 211 | "offset": 8.617229646275131, | ||
| 212 | "segment_type": "mid" | ||
| 213 | }, | ||
| 214 | { | ||
| 215 | "song_id": "song_0004", | ||
| 216 | "audio_path": "segments/song_0004_seg_03_humming_like.wav", | ||
| 217 | "duration": 5.0, | ||
| 218 | "type": "humming_like", | ||
| 219 | "offset": 1.5172700695095642, | ||
| 220 | "segment_type": "intro" | ||
| 221 | }, | ||
| 222 | { | ||
| 223 | "song_id": "song_0004", | ||
| 224 | "audio_path": "segments/song_0004_seg_04_confused.wav", | ||
| 225 | "duration": 5.0, | ||
| 226 | "type": "confused", | ||
| 227 | "offset": 4.161740214103284, | ||
| 228 | "segment_type": "mid" | ||
| 229 | }, | ||
| 230 | { | ||
| 231 | "song_id": "song_0004", | ||
| 232 | "audio_path": "songs/song_0004.wav", | ||
| 233 | "duration": 15.0, | ||
| 234 | "base_freq": 196.0, | ||
| 235 | "type": "reference" | ||
| 236 | }, | ||
| 237 | { | ||
| 238 | "song_id": "song_0005", | ||
| 239 | "audio_path": "segments/song_0005_seg_00.wav", | ||
| 240 | "duration": 5.0, | ||
| 241 | "type": "clean", | ||
| 242 | "offset": 5.088720150695117, | ||
| 243 | "segment_type": "mid" | ||
| 244 | }, | ||
| 245 | { | ||
| 246 | "song_id": "song_0005", | ||
| 247 | "audio_path": "segments/song_0005_seg_01.wav", | ||
| 248 | "duration": 5.0, | ||
| 249 | "type": "clean", | ||
| 250 | "offset": 2.734248967132742, | ||
| 251 | "segment_type": "intro" | ||
| 252 | }, | ||
| 253 | { | ||
| 254 | "song_id": "song_0005", | ||
| 255 | "audio_path": "segments/song_0005_seg_02_augmented.wav", | ||
| 256 | "duration": 5.0, | ||
| 257 | "type": "augmented", | ||
| 258 | "offset": 8.347239455766944, | ||
| 259 | "segment_type": "mid" | ||
| 260 | }, | ||
| 261 | { | ||
| 262 | "song_id": "song_0005", | ||
| 263 | "audio_path": "segments/song_0005_seg_03_humming_like.wav", | ||
| 264 | "duration": 5.0, | ||
| 265 | "type": "humming_like", | ||
| 266 | "offset": 5.08240891592894, | ||
| 267 | "segment_type": "mid" | ||
| 268 | }, | ||
| 269 | { | ||
| 270 | "song_id": "song_0005", | ||
| 271 | "audio_path": "segments/song_0005_seg_04_confused.wav", | ||
| 272 | "duration": 5.0, | ||
| 273 | "type": "confused", | ||
| 274 | "offset": 9.3424839368252, | ||
| 275 | "segment_type": "mid" | ||
| 276 | }, | ||
| 277 | { | ||
| 278 | "song_id": "song_0005", | ||
| 279 | "audio_path": "songs/song_0005.wav", | ||
| 280 | "duration": 15.0, | ||
| 281 | "base_freq": 220.0, | ||
| 282 | "type": "reference" | ||
| 283 | }, | ||
| 284 | { | ||
| 285 | "song_id": "song_0006", | ||
| 286 | "audio_path": "segments/song_0006_seg_00.wav", | ||
| 287 | "duration": 5.0, | ||
| 288 | "type": "clean", | ||
| 289 | "offset": 2.5062680004361604, | ||
| 290 | "segment_type": "intro" | ||
| 291 | }, | ||
| 292 | { | ||
| 293 | "song_id": "song_0006", | ||
| 294 | "audio_path": "segments/song_0006_seg_01.wav", | ||
| 295 | "duration": 5.0, | ||
| 296 | "type": "clean", | ||
| 297 | "offset": 7.555773237416772, | ||
| 298 | "segment_type": "mid" | ||
| 299 | }, | ||
| 300 | { | ||
| 301 | "song_id": "song_0006", | ||
| 302 | "audio_path": "segments/song_0006_seg_02_augmented.wav", | ||
| 303 | "duration": 5.0, | ||
| 304 | "type": "augmented", | ||
| 305 | "offset": 7.674707744954641, | ||
| 306 | "segment_type": "mid" | ||
| 307 | }, | ||
| 308 | { | ||
| 309 | "song_id": "song_0006", | ||
| 310 | "audio_path": "segments/song_0006_seg_03_humming_like.wav", | ||
| 311 | "duration": 5.0, | ||
| 312 | "type": "humming_like", | ||
| 313 | "offset": 0.33364531245632434, | ||
| 314 | "segment_type": "intro" | ||
| 315 | }, | ||
| 316 | { | ||
| 317 | "song_id": "song_0006", | ||
| 318 | "audio_path": "segments/song_0006_seg_04_confused.wav", | ||
| 319 | "duration": 5.0, | ||
| 320 | "type": "confused", | ||
| 321 | "offset": 2.007947946500762, | ||
| 322 | "segment_type": "intro" | ||
| 323 | }, | ||
| 324 | { | ||
| 325 | "song_id": "song_0006", | ||
| 326 | "audio_path": "songs/song_0006.wav", | ||
| 327 | "duration": 15.0, | ||
| 328 | "base_freq": 246.94, | ||
| 329 | "type": "reference" | ||
| 330 | }, | ||
| 331 | { | ||
| 332 | "song_id": "song_0007", | ||
| 333 | "audio_path": "segments/song_0007_seg_00.wav", | ||
| 334 | "duration": 5.0, | ||
| 335 | "type": "clean", | ||
| 336 | "offset": 6.589030736792923, | ||
| 337 | "segment_type": "mid" | ||
| 338 | }, | ||
| 339 | { | ||
| 340 | "song_id": "song_0007", | ||
| 341 | "audio_path": "segments/song_0007_seg_01.wav", | ||
| 342 | "duration": 5.0, | ||
| 343 | "type": "clean", | ||
| 344 | "offset": 3.016303290280887, | ||
| 345 | "segment_type": "mid" | ||
| 346 | }, | ||
| 347 | { | ||
| 348 | "song_id": "song_0007", | ||
| 349 | "audio_path": "segments/song_0007_seg_02_augmented.wav", | ||
| 350 | "duration": 5.0, | ||
| 351 | "type": "augmented", | ||
| 352 | "offset": 6.433406842054888, | ||
| 353 | "segment_type": "mid" | ||
| 354 | }, | ||
| 355 | { | ||
| 356 | "song_id": "song_0007", | ||
| 357 | "audio_path": "segments/song_0007_seg_03_humming_like.wav", | ||
| 358 | "duration": 5.0, | ||
| 359 | "type": "humming_like", | ||
| 360 | "offset": 4.435623293630087, | ||
| 361 | "segment_type": "mid" | ||
| 362 | }, | ||
| 363 | { | ||
| 364 | "song_id": "song_0007", | ||
| 365 | "audio_path": "segments/song_0007_seg_04_confused.wav", | ||
| 366 | "duration": 5.0, | ||
| 367 | "type": "confused", | ||
| 368 | "offset": 5.8536468854812105, | ||
| 369 | "segment_type": "mid" | ||
| 370 | }, | ||
| 371 | { | ||
| 372 | "song_id": "song_0007", | ||
| 373 | "audio_path": "songs/song_0007.wav", | ||
| 374 | "duration": 15.0, | ||
| 375 | "base_freq": 261.63, | ||
| 376 | "type": "reference" | ||
| 377 | }, | ||
| 378 | { | ||
| 379 | "song_id": "song_0008", | ||
| 380 | "audio_path": "segments/song_0008_seg_00.wav", | ||
| 381 | "duration": 5.0, | ||
| 382 | "type": "clean", | ||
| 383 | "offset": 0.42302261562791377, | ||
| 384 | "segment_type": "intro" | ||
| 385 | }, | ||
| 386 | { | ||
| 387 | "song_id": "song_0008", | ||
| 388 | "audio_path": "segments/song_0008_seg_01.wav", | ||
| 389 | "duration": 5.0, | ||
| 390 | "type": "clean", | ||
| 391 | "offset": 0.18741536585645702, | ||
| 392 | "segment_type": "intro" | ||
| 393 | }, | ||
| 394 | { | ||
| 395 | "song_id": "song_0008", | ||
| 396 | "audio_path": "segments/song_0008_seg_02_augmented.wav", | ||
| 397 | "duration": 5.0, | ||
| 398 | "type": "augmented", | ||
| 399 | "offset": 9.211624345024124, | ||
| 400 | "segment_type": "mid" | ||
| 401 | }, | ||
| 402 | { | ||
| 403 | "song_id": "song_0008", | ||
| 404 | "audio_path": "segments/song_0008_seg_03_humming_like.wav", | ||
| 405 | "duration": 5.0, | ||
| 406 | "type": "humming_like", | ||
| 407 | "offset": 4.176939598434806, | ||
| 408 | "segment_type": "mid" | ||
| 409 | }, | ||
| 410 | { | ||
| 411 | "song_id": "song_0008", | ||
| 412 | "audio_path": "segments/song_0008_seg_04_confused.wav", | ||
| 413 | "duration": 5.0, | ||
| 414 | "type": "confused", | ||
| 415 | "offset": 8.320259130717071, | ||
| 416 | "segment_type": "mid" | ||
| 417 | }, | ||
| 418 | { | ||
| 419 | "song_id": "song_0008", | ||
| 420 | "audio_path": "songs/song_0008.wav", | ||
| 421 | "duration": 15.0, | ||
| 422 | "base_freq": 293.66, | ||
| 423 | "type": "reference" | ||
| 424 | }, | ||
| 425 | { | ||
| 426 | "song_id": "song_0009", | ||
| 427 | "audio_path": "segments/song_0009_seg_00.wav", | ||
| 428 | "duration": 5.0, | ||
| 429 | "type": "clean", | ||
| 430 | "offset": 5.076897127246463, | ||
| 431 | "segment_type": "mid" | ||
| 432 | }, | ||
| 433 | { | ||
| 434 | "song_id": "song_0009", | ||
| 435 | "audio_path": "segments/song_0009_seg_01.wav", | ||
| 436 | "duration": 5.0, | ||
| 437 | "type": "clean", | ||
| 438 | "offset": 5.397707584136711, | ||
| 439 | "segment_type": "mid" | ||
| 440 | }, | ||
| 441 | { | ||
| 442 | "song_id": "song_0009", | ||
| 443 | "audio_path": "segments/song_0009_seg_02_augmented.wav", | ||
| 444 | "duration": 5.0, | ||
| 445 | "type": "augmented", | ||
| 446 | "offset": 7.3864400300146755, | ||
| 447 | "segment_type": "mid" | ||
| 448 | }, | ||
| 449 | { | ||
| 450 | "song_id": "song_0009", | ||
| 451 | "audio_path": "segments/song_0009_seg_03_humming_like.wav", | ||
| 452 | "duration": 5.0, | ||
| 453 | "type": "humming_like", | ||
| 454 | "offset": 5.9724644107162845, | ||
| 455 | "segment_type": "mid" | ||
| 456 | }, | ||
| 457 | { | ||
| 458 | "song_id": "song_0009", | ||
| 459 | "audio_path": "segments/song_0009_seg_04_confused.wav", | ||
| 460 | "duration": 5.0, | ||
| 461 | "type": "confused", | ||
| 462 | "offset": 7.21182997805427, | ||
| 463 | "segment_type": "mid" | ||
| 464 | }, | ||
| 465 | { | ||
| 466 | "song_id": "song_0009", | ||
| 467 | "audio_path": "songs/song_0009.wav", | ||
| 468 | "duration": 15.0, | ||
| 469 | "base_freq": 329.63, | ||
| 470 | "type": "reference" | ||
| 471 | }, | ||
| 472 | { | ||
| 473 | "song_id": "song_0010", | ||
| 474 | "audio_path": "segments/song_0010_seg_00.wav", | ||
| 475 | "duration": 5.0, | ||
| 476 | "type": "clean", | ||
| 477 | "offset": 3.1007588293689183, | ||
| 478 | "segment_type": "mid" | ||
| 479 | }, | ||
| 480 | { | ||
| 481 | "song_id": "song_0010", | ||
| 482 | "audio_path": "segments/song_0010_seg_01.wav", | ||
| 483 | "duration": 5.0, | ||
| 484 | "type": "clean", | ||
| 485 | "offset": 3.9822405568601704, | ||
| 486 | "segment_type": "mid" | ||
| 487 | }, | ||
| 488 | { | ||
| 489 | "song_id": "song_0010", | ||
| 490 | "audio_path": "segments/song_0010_seg_02_augmented.wav", | ||
| 491 | "duration": 5.0, | ||
| 492 | "type": "augmented", | ||
| 493 | "offset": 8.154060806559823, | ||
| 494 | "segment_type": "mid" | ||
| 495 | }, | ||
| 496 | { | ||
| 497 | "song_id": "song_0010", | ||
| 498 | "audio_path": "segments/song_0010_seg_03_humming_like.wav", | ||
| 499 | "duration": 5.0, | ||
| 500 | "type": "humming_like", | ||
| 501 | "offset": 2.7321660611387344, | ||
| 502 | "segment_type": "intro" | ||
| 503 | }, | ||
| 504 | { | ||
| 505 | "song_id": "song_0010", | ||
| 506 | "audio_path": "segments/song_0010_seg_04_confused.wav", | ||
| 507 | "duration": 5.0, | ||
| 508 | "type": "confused", | ||
| 509 | "offset": 9.564787178236601, | ||
| 510 | "segment_type": "mid" | ||
| 511 | }, | ||
| 512 | { | ||
| 513 | "song_id": "song_0010", | ||
| 514 | "audio_path": "songs/song_0010.wav", | ||
| 515 | "duration": 15.0, | ||
| 516 | "base_freq": 349.23, | ||
| 517 | "type": "reference" | ||
| 518 | }, | ||
| 519 | { | ||
| 520 | "song_id": "song_0011", | ||
| 521 | "audio_path": "segments/song_0011_seg_00.wav", | ||
| 522 | "duration": 5.0, | ||
| 523 | "type": "clean", | ||
| 524 | "offset": 8.949259168211244, | ||
| 525 | "segment_type": "mid" | ||
| 526 | }, | ||
| 527 | { | ||
| 528 | "song_id": "song_0011", | ||
| 529 | "audio_path": "segments/song_0011_seg_01.wav", | ||
| 530 | "duration": 5.0, | ||
| 531 | "type": "clean", | ||
| 532 | "offset": 8.459337061558657, | ||
| 533 | "segment_type": "mid" | ||
| 534 | }, | ||
| 535 | { | ||
| 536 | "song_id": "song_0011", | ||
| 537 | "audio_path": "segments/song_0011_seg_02_augmented.wav", | ||
| 538 | "duration": 5.0, | ||
| 539 | "type": "augmented", | ||
| 540 | "offset": 2.5060530898199906, | ||
| 541 | "segment_type": "intro" | ||
| 542 | }, | ||
| 543 | { | ||
| 544 | "song_id": "song_0011", | ||
| 545 | "audio_path": "segments/song_0011_seg_03_humming_like.wav", | ||
| 546 | "duration": 5.0, | ||
| 547 | "type": "humming_like", | ||
| 548 | "offset": 5.0257314474126265, | ||
| 549 | "segment_type": "mid" | ||
| 550 | }, | ||
| 551 | { | ||
| 552 | "song_id": "song_0011", | ||
| 553 | "audio_path": "segments/song_0011_seg_04_confused.wav", | ||
| 554 | "duration": 5.0, | ||
| 555 | "type": "confused", | ||
| 556 | "offset": 8.42530004113389, | ||
| 557 | "segment_type": "mid" | ||
| 558 | }, | ||
| 559 | { | ||
| 560 | "song_id": "song_0011", | ||
| 561 | "audio_path": "songs/song_0011.wav", | ||
| 562 | "duration": 15.0, | ||
| 563 | "base_freq": 392.0, | ||
| 564 | "type": "reference" | ||
| 565 | }, | ||
| 566 | { | ||
| 567 | "song_id": "song_0012", | ||
| 568 | "audio_path": "segments/song_0012_seg_00.wav", | ||
| 569 | "duration": 5.0, | ||
| 570 | "type": "clean", | ||
| 571 | "offset": 7.253242125518553, | ||
| 572 | "segment_type": "mid" | ||
| 573 | }, | ||
| 574 | { | ||
| 575 | "song_id": "song_0012", | ||
| 576 | "audio_path": "segments/song_0012_seg_01.wav", | ||
| 577 | "duration": 5.0, | ||
| 578 | "type": "clean", | ||
| 579 | "offset": 6.880436512027717, | ||
| 580 | "segment_type": "mid" | ||
| 581 | }, | ||
| 582 | { | ||
| 583 | "song_id": "song_0012", | ||
| 584 | "audio_path": "segments/song_0012_seg_02_augmented.wav", | ||
| 585 | "duration": 5.0, | ||
| 586 | "type": "augmented", | ||
| 587 | "offset": 0.26647154963833186, | ||
| 588 | "segment_type": "intro" | ||
| 589 | }, | ||
| 590 | { | ||
| 591 | "song_id": "song_0012", | ||
| 592 | "audio_path": "segments/song_0012_seg_03_humming_like.wav", | ||
| 593 | "duration": 5.0, | ||
| 594 | "type": "humming_like", | ||
| 595 | "offset": 7.214001122963067, | ||
| 596 | "segment_type": "mid" | ||
| 597 | }, | ||
| 598 | { | ||
| 599 | "song_id": "song_0012", | ||
| 600 | "audio_path": "segments/song_0012_seg_04_confused.wav", | ||
| 601 | "duration": 5.0, | ||
| 602 | "type": "confused", | ||
| 603 | "offset": 1.4777570830033182, | ||
| 604 | "segment_type": "intro" | ||
| 605 | }, | ||
| 606 | { | ||
| 607 | "song_id": "song_0012", | ||
| 608 | "audio_path": "songs/song_0012.wav", | ||
| 609 | "duration": 15.0, | ||
| 610 | "base_freq": 440.0, | ||
| 611 | "type": "reference" | ||
| 612 | }, | ||
| 613 | { | ||
| 614 | "song_id": "song_0013", | ||
| 615 | "audio_path": "segments/song_0013_seg_00.wav", | ||
| 616 | "duration": 5.0, | ||
| 617 | "type": "clean", | ||
| 618 | "offset": 3.3711217932975037, | ||
| 619 | "segment_type": "mid" | ||
| 620 | }, | ||
| 621 | { | ||
| 622 | "song_id": "song_0013", | ||
| 623 | "audio_path": "segments/song_0013_seg_01.wav", | ||
| 624 | "duration": 5.0, | ||
| 625 | "type": "clean", | ||
| 626 | "offset": 2.95024257658282, | ||
| 627 | "segment_type": "intro" | ||
| 628 | }, | ||
| 629 | { | ||
| 630 | "song_id": "song_0013", | ||
| 631 | "audio_path": "segments/song_0013_seg_02_augmented.wav", | ||
| 632 | "duration": 5.0, | ||
| 633 | "type": "augmented", | ||
| 634 | "offset": 6.7440113989474435, | ||
| 635 | "segment_type": "mid" | ||
| 636 | }, | ||
| 637 | { | ||
| 638 | "song_id": "song_0013", | ||
| 639 | "audio_path": "segments/song_0013_seg_03_humming_like.wav", | ||
| 640 | "duration": 5.0, | ||
| 641 | "type": "humming_like", | ||
| 642 | "offset": 3.27926658740176, | ||
| 643 | "segment_type": "mid" | ||
| 644 | }, | ||
| 645 | { | ||
| 646 | "song_id": "song_0013", | ||
| 647 | "audio_path": "segments/song_0013_seg_04_confused.wav", | ||
| 648 | "duration": 5.0, | ||
| 649 | "type": "confused", | ||
| 650 | "offset": 0.06830120539555451, | ||
| 651 | "segment_type": "intro" | ||
| 652 | }, | ||
| 653 | { | ||
| 654 | "song_id": "song_0013", | ||
| 655 | "audio_path": "songs/song_0013.wav", | ||
| 656 | "duration": 15.0, | ||
| 657 | "base_freq": 493.88, | ||
| 658 | "type": "reference" | ||
| 659 | }, | ||
| 660 | { | ||
| 661 | "song_id": "song_0014", | ||
| 662 | "audio_path": "segments/song_0014_seg_00.wav", | ||
| 663 | "duration": 5.0, | ||
| 664 | "type": "clean", | ||
| 665 | "offset": 4.389628114874606, | ||
| 666 | "segment_type": "mid" | ||
| 667 | }, | ||
| 668 | { | ||
| 669 | "song_id": "song_0014", | ||
| 670 | "audio_path": "segments/song_0014_seg_01.wav", | ||
| 671 | "duration": 5.0, | ||
| 672 | "type": "clean", | ||
| 673 | "offset": 5.397598089074283, | ||
| 674 | "segment_type": "mid" | ||
| 675 | }, | ||
| 676 | { | ||
| 677 | "song_id": "song_0014", | ||
| 678 | "audio_path": "segments/song_0014_seg_02_augmented.wav", | ||
| 679 | "duration": 5.0, | ||
| 680 | "type": "augmented", | ||
| 681 | "offset": 7.543857087472844, | ||
| 682 | "segment_type": "mid" | ||
| 683 | }, | ||
| 684 | { | ||
| 685 | "song_id": "song_0014", | ||
| 686 | "audio_path": "segments/song_0014_seg_03_humming_like.wav", | ||
| 687 | "duration": 5.0, | ||
| 688 | "type": "humming_like", | ||
| 689 | "offset": 5.77474814637882, | ||
| 690 | "segment_type": "mid" | ||
| 691 | }, | ||
| 692 | { | ||
| 693 | "song_id": "song_0014", | ||
| 694 | "audio_path": "segments/song_0014_seg_04_confused.wav", | ||
| 695 | "duration": 5.0, | ||
| 696 | "type": "confused", | ||
| 697 | "offset": 5.212510542649235, | ||
| 698 | "segment_type": "mid" | ||
| 699 | }, | ||
| 700 | { | ||
| 701 | "song_id": "song_0014", | ||
| 702 | "audio_path": "songs/song_0014.wav", | ||
| 703 | "duration": 15.0, | ||
| 704 | "base_freq": 523.25, | ||
| 705 | "type": "reference" | ||
| 706 | }, | ||
| 707 | { | ||
| 708 | "song_id": "song_0015", | ||
| 709 | "audio_path": "segments/song_0015_seg_00.wav", | ||
| 710 | "duration": 5.0, | ||
| 711 | "type": "clean", | ||
| 712 | "offset": 5.3221248501273655, | ||
| 713 | "segment_type": "mid" | ||
| 714 | }, | ||
| 715 | { | ||
| 716 | "song_id": "song_0015", | ||
| 717 | "audio_path": "segments/song_0015_seg_01.wav", | ||
| 718 | "duration": 5.0, | ||
| 719 | "type": "clean", | ||
| 720 | "offset": 4.113385082174164, | ||
| 721 | "segment_type": "mid" | ||
| 722 | }, | ||
| 723 | { | ||
| 724 | "song_id": "song_0015", | ||
| 725 | "audio_path": "segments/song_0015_seg_02_augmented.wav", | ||
| 726 | "duration": 5.0, | ||
| 727 | "type": "augmented", | ||
| 728 | "offset": 0.16726147602629915, | ||
| 729 | "segment_type": "intro" | ||
| 730 | }, | ||
| 731 | { | ||
| 732 | "song_id": "song_0015", | ||
| 733 | "audio_path": "segments/song_0015_seg_03_humming_like.wav", | ||
| 734 | "duration": 5.0, | ||
| 735 | "type": "humming_like", | ||
| 736 | "offset": 4.305732086760379, | ||
| 737 | "segment_type": "mid" | ||
| 738 | }, | ||
| 739 | { | ||
| 740 | "song_id": "song_0015", | ||
| 741 | "audio_path": "segments/song_0015_seg_04_confused.wav", | ||
| 742 | "duration": 5.0, | ||
| 743 | "type": "confused", | ||
| 744 | "offset": 6.197808424119352, | ||
| 745 | "segment_type": "mid" | ||
| 746 | }, | ||
| 747 | { | ||
| 748 | "song_id": "song_0015", | ||
| 749 | "audio_path": "songs/song_0015.wav", | ||
| 750 | "duration": 15.0, | ||
| 751 | "base_freq": 587.33, | ||
| 752 | "type": "reference" | ||
| 753 | } | ||
| 754 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/synthetic_v2/val.json
0 → 100644
| 1 | [ | ||
| 2 | { | ||
| 3 | "song_id": "song_0016", | ||
| 4 | "audio_path": "segments/song_0016_seg_00.wav", | ||
| 5 | "duration": 5.0, | ||
| 6 | "type": "clean", | ||
| 7 | "offset": 7.208994524555927, | ||
| 8 | "segment_type": "mid" | ||
| 9 | }, | ||
| 10 | { | ||
| 11 | "song_id": "song_0016", | ||
| 12 | "audio_path": "segments/song_0016_seg_01.wav", | ||
| 13 | "duration": 5.0, | ||
| 14 | "type": "clean", | ||
| 15 | "offset": 4.958024367228626, | ||
| 16 | "segment_type": "mid" | ||
| 17 | }, | ||
| 18 | { | ||
| 19 | "song_id": "song_0016", | ||
| 20 | "audio_path": "segments/song_0016_seg_02_augmented.wav", | ||
| 21 | "duration": 5.0, | ||
| 22 | "type": "augmented", | ||
| 23 | "offset": 6.1666879203579, | ||
| 24 | "segment_type": "mid" | ||
| 25 | }, | ||
| 26 | { | ||
| 27 | "song_id": "song_0016", | ||
| 28 | "audio_path": "segments/song_0016_seg_03_humming_like.wav", | ||
| 29 | "duration": 5.0, | ||
| 30 | "type": "humming_like", | ||
| 31 | "offset": 8.621983105655142, | ||
| 32 | "segment_type": "mid" | ||
| 33 | }, | ||
| 34 | { | ||
| 35 | "song_id": "song_0016", | ||
| 36 | "audio_path": "segments/song_0016_seg_04_confused.wav", | ||
| 37 | "duration": 5.0, | ||
| 38 | "type": "confused", | ||
| 39 | "offset": 3.004352846791234, | ||
| 40 | "segment_type": "mid" | ||
| 41 | }, | ||
| 42 | { | ||
| 43 | "song_id": "song_0016", | ||
| 44 | "audio_path": "songs/song_0016.wav", | ||
| 45 | "duration": 15.0, | ||
| 46 | "base_freq": 659.25, | ||
| 47 | "type": "reference" | ||
| 48 | }, | ||
| 49 | { | ||
| 50 | "song_id": "song_0017", | ||
| 51 | "audio_path": "segments/song_0017_seg_00.wav", | ||
| 52 | "duration": 5.0, | ||
| 53 | "type": "clean", | ||
| 54 | "offset": 5.277150196277827, | ||
| 55 | "segment_type": "mid" | ||
| 56 | }, | ||
| 57 | { | ||
| 58 | "song_id": "song_0017", | ||
| 59 | "audio_path": "segments/song_0017_seg_01.wav", | ||
| 60 | "duration": 5.0, | ||
| 61 | "type": "clean", | ||
| 62 | "offset": 6.391085856661506, | ||
| 63 | "segment_type": "mid" | ||
| 64 | }, | ||
| 65 | { | ||
| 66 | "song_id": "song_0017", | ||
| 67 | "audio_path": "segments/song_0017_seg_02_augmented.wav", | ||
| 68 | "duration": 5.0, | ||
| 69 | "type": "augmented", | ||
| 70 | "offset": 5.969708292829935, | ||
| 71 | "segment_type": "mid" | ||
| 72 | }, | ||
| 73 | { | ||
| 74 | "song_id": "song_0017", | ||
| 75 | "audio_path": "segments/song_0017_seg_03_humming_like.wav", | ||
| 76 | "duration": 5.0, | ||
| 77 | "type": "humming_like", | ||
| 78 | "offset": 6.1736267933642495, | ||
| 79 | "segment_type": "mid" | ||
| 80 | }, | ||
| 81 | { | ||
| 82 | "song_id": "song_0017", | ||
| 83 | "audio_path": "segments/song_0017_seg_04_confused.wav", | ||
| 84 | "duration": 5.0, | ||
| 85 | "type": "confused", | ||
| 86 | "offset": 1.1786165266165671, | ||
| 87 | "segment_type": "intro" | ||
| 88 | }, | ||
| 89 | { | ||
| 90 | "song_id": "song_0017", | ||
| 91 | "audio_path": "songs/song_0017.wav", | ||
| 92 | "duration": 15.0, | ||
| 93 | "base_freq": 698.46, | ||
| 94 | "type": "reference" | ||
| 95 | }, | ||
| 96 | { | ||
| 97 | "song_id": "song_0018", | ||
| 98 | "audio_path": "segments/song_0018_seg_00.wav", | ||
| 99 | "duration": 5.0, | ||
| 100 | "type": "clean", | ||
| 101 | "offset": 6.641438208318426, | ||
| 102 | "segment_type": "mid" | ||
| 103 | }, | ||
| 104 | { | ||
| 105 | "song_id": "song_0018", | ||
| 106 | "audio_path": "segments/song_0018_seg_01.wav", | ||
| 107 | "duration": 5.0, | ||
| 108 | "type": "clean", | ||
| 109 | "offset": 3.582227293409872, | ||
| 110 | "segment_type": "mid" | ||
| 111 | }, | ||
| 112 | { | ||
| 113 | "song_id": "song_0018", | ||
| 114 | "audio_path": "segments/song_0018_seg_02_augmented.wav", | ||
| 115 | "duration": 5.0, | ||
| 116 | "type": "augmented", | ||
| 117 | "offset": 0.6333068606017467, | ||
| 118 | "segment_type": "intro" | ||
| 119 | }, | ||
| 120 | { | ||
| 121 | "song_id": "song_0018", | ||
| 122 | "audio_path": "segments/song_0018_seg_03_humming_like.wav", | ||
| 123 | "duration": 5.0, | ||
| 124 | "type": "humming_like", | ||
| 125 | "offset": 3.3775515517078736, | ||
| 126 | "segment_type": "mid" | ||
| 127 | }, | ||
| 128 | { | ||
| 129 | "song_id": "song_0018", | ||
| 130 | "audio_path": "segments/song_0018_seg_04_confused.wav", | ||
| 131 | "duration": 5.0, | ||
| 132 | "type": "confused", | ||
| 133 | "offset": 6.825519260932059, | ||
| 134 | "segment_type": "mid" | ||
| 135 | }, | ||
| 136 | { | ||
| 137 | "song_id": "song_0018", | ||
| 138 | "audio_path": "songs/song_0018.wav", | ||
| 139 | "duration": 15.0, | ||
| 140 | "base_freq": 783.99, | ||
| 141 | "type": "reference" | ||
| 142 | }, | ||
| 143 | { | ||
| 144 | "song_id": "song_0019", | ||
| 145 | "audio_path": "segments/song_0019_seg_00.wav", | ||
| 146 | "duration": 5.0, | ||
| 147 | "type": "clean", | ||
| 148 | "offset": 6.405372883123518, | ||
| 149 | "segment_type": "mid" | ||
| 150 | }, | ||
| 151 | { | ||
| 152 | "song_id": "song_0019", | ||
| 153 | "audio_path": "segments/song_0019_seg_01.wav", | ||
| 154 | "duration": 5.0, | ||
| 155 | "type": "clean", | ||
| 156 | "offset": 5.376553581360508, | ||
| 157 | "segment_type": "mid" | ||
| 158 | }, | ||
| 159 | { | ||
| 160 | "song_id": "song_0019", | ||
| 161 | "audio_path": "segments/song_0019_seg_02_augmented.wav", | ||
| 162 | "duration": 5.0, | ||
| 163 | "type": "augmented", | ||
| 164 | "offset": 1.5268044380447066, | ||
| 165 | "segment_type": "intro" | ||
| 166 | }, | ||
| 167 | { | ||
| 168 | "song_id": "song_0019", | ||
| 169 | "audio_path": "segments/song_0019_seg_03_humming_like.wav", | ||
| 170 | "duration": 5.0, | ||
| 171 | "type": "humming_like", | ||
| 172 | "offset": 5.864371630124319, | ||
| 173 | "segment_type": "mid" | ||
| 174 | }, | ||
| 175 | { | ||
| 176 | "song_id": "song_0019", | ||
| 177 | "audio_path": "segments/song_0019_seg_04_confused.wav", | ||
| 178 | "duration": 5.0, | ||
| 179 | "type": "confused", | ||
| 180 | "offset": 4.37486043050575, | ||
| 181 | "segment_type": "mid" | ||
| 182 | }, | ||
| 183 | { | ||
| 184 | "song_id": "song_0019", | ||
| 185 | "audio_path": "songs/song_0019.wav", | ||
| 186 | "duration": 15.0, | ||
| 187 | "base_freq": 880.0, | ||
| 188 | "type": "reference" | ||
| 189 | } | ||
| 190 | ] | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
acr-engine/data/tmp_catalog.json
0 → 100644
| ... | @@ -71,6 +71,7 @@ def main(): | ... | @@ -71,6 +71,7 @@ def main(): |
| 71 | }) | 71 | }) |
| 72 | 72 | ||
| 73 | total = len(queries) | 73 | total = len(queries) |
| 74 | confusion_focus = {k:v for k,v in by_type.items() if k in {"confused", "humming_like"}} | ||
| 74 | report = { | 75 | report = { |
| 75 | "split": args.split, | 76 | "split": args.split, |
| 76 | "num_queries": total, | 77 | "num_queries": total, |
| ... | @@ -84,6 +85,10 @@ def main(): | ... | @@ -84,6 +85,10 @@ def main(): |
| 84 | } | 85 | } |
| 85 | for k, v in by_type.items() | 86 | for k, v in by_type.items() |
| 86 | }, | 87 | }, |
| 88 | "hard_case_summary": { | ||
| 89 | k: {"n": v["n"], "top1": round(v["top1"]/v["n"],4) if v["n"] else 0.0, "topk": round(v["topk"]/v["n"],4) if v["n"] else 0.0} | ||
| 90 | for k,v in confusion_focus.items() | ||
| 91 | }, | ||
| 87 | "sample_failures": failures[:10], | 92 | "sample_failures": failures[:10], |
| 88 | } | 93 | } |
| 89 | print(json.dumps(report, ensure_ascii=False, indent=2)) | 94 | print(json.dumps(report, ensure_ascii=False, indent=2)) | ... | ... |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
acr-engine/src/data/external_adapters.py
0 → 100644
| 1 | """Dataset adapter skeletons for external/open music corpora.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | from dataclasses import dataclass, asdict | ||
| 6 | from pathlib import Path | ||
| 7 | from typing import Dict, List | ||
| 8 | import argparse | ||
| 9 | import json | ||
| 10 | |||
| 11 | |||
| 12 | @dataclass | ||
| 13 | class DatasetRecord: | ||
| 14 | name: str | ||
| 15 | source_url: str | ||
| 16 | license: str | ||
| 17 | commercial_use: str | ||
| 18 | notes: str | ||
| 19 | |||
| 20 | |||
| 21 | class BaseAdapter: | ||
| 22 | name = "base" | ||
| 23 | |||
| 24 | def describe(self) -> Dict: | ||
| 25 | raise NotImplementedError | ||
| 26 | |||
| 27 | def init_layout(self, root: Path) -> Dict: | ||
| 28 | root.mkdir(parents=True, exist_ok=True) | ||
| 29 | for sub in ["raw", "processed", "manifests", "licenses"]: | ||
| 30 | (root / sub).mkdir(exist_ok=True) | ||
| 31 | manifest = { | ||
| 32 | "dataset": self.name, | ||
| 33 | "root": str(root), | ||
| 34 | "status": "initialized", | ||
| 35 | "next_steps": [ | ||
| 36 | "download raw audio according to upstream license terms", | ||
| 37 | "convert to catalog/query manifests", | ||
| 38 | "record license evidence before training", | ||
| 39 | ], | ||
| 40 | } | ||
| 41 | with open(root / "manifests" / "bootstrap.json", "w") as f: | ||
| 42 | json.dump(manifest, f, indent=2, ensure_ascii=False) | ||
| 43 | return manifest | ||
| 44 | |||
| 45 | |||
| 46 | class FMAAdapter(BaseAdapter): | ||
| 47 | name = "fma" | ||
| 48 | |||
| 49 | def describe(self) -> Dict: | ||
| 50 | return { | ||
| 51 | "name": "FMA", | ||
| 52 | "source_url": "https://github.com/mdeff/fma", | ||
| 53 | "recommended_subset": "fma_small", | ||
| 54 | "catalog_strategy": "full tracks as references; random 5-15s crops as queries", | ||
| 55 | "license_policy": "review per subset/track before commercial training", | ||
| 56 | } | ||
| 57 | |||
| 58 | |||
| 59 | class MTGJamendoAdapter(BaseAdapter): | ||
| 60 | name = "mtg_jamendo" | ||
| 61 | |||
| 62 | def describe(self) -> Dict: | ||
| 63 | return { | ||
| 64 | "name": "MTG-Jamendo", | ||
| 65 | "source_url": "https://github.com/MTG/mtg-jamendo-dataset", | ||
| 66 | "recommended_subset": "small curated slice", | ||
| 67 | "catalog_strategy": "download upstream audio subset then build catalog/query manifests", | ||
| 68 | "license_policy": "verify CC terms for intended commercial use", | ||
| 69 | } | ||
| 70 | |||
| 71 | |||
| 72 | class CCMusicAdapter(BaseAdapter): | ||
| 73 | name = "ccmusic" | ||
| 74 | |||
| 75 | def describe(self) -> Dict: | ||
| 76 | return { | ||
| 77 | "name": "CCMusic", | ||
| 78 | "source_url": "https://ccmusic-database.github.io/en/database/ccm.html", | ||
| 79 | "recommended_subset": "whitelisted approved subset only", | ||
| 80 | "catalog_strategy": "use approved corpora only; normalize to project manifests", | ||
| 81 | "license_policy": "application/permission review required before use", | ||
| 82 | } | ||
| 83 | |||
| 84 | |||
| 85 | class ModelScopeMusicAdapter(BaseAdapter): | ||
| 86 | name = "modelscope_music" | ||
| 87 | |||
| 88 | def describe(self) -> Dict: | ||
| 89 | return { | ||
| 90 | "name": "ModelScope music datasets", | ||
| 91 | "source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset", | ||
| 92 | "recommended_subset": "manual whitelist only", | ||
| 93 | "catalog_strategy": "treat as discovery surface; add per-dataset adapter after legal review", | ||
| 94 | "license_policy": "deny until whitelisted", | ||
| 95 | } | ||
| 96 | |||
| 97 | |||
| 98 | ADAPTERS = { | ||
| 99 | "fma": FMAAdapter(), | ||
| 100 | "mtg_jamendo": MTGJamendoAdapter(), | ||
| 101 | "ccmusic": CCMusicAdapter(), | ||
| 102 | "modelscope_music": ModelScopeMusicAdapter(), | ||
| 103 | } | ||
| 104 | |||
| 105 | REGISTRY: List[DatasetRecord] = [ | ||
| 106 | DatasetRecord( | ||
| 107 | name="FMA", | ||
| 108 | source_url="https://github.com/mdeff/fma", | ||
| 109 | license="Track-dependent / metadata CC BY 4.0; verify per subset", | ||
| 110 | commercial_use="review_required", | ||
| 111 | notes="Good first realistic MIR baseline", | ||
| 112 | ), | ||
| 113 | DatasetRecord( | ||
| 114 | name="MTG-Jamendo", | ||
| 115 | source_url="https://github.com/MTG/mtg-jamendo-dataset", | ||
| 116 | license="Creative Commons source tracks; verify exact subset terms", | ||
| 117 | commercial_use="review_required", | ||
| 118 | notes="Good retrieval/tagging corpus with scripts", | ||
| 119 | ), | ||
| 120 | DatasetRecord( | ||
| 121 | name="CCMusic", | ||
| 122 | source_url="https://ccmusic-database.github.io/en/database/ccm.html", | ||
| 123 | license="varies / application may be required", | ||
| 124 | commercial_use="review_required", | ||
| 125 | notes="Useful Chinese MIR source, needs permission review", | ||
| 126 | ), | ||
| 127 | DatasetRecord( | ||
| 128 | name="ModelScope-music", | ||
| 129 | source_url="https://modelscope.cn/search?page=1&search=music&type=dataset", | ||
| 130 | license="varies by dataset", | ||
| 131 | commercial_use="deny_until_whitelisted", | ||
| 132 | notes="Discovery surface only until per-dataset review is complete", | ||
| 133 | ), | ||
| 134 | ] | ||
| 135 | |||
| 136 | |||
| 137 | def write_registry(output_path: str): | ||
| 138 | out = Path(output_path) | ||
| 139 | out.parent.mkdir(parents=True, exist_ok=True) | ||
| 140 | with open(out, "w") as f: | ||
| 141 | json.dump([asdict(x) for x in REGISTRY], f, indent=2, ensure_ascii=False) | ||
| 142 | return out | ||
| 143 | |||
| 144 | |||
| 145 | def main(): | ||
| 146 | parser = argparse.ArgumentParser() | ||
| 147 | sub = parser.add_subparsers(dest="cmd", required=True) | ||
| 148 | |||
| 149 | p = sub.add_parser("registry") | ||
| 150 | p.add_argument("--output", default="data/dataset_registry.json") | ||
| 151 | |||
| 152 | p = sub.add_parser("init") | ||
| 153 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | ||
| 154 | p.add_argument("--root", default="data/external") | ||
| 155 | |||
| 156 | p = sub.add_parser("describe") | ||
| 157 | p.add_argument("dataset", choices=sorted(ADAPTERS)) | ||
| 158 | |||
| 159 | args = parser.parse_args() | ||
| 160 | if args.cmd == "registry": | ||
| 161 | path = write_registry(args.output) | ||
| 162 | print(path) | ||
| 163 | elif args.cmd == "init": | ||
| 164 | root = Path(args.root) / args.dataset | ||
| 165 | print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False)) | ||
| 166 | elif args.cmd == "describe": | ||
| 167 | print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False)) | ||
| 168 | |||
| 169 | |||
| 170 | if __name__ == "__main__": | ||
| 171 | main() |
acr-engine/src/data/manifest_tools.py
0 → 100644
| 1 | """External dataset manifest conversion templates.""" | ||
| 2 | |||
| 3 | from __future__ import annotations | ||
| 4 | |||
| 5 | import argparse | ||
| 6 | import csv | ||
| 7 | import json | ||
| 8 | from pathlib import Path | ||
| 9 | from typing import List, Dict | ||
| 10 | |||
| 11 | |||
| 12 | def write_catalog(records: List[Dict], output_path: Path): | ||
| 13 | output_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 14 | with open(output_path, "w") as f: | ||
| 15 | json.dump(records, f, indent=2, ensure_ascii=False) | ||
| 16 | |||
| 17 | |||
| 18 | def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_path", id_field: str = "song_id"): | ||
| 19 | records = [] | ||
| 20 | with open(csv_path, newline="") as f: | ||
| 21 | reader = csv.DictReader(f) | ||
| 22 | for row in reader: | ||
| 23 | records.append( | ||
| 24 | { | ||
| 25 | "song_id": row[id_field], | ||
| 26 | "audio_path": row[path_field], | ||
| 27 | "duration": float(row.get("duration", 0.0) or 0.0), | ||
| 28 | "type": "reference", | ||
| 29 | "source_dataset": row.get("source_dataset", "external"), | ||
| 30 | } | ||
| 31 | ) | ||
| 32 | write_catalog(records, output_path) | ||
| 33 | return len(records) | ||
| 34 | |||
| 35 | |||
| 36 | def main(): | ||
| 37 | parser = argparse.ArgumentParser() | ||
| 38 | sub = parser.add_subparsers(dest="cmd", required=True) | ||
| 39 | |||
| 40 | p = sub.add_parser("csv-to-catalog") | ||
| 41 | p.add_argument("csv_path") | ||
| 42 | p.add_argument("output_path") | ||
| 43 | p.add_argument("--path-field", default="audio_path") | ||
| 44 | p.add_argument("--id-field", default="song_id") | ||
| 45 | |||
| 46 | args = parser.parse_args() | ||
| 47 | if args.cmd == "csv-to-catalog": | ||
| 48 | count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field) | ||
| 49 | print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False)) | ||
| 50 | |||
| 51 | |||
| 52 | if __name__ == "__main__": | ||
| 53 | main() |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
| 1 | """ | 1 | """Hybrid ACR Engine: Chromaprint + ECAPA + melody-aware re-ranking.""" |
| 2 | Hybrid ACR Engine: Chromaprint fast pre-filter + ECAPA-TDNN deep re-ranking. | ||
| 3 | """ | ||
| 4 | 2 | ||
| 5 | import json | 3 | import json |
| 6 | import time | 4 | import time |
| 5 | from pathlib import Path | ||
| 7 | from typing import Dict, List, Optional | 6 | from typing import Dict, List, Optional |
| 8 | 7 | ||
| 9 | import librosa | 8 | import librosa |
| 10 | import numpy as np | 9 | import numpy as np |
| 11 | 10 | ||
| 11 | from src.utils.audio import AudioProcessor | ||
| 12 | |||
| 12 | 13 | ||
| 13 | class Candidate: | 14 | class Candidate: |
| 14 | def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0): | 15 | def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0, melody_score: float = 0.0): |
| 15 | self.song_id = song_id | 16 | self.song_id = song_id |
| 16 | self.chroma_score = chroma_score | 17 | self.chroma_score = chroma_score |
| 17 | self.ecapa_score = ecapa_score | 18 | self.ecapa_score = ecapa_score |
| 19 | self.melody_score = melody_score | ||
| 18 | self.metadata: Dict = {} | 20 | self.metadata: Dict = {} |
| 19 | 21 | ||
| 20 | def combined_score(self, chroma_weight: float, ecapa_weight: float) -> float: | 22 | def combined_score(self, chroma_weight: float, ecapa_weight: float, melody_weight: float) -> float: |
| 21 | return chroma_weight * self.chroma_score + ecapa_weight * self.ecapa_score | 23 | return ( |
| 22 | 24 | chroma_weight * self.chroma_score | |
| 23 | def __repr__(self): | 25 | + ecapa_weight * self.ecapa_score |
| 24 | return f"Candidate({self.song_id}, chroma={self.chroma_score:.3f}, ecapa={self.ecapa_score:.3f})" | 26 | + melody_weight * self.melody_score |
| 27 | ) | ||
| 25 | 28 | ||
| 26 | 29 | ||
| 27 | class HybridEngine: | 30 | class HybridEngine: |
| ... | @@ -32,8 +35,9 @@ class HybridEngine: | ... | @@ -32,8 +35,9 @@ class HybridEngine: |
| 32 | ref_embs: Optional[np.ndarray] = None, | 35 | ref_embs: Optional[np.ndarray] = None, |
| 33 | ref_ids: Optional[List[str]] = None, | 36 | ref_ids: Optional[List[str]] = None, |
| 34 | sr: int = 16000, | 37 | sr: int = 16000, |
| 35 | chroma_weight: float = 0.35, | 38 | chroma_weight: float = 0.25, |
| 36 | ecapa_weight: float = 0.65, | 39 | ecapa_weight: float = 0.5, |
| 40 | melody_weight: float = 0.25, | ||
| 37 | reject_threshold: float = 0.35, | 41 | reject_threshold: float = 0.35, |
| 38 | ): | 42 | ): |
| 39 | self.chroma = chroma_matcher | 43 | self.chroma = chroma_matcher |
| ... | @@ -43,12 +47,16 @@ class HybridEngine: | ... | @@ -43,12 +47,16 @@ class HybridEngine: |
| 43 | self.sr = sr | 47 | self.sr = sr |
| 44 | self.chroma_weight = chroma_weight | 48 | self.chroma_weight = chroma_weight |
| 45 | self.ecapa_weight = ecapa_weight | 49 | self.ecapa_weight = ecapa_weight |
| 50 | self.melody_weight = melody_weight | ||
| 46 | self.reject_threshold = reject_threshold | 51 | self.reject_threshold = reject_threshold |
| 47 | self.song_metadata: Dict[str, Dict] = {} | 52 | self.song_metadata: Dict[str, Dict] = {} |
| 53 | self.song_audio_paths: Dict[str, str] = {} | ||
| 54 | self.audio = AudioProcessor(sr=sr) | ||
| 48 | 55 | ||
| 49 | def load_metadata(self, metadata_path: str): | 56 | def load_metadata(self, metadata_path: str): |
| 50 | with open(metadata_path) as f: | 57 | with open(metadata_path) as f: |
| 51 | items = json.load(f) | 58 | items = json.load(f) |
| 59 | base_dir = str(Path(metadata_path).parent) | ||
| 52 | for item in items: | 60 | for item in items: |
| 53 | sid = item["song_id"] | 61 | sid = item["song_id"] |
| 54 | existing = self.song_metadata.get(sid, {}) | 62 | existing = self.song_metadata.get(sid, {}) |
| ... | @@ -59,15 +67,15 @@ class HybridEngine: | ... | @@ -59,15 +67,15 @@ class HybridEngine: |
| 59 | "audio_path": item.get("audio_path", existing.get("audio_path", "")), | 67 | "audio_path": item.get("audio_path", existing.get("audio_path", "")), |
| 60 | "type": item.get("type", existing.get("type", "unknown")), | 68 | "type": item.get("type", existing.get("type", "unknown")), |
| 61 | } | 69 | } |
| 70 | if item.get("type") == "reference": | ||
| 71 | self.song_audio_paths[sid] = str(Path(base_dir) / item["audio_path"]) | ||
| 62 | 72 | ||
| 63 | @staticmethod | 73 | @staticmethod |
| 64 | def _normalize_scores(score_pairs: List[tuple], invert: bool = False) -> Dict[str, float]: | 74 | def _normalize_scores(score_pairs: List[tuple]) -> Dict[str, float]: |
| 65 | if not score_pairs: | 75 | if not score_pairs: |
| 66 | return {} | 76 | return {} |
| 67 | ids = [sid for sid, _ in score_pairs] | 77 | ids = [sid for sid, _ in score_pairs] |
| 68 | values = np.array([float(score) for _, score in score_pairs], dtype=np.float32) | 78 | values = np.array([float(score) for _, score in score_pairs], dtype=np.float32) |
| 69 | if invert: | ||
| 70 | values = -values | ||
| 71 | if len(values) == 1: | 79 | if len(values) == 1: |
| 72 | return {ids[0]: 1.0} | 80 | return {ids[0]: 1.0} |
| 73 | vmin = float(values.min()) | 81 | vmin = float(values.min()) |
| ... | @@ -77,12 +85,18 @@ class HybridEngine: | ... | @@ -77,12 +85,18 @@ class HybridEngine: |
| 77 | norm = (values - vmin) / (vmax - vmin) | 85 | norm = (values - vmin) / (vmax - vmin) |
| 78 | return {sid: float(score) for sid, score in zip(ids, norm)} | 86 | return {sid: float(score) for sid, score in zip(ids, norm)} |
| 79 | 87 | ||
| 80 | def recognize( | 88 | def _melody_scores(self, query_y: np.ndarray, candidate_ids: List[str]) -> Dict[str, float]: |
| 81 | self, | 89 | scores = [] |
| 82 | audio_path: str, | 90 | for song_id in candidate_ids: |
| 83 | top_n: int = 5, | 91 | ref_path = self.song_audio_paths.get(song_id) |
| 84 | mode: str = "auto", | 92 | if not ref_path or not Path(ref_path).exists(): |
| 85 | ) -> Dict: | 93 | continue |
| 94 | ref_y, _ = librosa.load(ref_path, sr=self.sr, mono=True, duration=8.0) | ||
| 95 | score = self.audio.melody_similarity(query_y, ref_y) | ||
| 96 | scores.append((song_id, score)) | ||
| 97 | return self._normalize_scores(scores) | ||
| 98 | |||
| 99 | def recognize(self, audio_path: str, top_n: int = 5, mode: str = "auto") -> Dict: | ||
| 86 | del mode | 100 | del mode |
| 87 | start = time.time() | 101 | start = time.time() |
| 88 | y, _ = librosa.load(audio_path, sr=self.sr, mono=True) | 102 | y, _ = librosa.load(audio_path, sr=self.sr, mono=True) |
| ... | @@ -96,41 +110,45 @@ class HybridEngine: | ... | @@ -96,41 +110,45 @@ class HybridEngine: |
| 96 | ref_norm = self.ref_embs / (np.linalg.norm(self.ref_embs, axis=1, keepdims=True) + 1e-12) | 110 | ref_norm = self.ref_embs / (np.linalg.norm(self.ref_embs, axis=1, keepdims=True) + 1e-12) |
| 97 | query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-12) | 111 | query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-12) |
| 98 | scores = query_norm @ ref_norm.T | 112 | scores = query_norm @ ref_norm.T |
| 99 | top_indices = np.argsort(-scores)[: max(top_n * 5, 20)] | 113 | top_indices = np.argsort(-scores)[: max(top_n * 10, 30)] |
| 100 | ecapa_matches = [(self.ref_ids[idx], float(scores[idx])) for idx in top_indices] | 114 | ecapa_matches = [(self.ref_ids[idx], float(scores[idx])) for idx in top_indices] |
| 101 | ecapa_norm = self._normalize_scores(ecapa_matches) | 115 | ecapa_norm = self._normalize_scores(ecapa_matches) |
| 102 | 116 | ||
| 103 | all_song_ids = set(chroma_norm) | set(ecapa_norm) | 117 | candidate_pool = list(set(list(chroma_norm.keys())[: top_n * 8] + list(ecapa_norm.keys())[: top_n * 8])) |
| 118 | melody_norm = self._melody_scores(y, candidate_pool) | ||
| 119 | |||
| 120 | all_song_ids = set(candidate_pool) | set(melody_norm) | ||
| 104 | combined: List[Candidate] = [] | 121 | combined: List[Candidate] = [] |
| 105 | for song_id in all_song_ids: | 122 | for song_id in all_song_ids: |
| 106 | candidate = Candidate( | 123 | candidate = Candidate( |
| 107 | song_id=song_id, | 124 | song_id=song_id, |
| 108 | chroma_score=chroma_norm.get(song_id, 0.0), | 125 | chroma_score=chroma_norm.get(song_id, 0.0), |
| 109 | ecapa_score=ecapa_norm.get(song_id, 0.0), | 126 | ecapa_score=ecapa_norm.get(song_id, 0.0), |
| 127 | melody_score=melody_norm.get(song_id, 0.0), | ||
| 110 | ) | 128 | ) |
| 111 | candidate.metadata = self.song_metadata.get(song_id, {}) | 129 | candidate.metadata = self.song_metadata.get(song_id, {}) |
| 112 | combined.append(candidate) | 130 | combined.append(candidate) |
| 113 | 131 | ||
| 114 | combined.sort(key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight), reverse=True) | 132 | combined.sort( |
| 133 | key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight), | ||
| 134 | reverse=True, | ||
| 135 | ) | ||
| 115 | results = combined[:top_n] | 136 | results = combined[:top_n] |
| 116 | elapsed = (time.time() - start) * 1000 | 137 | elapsed = (time.time() - start) * 1000 |
| 117 | 138 | ||
| 118 | output = [] | 139 | output = [] |
| 119 | for c in results: | 140 | for c in results: |
| 120 | fused = c.combined_score(self.chroma_weight, self.ecapa_weight) | 141 | fused = c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight) |
| 121 | output.append( | 142 | output.append( |
| 122 | { | 143 | { |
| 123 | "song_id": c.song_id, | 144 | "song_id": c.song_id, |
| 124 | "confidence": round(fused, 4), | 145 | "confidence": round(fused, 4), |
| 125 | "chromaprint_score": round(c.chroma_score, 4), | 146 | "chromaprint_score": round(c.chroma_score, 4), |
| 126 | "ecapa_score": round(c.ecapa_score, 4), | 147 | "ecapa_score": round(c.ecapa_score, 4), |
| 148 | "melody_score": round(c.melody_score, 4), | ||
| 127 | "accepted": fused >= self.reject_threshold, | 149 | "accepted": fused >= self.reject_threshold, |
| 128 | "metadata": c.metadata, | 150 | "metadata": c.metadata, |
| 129 | } | 151 | } |
| 130 | ) | 152 | ) |
| 131 | 153 | ||
| 132 | return { | 154 | return {"candidates": output, "processing_time_ms": round(elapsed, 1), "num_candidates": len(results)} |
| 133 | "candidates": output, | ||
| 134 | "processing_time_ms": round(elapsed, 1), | ||
| 135 | "num_candidates": len(results), | ||
| 136 | } | ... | ... |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
acr-engine/src/service/app.py
0 → 100644
| 1 | from pathlib import Path | ||
| 2 | from typing import Optional | ||
| 3 | |||
| 4 | import numpy as np | ||
| 5 | from fastapi import FastAPI, HTTPException | ||
| 6 | from pydantic import BaseModel | ||
| 7 | |||
| 8 | from src.engines.chromaprint_matcher import ChromaprintMatcher | ||
| 9 | from src.engines.ecapa_embedder import ECAPAEmbedder | ||
| 10 | from src.engines.hybrid_engine import HybridEngine | ||
| 11 | |||
| 12 | |||
| 13 | class RecognizeRequest(BaseModel): | ||
| 14 | query_path: str | ||
| 15 | data_dir: str = "data/synthetic_v2" | ||
| 16 | model_path: str = "data/models_v3/best_model.pt" | ||
| 17 | index_prefix: str = "data/index_v3/reference" | ||
| 18 | top_n: int = 5 | ||
| 19 | device: str = "cpu" | ||
| 20 | |||
| 21 | |||
| 22 | class BuildIndexRequest(BaseModel): | ||
| 23 | data_dir: str | ||
| 24 | model_path: str | ||
| 25 | output_dir: str | ||
| 26 | device: str = "cpu" | ||
| 27 | |||
| 28 | |||
| 29 | app = FastAPI(title="ACR Service", version="0.1.0") | ||
| 30 | |||
| 31 | |||
| 32 | def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) -> HybridEngine: | ||
| 33 | matcher = ChromaprintMatcher() | ||
| 34 | chroma_path = str(Path(index_prefix).parent / "chromaprint.pkl") | ||
| 35 | if not Path(chroma_path).exists(): | ||
| 36 | raise HTTPException(status_code=400, detail=f"Missing chromaprint index: {chroma_path}") | ||
| 37 | matcher.load(chroma_path) | ||
| 38 | |||
| 39 | if not Path(model_path).exists(): | ||
| 40 | raise HTTPException(status_code=400, detail=f"Missing model: {model_path}") | ||
| 41 | embedder = ECAPAEmbedder(model_path=model_path, device=device) | ||
| 42 | |||
| 43 | embs_path = f"{index_prefix}_embs.npy" | ||
| 44 | ids_path = f"{index_prefix}_ids.npy" | ||
| 45 | if not Path(embs_path).exists() or not Path(ids_path).exists(): | ||
| 46 | raise HTTPException(status_code=400, detail="Missing embedding index files") | ||
| 47 | |||
| 48 | ref_embs = np.load(embs_path) | ||
| 49 | ref_ids = np.load(ids_path, allow_pickle=True).tolist() | ||
| 50 | engine = HybridEngine(matcher, embedder, ref_embs, ref_ids) | ||
| 51 | for split in ["catalog.json", "train.json", "val.json", "test.json"]: | ||
| 52 | p = Path(data_dir) / split | ||
| 53 | if p.exists(): | ||
| 54 | engine.load_metadata(str(p)) | ||
| 55 | return engine | ||
| 56 | |||
| 57 | |||
| 58 | @app.get("/health") | ||
| 59 | def health(): | ||
| 60 | return {"status": "ok"} | ||
| 61 | |||
| 62 | |||
| 63 | @app.post("/recognize") | ||
| 64 | def recognize(req: RecognizeRequest): | ||
| 65 | if not Path(req.query_path).exists(): | ||
| 66 | raise HTTPException(status_code=400, detail=f"Missing query file: {req.query_path}") | ||
| 67 | engine = _load_engine(req.data_dir, req.model_path, req.index_prefix, req.device) | ||
| 68 | return engine.recognize(req.query_path, top_n=req.top_n) | ||
| 69 | |||
| 70 | |||
| 71 | @app.post("/index/build") | ||
| 72 | def build_index(req: BuildIndexRequest): | ||
| 73 | from run_demo import build_chroma_index, build_embedding_index | ||
| 74 | |||
| 75 | data_dir = Path(req.data_dir) | ||
| 76 | out_dir = Path(req.output_dir) | ||
| 77 | out_dir.mkdir(parents=True, exist_ok=True) | ||
| 78 | build_chroma_index(data_dir, out_dir) | ||
| 79 | _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(req.model_path), out_dir / "reference", req.device) | ||
| 80 | return {"status": "ok", "num_reference_windows": len(ref_ids), "embedding_dim": int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0} |
No preview for this file type
No preview for this file type
No preview for this file type
| 1 | import torch | ||
| 2 | import torch.nn as nn | ||
| 3 | import torch.nn.functional as F | ||
| 4 | import numpy as np | ||
| 5 | import librosa | 1 | import librosa |
| 2 | import numpy as np | ||
| 3 | import torch | ||
| 6 | from typing import List, Optional, Tuple | 4 | from typing import List, Optional, Tuple |
| 7 | 5 | ||
| 8 | 6 | ||
| 9 | class AudioProcessor: | 7 | class AudioProcessor: |
| 10 | def __init__(self, sr: int = 16000, n_mels: int = 80, n_fft: int = 512, hop_length: int = 160): | 8 | def __init__(self, sr: int = 16000, n_mels: int = 128, n_fft: int = 512, hop_length: int = 160): |
| 11 | self.sr = sr | 9 | self.sr = sr |
| 12 | self.n_mels = n_mels | 10 | self.n_mels = n_mels |
| 13 | self.n_fft = n_fft | 11 | self.n_fft = n_fft |
| ... | @@ -19,8 +17,7 @@ class AudioProcessor: | ... | @@ -19,8 +17,7 @@ class AudioProcessor: |
| 19 | 17 | ||
| 20 | def to_mel(self, y: np.ndarray) -> np.ndarray: | 18 | def to_mel(self, y: np.ndarray) -> np.ndarray: |
| 21 | mel = librosa.feature.melspectrogram( | 19 | mel = librosa.feature.melspectrogram( |
| 22 | y=y, sr=self.sr, n_mels=self.n_mels, | 20 | y=y, sr=self.sr, n_mels=self.n_mels, n_fft=self.n_fft, hop_length=self.hop_length |
| 23 | n_fft=self.n_fft, hop_length=self.hop_length | ||
| 24 | ) | 21 | ) |
| 25 | return librosa.power_to_db(mel, ref=np.max) | 22 | return librosa.power_to_db(mel, ref=np.max) |
| 26 | 23 | ||
| ... | @@ -36,7 +33,7 @@ class AudioProcessor: | ... | @@ -36,7 +33,7 @@ class AudioProcessor: |
| 36 | y = np.pad(y, (0, pad)) | 33 | y = np.pad(y, (0, pad)) |
| 37 | windows = [] | 34 | windows = [] |
| 38 | for start in range(0, len(y) - win_len + 1, stride): | 35 | for start in range(0, len(y) - win_len + 1, stride): |
| 39 | windows.append(y[start:start + win_len]) | 36 | windows.append(y[start : start + win_len]) |
| 40 | if not windows: | 37 | if not windows: |
| 41 | windows.append(y[:win_len]) | 38 | windows.append(y[:win_len]) |
| 42 | return windows | 39 | return windows |
| ... | @@ -47,10 +44,32 @@ class AudioProcessor: | ... | @@ -47,10 +44,32 @@ class AudioProcessor: |
| 47 | return self.to_mel_tensor(y), duration | 44 | return self.to_mel_tensor(y), duration |
| 48 | 45 | ||
| 49 | def extract_chroma(self, y: np.ndarray) -> np.ndarray: | 46 | def extract_chroma(self, y: np.ndarray) -> np.ndarray: |
| 50 | chroma = librosa.feature.chroma_cqt(y=y, sr=self.sr) | 47 | return librosa.feature.chroma_cqt(y=y, sr=self.sr) |
| 51 | return chroma | ||
| 52 | 48 | ||
| 53 | def extract_f0(self, y: np.ndarray, fmin=65, fmax=2093) -> np.ndarray: | 49 | def extract_f0(self, y: np.ndarray, fmin=65, fmax=2093) -> np.ndarray: |
| 54 | f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=fmin, fmax=fmax) | 50 | f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=fmin, fmax=fmax) |
| 55 | f0 = np.nan_to_num(f0, nan=0.0) | 51 | return np.nan_to_num(f0, nan=0.0) |
| 56 | return f0 | 52 | |
| 53 | def melody_signature(self, y: np.ndarray) -> np.ndarray: | ||
| 54 | f0 = self.extract_f0(y) | ||
| 55 | if f0.size == 0: | ||
| 56 | return np.zeros(32, dtype=np.float32) | ||
| 57 | nonzero = f0[f0 > 0] | ||
| 58 | if nonzero.size == 0: | ||
| 59 | return np.zeros(32, dtype=np.float32) | ||
| 60 | contour = np.diff(np.log2(nonzero + 1e-6), prepend=np.log2(nonzero[0] + 1e-6)) | ||
| 61 | contour = np.clip(contour, -0.5, 0.5) | ||
| 62 | if contour.size < 32: | ||
| 63 | contour = np.pad(contour, (0, 32 - contour.size)) | ||
| 64 | else: | ||
| 65 | idx = np.linspace(0, contour.size - 1, 32).astype(int) | ||
| 66 | contour = contour[idx] | ||
| 67 | return contour.astype(np.float32) | ||
| 68 | |||
| 69 | def melody_similarity(self, y1: np.ndarray, y2: np.ndarray) -> float: | ||
| 70 | s1 = self.melody_signature(y1) | ||
| 71 | s2 = self.melody_signature(y2) | ||
| 72 | denom = float(np.linalg.norm(s1) * np.linalg.norm(s2) + 1e-12) | ||
| 73 | if denom <= 1e-12: | ||
| 74 | return 0.0 | ||
| 75 | return float(np.dot(s1, s2) / denom) | ... | ... |
| ... | @@ -53,3 +53,25 @@ | ... | @@ -53,3 +53,25 @@ |
| 53 | 结论: | 53 | 结论: |
| 54 | - 结构性错误(catalog/index/fusion/评测缺失)已明显改善 | 54 | - 结构性错误(catalog/index/fusion/评测缺失)已明显改善 |
| 55 | - 当前主要剩余短板是 humming_like / confused 的鲁棒识别 | 55 | - 当前主要剩余短板是 humming_like / confused 的鲁棒识别 |
| 56 | |||
| 57 | ## 2026-06-02 | ||
| 58 | |||
| 59 | ### Stage: 工业化服务骨架 + 外部 manifest 转换模板 | ||
| 60 | |||
| 61 | 完成项: | ||
| 62 | - 新增 FastAPI 服务骨架:`acr-engine/src/service/app.py` | ||
| 63 | - 新增 manifest 转换工具:`acr-engine/src/data/manifest_tools.py` | ||
| 64 | - 新增工业 benchmark 文档:`docs/industrial-benchmark-spec.md` | ||
| 65 | - 扩展外部 dataset adapter CLI:`acr-engine/src/data/external_adapters.py` | ||
| 66 | - 新增服务 API 文档:`docs/service-api.md` | ||
| 67 | - requirements 增加 FastAPI / uvicorn / pydantic | ||
| 68 | |||
| 69 | 验证结果: | ||
| 70 | - `external_adapters.py registry` 成功 | ||
| 71 | - `external_adapters.py describe ccmusic` 成功 | ||
| 72 | - `external_adapters.py init modelscope_music` 成功 | ||
| 73 | - `manifest_tools.py csv-to-catalog` 成功生成 catalog | ||
| 74 | - `service.app health()` 返回 `{"status":"ok"}` | ||
| 75 | - API `build_index(...)` 成功返回 reference window 数量 | ||
| 76 | - API `recognize(...)` 成功返回候选结果 | ||
| 77 | - `train.py --dry-run` 成功 | ... | ... |
docs/dataset-sources-and-licensing.md
0 → 100644
| 1 | # Dataset Sources and Licensing Notes | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 注意 | ||
| 6 | 以下仅为工程接入与研究规划说明,不等于法律意见。实际商用前需要逐条复核原始 license、dataset terms 和再训练约束。 | ||
| 7 | |||
| 8 | ## 候选数据源 | ||
| 9 | |||
| 10 | ### 1. FMA | ||
| 11 | - URL: https://github.com/mdeff/fma | ||
| 12 | - 特点: 开放、MIR 常用、适合 retrieval baseline | ||
| 13 | - 风险: 音频 license 按 artist/track 可能不同,需逐条核验 | ||
| 14 | |||
| 15 | ### 2. MTG-Jamendo | ||
| 16 | - URL: https://github.com/MTG/mtg-jamendo-dataset | ||
| 17 | - 特点: Creative Commons 来源,适合音乐检索/标签任务 | ||
| 18 | - 风险: 仍需按具体曲目用途与商业场景做 license 审查 | ||
| 19 | |||
| 20 | ### 3. CCMusic | ||
| 21 | - 论文/介绍: https://transactions.ismir.net/articles/10.5334/tismir.194 | ||
| 22 | - 主页: https://ccmusic-database.github.io/en/database/ccm.html | ||
| 23 | - 特点: 中国音乐 MIR 数据资源丰富 | ||
| 24 | - 风险: 部分数据集可能需要申请或存在使用边界,必须单独核验 | ||
| 25 | |||
| 26 | ### 4. ModelScope music datasets | ||
| 27 | - 入口: https://www.modelscope.cn/datasets | ||
| 28 | - 搜索: https://modelscope.cn/search?page=1&search=music&type=dataset | ||
| 29 | - 特点: 数据发现方便,可扩充中文生态 | ||
| 30 | - 风险: license 分散,不能默认可商用;接入前必须建立白名单 | ||
| 31 | |||
| 32 | ## 接入原则 | ||
| 33 | |||
| 34 | - 只接入 license 明确的数据集 | ||
| 35 | - 默认拒绝“来源不明 / 不允许商业使用 / 禁止训练衍生模型”的数据 | ||
| 36 | - 训练前把数据集及许可信息落盘到 registry |
docs/industrial-benchmark-spec.md
0 → 100644
| 1 | # Industrial Benchmark Spec | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 目标 | ||
| 6 | 为工业级可商用 ACR 设立持续基准,不只看总体 top1/top5,还看场景化与风险化指标。 | ||
| 7 | |||
| 8 | ## Benchmark 维度 | ||
| 9 | |||
| 10 | ### 1. Retrieval Quality | ||
| 11 | - top1 | ||
| 12 | - top5 | ||
| 13 | - MRR | ||
| 14 | - recall@k | ||
| 15 | |||
| 16 | ### 2. Scenario Buckets | ||
| 17 | - clean | ||
| 18 | - noisy | ||
| 19 | - compressed | ||
| 20 | - time-stretched | ||
| 21 | - pitch-shifted | ||
| 22 | - humming_like | ||
| 23 | - confused | ||
| 24 | - partial-overlap | ||
| 25 | - far-field / device-recorded | ||
| 26 | |||
| 27 | ### 3. Catalog Scale Buckets | ||
| 28 | - 1K songs | ||
| 29 | - 10K songs | ||
| 30 | - 100K songs | ||
| 31 | - 1M+ songs | ||
| 32 | |||
| 33 | ### 4. Operational Metrics | ||
| 34 | - p50 / p95 latency | ||
| 35 | - indexing throughput | ||
| 36 | - incremental update time | ||
| 37 | - memory / disk footprint | ||
| 38 | |||
| 39 | ### 5. Business Safety Metrics | ||
| 40 | - false accept rate | ||
| 41 | - rejection quality | ||
| 42 | - near-duplicate confusion rate | ||
| 43 | - license provenance coverage | ||
| 44 | |||
| 45 | ## Required Artifacts per Model Release | ||
| 46 | - dataset registry snapshot | ||
| 47 | - training config snapshot | ||
| 48 | - benchmark report JSON | ||
| 49 | - benchmark summary markdown | ||
| 50 | - model card | ||
| 51 | - license review manifest | ||
| 52 | |||
| 53 | ## Minimum Go/No-Go Gate | ||
| 54 | - clean top1 >= 0.95 | ||
| 55 | - noisy top1 >= 0.85 | ||
| 56 | - confused top1 >= 0.70 | ||
| 57 | - humming_like top1 >= 0.60 | ||
| 58 | - top5 >= 0.95 on all production-relevant buckets | ||
| 59 | - false accept below agreed threshold |
docs/industrialization-roadmap.md
0 → 100644
| 1 | # ACR 工业级可商用演进路线 | ||
| 2 | |||
| 3 | > 更新:2026-06-02 | ||
| 4 | |||
| 5 | ## 1. 目标定义 | ||
| 6 | |||
| 7 | 把当前原型升级为一个可商用的工业级 ACR 系统,满足: | ||
| 8 | |||
| 9 | - 可扩展曲库管理 | ||
| 10 | - 可重复训练 / 评测 / 部署 | ||
| 11 | - 多数据源接入(synthetic / FMA / Jamendo / CCMusic / ModelScope) | ||
| 12 | - 更强鲁棒性(噪声、失真、哼唱、混淆) | ||
| 13 | - 检索服务化 | ||
| 14 | - 商用合规与授权边界可审计 | ||
| 15 | |||
| 16 | ## 2. 工业级分层 | ||
| 17 | |||
| 18 | ### 2.1 数据层 | ||
| 19 | - `catalog.json` / query manifests | ||
| 20 | - 外部 dataset adapters | ||
| 21 | - license / usage tracking | ||
| 22 | - 数据版本与快照 | ||
| 23 | |||
| 24 | ### 2.2 训练层 | ||
| 25 | - baseline encoder | ||
| 26 | - foundation-model encoder | ||
| 27 | - retrieval-first losses | ||
| 28 | - hard negative mining | ||
| 29 | - 数据平衡与生成增强 | ||
| 30 | |||
| 31 | ### 2.3 索引层 | ||
| 32 | - window-level embeddings | ||
| 33 | - ANN index (Faiss/HNSW) | ||
| 34 | - 指纹索引与向量索引双路 | ||
| 35 | - 增量入库 | ||
| 36 | |||
| 37 | ### 2.4 服务层 | ||
| 38 | - FastAPI / gRPC | ||
| 39 | - batch ingest | ||
| 40 | - recognize API | ||
| 41 | - top-k candidate + rejection | ||
| 42 | - metadata lookup | ||
| 43 | |||
| 44 | ### 2.5 质量层 | ||
| 45 | - regression benchmark | ||
| 46 | - hard-case benchmark | ||
| 47 | - online shadow evaluation | ||
| 48 | - 数据/模型回滚机制 | ||
| 49 | |||
| 50 | ## 3. 数据集策略 | ||
| 51 | |||
| 52 | ### 第一梯队(优先) | ||
| 53 | - FMA small / medium | ||
| 54 | - MTG-Jamendo | ||
| 55 | - CCMusic(需核验申请/授权方式) | ||
| 56 | - ModelScope music datasets(按 license 白名单接入) | ||
| 57 | |||
| 58 | ### 第二梯队 | ||
| 59 | - humming / QBSH 数据集 | ||
| 60 | - instrument / structure / singing datasets 作为辅助监督 | ||
| 61 | |||
| 62 | ## 4. 商用必做项 | ||
| 63 | |||
| 64 | - 每个 dataset 记录: | ||
| 65 | - 来源 URL | ||
| 66 | - license | ||
| 67 | - 是否允许商业使用 | ||
| 68 | - 再分发限制 | ||
| 69 | - 模型训练用途限制 | ||
| 70 | - 每个模型版本记录训练数据组成 | ||
| 71 | - 每次上线保留评测报告与可追溯哈希 | ||
| 72 | |||
| 73 | ## 5. 当前到工业化的缺口 | ||
| 74 | |||
| 75 | - 缺 dataset adapter 层 | ||
| 76 | - 缺 ANN 检索 | ||
| 77 | - 缺 API 服务 | ||
| 78 | - 缺 license registry | ||
| 79 | - 缺 foundation-model baseline | ||
| 80 | - 缺真正的 hard-negative mining | ||
| 81 | - 缺真实开源数据 benchmark |
docs/service-api.md
0 → 100644
| 1 | # ACR Service API | ||
| 2 | |||
| 3 | ## Endpoints | ||
| 4 | |||
| 5 | ### GET /health | ||
| 6 | 返回服务健康状态。 | ||
| 7 | |||
| 8 | ### POST /recognize | ||
| 9 | 请求体: | ||
| 10 | |||
| 11 | ```json | ||
| 12 | { | ||
| 13 | "query_path": "data/synthetic_v2/segments/song_0021_seg_01_augmented.wav", | ||
| 14 | "data_dir": "data/synthetic_v2", | ||
| 15 | "model_path": "data/models_v3/best_model.pt", | ||
| 16 | "index_prefix": "data/index_v3/reference", | ||
| 17 | "top_n": 5, | ||
| 18 | "device": "cpu" | ||
| 19 | } | ||
| 20 | ``` | ||
| 21 | |||
| 22 | ### POST /index/build | ||
| 23 | 请求体: | ||
| 24 | |||
| 25 | ```json | ||
| 26 | { | ||
| 27 | "data_dir": "data/synthetic_v2", | ||
| 28 | "model_path": "data/models_v3/best_model.pt", | ||
| 29 | "output_dir": "data/index_v3", | ||
| 30 | "device": "cpu" | ||
| 31 | } | ||
| 32 | ``` |
-
Please register or sign in to post a comment