Commit 31a72045 31a720458be99c6fb7c980d570e9db42ed40eed4 by cnb.bofCdSsphPA

add src

1 parent 4b16286e
Showing 188 changed files with 1994 additions and 43 deletions
...@@ -38,8 +38,9 @@ engine: ...@@ -38,8 +38,9 @@ engine:
38 n_fft: 1024 38 n_fft: 1024
39 hop_length: 256 39 hop_length: 256
40 hybrid: 40 hybrid:
41 chroma_weight: 0.3 41 chroma_weight: 0.25
42 ecapa_weight: 0.7 42 ecapa_weight: 0.5
43 melody_weight: 0.25
43 reject_threshold: 0.4 44 reject_threshold: 0.4
44 45
45 augmentation: 46 augmentation:
......
1 [
2 {
3 "name": "FMA",
4 "source_url": "https://github.com/mdeff/fma",
5 "license": "Track-dependent / metadata CC BY 4.0; verify per subset",
6 "commercial_use": "review_required",
7 "notes": "Good first realistic MIR baseline"
8 },
9 {
10 "name": "MTG-Jamendo",
11 "source_url": "https://github.com/MTG/mtg-jamendo-dataset",
12 "license": "Creative Commons source tracks; verify exact subset terms",
13 "commercial_use": "review_required",
14 "notes": "Good retrieval/tagging corpus with scripts"
15 },
16 {
17 "name": "CCMusic",
18 "source_url": "https://ccmusic-database.github.io/en/database/ccm.html",
19 "license": "varies / application may be required",
20 "commercial_use": "review_required",
21 "notes": "Useful Chinese MIR source, needs permission review"
22 },
23 {
24 "name": "ModelScope-music",
25 "source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset",
26 "license": "varies by dataset",
27 "commercial_use": "deny_until_whitelisted",
28 "notes": "Discovery surface only until per-dataset review is complete"
29 }
30 ]
...\ No newline at end of file ...\ No newline at end of file
1 {
2 "dataset": "modelscope_music",
3 "root": "data/external/modelscope_music",
4 "status": "initialized",
5 "next_steps": [
6 "download raw audio according to upstream license terms",
7 "convert to catalog/query manifests",
8 "record license evidence before training"
9 ]
10 }
...\ No newline at end of file ...\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
1 {
2 "song_0000": 0,
3 "song_0001": 1,
4 "song_0002": 2,
5 "song_0003": 3,
6 "song_0004": 4,
7 "song_0005": 5,
8 "song_0006": 6,
9 "song_0007": 7,
10 "song_0008": 8,
11 "song_0009": 9,
12 "song_0010": 10,
13 "song_0011": 11,
14 "song_0012": 12,
15 "song_0013": 13,
16 "song_0014": 14,
17 "song_0015": 15
18 }
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "song_0000",
4 "audio_path": "songs/song_0000.wav",
5 "duration": 15.0,
6 "base_freq": 130.81,
7 "type": "reference"
8 },
9 {
10 "song_id": "song_0001",
11 "audio_path": "songs/song_0001.wav",
12 "duration": 15.0,
13 "base_freq": 146.83,
14 "type": "reference"
15 },
16 {
17 "song_id": "song_0002",
18 "audio_path": "songs/song_0002.wav",
19 "duration": 15.0,
20 "base_freq": 164.81,
21 "type": "reference"
22 },
23 {
24 "song_id": "song_0003",
25 "audio_path": "songs/song_0003.wav",
26 "duration": 15.0,
27 "base_freq": 174.61,
28 "type": "reference"
29 },
30 {
31 "song_id": "song_0004",
32 "audio_path": "songs/song_0004.wav",
33 "duration": 15.0,
34 "base_freq": 196.0,
35 "type": "reference"
36 },
37 {
38 "song_id": "song_0005",
39 "audio_path": "songs/song_0005.wav",
40 "duration": 15.0,
41 "base_freq": 220.0,
42 "type": "reference"
43 },
44 {
45 "song_id": "song_0006",
46 "audio_path": "songs/song_0006.wav",
47 "duration": 15.0,
48 "base_freq": 246.94,
49 "type": "reference"
50 },
51 {
52 "song_id": "song_0007",
53 "audio_path": "songs/song_0007.wav",
54 "duration": 15.0,
55 "base_freq": 261.63,
56 "type": "reference"
57 },
58 {
59 "song_id": "song_0008",
60 "audio_path": "songs/song_0008.wav",
61 "duration": 15.0,
62 "base_freq": 293.66,
63 "type": "reference"
64 },
65 {
66 "song_id": "song_0009",
67 "audio_path": "songs/song_0009.wav",
68 "duration": 15.0,
69 "base_freq": 329.63,
70 "type": "reference"
71 },
72 {
73 "song_id": "song_0010",
74 "audio_path": "songs/song_0010.wav",
75 "duration": 15.0,
76 "base_freq": 349.23,
77 "type": "reference"
78 },
79 {
80 "song_id": "song_0011",
81 "audio_path": "songs/song_0011.wav",
82 "duration": 15.0,
83 "base_freq": 392.0,
84 "type": "reference"
85 },
86 {
87 "song_id": "song_0012",
88 "audio_path": "songs/song_0012.wav",
89 "duration": 15.0,
90 "base_freq": 440.0,
91 "type": "reference"
92 },
93 {
94 "song_id": "song_0013",
95 "audio_path": "songs/song_0013.wav",
96 "duration": 15.0,
97 "base_freq": 493.88,
98 "type": "reference"
99 },
100 {
101 "song_id": "song_0014",
102 "audio_path": "songs/song_0014.wav",
103 "duration": 15.0,
104 "base_freq": 523.25,
105 "type": "reference"
106 },
107 {
108 "song_id": "song_0015",
109 "audio_path": "songs/song_0015.wav",
110 "duration": 15.0,
111 "base_freq": 587.33,
112 "type": "reference"
113 },
114 {
115 "song_id": "song_0016",
116 "audio_path": "songs/song_0016.wav",
117 "duration": 15.0,
118 "base_freq": 659.25,
119 "type": "reference"
120 },
121 {
122 "song_id": "song_0017",
123 "audio_path": "songs/song_0017.wav",
124 "duration": 15.0,
125 "base_freq": 698.46,
126 "type": "reference"
127 },
128 {
129 "song_id": "song_0018",
130 "audio_path": "songs/song_0018.wav",
131 "duration": 15.0,
132 "base_freq": 783.99,
133 "type": "reference"
134 },
135 {
136 "song_id": "song_0019",
137 "audio_path": "songs/song_0019.wav",
138 "duration": 15.0,
139 "base_freq": 880.0,
140 "type": "reference"
141 },
142 {
143 "song_id": "song_0020",
144 "audio_path": "songs/song_0020.wav",
145 "duration": 15.0,
146 "base_freq": 987.77,
147 "type": "reference"
148 },
149 {
150 "song_id": "song_0021",
151 "audio_path": "songs/song_0021.wav",
152 "duration": 15.0,
153 "base_freq": 146.8292605393491,
154 "type": "reference"
155 },
156 {
157 "song_id": "song_0022",
158 "audio_path": "songs/song_0022.wav",
159 "duration": 15.0,
160 "base_freq": 164.81110255326524,
161 "type": "reference"
162 },
163 {
164 "song_id": "song_0023",
165 "audio_path": "songs/song_0023.wav",
166 "duration": 15.0,
167 "base_freq": 184.99297018186778,
168 "type": "reference"
169 }
170 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "song_0020",
4 "audio_path": "segments/song_0020_seg_00.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 4.349828784349853,
8 "segment_type": "mid"
9 },
10 {
11 "song_id": "song_0020",
12 "audio_path": "segments/song_0020_seg_01.wav",
13 "duration": 5.0,
14 "type": "clean",
15 "offset": 9.642182747327407,
16 "segment_type": "mid"
17 },
18 {
19 "song_id": "song_0020",
20 "audio_path": "segments/song_0020_seg_02_augmented.wav",
21 "duration": 5.0,
22 "type": "augmented",
23 "offset": 2.367717347418965,
24 "segment_type": "intro"
25 },
26 {
27 "song_id": "song_0020",
28 "audio_path": "segments/song_0020_seg_03_humming_like.wav",
29 "duration": 5.0,
30 "type": "humming_like",
31 "offset": 3.180577192661006,
32 "segment_type": "mid"
33 },
34 {
35 "song_id": "song_0020",
36 "audio_path": "segments/song_0020_seg_04_confused.wav",
37 "duration": 5.0,
38 "type": "confused",
39 "offset": 4.660551124366617,
40 "segment_type": "mid"
41 },
42 {
43 "song_id": "song_0020",
44 "audio_path": "songs/song_0020.wav",
45 "duration": 15.0,
46 "base_freq": 987.77,
47 "type": "reference"
48 },
49 {
50 "song_id": "song_0021",
51 "audio_path": "segments/song_0021_seg_00.wav",
52 "duration": 5.0,
53 "type": "clean",
54 "offset": 5.631088908640184,
55 "segment_type": "mid"
56 },
57 {
58 "song_id": "song_0021",
59 "audio_path": "segments/song_0021_seg_01.wav",
60 "duration": 5.0,
61 "type": "clean",
62 "offset": 1.8823366490525628,
63 "segment_type": "intro"
64 },
65 {
66 "song_id": "song_0021",
67 "audio_path": "segments/song_0021_seg_02_augmented.wav",
68 "duration": 5.0,
69 "type": "augmented",
70 "offset": 9.88006210404643,
71 "segment_type": "mid"
72 },
73 {
74 "song_id": "song_0021",
75 "audio_path": "segments/song_0021_seg_03_humming_like.wav",
76 "duration": 5.0,
77 "type": "humming_like",
78 "offset": 0.9025737685090285,
79 "segment_type": "intro"
80 },
81 {
82 "song_id": "song_0021",
83 "audio_path": "segments/song_0021_seg_04_confused.wav",
84 "duration": 5.0,
85 "type": "confused",
86 "offset": 1.3048954561918258,
87 "segment_type": "intro"
88 },
89 {
90 "song_id": "song_0021",
91 "audio_path": "songs/song_0021.wav",
92 "duration": 15.0,
93 "base_freq": 146.8292605393491,
94 "type": "reference"
95 },
96 {
97 "song_id": "song_0022",
98 "audio_path": "segments/song_0022_seg_00.wav",
99 "duration": 5.0,
100 "type": "clean",
101 "offset": 3.9746734850812295,
102 "segment_type": "mid"
103 },
104 {
105 "song_id": "song_0022",
106 "audio_path": "segments/song_0022_seg_01.wav",
107 "duration": 5.0,
108 "type": "clean",
109 "offset": 4.890968121206573,
110 "segment_type": "mid"
111 },
112 {
113 "song_id": "song_0022",
114 "audio_path": "segments/song_0022_seg_02_augmented.wav",
115 "duration": 5.0,
116 "type": "augmented",
117 "offset": 6.610400547460049,
118 "segment_type": "mid"
119 },
120 {
121 "song_id": "song_0022",
122 "audio_path": "segments/song_0022_seg_03_humming_like.wav",
123 "duration": 5.0,
124 "type": "humming_like",
125 "offset": 2.6329596668288424,
126 "segment_type": "intro"
127 },
128 {
129 "song_id": "song_0022",
130 "audio_path": "segments/song_0022_seg_04_confused.wav",
131 "duration": 5.0,
132 "type": "confused",
133 "offset": 0.8570731183991709,
134 "segment_type": "intro"
135 },
136 {
137 "song_id": "song_0022",
138 "audio_path": "songs/song_0022.wav",
139 "duration": 15.0,
140 "base_freq": 164.81110255326524,
141 "type": "reference"
142 },
143 {
144 "song_id": "song_0023",
145 "audio_path": "segments/song_0023_seg_00.wav",
146 "duration": 5.0,
147 "type": "clean",
148 "offset": 4.461034326075292,
149 "segment_type": "mid"
150 },
151 {
152 "song_id": "song_0023",
153 "audio_path": "segments/song_0023_seg_01.wav",
154 "duration": 5.0,
155 "type": "clean",
156 "offset": 9.605203782802876,
157 "segment_type": "mid"
158 },
159 {
160 "song_id": "song_0023",
161 "audio_path": "segments/song_0023_seg_02_augmented.wav",
162 "duration": 5.0,
163 "type": "augmented",
164 "offset": 4.7458228906154805,
165 "segment_type": "mid"
166 },
167 {
168 "song_id": "song_0023",
169 "audio_path": "segments/song_0023_seg_03_humming_like.wav",
170 "duration": 5.0,
171 "type": "humming_like",
172 "offset": 8.308702013555955,
173 "segment_type": "mid"
174 },
175 {
176 "song_id": "song_0023",
177 "audio_path": "segments/song_0023_seg_04_confused.wav",
178 "duration": 5.0,
179 "type": "confused",
180 "offset": 2.213510770155481,
181 "segment_type": "intro"
182 },
183 {
184 "song_id": "song_0023",
185 "audio_path": "songs/song_0023.wav",
186 "duration": 15.0,
187 "base_freq": 184.99297018186778,
188 "type": "reference"
189 }
190 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "song_0000",
4 "audio_path": "segments/song_0000_seg_00.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 9.538159275210802,
8 "segment_type": "mid"
9 },
10 {
11 "song_id": "song_0000",
12 "audio_path": "segments/song_0000_seg_01.wav",
13 "duration": 5.0,
14 "type": "clean",
15 "offset": 8.75852940378194,
16 "segment_type": "mid"
17 },
18 {
19 "song_id": "song_0000",
20 "audio_path": "segments/song_0000_seg_02_augmented.wav",
21 "duration": 5.0,
22 "type": "augmented",
23 "offset": 2.6338905075109076,
24 "segment_type": "intro"
25 },
26 {
27 "song_id": "song_0000",
28 "audio_path": "segments/song_0000_seg_03_humming_like.wav",
29 "duration": 5.0,
30 "type": "humming_like",
31 "offset": 6.389494948660052,
32 "segment_type": "mid"
33 },
34 {
35 "song_id": "song_0000",
36 "audio_path": "segments/song_0000_seg_04_confused.wav",
37 "duration": 5.0,
38 "type": "confused",
39 "offset": 5.303536721951775,
40 "segment_type": "mid"
41 },
42 {
43 "song_id": "song_0000",
44 "audio_path": "songs/song_0000.wav",
45 "duration": 15.0,
46 "base_freq": 130.81,
47 "type": "reference"
48 },
49 {
50 "song_id": "song_0001",
51 "audio_path": "segments/song_0001_seg_00.wav",
52 "duration": 5.0,
53 "type": "clean",
54 "offset": 5.227827155319589,
55 "segment_type": "mid"
56 },
57 {
58 "song_id": "song_0001",
59 "audio_path": "segments/song_0001_seg_01.wav",
60 "duration": 5.0,
61 "type": "clean",
62 "offset": 9.347062577364273,
63 "segment_type": "mid"
64 },
65 {
66 "song_id": "song_0001",
67 "audio_path": "segments/song_0001_seg_02_augmented.wav",
68 "duration": 5.0,
69 "type": "augmented",
70 "offset": 2.042591994235364,
71 "segment_type": "intro"
72 },
73 {
74 "song_id": "song_0001",
75 "audio_path": "segments/song_0001_seg_03_humming_like.wav",
76 "duration": 5.0,
77 "type": "humming_like",
78 "offset": 3.1617719627185403,
79 "segment_type": "mid"
80 },
81 {
82 "song_id": "song_0001",
83 "audio_path": "segments/song_0001_seg_04_confused.wav",
84 "duration": 5.0,
85 "type": "confused",
86 "offset": 0.73260721099633,
87 "segment_type": "intro"
88 },
89 {
90 "song_id": "song_0001",
91 "audio_path": "songs/song_0001.wav",
92 "duration": 15.0,
93 "base_freq": 146.83,
94 "type": "reference"
95 },
96 {
97 "song_id": "song_0002",
98 "audio_path": "segments/song_0002_seg_00.wav",
99 "duration": 5.0,
100 "type": "clean",
101 "offset": 3.0928466220865323,
102 "segment_type": "mid"
103 },
104 {
105 "song_id": "song_0002",
106 "audio_path": "segments/song_0002_seg_01.wav",
107 "duration": 5.0,
108 "type": "clean",
109 "offset": 4.083929086192168,
110 "segment_type": "mid"
111 },
112 {
113 "song_id": "song_0002",
114 "audio_path": "segments/song_0002_seg_02_augmented.wav",
115 "duration": 5.0,
116 "type": "augmented",
117 "offset": 4.024003870577246,
118 "segment_type": "mid"
119 },
120 {
121 "song_id": "song_0002",
122 "audio_path": "segments/song_0002_seg_03_humming_like.wav",
123 "duration": 5.0,
124 "type": "humming_like",
125 "offset": 9.028055457325827,
126 "segment_type": "mid"
127 },
128 {
129 "song_id": "song_0002",
130 "audio_path": "segments/song_0002_seg_04_confused.wav",
131 "duration": 5.0,
132 "type": "confused",
133 "offset": 4.2988814998983464,
134 "segment_type": "mid"
135 },
136 {
137 "song_id": "song_0002",
138 "audio_path": "songs/song_0002.wav",
139 "duration": 15.0,
140 "base_freq": 164.81,
141 "type": "reference"
142 },
143 {
144 "song_id": "song_0003",
145 "audio_path": "segments/song_0003_seg_00.wav",
146 "duration": 5.0,
147 "type": "clean",
148 "offset": 0.1938328705001069,
149 "segment_type": "intro"
150 },
151 {
152 "song_id": "song_0003",
153 "audio_path": "segments/song_0003_seg_01.wav",
154 "duration": 5.0,
155 "type": "clean",
156 "offset": 5.394190479225337,
157 "segment_type": "mid"
158 },
159 {
160 "song_id": "song_0003",
161 "audio_path": "segments/song_0003_seg_02_augmented.wav",
162 "duration": 5.0,
163 "type": "augmented",
164 "offset": 9.999078285092093,
165 "segment_type": "mid"
166 },
167 {
168 "song_id": "song_0003",
169 "audio_path": "segments/song_0003_seg_03_humming_like.wav",
170 "duration": 5.0,
171 "type": "humming_like",
172 "offset": 9.496117327159888,
173 "segment_type": "mid"
174 },
175 {
176 "song_id": "song_0003",
177 "audio_path": "segments/song_0003_seg_04_confused.wav",
178 "duration": 5.0,
179 "type": "confused",
180 "offset": 2.1796454090650363,
181 "segment_type": "intro"
182 },
183 {
184 "song_id": "song_0003",
185 "audio_path": "songs/song_0003.wav",
186 "duration": 15.0,
187 "base_freq": 174.61,
188 "type": "reference"
189 },
190 {
191 "song_id": "song_0004",
192 "audio_path": "segments/song_0004_seg_00.wav",
193 "duration": 5.0,
194 "type": "clean",
195 "offset": 9.654976431382948,
196 "segment_type": "mid"
197 },
198 {
199 "song_id": "song_0004",
200 "audio_path": "segments/song_0004_seg_01.wav",
201 "duration": 5.0,
202 "type": "clean",
203 "offset": 2.524783904929726,
204 "segment_type": "intro"
205 },
206 {
207 "song_id": "song_0004",
208 "audio_path": "segments/song_0004_seg_02_augmented.wav",
209 "duration": 5.0,
210 "type": "augmented",
211 "offset": 8.617229646275131,
212 "segment_type": "mid"
213 },
214 {
215 "song_id": "song_0004",
216 "audio_path": "segments/song_0004_seg_03_humming_like.wav",
217 "duration": 5.0,
218 "type": "humming_like",
219 "offset": 1.5172700695095642,
220 "segment_type": "intro"
221 },
222 {
223 "song_id": "song_0004",
224 "audio_path": "segments/song_0004_seg_04_confused.wav",
225 "duration": 5.0,
226 "type": "confused",
227 "offset": 4.161740214103284,
228 "segment_type": "mid"
229 },
230 {
231 "song_id": "song_0004",
232 "audio_path": "songs/song_0004.wav",
233 "duration": 15.0,
234 "base_freq": 196.0,
235 "type": "reference"
236 },
237 {
238 "song_id": "song_0005",
239 "audio_path": "segments/song_0005_seg_00.wav",
240 "duration": 5.0,
241 "type": "clean",
242 "offset": 5.088720150695117,
243 "segment_type": "mid"
244 },
245 {
246 "song_id": "song_0005",
247 "audio_path": "segments/song_0005_seg_01.wav",
248 "duration": 5.0,
249 "type": "clean",
250 "offset": 2.734248967132742,
251 "segment_type": "intro"
252 },
253 {
254 "song_id": "song_0005",
255 "audio_path": "segments/song_0005_seg_02_augmented.wav",
256 "duration": 5.0,
257 "type": "augmented",
258 "offset": 8.347239455766944,
259 "segment_type": "mid"
260 },
261 {
262 "song_id": "song_0005",
263 "audio_path": "segments/song_0005_seg_03_humming_like.wav",
264 "duration": 5.0,
265 "type": "humming_like",
266 "offset": 5.08240891592894,
267 "segment_type": "mid"
268 },
269 {
270 "song_id": "song_0005",
271 "audio_path": "segments/song_0005_seg_04_confused.wav",
272 "duration": 5.0,
273 "type": "confused",
274 "offset": 9.3424839368252,
275 "segment_type": "mid"
276 },
277 {
278 "song_id": "song_0005",
279 "audio_path": "songs/song_0005.wav",
280 "duration": 15.0,
281 "base_freq": 220.0,
282 "type": "reference"
283 },
284 {
285 "song_id": "song_0006",
286 "audio_path": "segments/song_0006_seg_00.wav",
287 "duration": 5.0,
288 "type": "clean",
289 "offset": 2.5062680004361604,
290 "segment_type": "intro"
291 },
292 {
293 "song_id": "song_0006",
294 "audio_path": "segments/song_0006_seg_01.wav",
295 "duration": 5.0,
296 "type": "clean",
297 "offset": 7.555773237416772,
298 "segment_type": "mid"
299 },
300 {
301 "song_id": "song_0006",
302 "audio_path": "segments/song_0006_seg_02_augmented.wav",
303 "duration": 5.0,
304 "type": "augmented",
305 "offset": 7.674707744954641,
306 "segment_type": "mid"
307 },
308 {
309 "song_id": "song_0006",
310 "audio_path": "segments/song_0006_seg_03_humming_like.wav",
311 "duration": 5.0,
312 "type": "humming_like",
313 "offset": 0.33364531245632434,
314 "segment_type": "intro"
315 },
316 {
317 "song_id": "song_0006",
318 "audio_path": "segments/song_0006_seg_04_confused.wav",
319 "duration": 5.0,
320 "type": "confused",
321 "offset": 2.007947946500762,
322 "segment_type": "intro"
323 },
324 {
325 "song_id": "song_0006",
326 "audio_path": "songs/song_0006.wav",
327 "duration": 15.0,
328 "base_freq": 246.94,
329 "type": "reference"
330 },
331 {
332 "song_id": "song_0007",
333 "audio_path": "segments/song_0007_seg_00.wav",
334 "duration": 5.0,
335 "type": "clean",
336 "offset": 6.589030736792923,
337 "segment_type": "mid"
338 },
339 {
340 "song_id": "song_0007",
341 "audio_path": "segments/song_0007_seg_01.wav",
342 "duration": 5.0,
343 "type": "clean",
344 "offset": 3.016303290280887,
345 "segment_type": "mid"
346 },
347 {
348 "song_id": "song_0007",
349 "audio_path": "segments/song_0007_seg_02_augmented.wav",
350 "duration": 5.0,
351 "type": "augmented",
352 "offset": 6.433406842054888,
353 "segment_type": "mid"
354 },
355 {
356 "song_id": "song_0007",
357 "audio_path": "segments/song_0007_seg_03_humming_like.wav",
358 "duration": 5.0,
359 "type": "humming_like",
360 "offset": 4.435623293630087,
361 "segment_type": "mid"
362 },
363 {
364 "song_id": "song_0007",
365 "audio_path": "segments/song_0007_seg_04_confused.wav",
366 "duration": 5.0,
367 "type": "confused",
368 "offset": 5.8536468854812105,
369 "segment_type": "mid"
370 },
371 {
372 "song_id": "song_0007",
373 "audio_path": "songs/song_0007.wav",
374 "duration": 15.0,
375 "base_freq": 261.63,
376 "type": "reference"
377 },
378 {
379 "song_id": "song_0008",
380 "audio_path": "segments/song_0008_seg_00.wav",
381 "duration": 5.0,
382 "type": "clean",
383 "offset": 0.42302261562791377,
384 "segment_type": "intro"
385 },
386 {
387 "song_id": "song_0008",
388 "audio_path": "segments/song_0008_seg_01.wav",
389 "duration": 5.0,
390 "type": "clean",
391 "offset": 0.18741536585645702,
392 "segment_type": "intro"
393 },
394 {
395 "song_id": "song_0008",
396 "audio_path": "segments/song_0008_seg_02_augmented.wav",
397 "duration": 5.0,
398 "type": "augmented",
399 "offset": 9.211624345024124,
400 "segment_type": "mid"
401 },
402 {
403 "song_id": "song_0008",
404 "audio_path": "segments/song_0008_seg_03_humming_like.wav",
405 "duration": 5.0,
406 "type": "humming_like",
407 "offset": 4.176939598434806,
408 "segment_type": "mid"
409 },
410 {
411 "song_id": "song_0008",
412 "audio_path": "segments/song_0008_seg_04_confused.wav",
413 "duration": 5.0,
414 "type": "confused",
415 "offset": 8.320259130717071,
416 "segment_type": "mid"
417 },
418 {
419 "song_id": "song_0008",
420 "audio_path": "songs/song_0008.wav",
421 "duration": 15.0,
422 "base_freq": 293.66,
423 "type": "reference"
424 },
425 {
426 "song_id": "song_0009",
427 "audio_path": "segments/song_0009_seg_00.wav",
428 "duration": 5.0,
429 "type": "clean",
430 "offset": 5.076897127246463,
431 "segment_type": "mid"
432 },
433 {
434 "song_id": "song_0009",
435 "audio_path": "segments/song_0009_seg_01.wav",
436 "duration": 5.0,
437 "type": "clean",
438 "offset": 5.397707584136711,
439 "segment_type": "mid"
440 },
441 {
442 "song_id": "song_0009",
443 "audio_path": "segments/song_0009_seg_02_augmented.wav",
444 "duration": 5.0,
445 "type": "augmented",
446 "offset": 7.3864400300146755,
447 "segment_type": "mid"
448 },
449 {
450 "song_id": "song_0009",
451 "audio_path": "segments/song_0009_seg_03_humming_like.wav",
452 "duration": 5.0,
453 "type": "humming_like",
454 "offset": 5.9724644107162845,
455 "segment_type": "mid"
456 },
457 {
458 "song_id": "song_0009",
459 "audio_path": "segments/song_0009_seg_04_confused.wav",
460 "duration": 5.0,
461 "type": "confused",
462 "offset": 7.21182997805427,
463 "segment_type": "mid"
464 },
465 {
466 "song_id": "song_0009",
467 "audio_path": "songs/song_0009.wav",
468 "duration": 15.0,
469 "base_freq": 329.63,
470 "type": "reference"
471 },
472 {
473 "song_id": "song_0010",
474 "audio_path": "segments/song_0010_seg_00.wav",
475 "duration": 5.0,
476 "type": "clean",
477 "offset": 3.1007588293689183,
478 "segment_type": "mid"
479 },
480 {
481 "song_id": "song_0010",
482 "audio_path": "segments/song_0010_seg_01.wav",
483 "duration": 5.0,
484 "type": "clean",
485 "offset": 3.9822405568601704,
486 "segment_type": "mid"
487 },
488 {
489 "song_id": "song_0010",
490 "audio_path": "segments/song_0010_seg_02_augmented.wav",
491 "duration": 5.0,
492 "type": "augmented",
493 "offset": 8.154060806559823,
494 "segment_type": "mid"
495 },
496 {
497 "song_id": "song_0010",
498 "audio_path": "segments/song_0010_seg_03_humming_like.wav",
499 "duration": 5.0,
500 "type": "humming_like",
501 "offset": 2.7321660611387344,
502 "segment_type": "intro"
503 },
504 {
505 "song_id": "song_0010",
506 "audio_path": "segments/song_0010_seg_04_confused.wav",
507 "duration": 5.0,
508 "type": "confused",
509 "offset": 9.564787178236601,
510 "segment_type": "mid"
511 },
512 {
513 "song_id": "song_0010",
514 "audio_path": "songs/song_0010.wav",
515 "duration": 15.0,
516 "base_freq": 349.23,
517 "type": "reference"
518 },
519 {
520 "song_id": "song_0011",
521 "audio_path": "segments/song_0011_seg_00.wav",
522 "duration": 5.0,
523 "type": "clean",
524 "offset": 8.949259168211244,
525 "segment_type": "mid"
526 },
527 {
528 "song_id": "song_0011",
529 "audio_path": "segments/song_0011_seg_01.wav",
530 "duration": 5.0,
531 "type": "clean",
532 "offset": 8.459337061558657,
533 "segment_type": "mid"
534 },
535 {
536 "song_id": "song_0011",
537 "audio_path": "segments/song_0011_seg_02_augmented.wav",
538 "duration": 5.0,
539 "type": "augmented",
540 "offset": 2.5060530898199906,
541 "segment_type": "intro"
542 },
543 {
544 "song_id": "song_0011",
545 "audio_path": "segments/song_0011_seg_03_humming_like.wav",
546 "duration": 5.0,
547 "type": "humming_like",
548 "offset": 5.0257314474126265,
549 "segment_type": "mid"
550 },
551 {
552 "song_id": "song_0011",
553 "audio_path": "segments/song_0011_seg_04_confused.wav",
554 "duration": 5.0,
555 "type": "confused",
556 "offset": 8.42530004113389,
557 "segment_type": "mid"
558 },
559 {
560 "song_id": "song_0011",
561 "audio_path": "songs/song_0011.wav",
562 "duration": 15.0,
563 "base_freq": 392.0,
564 "type": "reference"
565 },
566 {
567 "song_id": "song_0012",
568 "audio_path": "segments/song_0012_seg_00.wav",
569 "duration": 5.0,
570 "type": "clean",
571 "offset": 7.253242125518553,
572 "segment_type": "mid"
573 },
574 {
575 "song_id": "song_0012",
576 "audio_path": "segments/song_0012_seg_01.wav",
577 "duration": 5.0,
578 "type": "clean",
579 "offset": 6.880436512027717,
580 "segment_type": "mid"
581 },
582 {
583 "song_id": "song_0012",
584 "audio_path": "segments/song_0012_seg_02_augmented.wav",
585 "duration": 5.0,
586 "type": "augmented",
587 "offset": 0.26647154963833186,
588 "segment_type": "intro"
589 },
590 {
591 "song_id": "song_0012",
592 "audio_path": "segments/song_0012_seg_03_humming_like.wav",
593 "duration": 5.0,
594 "type": "humming_like",
595 "offset": 7.214001122963067,
596 "segment_type": "mid"
597 },
598 {
599 "song_id": "song_0012",
600 "audio_path": "segments/song_0012_seg_04_confused.wav",
601 "duration": 5.0,
602 "type": "confused",
603 "offset": 1.4777570830033182,
604 "segment_type": "intro"
605 },
606 {
607 "song_id": "song_0012",
608 "audio_path": "songs/song_0012.wav",
609 "duration": 15.0,
610 "base_freq": 440.0,
611 "type": "reference"
612 },
613 {
614 "song_id": "song_0013",
615 "audio_path": "segments/song_0013_seg_00.wav",
616 "duration": 5.0,
617 "type": "clean",
618 "offset": 3.3711217932975037,
619 "segment_type": "mid"
620 },
621 {
622 "song_id": "song_0013",
623 "audio_path": "segments/song_0013_seg_01.wav",
624 "duration": 5.0,
625 "type": "clean",
626 "offset": 2.95024257658282,
627 "segment_type": "intro"
628 },
629 {
630 "song_id": "song_0013",
631 "audio_path": "segments/song_0013_seg_02_augmented.wav",
632 "duration": 5.0,
633 "type": "augmented",
634 "offset": 6.7440113989474435,
635 "segment_type": "mid"
636 },
637 {
638 "song_id": "song_0013",
639 "audio_path": "segments/song_0013_seg_03_humming_like.wav",
640 "duration": 5.0,
641 "type": "humming_like",
642 "offset": 3.27926658740176,
643 "segment_type": "mid"
644 },
645 {
646 "song_id": "song_0013",
647 "audio_path": "segments/song_0013_seg_04_confused.wav",
648 "duration": 5.0,
649 "type": "confused",
650 "offset": 0.06830120539555451,
651 "segment_type": "intro"
652 },
653 {
654 "song_id": "song_0013",
655 "audio_path": "songs/song_0013.wav",
656 "duration": 15.0,
657 "base_freq": 493.88,
658 "type": "reference"
659 },
660 {
661 "song_id": "song_0014",
662 "audio_path": "segments/song_0014_seg_00.wav",
663 "duration": 5.0,
664 "type": "clean",
665 "offset": 4.389628114874606,
666 "segment_type": "mid"
667 },
668 {
669 "song_id": "song_0014",
670 "audio_path": "segments/song_0014_seg_01.wav",
671 "duration": 5.0,
672 "type": "clean",
673 "offset": 5.397598089074283,
674 "segment_type": "mid"
675 },
676 {
677 "song_id": "song_0014",
678 "audio_path": "segments/song_0014_seg_02_augmented.wav",
679 "duration": 5.0,
680 "type": "augmented",
681 "offset": 7.543857087472844,
682 "segment_type": "mid"
683 },
684 {
685 "song_id": "song_0014",
686 "audio_path": "segments/song_0014_seg_03_humming_like.wav",
687 "duration": 5.0,
688 "type": "humming_like",
689 "offset": 5.77474814637882,
690 "segment_type": "mid"
691 },
692 {
693 "song_id": "song_0014",
694 "audio_path": "segments/song_0014_seg_04_confused.wav",
695 "duration": 5.0,
696 "type": "confused",
697 "offset": 5.212510542649235,
698 "segment_type": "mid"
699 },
700 {
701 "song_id": "song_0014",
702 "audio_path": "songs/song_0014.wav",
703 "duration": 15.0,
704 "base_freq": 523.25,
705 "type": "reference"
706 },
707 {
708 "song_id": "song_0015",
709 "audio_path": "segments/song_0015_seg_00.wav",
710 "duration": 5.0,
711 "type": "clean",
712 "offset": 5.3221248501273655,
713 "segment_type": "mid"
714 },
715 {
716 "song_id": "song_0015",
717 "audio_path": "segments/song_0015_seg_01.wav",
718 "duration": 5.0,
719 "type": "clean",
720 "offset": 4.113385082174164,
721 "segment_type": "mid"
722 },
723 {
724 "song_id": "song_0015",
725 "audio_path": "segments/song_0015_seg_02_augmented.wav",
726 "duration": 5.0,
727 "type": "augmented",
728 "offset": 0.16726147602629915,
729 "segment_type": "intro"
730 },
731 {
732 "song_id": "song_0015",
733 "audio_path": "segments/song_0015_seg_03_humming_like.wav",
734 "duration": 5.0,
735 "type": "humming_like",
736 "offset": 4.305732086760379,
737 "segment_type": "mid"
738 },
739 {
740 "song_id": "song_0015",
741 "audio_path": "segments/song_0015_seg_04_confused.wav",
742 "duration": 5.0,
743 "type": "confused",
744 "offset": 6.197808424119352,
745 "segment_type": "mid"
746 },
747 {
748 "song_id": "song_0015",
749 "audio_path": "songs/song_0015.wav",
750 "duration": 15.0,
751 "base_freq": 587.33,
752 "type": "reference"
753 }
754 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "song_0016",
4 "audio_path": "segments/song_0016_seg_00.wav",
5 "duration": 5.0,
6 "type": "clean",
7 "offset": 7.208994524555927,
8 "segment_type": "mid"
9 },
10 {
11 "song_id": "song_0016",
12 "audio_path": "segments/song_0016_seg_01.wav",
13 "duration": 5.0,
14 "type": "clean",
15 "offset": 4.958024367228626,
16 "segment_type": "mid"
17 },
18 {
19 "song_id": "song_0016",
20 "audio_path": "segments/song_0016_seg_02_augmented.wav",
21 "duration": 5.0,
22 "type": "augmented",
23 "offset": 6.1666879203579,
24 "segment_type": "mid"
25 },
26 {
27 "song_id": "song_0016",
28 "audio_path": "segments/song_0016_seg_03_humming_like.wav",
29 "duration": 5.0,
30 "type": "humming_like",
31 "offset": 8.621983105655142,
32 "segment_type": "mid"
33 },
34 {
35 "song_id": "song_0016",
36 "audio_path": "segments/song_0016_seg_04_confused.wav",
37 "duration": 5.0,
38 "type": "confused",
39 "offset": 3.004352846791234,
40 "segment_type": "mid"
41 },
42 {
43 "song_id": "song_0016",
44 "audio_path": "songs/song_0016.wav",
45 "duration": 15.0,
46 "base_freq": 659.25,
47 "type": "reference"
48 },
49 {
50 "song_id": "song_0017",
51 "audio_path": "segments/song_0017_seg_00.wav",
52 "duration": 5.0,
53 "type": "clean",
54 "offset": 5.277150196277827,
55 "segment_type": "mid"
56 },
57 {
58 "song_id": "song_0017",
59 "audio_path": "segments/song_0017_seg_01.wav",
60 "duration": 5.0,
61 "type": "clean",
62 "offset": 6.391085856661506,
63 "segment_type": "mid"
64 },
65 {
66 "song_id": "song_0017",
67 "audio_path": "segments/song_0017_seg_02_augmented.wav",
68 "duration": 5.0,
69 "type": "augmented",
70 "offset": 5.969708292829935,
71 "segment_type": "mid"
72 },
73 {
74 "song_id": "song_0017",
75 "audio_path": "segments/song_0017_seg_03_humming_like.wav",
76 "duration": 5.0,
77 "type": "humming_like",
78 "offset": 6.1736267933642495,
79 "segment_type": "mid"
80 },
81 {
82 "song_id": "song_0017",
83 "audio_path": "segments/song_0017_seg_04_confused.wav",
84 "duration": 5.0,
85 "type": "confused",
86 "offset": 1.1786165266165671,
87 "segment_type": "intro"
88 },
89 {
90 "song_id": "song_0017",
91 "audio_path": "songs/song_0017.wav",
92 "duration": 15.0,
93 "base_freq": 698.46,
94 "type": "reference"
95 },
96 {
97 "song_id": "song_0018",
98 "audio_path": "segments/song_0018_seg_00.wav",
99 "duration": 5.0,
100 "type": "clean",
101 "offset": 6.641438208318426,
102 "segment_type": "mid"
103 },
104 {
105 "song_id": "song_0018",
106 "audio_path": "segments/song_0018_seg_01.wav",
107 "duration": 5.0,
108 "type": "clean",
109 "offset": 3.582227293409872,
110 "segment_type": "mid"
111 },
112 {
113 "song_id": "song_0018",
114 "audio_path": "segments/song_0018_seg_02_augmented.wav",
115 "duration": 5.0,
116 "type": "augmented",
117 "offset": 0.6333068606017467,
118 "segment_type": "intro"
119 },
120 {
121 "song_id": "song_0018",
122 "audio_path": "segments/song_0018_seg_03_humming_like.wav",
123 "duration": 5.0,
124 "type": "humming_like",
125 "offset": 3.3775515517078736,
126 "segment_type": "mid"
127 },
128 {
129 "song_id": "song_0018",
130 "audio_path": "segments/song_0018_seg_04_confused.wav",
131 "duration": 5.0,
132 "type": "confused",
133 "offset": 6.825519260932059,
134 "segment_type": "mid"
135 },
136 {
137 "song_id": "song_0018",
138 "audio_path": "songs/song_0018.wav",
139 "duration": 15.0,
140 "base_freq": 783.99,
141 "type": "reference"
142 },
143 {
144 "song_id": "song_0019",
145 "audio_path": "segments/song_0019_seg_00.wav",
146 "duration": 5.0,
147 "type": "clean",
148 "offset": 6.405372883123518,
149 "segment_type": "mid"
150 },
151 {
152 "song_id": "song_0019",
153 "audio_path": "segments/song_0019_seg_01.wav",
154 "duration": 5.0,
155 "type": "clean",
156 "offset": 5.376553581360508,
157 "segment_type": "mid"
158 },
159 {
160 "song_id": "song_0019",
161 "audio_path": "segments/song_0019_seg_02_augmented.wav",
162 "duration": 5.0,
163 "type": "augmented",
164 "offset": 1.5268044380447066,
165 "segment_type": "intro"
166 },
167 {
168 "song_id": "song_0019",
169 "audio_path": "segments/song_0019_seg_03_humming_like.wav",
170 "duration": 5.0,
171 "type": "humming_like",
172 "offset": 5.864371630124319,
173 "segment_type": "mid"
174 },
175 {
176 "song_id": "song_0019",
177 "audio_path": "segments/song_0019_seg_04_confused.wav",
178 "duration": 5.0,
179 "type": "confused",
180 "offset": 4.37486043050575,
181 "segment_type": "mid"
182 },
183 {
184 "song_id": "song_0019",
185 "audio_path": "songs/song_0019.wav",
186 "duration": 15.0,
187 "base_freq": 880.0,
188 "type": "reference"
189 }
190 ]
...\ No newline at end of file ...\ No newline at end of file
1 [
2 {
3 "song_id": "foo",
4 "audio_path": "raw/foo.wav",
5 "duration": 10.5,
6 "type": "reference",
7 "source_dataset": "fma"
8 }
9 ]
...\ No newline at end of file ...\ No newline at end of file
...@@ -71,6 +71,7 @@ def main(): ...@@ -71,6 +71,7 @@ def main():
71 }) 71 })
72 72
73 total = len(queries) 73 total = len(queries)
74 confusion_focus = {k:v for k,v in by_type.items() if k in {"confused", "humming_like"}}
74 report = { 75 report = {
75 "split": args.split, 76 "split": args.split,
76 "num_queries": total, 77 "num_queries": total,
...@@ -84,6 +85,10 @@ def main(): ...@@ -84,6 +85,10 @@ def main():
84 } 85 }
85 for k, v in by_type.items() 86 for k, v in by_type.items()
86 }, 87 },
88 "hard_case_summary": {
89 k: {"n": v["n"], "top1": round(v["top1"]/v["n"],4) if v["n"] else 0.0, "topk": round(v["topk"]/v["n"],4) if v["n"] else 0.0}
90 for k,v in confusion_focus.items()
91 },
87 "sample_failures": failures[:10], 92 "sample_failures": failures[:10],
88 } 93 }
89 print(json.dumps(report, ensure_ascii=False, indent=2)) 94 print(json.dumps(report, ensure_ascii=False, indent=2))
......
...@@ -4,3 +4,6 @@ soundfile>=0.12 ...@@ -4,3 +4,6 @@ soundfile>=0.12
4 librosa>=0.10 4 librosa>=0.10
5 tqdm>=4.66 5 tqdm>=4.66
6 torch>=2.3 6 torch>=2.3
7 fastapi>=0.115
8 uvicorn>=0.30
9 pydantic>=2.8
......
1 """Dataset adapter skeletons for external/open music corpora."""
2
3 from __future__ import annotations
4
5 from dataclasses import dataclass, asdict
6 from pathlib import Path
7 from typing import Dict, List
8 import argparse
9 import json
10
11
12 @dataclass
13 class DatasetRecord:
14 name: str
15 source_url: str
16 license: str
17 commercial_use: str
18 notes: str
19
20
21 class BaseAdapter:
22 name = "base"
23
24 def describe(self) -> Dict:
25 raise NotImplementedError
26
27 def init_layout(self, root: Path) -> Dict:
28 root.mkdir(parents=True, exist_ok=True)
29 for sub in ["raw", "processed", "manifests", "licenses"]:
30 (root / sub).mkdir(exist_ok=True)
31 manifest = {
32 "dataset": self.name,
33 "root": str(root),
34 "status": "initialized",
35 "next_steps": [
36 "download raw audio according to upstream license terms",
37 "convert to catalog/query manifests",
38 "record license evidence before training",
39 ],
40 }
41 with open(root / "manifests" / "bootstrap.json", "w") as f:
42 json.dump(manifest, f, indent=2, ensure_ascii=False)
43 return manifest
44
45
46 class FMAAdapter(BaseAdapter):
47 name = "fma"
48
49 def describe(self) -> Dict:
50 return {
51 "name": "FMA",
52 "source_url": "https://github.com/mdeff/fma",
53 "recommended_subset": "fma_small",
54 "catalog_strategy": "full tracks as references; random 5-15s crops as queries",
55 "license_policy": "review per subset/track before commercial training",
56 }
57
58
59 class MTGJamendoAdapter(BaseAdapter):
60 name = "mtg_jamendo"
61
62 def describe(self) -> Dict:
63 return {
64 "name": "MTG-Jamendo",
65 "source_url": "https://github.com/MTG/mtg-jamendo-dataset",
66 "recommended_subset": "small curated slice",
67 "catalog_strategy": "download upstream audio subset then build catalog/query manifests",
68 "license_policy": "verify CC terms for intended commercial use",
69 }
70
71
72 class CCMusicAdapter(BaseAdapter):
73 name = "ccmusic"
74
75 def describe(self) -> Dict:
76 return {
77 "name": "CCMusic",
78 "source_url": "https://ccmusic-database.github.io/en/database/ccm.html",
79 "recommended_subset": "whitelisted approved subset only",
80 "catalog_strategy": "use approved corpora only; normalize to project manifests",
81 "license_policy": "application/permission review required before use",
82 }
83
84
85 class ModelScopeMusicAdapter(BaseAdapter):
86 name = "modelscope_music"
87
88 def describe(self) -> Dict:
89 return {
90 "name": "ModelScope music datasets",
91 "source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset",
92 "recommended_subset": "manual whitelist only",
93 "catalog_strategy": "treat as discovery surface; add per-dataset adapter after legal review",
94 "license_policy": "deny until whitelisted",
95 }
96
97
98 ADAPTERS = {
99 "fma": FMAAdapter(),
100 "mtg_jamendo": MTGJamendoAdapter(),
101 "ccmusic": CCMusicAdapter(),
102 "modelscope_music": ModelScopeMusicAdapter(),
103 }
104
105 REGISTRY: List[DatasetRecord] = [
106 DatasetRecord(
107 name="FMA",
108 source_url="https://github.com/mdeff/fma",
109 license="Track-dependent / metadata CC BY 4.0; verify per subset",
110 commercial_use="review_required",
111 notes="Good first realistic MIR baseline",
112 ),
113 DatasetRecord(
114 name="MTG-Jamendo",
115 source_url="https://github.com/MTG/mtg-jamendo-dataset",
116 license="Creative Commons source tracks; verify exact subset terms",
117 commercial_use="review_required",
118 notes="Good retrieval/tagging corpus with scripts",
119 ),
120 DatasetRecord(
121 name="CCMusic",
122 source_url="https://ccmusic-database.github.io/en/database/ccm.html",
123 license="varies / application may be required",
124 commercial_use="review_required",
125 notes="Useful Chinese MIR source, needs permission review",
126 ),
127 DatasetRecord(
128 name="ModelScope-music",
129 source_url="https://modelscope.cn/search?page=1&search=music&type=dataset",
130 license="varies by dataset",
131 commercial_use="deny_until_whitelisted",
132 notes="Discovery surface only until per-dataset review is complete",
133 ),
134 ]
135
136
137 def write_registry(output_path: str):
138 out = Path(output_path)
139 out.parent.mkdir(parents=True, exist_ok=True)
140 with open(out, "w") as f:
141 json.dump([asdict(x) for x in REGISTRY], f, indent=2, ensure_ascii=False)
142 return out
143
144
145 def main():
146 parser = argparse.ArgumentParser()
147 sub = parser.add_subparsers(dest="cmd", required=True)
148
149 p = sub.add_parser("registry")
150 p.add_argument("--output", default="data/dataset_registry.json")
151
152 p = sub.add_parser("init")
153 p.add_argument("dataset", choices=sorted(ADAPTERS))
154 p.add_argument("--root", default="data/external")
155
156 p = sub.add_parser("describe")
157 p.add_argument("dataset", choices=sorted(ADAPTERS))
158
159 args = parser.parse_args()
160 if args.cmd == "registry":
161 path = write_registry(args.output)
162 print(path)
163 elif args.cmd == "init":
164 root = Path(args.root) / args.dataset
165 print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False))
166 elif args.cmd == "describe":
167 print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False))
168
169
170 if __name__ == "__main__":
171 main()
1 """External dataset manifest conversion templates."""
2
3 from __future__ import annotations
4
5 import argparse
6 import csv
7 import json
8 from pathlib import Path
9 from typing import List, Dict
10
11
12 def write_catalog(records: List[Dict], output_path: Path):
13 output_path.parent.mkdir(parents=True, exist_ok=True)
14 with open(output_path, "w") as f:
15 json.dump(records, f, indent=2, ensure_ascii=False)
16
17
18 def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_path", id_field: str = "song_id"):
19 records = []
20 with open(csv_path, newline="") as f:
21 reader = csv.DictReader(f)
22 for row in reader:
23 records.append(
24 {
25 "song_id": row[id_field],
26 "audio_path": row[path_field],
27 "duration": float(row.get("duration", 0.0) or 0.0),
28 "type": "reference",
29 "source_dataset": row.get("source_dataset", "external"),
30 }
31 )
32 write_catalog(records, output_path)
33 return len(records)
34
35
36 def main():
37 parser = argparse.ArgumentParser()
38 sub = parser.add_subparsers(dest="cmd", required=True)
39
40 p = sub.add_parser("csv-to-catalog")
41 p.add_argument("csv_path")
42 p.add_argument("output_path")
43 p.add_argument("--path-field", default="audio_path")
44 p.add_argument("--id-field", default="song_id")
45
46 args = parser.parse_args()
47 if args.cmd == "csv-to-catalog":
48 count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
49 print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))
50
51
52 if __name__ == "__main__":
53 main()
1 """ 1 """Hybrid ACR Engine: Chromaprint + ECAPA + melody-aware re-ranking."""
2 Hybrid ACR Engine: Chromaprint fast pre-filter + ECAPA-TDNN deep re-ranking.
3 """
4 2
5 import json 3 import json
6 import time 4 import time
5 from pathlib import Path
7 from typing import Dict, List, Optional 6 from typing import Dict, List, Optional
8 7
9 import librosa 8 import librosa
10 import numpy as np 9 import numpy as np
11 10
11 from src.utils.audio import AudioProcessor
12
12 13
13 class Candidate: 14 class Candidate:
14 def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0): 15 def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0, melody_score: float = 0.0):
15 self.song_id = song_id 16 self.song_id = song_id
16 self.chroma_score = chroma_score 17 self.chroma_score = chroma_score
17 self.ecapa_score = ecapa_score 18 self.ecapa_score = ecapa_score
19 self.melody_score = melody_score
18 self.metadata: Dict = {} 20 self.metadata: Dict = {}
19 21
20 def combined_score(self, chroma_weight: float, ecapa_weight: float) -> float: 22 def combined_score(self, chroma_weight: float, ecapa_weight: float, melody_weight: float) -> float:
21 return chroma_weight * self.chroma_score + ecapa_weight * self.ecapa_score 23 return (
22 24 chroma_weight * self.chroma_score
23 def __repr__(self): 25 + ecapa_weight * self.ecapa_score
24 return f"Candidate({self.song_id}, chroma={self.chroma_score:.3f}, ecapa={self.ecapa_score:.3f})" 26 + melody_weight * self.melody_score
27 )
25 28
26 29
27 class HybridEngine: 30 class HybridEngine:
...@@ -32,8 +35,9 @@ class HybridEngine: ...@@ -32,8 +35,9 @@ class HybridEngine:
32 ref_embs: Optional[np.ndarray] = None, 35 ref_embs: Optional[np.ndarray] = None,
33 ref_ids: Optional[List[str]] = None, 36 ref_ids: Optional[List[str]] = None,
34 sr: int = 16000, 37 sr: int = 16000,
35 chroma_weight: float = 0.35, 38 chroma_weight: float = 0.25,
36 ecapa_weight: float = 0.65, 39 ecapa_weight: float = 0.5,
40 melody_weight: float = 0.25,
37 reject_threshold: float = 0.35, 41 reject_threshold: float = 0.35,
38 ): 42 ):
39 self.chroma = chroma_matcher 43 self.chroma = chroma_matcher
...@@ -43,12 +47,16 @@ class HybridEngine: ...@@ -43,12 +47,16 @@ class HybridEngine:
43 self.sr = sr 47 self.sr = sr
44 self.chroma_weight = chroma_weight 48 self.chroma_weight = chroma_weight
45 self.ecapa_weight = ecapa_weight 49 self.ecapa_weight = ecapa_weight
50 self.melody_weight = melody_weight
46 self.reject_threshold = reject_threshold 51 self.reject_threshold = reject_threshold
47 self.song_metadata: Dict[str, Dict] = {} 52 self.song_metadata: Dict[str, Dict] = {}
53 self.song_audio_paths: Dict[str, str] = {}
54 self.audio = AudioProcessor(sr=sr)
48 55
49 def load_metadata(self, metadata_path: str): 56 def load_metadata(self, metadata_path: str):
50 with open(metadata_path) as f: 57 with open(metadata_path) as f:
51 items = json.load(f) 58 items = json.load(f)
59 base_dir = str(Path(metadata_path).parent)
52 for item in items: 60 for item in items:
53 sid = item["song_id"] 61 sid = item["song_id"]
54 existing = self.song_metadata.get(sid, {}) 62 existing = self.song_metadata.get(sid, {})
...@@ -59,15 +67,15 @@ class HybridEngine: ...@@ -59,15 +67,15 @@ class HybridEngine:
59 "audio_path": item.get("audio_path", existing.get("audio_path", "")), 67 "audio_path": item.get("audio_path", existing.get("audio_path", "")),
60 "type": item.get("type", existing.get("type", "unknown")), 68 "type": item.get("type", existing.get("type", "unknown")),
61 } 69 }
70 if item.get("type") == "reference":
71 self.song_audio_paths[sid] = str(Path(base_dir) / item["audio_path"])
62 72
63 @staticmethod 73 @staticmethod
64 def _normalize_scores(score_pairs: List[tuple], invert: bool = False) -> Dict[str, float]: 74 def _normalize_scores(score_pairs: List[tuple]) -> Dict[str, float]:
65 if not score_pairs: 75 if not score_pairs:
66 return {} 76 return {}
67 ids = [sid for sid, _ in score_pairs] 77 ids = [sid for sid, _ in score_pairs]
68 values = np.array([float(score) for _, score in score_pairs], dtype=np.float32) 78 values = np.array([float(score) for _, score in score_pairs], dtype=np.float32)
69 if invert:
70 values = -values
71 if len(values) == 1: 79 if len(values) == 1:
72 return {ids[0]: 1.0} 80 return {ids[0]: 1.0}
73 vmin = float(values.min()) 81 vmin = float(values.min())
...@@ -77,12 +85,18 @@ class HybridEngine: ...@@ -77,12 +85,18 @@ class HybridEngine:
77 norm = (values - vmin) / (vmax - vmin) 85 norm = (values - vmin) / (vmax - vmin)
78 return {sid: float(score) for sid, score in zip(ids, norm)} 86 return {sid: float(score) for sid, score in zip(ids, norm)}
79 87
80 def recognize( 88 def _melody_scores(self, query_y: np.ndarray, candidate_ids: List[str]) -> Dict[str, float]:
81 self, 89 scores = []
82 audio_path: str, 90 for song_id in candidate_ids:
83 top_n: int = 5, 91 ref_path = self.song_audio_paths.get(song_id)
84 mode: str = "auto", 92 if not ref_path or not Path(ref_path).exists():
85 ) -> Dict: 93 continue
94 ref_y, _ = librosa.load(ref_path, sr=self.sr, mono=True, duration=8.0)
95 score = self.audio.melody_similarity(query_y, ref_y)
96 scores.append((song_id, score))
97 return self._normalize_scores(scores)
98
99 def recognize(self, audio_path: str, top_n: int = 5, mode: str = "auto") -> Dict:
86 del mode 100 del mode
87 start = time.time() 101 start = time.time()
88 y, _ = librosa.load(audio_path, sr=self.sr, mono=True) 102 y, _ = librosa.load(audio_path, sr=self.sr, mono=True)
...@@ -96,41 +110,45 @@ class HybridEngine: ...@@ -96,41 +110,45 @@ class HybridEngine:
96 ref_norm = self.ref_embs / (np.linalg.norm(self.ref_embs, axis=1, keepdims=True) + 1e-12) 110 ref_norm = self.ref_embs / (np.linalg.norm(self.ref_embs, axis=1, keepdims=True) + 1e-12)
97 query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-12) 111 query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-12)
98 scores = query_norm @ ref_norm.T 112 scores = query_norm @ ref_norm.T
99 top_indices = np.argsort(-scores)[: max(top_n * 5, 20)] 113 top_indices = np.argsort(-scores)[: max(top_n * 10, 30)]
100 ecapa_matches = [(self.ref_ids[idx], float(scores[idx])) for idx in top_indices] 114 ecapa_matches = [(self.ref_ids[idx], float(scores[idx])) for idx in top_indices]
101 ecapa_norm = self._normalize_scores(ecapa_matches) 115 ecapa_norm = self._normalize_scores(ecapa_matches)
102 116
103 all_song_ids = set(chroma_norm) | set(ecapa_norm) 117 candidate_pool = list(set(list(chroma_norm.keys())[: top_n * 8] + list(ecapa_norm.keys())[: top_n * 8]))
118 melody_norm = self._melody_scores(y, candidate_pool)
119
120 all_song_ids = set(candidate_pool) | set(melody_norm)
104 combined: List[Candidate] = [] 121 combined: List[Candidate] = []
105 for song_id in all_song_ids: 122 for song_id in all_song_ids:
106 candidate = Candidate( 123 candidate = Candidate(
107 song_id=song_id, 124 song_id=song_id,
108 chroma_score=chroma_norm.get(song_id, 0.0), 125 chroma_score=chroma_norm.get(song_id, 0.0),
109 ecapa_score=ecapa_norm.get(song_id, 0.0), 126 ecapa_score=ecapa_norm.get(song_id, 0.0),
127 melody_score=melody_norm.get(song_id, 0.0),
110 ) 128 )
111 candidate.metadata = self.song_metadata.get(song_id, {}) 129 candidate.metadata = self.song_metadata.get(song_id, {})
112 combined.append(candidate) 130 combined.append(candidate)
113 131
114 combined.sort(key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight), reverse=True) 132 combined.sort(
133 key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight),
134 reverse=True,
135 )
115 results = combined[:top_n] 136 results = combined[:top_n]
116 elapsed = (time.time() - start) * 1000 137 elapsed = (time.time() - start) * 1000
117 138
118 output = [] 139 output = []
119 for c in results: 140 for c in results:
120 fused = c.combined_score(self.chroma_weight, self.ecapa_weight) 141 fused = c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight)
121 output.append( 142 output.append(
122 { 143 {
123 "song_id": c.song_id, 144 "song_id": c.song_id,
124 "confidence": round(fused, 4), 145 "confidence": round(fused, 4),
125 "chromaprint_score": round(c.chroma_score, 4), 146 "chromaprint_score": round(c.chroma_score, 4),
126 "ecapa_score": round(c.ecapa_score, 4), 147 "ecapa_score": round(c.ecapa_score, 4),
148 "melody_score": round(c.melody_score, 4),
127 "accepted": fused >= self.reject_threshold, 149 "accepted": fused >= self.reject_threshold,
128 "metadata": c.metadata, 150 "metadata": c.metadata,
129 } 151 }
130 ) 152 )
131 153
132 return { 154 return {"candidates": output, "processing_time_ms": round(elapsed, 1), "num_candidates": len(results)}
133 "candidates": output,
134 "processing_time_ms": round(elapsed, 1),
135 "num_candidates": len(results),
136 }
......
1 from pathlib import Path
2 from typing import Optional
3
4 import numpy as np
5 from fastapi import FastAPI, HTTPException
6 from pydantic import BaseModel
7
8 from src.engines.chromaprint_matcher import ChromaprintMatcher
9 from src.engines.ecapa_embedder import ECAPAEmbedder
10 from src.engines.hybrid_engine import HybridEngine
11
12
13 class RecognizeRequest(BaseModel):
14 query_path: str
15 data_dir: str = "data/synthetic_v2"
16 model_path: str = "data/models_v3/best_model.pt"
17 index_prefix: str = "data/index_v3/reference"
18 top_n: int = 5
19 device: str = "cpu"
20
21
22 class BuildIndexRequest(BaseModel):
23 data_dir: str
24 model_path: str
25 output_dir: str
26 device: str = "cpu"
27
28
29 app = FastAPI(title="ACR Service", version="0.1.0")
30
31
32 def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) -> HybridEngine:
33 matcher = ChromaprintMatcher()
34 chroma_path = str(Path(index_prefix).parent / "chromaprint.pkl")
35 if not Path(chroma_path).exists():
36 raise HTTPException(status_code=400, detail=f"Missing chromaprint index: {chroma_path}")
37 matcher.load(chroma_path)
38
39 if not Path(model_path).exists():
40 raise HTTPException(status_code=400, detail=f"Missing model: {model_path}")
41 embedder = ECAPAEmbedder(model_path=model_path, device=device)
42
43 embs_path = f"{index_prefix}_embs.npy"
44 ids_path = f"{index_prefix}_ids.npy"
45 if not Path(embs_path).exists() or not Path(ids_path).exists():
46 raise HTTPException(status_code=400, detail="Missing embedding index files")
47
48 ref_embs = np.load(embs_path)
49 ref_ids = np.load(ids_path, allow_pickle=True).tolist()
50 engine = HybridEngine(matcher, embedder, ref_embs, ref_ids)
51 for split in ["catalog.json", "train.json", "val.json", "test.json"]:
52 p = Path(data_dir) / split
53 if p.exists():
54 engine.load_metadata(str(p))
55 return engine
56
57
58 @app.get("/health")
59 def health():
60 return {"status": "ok"}
61
62
63 @app.post("/recognize")
64 def recognize(req: RecognizeRequest):
65 if not Path(req.query_path).exists():
66 raise HTTPException(status_code=400, detail=f"Missing query file: {req.query_path}")
67 engine = _load_engine(req.data_dir, req.model_path, req.index_prefix, req.device)
68 return engine.recognize(req.query_path, top_n=req.top_n)
69
70
71 @app.post("/index/build")
72 def build_index(req: BuildIndexRequest):
73 from run_demo import build_chroma_index, build_embedding_index
74
75 data_dir = Path(req.data_dir)
76 out_dir = Path(req.output_dir)
77 out_dir.mkdir(parents=True, exist_ok=True)
78 build_chroma_index(data_dir, out_dir)
79 _, ref_embs, ref_ids = build_embedding_index(data_dir, Path(req.model_path), out_dir / "reference", req.device)
80 return {"status": "ok", "num_reference_windows": len(ref_ids), "embedding_dim": int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0}
1 import torch
2 import torch.nn as nn
3 import torch.nn.functional as F
4 import numpy as np
5 import librosa 1 import librosa
2 import numpy as np
3 import torch
6 from typing import List, Optional, Tuple 4 from typing import List, Optional, Tuple
7 5
8 6
9 class AudioProcessor: 7 class AudioProcessor:
10 def __init__(self, sr: int = 16000, n_mels: int = 80, n_fft: int = 512, hop_length: int = 160): 8 def __init__(self, sr: int = 16000, n_mels: int = 128, n_fft: int = 512, hop_length: int = 160):
11 self.sr = sr 9 self.sr = sr
12 self.n_mels = n_mels 10 self.n_mels = n_mels
13 self.n_fft = n_fft 11 self.n_fft = n_fft
...@@ -19,8 +17,7 @@ class AudioProcessor: ...@@ -19,8 +17,7 @@ class AudioProcessor:
19 17
20 def to_mel(self, y: np.ndarray) -> np.ndarray: 18 def to_mel(self, y: np.ndarray) -> np.ndarray:
21 mel = librosa.feature.melspectrogram( 19 mel = librosa.feature.melspectrogram(
22 y=y, sr=self.sr, n_mels=self.n_mels, 20 y=y, sr=self.sr, n_mels=self.n_mels, n_fft=self.n_fft, hop_length=self.hop_length
23 n_fft=self.n_fft, hop_length=self.hop_length
24 ) 21 )
25 return librosa.power_to_db(mel, ref=np.max) 22 return librosa.power_to_db(mel, ref=np.max)
26 23
...@@ -36,7 +33,7 @@ class AudioProcessor: ...@@ -36,7 +33,7 @@ class AudioProcessor:
36 y = np.pad(y, (0, pad)) 33 y = np.pad(y, (0, pad))
37 windows = [] 34 windows = []
38 for start in range(0, len(y) - win_len + 1, stride): 35 for start in range(0, len(y) - win_len + 1, stride):
39 windows.append(y[start:start + win_len]) 36 windows.append(y[start : start + win_len])
40 if not windows: 37 if not windows:
41 windows.append(y[:win_len]) 38 windows.append(y[:win_len])
42 return windows 39 return windows
...@@ -47,10 +44,32 @@ class AudioProcessor: ...@@ -47,10 +44,32 @@ class AudioProcessor:
47 return self.to_mel_tensor(y), duration 44 return self.to_mel_tensor(y), duration
48 45
49 def extract_chroma(self, y: np.ndarray) -> np.ndarray: 46 def extract_chroma(self, y: np.ndarray) -> np.ndarray:
50 chroma = librosa.feature.chroma_cqt(y=y, sr=self.sr) 47 return librosa.feature.chroma_cqt(y=y, sr=self.sr)
51 return chroma
52 48
53 def extract_f0(self, y: np.ndarray, fmin=65, fmax=2093) -> np.ndarray: 49 def extract_f0(self, y: np.ndarray, fmin=65, fmax=2093) -> np.ndarray:
54 f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=fmin, fmax=fmax) 50 f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=fmin, fmax=fmax)
55 f0 = np.nan_to_num(f0, nan=0.0) 51 return np.nan_to_num(f0, nan=0.0)
56 return f0 52
53 def melody_signature(self, y: np.ndarray) -> np.ndarray:
54 f0 = self.extract_f0(y)
55 if f0.size == 0:
56 return np.zeros(32, dtype=np.float32)
57 nonzero = f0[f0 > 0]
58 if nonzero.size == 0:
59 return np.zeros(32, dtype=np.float32)
60 contour = np.diff(np.log2(nonzero + 1e-6), prepend=np.log2(nonzero[0] + 1e-6))
61 contour = np.clip(contour, -0.5, 0.5)
62 if contour.size < 32:
63 contour = np.pad(contour, (0, 32 - contour.size))
64 else:
65 idx = np.linspace(0, contour.size - 1, 32).astype(int)
66 contour = contour[idx]
67 return contour.astype(np.float32)
68
69 def melody_similarity(self, y1: np.ndarray, y2: np.ndarray) -> float:
70 s1 = self.melody_signature(y1)
71 s2 = self.melody_signature(y2)
72 denom = float(np.linalg.norm(s1) * np.linalg.norm(s2) + 1e-12)
73 if denom <= 1e-12:
74 return 0.0
75 return float(np.dot(s1, s2) / denom)
......
...@@ -53,3 +53,25 @@ ...@@ -53,3 +53,25 @@
53 结论: 53 结论:
54 - 结构性错误(catalog/index/fusion/评测缺失)已明显改善 54 - 结构性错误(catalog/index/fusion/评测缺失)已明显改善
55 - 当前主要剩余短板是 humming_like / confused 的鲁棒识别 55 - 当前主要剩余短板是 humming_like / confused 的鲁棒识别
56
57 ## 2026-06-02
58
59 ### Stage: 工业化服务骨架 + 外部 manifest 转换模板
60
61 完成项:
62 - 新增 FastAPI 服务骨架:`acr-engine/src/service/app.py`
63 - 新增 manifest 转换工具:`acr-engine/src/data/manifest_tools.py`
64 - 新增工业 benchmark 文档:`docs/industrial-benchmark-spec.md`
65 - 扩展外部 dataset adapter CLI:`acr-engine/src/data/external_adapters.py`
66 - 新增服务 API 文档:`docs/service-api.md`
67 - requirements 增加 FastAPI / uvicorn / pydantic
68
69 验证结果:
70 - `external_adapters.py registry` 成功
71 - `external_adapters.py describe ccmusic` 成功
72 - `external_adapters.py init modelscope_music` 成功
73 - `manifest_tools.py csv-to-catalog` 成功生成 catalog
74 - `service.app health()` 返回 `{"status":"ok"}`
75 - API `build_index(...)` 成功返回 reference window 数量
76 - API `recognize(...)` 成功返回候选结果
77 - `train.py --dry-run` 成功
......
1 # Dataset Sources and Licensing Notes
2
3 > 更新:2026-06-02
4
5 ## 注意
6 以下仅为工程接入与研究规划说明,不等于法律意见。实际商用前需要逐条复核原始 license、dataset terms 和再训练约束。
7
8 ## 候选数据源
9
10 ### 1. FMA
11 - URL: https://github.com/mdeff/fma
12 - 特点: 开放、MIR 常用、适合 retrieval baseline
13 - 风险: 音频 license 按 artist/track 可能不同,需逐条核验
14
15 ### 2. MTG-Jamendo
16 - URL: https://github.com/MTG/mtg-jamendo-dataset
17 - 特点: Creative Commons 来源,适合音乐检索/标签任务
18 - 风险: 仍需按具体曲目用途与商业场景做 license 审查
19
20 ### 3. CCMusic
21 - 论文/介绍: https://transactions.ismir.net/articles/10.5334/tismir.194
22 - 主页: https://ccmusic-database.github.io/en/database/ccm.html
23 - 特点: 中国音乐 MIR 数据资源丰富
24 - 风险: 部分数据集可能需要申请或存在使用边界,必须单独核验
25
26 ### 4. ModelScope music datasets
27 - 入口: https://www.modelscope.cn/datasets
28 - 搜索: https://modelscope.cn/search?page=1&search=music&type=dataset
29 - 特点: 数据发现方便,可扩充中文生态
30 - 风险: license 分散,不能默认可商用;接入前必须建立白名单
31
32 ## 接入原则
33
34 - 只接入 license 明确的数据集
35 - 默认拒绝“来源不明 / 不允许商业使用 / 禁止训练衍生模型”的数据
36 - 训练前把数据集及许可信息落盘到 registry
1 # Industrial Benchmark Spec
2
3 > 更新:2026-06-02
4
5 ## 目标
6 为工业级可商用 ACR 设立持续基准,不只看总体 top1/top5,还看场景化与风险化指标。
7
8 ## Benchmark 维度
9
10 ### 1. Retrieval Quality
11 - top1
12 - top5
13 - MRR
14 - recall@k
15
16 ### 2. Scenario Buckets
17 - clean
18 - noisy
19 - compressed
20 - time-stretched
21 - pitch-shifted
22 - humming_like
23 - confused
24 - partial-overlap
25 - far-field / device-recorded
26
27 ### 3. Catalog Scale Buckets
28 - 1K songs
29 - 10K songs
30 - 100K songs
31 - 1M+ songs
32
33 ### 4. Operational Metrics
34 - p50 / p95 latency
35 - indexing throughput
36 - incremental update time
37 - memory / disk footprint
38
39 ### 5. Business Safety Metrics
40 - false accept rate
41 - rejection quality
42 - near-duplicate confusion rate
43 - license provenance coverage
44
45 ## Required Artifacts per Model Release
46 - dataset registry snapshot
47 - training config snapshot
48 - benchmark report JSON
49 - benchmark summary markdown
50 - model card
51 - license review manifest
52
53 ## Minimum Go/No-Go Gate
54 - clean top1 >= 0.95
55 - noisy top1 >= 0.85
56 - confused top1 >= 0.70
57 - humming_like top1 >= 0.60
58 - top5 >= 0.95 on all production-relevant buckets
59 - false accept below agreed threshold
1 # ACR 工业级可商用演进路线
2
3 > 更新:2026-06-02
4
5 ## 1. 目标定义
6
7 把当前原型升级为一个可商用的工业级 ACR 系统,满足:
8
9 - 可扩展曲库管理
10 - 可重复训练 / 评测 / 部署
11 - 多数据源接入(synthetic / FMA / Jamendo / CCMusic / ModelScope)
12 - 更强鲁棒性(噪声、失真、哼唱、混淆)
13 - 检索服务化
14 - 商用合规与授权边界可审计
15
16 ## 2. 工业级分层
17
18 ### 2.1 数据层
19 - `catalog.json` / query manifests
20 - 外部 dataset adapters
21 - license / usage tracking
22 - 数据版本与快照
23
24 ### 2.2 训练层
25 - baseline encoder
26 - foundation-model encoder
27 - retrieval-first losses
28 - hard negative mining
29 - 数据平衡与生成增强
30
31 ### 2.3 索引层
32 - window-level embeddings
33 - ANN index (Faiss/HNSW)
34 - 指纹索引与向量索引双路
35 - 增量入库
36
37 ### 2.4 服务层
38 - FastAPI / gRPC
39 - batch ingest
40 - recognize API
41 - top-k candidate + rejection
42 - metadata lookup
43
44 ### 2.5 质量层
45 - regression benchmark
46 - hard-case benchmark
47 - online shadow evaluation
48 - 数据/模型回滚机制
49
50 ## 3. 数据集策略
51
52 ### 第一梯队(优先)
53 - FMA small / medium
54 - MTG-Jamendo
55 - CCMusic(需核验申请/授权方式)
56 - ModelScope music datasets(按 license 白名单接入)
57
58 ### 第二梯队
59 - humming / QBSH 数据集
60 - instrument / structure / singing datasets 作为辅助监督
61
62 ## 4. 商用必做项
63
64 - 每个 dataset 记录:
65 - 来源 URL
66 - license
67 - 是否允许商业使用
68 - 再分发限制
69 - 模型训练用途限制
70 - 每个模型版本记录训练数据组成
71 - 每次上线保留评测报告与可追溯哈希
72
73 ## 5. 当前到工业化的缺口
74
75 - 缺 dataset adapter 层
76 - 缺 ANN 检索
77 - 缺 API 服务
78 - 缺 license registry
79 - 缺 foundation-model baseline
80 - 缺真正的 hard-negative mining
81 - 缺真实开源数据 benchmark
1 # ACR Service API
2
3 ## Endpoints
4
5 ### GET /health
6 返回服务健康状态。
7
8 ### POST /recognize
9 请求体:
10
11 ```json
12 {
13 "query_path": "data/synthetic_v2/segments/song_0021_seg_01_augmented.wav",
14 "data_dir": "data/synthetic_v2",
15 "model_path": "data/models_v3/best_model.pt",
16 "index_prefix": "data/index_v3/reference",
17 "top_n": 5,
18 "device": "cpu"
19 }
20 ```
21
22 ### POST /index/build
23 请求体:
24
25 ```json
26 {
27 "data_dir": "data/synthetic_v2",
28 "model_path": "data/models_v3/best_model.pt",
29 "output_dir": "data/index_v3",
30 "device": "cpu"
31 }
32 ```