Commit 31a72045 31a720458be99c6fb7c980d570e9db42ed40eed4 by cnb.bofCdSsphPA

add src

1 parent 4b16286e
Showing 188 changed files with 1994 additions and 43 deletions
......@@ -38,8 +38,9 @@ engine:
n_fft: 1024
hop_length: 256
hybrid:
chroma_weight: 0.3
ecapa_weight: 0.7
chroma_weight: 0.25
ecapa_weight: 0.5
melody_weight: 0.25
reject_threshold: 0.4
augmentation:
......
[
{
"name": "FMA",
"source_url": "https://github.com/mdeff/fma",
"license": "Track-dependent / metadata CC BY 4.0; verify per subset",
"commercial_use": "review_required",
"notes": "Good first realistic MIR baseline"
},
{
"name": "MTG-Jamendo",
"source_url": "https://github.com/MTG/mtg-jamendo-dataset",
"license": "Creative Commons source tracks; verify exact subset terms",
"commercial_use": "review_required",
"notes": "Good retrieval/tagging corpus with scripts"
},
{
"name": "CCMusic",
"source_url": "https://ccmusic-database.github.io/en/database/ccm.html",
"license": "varies / application may be required",
"commercial_use": "review_required",
"notes": "Useful Chinese MIR source, needs permission review"
},
{
"name": "ModelScope-music",
"source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset",
"license": "varies by dataset",
"commercial_use": "deny_until_whitelisted",
"notes": "Discovery surface only until per-dataset review is complete"
}
]
\ No newline at end of file
{
"dataset": "modelscope_music",
"root": "data/external/modelscope_music",
"status": "initialized",
"next_steps": [
"download raw audio according to upstream license terms",
"convert to catalog/query manifests",
"record license evidence before training"
]
}
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
This file is too large to display.
{
"song_0000": 0,
"song_0001": 1,
"song_0002": 2,
"song_0003": 3,
"song_0004": 4,
"song_0005": 5,
"song_0006": 6,
"song_0007": 7,
"song_0008": 8,
"song_0009": 9,
"song_0010": 10,
"song_0011": 11,
"song_0012": 12,
"song_0013": 13,
"song_0014": 14,
"song_0015": 15
}
\ No newline at end of file
[
{
"song_id": "song_0000",
"audio_path": "songs/song_0000.wav",
"duration": 15.0,
"base_freq": 130.81,
"type": "reference"
},
{
"song_id": "song_0001",
"audio_path": "songs/song_0001.wav",
"duration": 15.0,
"base_freq": 146.83,
"type": "reference"
},
{
"song_id": "song_0002",
"audio_path": "songs/song_0002.wav",
"duration": 15.0,
"base_freq": 164.81,
"type": "reference"
},
{
"song_id": "song_0003",
"audio_path": "songs/song_0003.wav",
"duration": 15.0,
"base_freq": 174.61,
"type": "reference"
},
{
"song_id": "song_0004",
"audio_path": "songs/song_0004.wav",
"duration": 15.0,
"base_freq": 196.0,
"type": "reference"
},
{
"song_id": "song_0005",
"audio_path": "songs/song_0005.wav",
"duration": 15.0,
"base_freq": 220.0,
"type": "reference"
},
{
"song_id": "song_0006",
"audio_path": "songs/song_0006.wav",
"duration": 15.0,
"base_freq": 246.94,
"type": "reference"
},
{
"song_id": "song_0007",
"audio_path": "songs/song_0007.wav",
"duration": 15.0,
"base_freq": 261.63,
"type": "reference"
},
{
"song_id": "song_0008",
"audio_path": "songs/song_0008.wav",
"duration": 15.0,
"base_freq": 293.66,
"type": "reference"
},
{
"song_id": "song_0009",
"audio_path": "songs/song_0009.wav",
"duration": 15.0,
"base_freq": 329.63,
"type": "reference"
},
{
"song_id": "song_0010",
"audio_path": "songs/song_0010.wav",
"duration": 15.0,
"base_freq": 349.23,
"type": "reference"
},
{
"song_id": "song_0011",
"audio_path": "songs/song_0011.wav",
"duration": 15.0,
"base_freq": 392.0,
"type": "reference"
},
{
"song_id": "song_0012",
"audio_path": "songs/song_0012.wav",
"duration": 15.0,
"base_freq": 440.0,
"type": "reference"
},
{
"song_id": "song_0013",
"audio_path": "songs/song_0013.wav",
"duration": 15.0,
"base_freq": 493.88,
"type": "reference"
},
{
"song_id": "song_0014",
"audio_path": "songs/song_0014.wav",
"duration": 15.0,
"base_freq": 523.25,
"type": "reference"
},
{
"song_id": "song_0015",
"audio_path": "songs/song_0015.wav",
"duration": 15.0,
"base_freq": 587.33,
"type": "reference"
},
{
"song_id": "song_0016",
"audio_path": "songs/song_0016.wav",
"duration": 15.0,
"base_freq": 659.25,
"type": "reference"
},
{
"song_id": "song_0017",
"audio_path": "songs/song_0017.wav",
"duration": 15.0,
"base_freq": 698.46,
"type": "reference"
},
{
"song_id": "song_0018",
"audio_path": "songs/song_0018.wav",
"duration": 15.0,
"base_freq": 783.99,
"type": "reference"
},
{
"song_id": "song_0019",
"audio_path": "songs/song_0019.wav",
"duration": 15.0,
"base_freq": 880.0,
"type": "reference"
},
{
"song_id": "song_0020",
"audio_path": "songs/song_0020.wav",
"duration": 15.0,
"base_freq": 987.77,
"type": "reference"
},
{
"song_id": "song_0021",
"audio_path": "songs/song_0021.wav",
"duration": 15.0,
"base_freq": 146.8292605393491,
"type": "reference"
},
{
"song_id": "song_0022",
"audio_path": "songs/song_0022.wav",
"duration": 15.0,
"base_freq": 164.81110255326524,
"type": "reference"
},
{
"song_id": "song_0023",
"audio_path": "songs/song_0023.wav",
"duration": 15.0,
"base_freq": 184.99297018186778,
"type": "reference"
}
]
\ No newline at end of file
[
{
"song_id": "song_0020",
"audio_path": "segments/song_0020_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.349828784349853,
"segment_type": "mid"
},
{
"song_id": "song_0020",
"audio_path": "segments/song_0020_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 9.642182747327407,
"segment_type": "mid"
},
{
"song_id": "song_0020",
"audio_path": "segments/song_0020_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 2.367717347418965,
"segment_type": "intro"
},
{
"song_id": "song_0020",
"audio_path": "segments/song_0020_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 3.180577192661006,
"segment_type": "mid"
},
{
"song_id": "song_0020",
"audio_path": "segments/song_0020_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 4.660551124366617,
"segment_type": "mid"
},
{
"song_id": "song_0020",
"audio_path": "songs/song_0020.wav",
"duration": 15.0,
"base_freq": 987.77,
"type": "reference"
},
{
"song_id": "song_0021",
"audio_path": "segments/song_0021_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.631088908640184,
"segment_type": "mid"
},
{
"song_id": "song_0021",
"audio_path": "segments/song_0021_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 1.8823366490525628,
"segment_type": "intro"
},
{
"song_id": "song_0021",
"audio_path": "segments/song_0021_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 9.88006210404643,
"segment_type": "mid"
},
{
"song_id": "song_0021",
"audio_path": "segments/song_0021_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 0.9025737685090285,
"segment_type": "intro"
},
{
"song_id": "song_0021",
"audio_path": "segments/song_0021_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 1.3048954561918258,
"segment_type": "intro"
},
{
"song_id": "song_0021",
"audio_path": "songs/song_0021.wav",
"duration": 15.0,
"base_freq": 146.8292605393491,
"type": "reference"
},
{
"song_id": "song_0022",
"audio_path": "segments/song_0022_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.9746734850812295,
"segment_type": "mid"
},
{
"song_id": "song_0022",
"audio_path": "segments/song_0022_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.890968121206573,
"segment_type": "mid"
},
{
"song_id": "song_0022",
"audio_path": "segments/song_0022_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 6.610400547460049,
"segment_type": "mid"
},
{
"song_id": "song_0022",
"audio_path": "segments/song_0022_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 2.6329596668288424,
"segment_type": "intro"
},
{
"song_id": "song_0022",
"audio_path": "segments/song_0022_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 0.8570731183991709,
"segment_type": "intro"
},
{
"song_id": "song_0022",
"audio_path": "songs/song_0022.wav",
"duration": 15.0,
"base_freq": 164.81110255326524,
"type": "reference"
},
{
"song_id": "song_0023",
"audio_path": "segments/song_0023_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.461034326075292,
"segment_type": "mid"
},
{
"song_id": "song_0023",
"audio_path": "segments/song_0023_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 9.605203782802876,
"segment_type": "mid"
},
{
"song_id": "song_0023",
"audio_path": "segments/song_0023_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 4.7458228906154805,
"segment_type": "mid"
},
{
"song_id": "song_0023",
"audio_path": "segments/song_0023_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 8.308702013555955,
"segment_type": "mid"
},
{
"song_id": "song_0023",
"audio_path": "segments/song_0023_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 2.213510770155481,
"segment_type": "intro"
},
{
"song_id": "song_0023",
"audio_path": "songs/song_0023.wav",
"duration": 15.0,
"base_freq": 184.99297018186778,
"type": "reference"
}
]
\ No newline at end of file
[
{
"song_id": "song_0000",
"audio_path": "segments/song_0000_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 9.538159275210802,
"segment_type": "mid"
},
{
"song_id": "song_0000",
"audio_path": "segments/song_0000_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 8.75852940378194,
"segment_type": "mid"
},
{
"song_id": "song_0000",
"audio_path": "segments/song_0000_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 2.6338905075109076,
"segment_type": "intro"
},
{
"song_id": "song_0000",
"audio_path": "segments/song_0000_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 6.389494948660052,
"segment_type": "mid"
},
{
"song_id": "song_0000",
"audio_path": "segments/song_0000_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 5.303536721951775,
"segment_type": "mid"
},
{
"song_id": "song_0000",
"audio_path": "songs/song_0000.wav",
"duration": 15.0,
"base_freq": 130.81,
"type": "reference"
},
{
"song_id": "song_0001",
"audio_path": "segments/song_0001_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.227827155319589,
"segment_type": "mid"
},
{
"song_id": "song_0001",
"audio_path": "segments/song_0001_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 9.347062577364273,
"segment_type": "mid"
},
{
"song_id": "song_0001",
"audio_path": "segments/song_0001_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 2.042591994235364,
"segment_type": "intro"
},
{
"song_id": "song_0001",
"audio_path": "segments/song_0001_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 3.1617719627185403,
"segment_type": "mid"
},
{
"song_id": "song_0001",
"audio_path": "segments/song_0001_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 0.73260721099633,
"segment_type": "intro"
},
{
"song_id": "song_0001",
"audio_path": "songs/song_0001.wav",
"duration": 15.0,
"base_freq": 146.83,
"type": "reference"
},
{
"song_id": "song_0002",
"audio_path": "segments/song_0002_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.0928466220865323,
"segment_type": "mid"
},
{
"song_id": "song_0002",
"audio_path": "segments/song_0002_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.083929086192168,
"segment_type": "mid"
},
{
"song_id": "song_0002",
"audio_path": "segments/song_0002_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 4.024003870577246,
"segment_type": "mid"
},
{
"song_id": "song_0002",
"audio_path": "segments/song_0002_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 9.028055457325827,
"segment_type": "mid"
},
{
"song_id": "song_0002",
"audio_path": "segments/song_0002_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 4.2988814998983464,
"segment_type": "mid"
},
{
"song_id": "song_0002",
"audio_path": "songs/song_0002.wav",
"duration": 15.0,
"base_freq": 164.81,
"type": "reference"
},
{
"song_id": "song_0003",
"audio_path": "segments/song_0003_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 0.1938328705001069,
"segment_type": "intro"
},
{
"song_id": "song_0003",
"audio_path": "segments/song_0003_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.394190479225337,
"segment_type": "mid"
},
{
"song_id": "song_0003",
"audio_path": "segments/song_0003_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 9.999078285092093,
"segment_type": "mid"
},
{
"song_id": "song_0003",
"audio_path": "segments/song_0003_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 9.496117327159888,
"segment_type": "mid"
},
{
"song_id": "song_0003",
"audio_path": "segments/song_0003_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 2.1796454090650363,
"segment_type": "intro"
},
{
"song_id": "song_0003",
"audio_path": "songs/song_0003.wav",
"duration": 15.0,
"base_freq": 174.61,
"type": "reference"
},
{
"song_id": "song_0004",
"audio_path": "segments/song_0004_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 9.654976431382948,
"segment_type": "mid"
},
{
"song_id": "song_0004",
"audio_path": "segments/song_0004_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 2.524783904929726,
"segment_type": "intro"
},
{
"song_id": "song_0004",
"audio_path": "segments/song_0004_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 8.617229646275131,
"segment_type": "mid"
},
{
"song_id": "song_0004",
"audio_path": "segments/song_0004_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 1.5172700695095642,
"segment_type": "intro"
},
{
"song_id": "song_0004",
"audio_path": "segments/song_0004_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 4.161740214103284,
"segment_type": "mid"
},
{
"song_id": "song_0004",
"audio_path": "songs/song_0004.wav",
"duration": 15.0,
"base_freq": 196.0,
"type": "reference"
},
{
"song_id": "song_0005",
"audio_path": "segments/song_0005_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.088720150695117,
"segment_type": "mid"
},
{
"song_id": "song_0005",
"audio_path": "segments/song_0005_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 2.734248967132742,
"segment_type": "intro"
},
{
"song_id": "song_0005",
"audio_path": "segments/song_0005_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 8.347239455766944,
"segment_type": "mid"
},
{
"song_id": "song_0005",
"audio_path": "segments/song_0005_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 5.08240891592894,
"segment_type": "mid"
},
{
"song_id": "song_0005",
"audio_path": "segments/song_0005_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 9.3424839368252,
"segment_type": "mid"
},
{
"song_id": "song_0005",
"audio_path": "songs/song_0005.wav",
"duration": 15.0,
"base_freq": 220.0,
"type": "reference"
},
{
"song_id": "song_0006",
"audio_path": "segments/song_0006_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 2.5062680004361604,
"segment_type": "intro"
},
{
"song_id": "song_0006",
"audio_path": "segments/song_0006_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 7.555773237416772,
"segment_type": "mid"
},
{
"song_id": "song_0006",
"audio_path": "segments/song_0006_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 7.674707744954641,
"segment_type": "mid"
},
{
"song_id": "song_0006",
"audio_path": "segments/song_0006_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 0.33364531245632434,
"segment_type": "intro"
},
{
"song_id": "song_0006",
"audio_path": "segments/song_0006_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 2.007947946500762,
"segment_type": "intro"
},
{
"song_id": "song_0006",
"audio_path": "songs/song_0006.wav",
"duration": 15.0,
"base_freq": 246.94,
"type": "reference"
},
{
"song_id": "song_0007",
"audio_path": "segments/song_0007_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.589030736792923,
"segment_type": "mid"
},
{
"song_id": "song_0007",
"audio_path": "segments/song_0007_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.016303290280887,
"segment_type": "mid"
},
{
"song_id": "song_0007",
"audio_path": "segments/song_0007_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 6.433406842054888,
"segment_type": "mid"
},
{
"song_id": "song_0007",
"audio_path": "segments/song_0007_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 4.435623293630087,
"segment_type": "mid"
},
{
"song_id": "song_0007",
"audio_path": "segments/song_0007_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 5.8536468854812105,
"segment_type": "mid"
},
{
"song_id": "song_0007",
"audio_path": "songs/song_0007.wav",
"duration": 15.0,
"base_freq": 261.63,
"type": "reference"
},
{
"song_id": "song_0008",
"audio_path": "segments/song_0008_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 0.42302261562791377,
"segment_type": "intro"
},
{
"song_id": "song_0008",
"audio_path": "segments/song_0008_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 0.18741536585645702,
"segment_type": "intro"
},
{
"song_id": "song_0008",
"audio_path": "segments/song_0008_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 9.211624345024124,
"segment_type": "mid"
},
{
"song_id": "song_0008",
"audio_path": "segments/song_0008_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 4.176939598434806,
"segment_type": "mid"
},
{
"song_id": "song_0008",
"audio_path": "segments/song_0008_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 8.320259130717071,
"segment_type": "mid"
},
{
"song_id": "song_0008",
"audio_path": "songs/song_0008.wav",
"duration": 15.0,
"base_freq": 293.66,
"type": "reference"
},
{
"song_id": "song_0009",
"audio_path": "segments/song_0009_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.076897127246463,
"segment_type": "mid"
},
{
"song_id": "song_0009",
"audio_path": "segments/song_0009_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.397707584136711,
"segment_type": "mid"
},
{
"song_id": "song_0009",
"audio_path": "segments/song_0009_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 7.3864400300146755,
"segment_type": "mid"
},
{
"song_id": "song_0009",
"audio_path": "segments/song_0009_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 5.9724644107162845,
"segment_type": "mid"
},
{
"song_id": "song_0009",
"audio_path": "segments/song_0009_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 7.21182997805427,
"segment_type": "mid"
},
{
"song_id": "song_0009",
"audio_path": "songs/song_0009.wav",
"duration": 15.0,
"base_freq": 329.63,
"type": "reference"
},
{
"song_id": "song_0010",
"audio_path": "segments/song_0010_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.1007588293689183,
"segment_type": "mid"
},
{
"song_id": "song_0010",
"audio_path": "segments/song_0010_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.9822405568601704,
"segment_type": "mid"
},
{
"song_id": "song_0010",
"audio_path": "segments/song_0010_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 8.154060806559823,
"segment_type": "mid"
},
{
"song_id": "song_0010",
"audio_path": "segments/song_0010_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 2.7321660611387344,
"segment_type": "intro"
},
{
"song_id": "song_0010",
"audio_path": "segments/song_0010_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 9.564787178236601,
"segment_type": "mid"
},
{
"song_id": "song_0010",
"audio_path": "songs/song_0010.wav",
"duration": 15.0,
"base_freq": 349.23,
"type": "reference"
},
{
"song_id": "song_0011",
"audio_path": "segments/song_0011_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 8.949259168211244,
"segment_type": "mid"
},
{
"song_id": "song_0011",
"audio_path": "segments/song_0011_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 8.459337061558657,
"segment_type": "mid"
},
{
"song_id": "song_0011",
"audio_path": "segments/song_0011_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 2.5060530898199906,
"segment_type": "intro"
},
{
"song_id": "song_0011",
"audio_path": "segments/song_0011_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 5.0257314474126265,
"segment_type": "mid"
},
{
"song_id": "song_0011",
"audio_path": "segments/song_0011_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 8.42530004113389,
"segment_type": "mid"
},
{
"song_id": "song_0011",
"audio_path": "songs/song_0011.wav",
"duration": 15.0,
"base_freq": 392.0,
"type": "reference"
},
{
"song_id": "song_0012",
"audio_path": "segments/song_0012_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 7.253242125518553,
"segment_type": "mid"
},
{
"song_id": "song_0012",
"audio_path": "segments/song_0012_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.880436512027717,
"segment_type": "mid"
},
{
"song_id": "song_0012",
"audio_path": "segments/song_0012_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 0.26647154963833186,
"segment_type": "intro"
},
{
"song_id": "song_0012",
"audio_path": "segments/song_0012_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 7.214001122963067,
"segment_type": "mid"
},
{
"song_id": "song_0012",
"audio_path": "segments/song_0012_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 1.4777570830033182,
"segment_type": "intro"
},
{
"song_id": "song_0012",
"audio_path": "songs/song_0012.wav",
"duration": 15.0,
"base_freq": 440.0,
"type": "reference"
},
{
"song_id": "song_0013",
"audio_path": "segments/song_0013_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.3711217932975037,
"segment_type": "mid"
},
{
"song_id": "song_0013",
"audio_path": "segments/song_0013_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 2.95024257658282,
"segment_type": "intro"
},
{
"song_id": "song_0013",
"audio_path": "segments/song_0013_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 6.7440113989474435,
"segment_type": "mid"
},
{
"song_id": "song_0013",
"audio_path": "segments/song_0013_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 3.27926658740176,
"segment_type": "mid"
},
{
"song_id": "song_0013",
"audio_path": "segments/song_0013_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 0.06830120539555451,
"segment_type": "intro"
},
{
"song_id": "song_0013",
"audio_path": "songs/song_0013.wav",
"duration": 15.0,
"base_freq": 493.88,
"type": "reference"
},
{
"song_id": "song_0014",
"audio_path": "segments/song_0014_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.389628114874606,
"segment_type": "mid"
},
{
"song_id": "song_0014",
"audio_path": "segments/song_0014_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.397598089074283,
"segment_type": "mid"
},
{
"song_id": "song_0014",
"audio_path": "segments/song_0014_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 7.543857087472844,
"segment_type": "mid"
},
{
"song_id": "song_0014",
"audio_path": "segments/song_0014_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 5.77474814637882,
"segment_type": "mid"
},
{
"song_id": "song_0014",
"audio_path": "segments/song_0014_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 5.212510542649235,
"segment_type": "mid"
},
{
"song_id": "song_0014",
"audio_path": "songs/song_0014.wav",
"duration": 15.0,
"base_freq": 523.25,
"type": "reference"
},
{
"song_id": "song_0015",
"audio_path": "segments/song_0015_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.3221248501273655,
"segment_type": "mid"
},
{
"song_id": "song_0015",
"audio_path": "segments/song_0015_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.113385082174164,
"segment_type": "mid"
},
{
"song_id": "song_0015",
"audio_path": "segments/song_0015_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 0.16726147602629915,
"segment_type": "intro"
},
{
"song_id": "song_0015",
"audio_path": "segments/song_0015_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 4.305732086760379,
"segment_type": "mid"
},
{
"song_id": "song_0015",
"audio_path": "segments/song_0015_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 6.197808424119352,
"segment_type": "mid"
},
{
"song_id": "song_0015",
"audio_path": "songs/song_0015.wav",
"duration": 15.0,
"base_freq": 587.33,
"type": "reference"
}
]
\ No newline at end of file
[
{
"song_id": "song_0016",
"audio_path": "segments/song_0016_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 7.208994524555927,
"segment_type": "mid"
},
{
"song_id": "song_0016",
"audio_path": "segments/song_0016_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 4.958024367228626,
"segment_type": "mid"
},
{
"song_id": "song_0016",
"audio_path": "segments/song_0016_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 6.1666879203579,
"segment_type": "mid"
},
{
"song_id": "song_0016",
"audio_path": "segments/song_0016_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 8.621983105655142,
"segment_type": "mid"
},
{
"song_id": "song_0016",
"audio_path": "segments/song_0016_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 3.004352846791234,
"segment_type": "mid"
},
{
"song_id": "song_0016",
"audio_path": "songs/song_0016.wav",
"duration": 15.0,
"base_freq": 659.25,
"type": "reference"
},
{
"song_id": "song_0017",
"audio_path": "segments/song_0017_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.277150196277827,
"segment_type": "mid"
},
{
"song_id": "song_0017",
"audio_path": "segments/song_0017_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.391085856661506,
"segment_type": "mid"
},
{
"song_id": "song_0017",
"audio_path": "segments/song_0017_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 5.969708292829935,
"segment_type": "mid"
},
{
"song_id": "song_0017",
"audio_path": "segments/song_0017_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 6.1736267933642495,
"segment_type": "mid"
},
{
"song_id": "song_0017",
"audio_path": "segments/song_0017_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 1.1786165266165671,
"segment_type": "intro"
},
{
"song_id": "song_0017",
"audio_path": "songs/song_0017.wav",
"duration": 15.0,
"base_freq": 698.46,
"type": "reference"
},
{
"song_id": "song_0018",
"audio_path": "segments/song_0018_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.641438208318426,
"segment_type": "mid"
},
{
"song_id": "song_0018",
"audio_path": "segments/song_0018_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 3.582227293409872,
"segment_type": "mid"
},
{
"song_id": "song_0018",
"audio_path": "segments/song_0018_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 0.6333068606017467,
"segment_type": "intro"
},
{
"song_id": "song_0018",
"audio_path": "segments/song_0018_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 3.3775515517078736,
"segment_type": "mid"
},
{
"song_id": "song_0018",
"audio_path": "segments/song_0018_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 6.825519260932059,
"segment_type": "mid"
},
{
"song_id": "song_0018",
"audio_path": "songs/song_0018.wav",
"duration": 15.0,
"base_freq": 783.99,
"type": "reference"
},
{
"song_id": "song_0019",
"audio_path": "segments/song_0019_seg_00.wav",
"duration": 5.0,
"type": "clean",
"offset": 6.405372883123518,
"segment_type": "mid"
},
{
"song_id": "song_0019",
"audio_path": "segments/song_0019_seg_01.wav",
"duration": 5.0,
"type": "clean",
"offset": 5.376553581360508,
"segment_type": "mid"
},
{
"song_id": "song_0019",
"audio_path": "segments/song_0019_seg_02_augmented.wav",
"duration": 5.0,
"type": "augmented",
"offset": 1.5268044380447066,
"segment_type": "intro"
},
{
"song_id": "song_0019",
"audio_path": "segments/song_0019_seg_03_humming_like.wav",
"duration": 5.0,
"type": "humming_like",
"offset": 5.864371630124319,
"segment_type": "mid"
},
{
"song_id": "song_0019",
"audio_path": "segments/song_0019_seg_04_confused.wav",
"duration": 5.0,
"type": "confused",
"offset": 4.37486043050575,
"segment_type": "mid"
},
{
"song_id": "song_0019",
"audio_path": "songs/song_0019.wav",
"duration": 15.0,
"base_freq": 880.0,
"type": "reference"
}
]
\ No newline at end of file
[
{
"song_id": "foo",
"audio_path": "raw/foo.wav",
"duration": 10.5,
"type": "reference",
"source_dataset": "fma"
}
]
\ No newline at end of file
......@@ -71,6 +71,7 @@ def main():
})
total = len(queries)
confusion_focus = {k:v for k,v in by_type.items() if k in {"confused", "humming_like"}}
report = {
"split": args.split,
"num_queries": total,
......@@ -84,6 +85,10 @@ def main():
}
for k, v in by_type.items()
},
"hard_case_summary": {
k: {"n": v["n"], "top1": round(v["top1"]/v["n"],4) if v["n"] else 0.0, "topk": round(v["topk"]/v["n"],4) if v["n"] else 0.0}
for k,v in confusion_focus.items()
},
"sample_failures": failures[:10],
}
print(json.dumps(report, ensure_ascii=False, indent=2))
......
......@@ -4,3 +4,6 @@ soundfile>=0.12
librosa>=0.10
tqdm>=4.66
torch>=2.3
fastapi>=0.115
uvicorn>=0.30
pydantic>=2.8
......
"""Dataset adapter skeletons for external/open music corpora."""
from __future__ import annotations
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List
import argparse
import json
@dataclass
class DatasetRecord:
name: str
source_url: str
license: str
commercial_use: str
notes: str
class BaseAdapter:
name = "base"
def describe(self) -> Dict:
raise NotImplementedError
def init_layout(self, root: Path) -> Dict:
root.mkdir(parents=True, exist_ok=True)
for sub in ["raw", "processed", "manifests", "licenses"]:
(root / sub).mkdir(exist_ok=True)
manifest = {
"dataset": self.name,
"root": str(root),
"status": "initialized",
"next_steps": [
"download raw audio according to upstream license terms",
"convert to catalog/query manifests",
"record license evidence before training",
],
}
with open(root / "manifests" / "bootstrap.json", "w") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
return manifest
class FMAAdapter(BaseAdapter):
name = "fma"
def describe(self) -> Dict:
return {
"name": "FMA",
"source_url": "https://github.com/mdeff/fma",
"recommended_subset": "fma_small",
"catalog_strategy": "full tracks as references; random 5-15s crops as queries",
"license_policy": "review per subset/track before commercial training",
}
class MTGJamendoAdapter(BaseAdapter):
name = "mtg_jamendo"
def describe(self) -> Dict:
return {
"name": "MTG-Jamendo",
"source_url": "https://github.com/MTG/mtg-jamendo-dataset",
"recommended_subset": "small curated slice",
"catalog_strategy": "download upstream audio subset then build catalog/query manifests",
"license_policy": "verify CC terms for intended commercial use",
}
class CCMusicAdapter(BaseAdapter):
name = "ccmusic"
def describe(self) -> Dict:
return {
"name": "CCMusic",
"source_url": "https://ccmusic-database.github.io/en/database/ccm.html",
"recommended_subset": "whitelisted approved subset only",
"catalog_strategy": "use approved corpora only; normalize to project manifests",
"license_policy": "application/permission review required before use",
}
class ModelScopeMusicAdapter(BaseAdapter):
name = "modelscope_music"
def describe(self) -> Dict:
return {
"name": "ModelScope music datasets",
"source_url": "https://modelscope.cn/search?page=1&search=music&type=dataset",
"recommended_subset": "manual whitelist only",
"catalog_strategy": "treat as discovery surface; add per-dataset adapter after legal review",
"license_policy": "deny until whitelisted",
}
ADAPTERS = {
"fma": FMAAdapter(),
"mtg_jamendo": MTGJamendoAdapter(),
"ccmusic": CCMusicAdapter(),
"modelscope_music": ModelScopeMusicAdapter(),
}
REGISTRY: List[DatasetRecord] = [
DatasetRecord(
name="FMA",
source_url="https://github.com/mdeff/fma",
license="Track-dependent / metadata CC BY 4.0; verify per subset",
commercial_use="review_required",
notes="Good first realistic MIR baseline",
),
DatasetRecord(
name="MTG-Jamendo",
source_url="https://github.com/MTG/mtg-jamendo-dataset",
license="Creative Commons source tracks; verify exact subset terms",
commercial_use="review_required",
notes="Good retrieval/tagging corpus with scripts",
),
DatasetRecord(
name="CCMusic",
source_url="https://ccmusic-database.github.io/en/database/ccm.html",
license="varies / application may be required",
commercial_use="review_required",
notes="Useful Chinese MIR source, needs permission review",
),
DatasetRecord(
name="ModelScope-music",
source_url="https://modelscope.cn/search?page=1&search=music&type=dataset",
license="varies by dataset",
commercial_use="deny_until_whitelisted",
notes="Discovery surface only until per-dataset review is complete",
),
]
def write_registry(output_path: str):
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
with open(out, "w") as f:
json.dump([asdict(x) for x in REGISTRY], f, indent=2, ensure_ascii=False)
return out
def main():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("registry")
p.add_argument("--output", default="data/dataset_registry.json")
p = sub.add_parser("init")
p.add_argument("dataset", choices=sorted(ADAPTERS))
p.add_argument("--root", default="data/external")
p = sub.add_parser("describe")
p.add_argument("dataset", choices=sorted(ADAPTERS))
args = parser.parse_args()
if args.cmd == "registry":
path = write_registry(args.output)
print(path)
elif args.cmd == "init":
root = Path(args.root) / args.dataset
print(json.dumps(ADAPTERS[args.dataset].init_layout(root), indent=2, ensure_ascii=False))
elif args.cmd == "describe":
print(json.dumps(ADAPTERS[args.dataset].describe(), indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
"""External dataset manifest conversion templates."""
from __future__ import annotations
import argparse
import csv
import json
from pathlib import Path
from typing import List, Dict
def write_catalog(records: List[Dict], output_path: Path):
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
def csv_to_catalog(csv_path: Path, output_path: Path, path_field: str = "audio_path", id_field: str = "song_id"):
records = []
with open(csv_path, newline="") as f:
reader = csv.DictReader(f)
for row in reader:
records.append(
{
"song_id": row[id_field],
"audio_path": row[path_field],
"duration": float(row.get("duration", 0.0) or 0.0),
"type": "reference",
"source_dataset": row.get("source_dataset", "external"),
}
)
write_catalog(records, output_path)
return len(records)
def main():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("csv-to-catalog")
p.add_argument("csv_path")
p.add_argument("output_path")
p.add_argument("--path-field", default="audio_path")
p.add_argument("--id-field", default="song_id")
args = parser.parse_args()
if args.cmd == "csv-to-catalog":
count = csv_to_catalog(Path(args.csv_path), Path(args.output_path), args.path_field, args.id_field)
print(json.dumps({"status": "ok", "records": count}, ensure_ascii=False))
if __name__ == "__main__":
main()
"""
Hybrid ACR Engine: Chromaprint fast pre-filter + ECAPA-TDNN deep re-ranking.
"""
"""Hybrid ACR Engine: Chromaprint + ECAPA + melody-aware re-ranking."""
import json
import time
from pathlib import Path
from typing import Dict, List, Optional
import librosa
import numpy as np
from src.utils.audio import AudioProcessor
class Candidate:
def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0):
def __init__(self, song_id: str, chroma_score: float = 0.0, ecapa_score: float = 0.0, melody_score: float = 0.0):
self.song_id = song_id
self.chroma_score = chroma_score
self.ecapa_score = ecapa_score
self.melody_score = melody_score
self.metadata: Dict = {}
def combined_score(self, chroma_weight: float, ecapa_weight: float) -> float:
return chroma_weight * self.chroma_score + ecapa_weight * self.ecapa_score
def __repr__(self):
return f"Candidate({self.song_id}, chroma={self.chroma_score:.3f}, ecapa={self.ecapa_score:.3f})"
def combined_score(self, chroma_weight: float, ecapa_weight: float, melody_weight: float) -> float:
return (
chroma_weight * self.chroma_score
+ ecapa_weight * self.ecapa_score
+ melody_weight * self.melody_score
)
class HybridEngine:
......@@ -32,8 +35,9 @@ class HybridEngine:
ref_embs: Optional[np.ndarray] = None,
ref_ids: Optional[List[str]] = None,
sr: int = 16000,
chroma_weight: float = 0.35,
ecapa_weight: float = 0.65,
chroma_weight: float = 0.25,
ecapa_weight: float = 0.5,
melody_weight: float = 0.25,
reject_threshold: float = 0.35,
):
self.chroma = chroma_matcher
......@@ -43,12 +47,16 @@ class HybridEngine:
self.sr = sr
self.chroma_weight = chroma_weight
self.ecapa_weight = ecapa_weight
self.melody_weight = melody_weight
self.reject_threshold = reject_threshold
self.song_metadata: Dict[str, Dict] = {}
self.song_audio_paths: Dict[str, str] = {}
self.audio = AudioProcessor(sr=sr)
def load_metadata(self, metadata_path: str):
with open(metadata_path) as f:
items = json.load(f)
base_dir = str(Path(metadata_path).parent)
for item in items:
sid = item["song_id"]
existing = self.song_metadata.get(sid, {})
......@@ -59,15 +67,15 @@ class HybridEngine:
"audio_path": item.get("audio_path", existing.get("audio_path", "")),
"type": item.get("type", existing.get("type", "unknown")),
}
if item.get("type") == "reference":
self.song_audio_paths[sid] = str(Path(base_dir) / item["audio_path"])
@staticmethod
def _normalize_scores(score_pairs: List[tuple], invert: bool = False) -> Dict[str, float]:
def _normalize_scores(score_pairs: List[tuple]) -> Dict[str, float]:
if not score_pairs:
return {}
ids = [sid for sid, _ in score_pairs]
values = np.array([float(score) for _, score in score_pairs], dtype=np.float32)
if invert:
values = -values
if len(values) == 1:
return {ids[0]: 1.0}
vmin = float(values.min())
......@@ -77,12 +85,18 @@ class HybridEngine:
norm = (values - vmin) / (vmax - vmin)
return {sid: float(score) for sid, score in zip(ids, norm)}
def recognize(
self,
audio_path: str,
top_n: int = 5,
mode: str = "auto",
) -> Dict:
def _melody_scores(self, query_y: np.ndarray, candidate_ids: List[str]) -> Dict[str, float]:
scores = []
for song_id in candidate_ids:
ref_path = self.song_audio_paths.get(song_id)
if not ref_path or not Path(ref_path).exists():
continue
ref_y, _ = librosa.load(ref_path, sr=self.sr, mono=True, duration=8.0)
score = self.audio.melody_similarity(query_y, ref_y)
scores.append((song_id, score))
return self._normalize_scores(scores)
def recognize(self, audio_path: str, top_n: int = 5, mode: str = "auto") -> Dict:
del mode
start = time.time()
y, _ = librosa.load(audio_path, sr=self.sr, mono=True)
......@@ -96,41 +110,45 @@ class HybridEngine:
ref_norm = self.ref_embs / (np.linalg.norm(self.ref_embs, axis=1, keepdims=True) + 1e-12)
query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-12)
scores = query_norm @ ref_norm.T
top_indices = np.argsort(-scores)[: max(top_n * 5, 20)]
top_indices = np.argsort(-scores)[: max(top_n * 10, 30)]
ecapa_matches = [(self.ref_ids[idx], float(scores[idx])) for idx in top_indices]
ecapa_norm = self._normalize_scores(ecapa_matches)
all_song_ids = set(chroma_norm) | set(ecapa_norm)
candidate_pool = list(set(list(chroma_norm.keys())[: top_n * 8] + list(ecapa_norm.keys())[: top_n * 8]))
melody_norm = self._melody_scores(y, candidate_pool)
all_song_ids = set(candidate_pool) | set(melody_norm)
combined: List[Candidate] = []
for song_id in all_song_ids:
candidate = Candidate(
song_id=song_id,
chroma_score=chroma_norm.get(song_id, 0.0),
ecapa_score=ecapa_norm.get(song_id, 0.0),
melody_score=melody_norm.get(song_id, 0.0),
)
candidate.metadata = self.song_metadata.get(song_id, {})
combined.append(candidate)
combined.sort(key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight), reverse=True)
combined.sort(
key=lambda c: c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight),
reverse=True,
)
results = combined[:top_n]
elapsed = (time.time() - start) * 1000
output = []
for c in results:
fused = c.combined_score(self.chroma_weight, self.ecapa_weight)
fused = c.combined_score(self.chroma_weight, self.ecapa_weight, self.melody_weight)
output.append(
{
"song_id": c.song_id,
"confidence": round(fused, 4),
"chromaprint_score": round(c.chroma_score, 4),
"ecapa_score": round(c.ecapa_score, 4),
"melody_score": round(c.melody_score, 4),
"accepted": fused >= self.reject_threshold,
"metadata": c.metadata,
}
)
return {
"candidates": output,
"processing_time_ms": round(elapsed, 1),
"num_candidates": len(results),
}
return {"candidates": output, "processing_time_ms": round(elapsed, 1), "num_candidates": len(results)}
......
from pathlib import Path
from typing import Optional
import numpy as np
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from src.engines.chromaprint_matcher import ChromaprintMatcher
from src.engines.ecapa_embedder import ECAPAEmbedder
from src.engines.hybrid_engine import HybridEngine
class RecognizeRequest(BaseModel):
query_path: str
data_dir: str = "data/synthetic_v2"
model_path: str = "data/models_v3/best_model.pt"
index_prefix: str = "data/index_v3/reference"
top_n: int = 5
device: str = "cpu"
class BuildIndexRequest(BaseModel):
data_dir: str
model_path: str
output_dir: str
device: str = "cpu"
app = FastAPI(title="ACR Service", version="0.1.0")
def _load_engine(data_dir: str, model_path: str, index_prefix: str, device: str) -> HybridEngine:
matcher = ChromaprintMatcher()
chroma_path = str(Path(index_prefix).parent / "chromaprint.pkl")
if not Path(chroma_path).exists():
raise HTTPException(status_code=400, detail=f"Missing chromaprint index: {chroma_path}")
matcher.load(chroma_path)
if not Path(model_path).exists():
raise HTTPException(status_code=400, detail=f"Missing model: {model_path}")
embedder = ECAPAEmbedder(model_path=model_path, device=device)
embs_path = f"{index_prefix}_embs.npy"
ids_path = f"{index_prefix}_ids.npy"
if not Path(embs_path).exists() or not Path(ids_path).exists():
raise HTTPException(status_code=400, detail="Missing embedding index files")
ref_embs = np.load(embs_path)
ref_ids = np.load(ids_path, allow_pickle=True).tolist()
engine = HybridEngine(matcher, embedder, ref_embs, ref_ids)
for split in ["catalog.json", "train.json", "val.json", "test.json"]:
p = Path(data_dir) / split
if p.exists():
engine.load_metadata(str(p))
return engine
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/recognize")
def recognize(req: RecognizeRequest):
if not Path(req.query_path).exists():
raise HTTPException(status_code=400, detail=f"Missing query file: {req.query_path}")
engine = _load_engine(req.data_dir, req.model_path, req.index_prefix, req.device)
return engine.recognize(req.query_path, top_n=req.top_n)
@app.post("/index/build")
def build_index(req: BuildIndexRequest):
from run_demo import build_chroma_index, build_embedding_index
data_dir = Path(req.data_dir)
out_dir = Path(req.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
build_chroma_index(data_dir, out_dir)
_, ref_embs, ref_ids = build_embedding_index(data_dir, Path(req.model_path), out_dir / "reference", req.device)
return {"status": "ok", "num_reference_windows": len(ref_ids), "embedding_dim": int(ref_embs.shape[1]) if len(ref_embs.shape) > 1 else 0}
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import librosa
import numpy as np
import torch
from typing import List, Optional, Tuple
class AudioProcessor:
def __init__(self, sr: int = 16000, n_mels: int = 80, n_fft: int = 512, hop_length: int = 160):
def __init__(self, sr: int = 16000, n_mels: int = 128, n_fft: int = 512, hop_length: int = 160):
self.sr = sr
self.n_mels = n_mels
self.n_fft = n_fft
......@@ -19,8 +17,7 @@ class AudioProcessor:
def to_mel(self, y: np.ndarray) -> np.ndarray:
mel = librosa.feature.melspectrogram(
y=y, sr=self.sr, n_mels=self.n_mels,
n_fft=self.n_fft, hop_length=self.hop_length
y=y, sr=self.sr, n_mels=self.n_mels, n_fft=self.n_fft, hop_length=self.hop_length
)
return librosa.power_to_db(mel, ref=np.max)
......@@ -36,7 +33,7 @@ class AudioProcessor:
y = np.pad(y, (0, pad))
windows = []
for start in range(0, len(y) - win_len + 1, stride):
windows.append(y[start:start + win_len])
windows.append(y[start : start + win_len])
if not windows:
windows.append(y[:win_len])
return windows
......@@ -47,10 +44,32 @@ class AudioProcessor:
return self.to_mel_tensor(y), duration
def extract_chroma(self, y: np.ndarray) -> np.ndarray:
chroma = librosa.feature.chroma_cqt(y=y, sr=self.sr)
return chroma
return librosa.feature.chroma_cqt(y=y, sr=self.sr)
def extract_f0(self, y: np.ndarray, fmin=65, fmax=2093) -> np.ndarray:
f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=fmin, fmax=fmax)
f0 = np.nan_to_num(f0, nan=0.0)
return f0
return np.nan_to_num(f0, nan=0.0)
def melody_signature(self, y: np.ndarray) -> np.ndarray:
f0 = self.extract_f0(y)
if f0.size == 0:
return np.zeros(32, dtype=np.float32)
nonzero = f0[f0 > 0]
if nonzero.size == 0:
return np.zeros(32, dtype=np.float32)
contour = np.diff(np.log2(nonzero + 1e-6), prepend=np.log2(nonzero[0] + 1e-6))
contour = np.clip(contour, -0.5, 0.5)
if contour.size < 32:
contour = np.pad(contour, (0, 32 - contour.size))
else:
idx = np.linspace(0, contour.size - 1, 32).astype(int)
contour = contour[idx]
return contour.astype(np.float32)
def melody_similarity(self, y1: np.ndarray, y2: np.ndarray) -> float:
s1 = self.melody_signature(y1)
s2 = self.melody_signature(y2)
denom = float(np.linalg.norm(s1) * np.linalg.norm(s2) + 1e-12)
if denom <= 1e-12:
return 0.0
return float(np.dot(s1, s2) / denom)
......
......@@ -53,3 +53,25 @@
结论:
- 结构性错误(catalog/index/fusion/评测缺失)已明显改善
- 当前主要剩余短板是 humming_like / confused 的鲁棒识别
## 2026-06-02
### Stage: 工业化服务骨架 + 外部 manifest 转换模板
完成项:
- 新增 FastAPI 服务骨架:`acr-engine/src/service/app.py`
- 新增 manifest 转换工具:`acr-engine/src/data/manifest_tools.py`
- 新增工业 benchmark 文档:`docs/industrial-benchmark-spec.md`
- 扩展外部 dataset adapter CLI:`acr-engine/src/data/external_adapters.py`
- 新增服务 API 文档:`docs/service-api.md`
- requirements 增加 FastAPI / uvicorn / pydantic
验证结果:
- `external_adapters.py registry` 成功
- `external_adapters.py describe ccmusic` 成功
- `external_adapters.py init modelscope_music` 成功
- `manifest_tools.py csv-to-catalog` 成功生成 catalog
- `service.app health()` 返回 `{"status":"ok"}`
- API `build_index(...)` 成功返回 reference window 数量
- API `recognize(...)` 成功返回候选结果
- `train.py --dry-run` 成功
......
# Dataset Sources and Licensing Notes
> 更新:2026-06-02
## 注意
以下仅为工程接入与研究规划说明,不等于法律意见。实际商用前需要逐条复核原始 license、dataset terms 和再训练约束。
## 候选数据源
### 1. FMA
- URL: https://github.com/mdeff/fma
- 特点: 开放、MIR 常用、适合 retrieval baseline
- 风险: 音频 license 按 artist/track 可能不同,需逐条核验
### 2. MTG-Jamendo
- URL: https://github.com/MTG/mtg-jamendo-dataset
- 特点: Creative Commons 来源,适合音乐检索/标签任务
- 风险: 仍需按具体曲目用途与商业场景做 license 审查
### 3. CCMusic
- 论文/介绍: https://transactions.ismir.net/articles/10.5334/tismir.194
- 主页: https://ccmusic-database.github.io/en/database/ccm.html
- 特点: 中国音乐 MIR 数据资源丰富
- 风险: 部分数据集可能需要申请或存在使用边界,必须单独核验
### 4. ModelScope music datasets
- 入口: https://www.modelscope.cn/datasets
- 搜索: https://modelscope.cn/search?page=1&search=music&type=dataset
- 特点: 数据发现方便,可扩充中文生态
- 风险: license 分散,不能默认可商用;接入前必须建立白名单
## 接入原则
- 只接入 license 明确的数据集
- 默认拒绝“来源不明 / 不允许商业使用 / 禁止训练衍生模型”的数据
- 训练前把数据集及许可信息落盘到 registry
# Industrial Benchmark Spec
> 更新:2026-06-02
## 目标
为工业级可商用 ACR 设立持续基准,不只看总体 top1/top5,还看场景化与风险化指标。
## Benchmark 维度
### 1. Retrieval Quality
- top1
- top5
- MRR
- recall@k
### 2. Scenario Buckets
- clean
- noisy
- compressed
- time-stretched
- pitch-shifted
- humming_like
- confused
- partial-overlap
- far-field / device-recorded
### 3. Catalog Scale Buckets
- 1K songs
- 10K songs
- 100K songs
- 1M+ songs
### 4. Operational Metrics
- p50 / p95 latency
- indexing throughput
- incremental update time
- memory / disk footprint
### 5. Business Safety Metrics
- false accept rate
- rejection quality
- near-duplicate confusion rate
- license provenance coverage
## Required Artifacts per Model Release
- dataset registry snapshot
- training config snapshot
- benchmark report JSON
- benchmark summary markdown
- model card
- license review manifest
## Minimum Go/No-Go Gate
- clean top1 >= 0.95
- noisy top1 >= 0.85
- confused top1 >= 0.70
- humming_like top1 >= 0.60
- top5 >= 0.95 on all production-relevant buckets
- false accept below agreed threshold
# ACR 工业级可商用演进路线
> 更新:2026-06-02
## 1. 目标定义
把当前原型升级为一个可商用的工业级 ACR 系统,满足:
- 可扩展曲库管理
- 可重复训练 / 评测 / 部署
- 多数据源接入(synthetic / FMA / Jamendo / CCMusic / ModelScope)
- 更强鲁棒性(噪声、失真、哼唱、混淆)
- 检索服务化
- 商用合规与授权边界可审计
## 2. 工业级分层
### 2.1 数据层
- `catalog.json` / query manifests
- 外部 dataset adapters
- license / usage tracking
- 数据版本与快照
### 2.2 训练层
- baseline encoder
- foundation-model encoder
- retrieval-first losses
- hard negative mining
- 数据平衡与生成增强
### 2.3 索引层
- window-level embeddings
- ANN index (Faiss/HNSW)
- 指纹索引与向量索引双路
- 增量入库
### 2.4 服务层
- FastAPI / gRPC
- batch ingest
- recognize API
- top-k candidate + rejection
- metadata lookup
### 2.5 质量层
- regression benchmark
- hard-case benchmark
- online shadow evaluation
- 数据/模型回滚机制
## 3. 数据集策略
### 第一梯队(优先)
- FMA small / medium
- MTG-Jamendo
- CCMusic(需核验申请/授权方式)
- ModelScope music datasets(按 license 白名单接入)
### 第二梯队
- humming / QBSH 数据集
- instrument / structure / singing datasets 作为辅助监督
## 4. 商用必做项
- 每个 dataset 记录:
- 来源 URL
- license
- 是否允许商业使用
- 再分发限制
- 模型训练用途限制
- 每个模型版本记录训练数据组成
- 每次上线保留评测报告与可追溯哈希
## 5. 当前到工业化的缺口
- 缺 dataset adapter 层
- 缺 ANN 检索
- 缺 API 服务
- 缺 license registry
- 缺 foundation-model baseline
- 缺真正的 hard-negative mining
- 缺真实开源数据 benchmark
# ACR Service API
## Endpoints
### GET /health
返回服务健康状态。
### POST /recognize
请求体:
```json
{
"query_path": "data/synthetic_v2/segments/song_0021_seg_01_augmented.wav",
"data_dir": "data/synthetic_v2",
"model_path": "data/models_v3/best_model.pt",
"index_prefix": "data/index_v3/reference",
"top_n": 5,
"device": "cpu"
}
```
### POST /index/build
请求体:
```json
{
"data_dir": "data/synthetic_v2",
"model_path": "data/models_v3/best_model.pt",
"output_dir": "data/index_v3",
"device": "cpu"
}
```