build_business_project_manifests.py
2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
from pathlib import Path
def load_jsonl(path: Path) -> list[dict]:
return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
def write_json(path: Path, rows: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(rows, ensure_ascii=False, indent=2))
def build_reference(row: dict) -> dict:
return {
'song_id': row['song_id'],
'audio_path': row['audio_path'],
'duration': row.get('duration_sec') or 0.0,
'type': 'reference',
'source_dataset': row.get('source_dataset', 'business_music'),
}
def build_query(row: dict) -> dict:
return {
'song_id': row['song_id'],
'audio_path': row['audio_path'],
'duration': row.get('duration_sec') or 8.0,
'type': 'clean',
'offset': row.get('offset_sec') or 0.0,
'segment_type': 'external_query',
'source_dataset': row.get('source_dataset', 'business_music'),
}
def dedupe_refs(rows: list[dict]) -> list[dict]:
seen = set()
out = []
for row in rows:
key = (row['song_id'], row['audio_path'])
if key in seen:
continue
seen.add(key)
out.append(row)
return out
def main() -> None:
parser = argparse.ArgumentParser(description='Build project manifests from business manifest-ready JSONL')
parser.add_argument('--input', required=True, help='manifest-ready JSONL from normalize_business_export.py')
parser.add_argument('--output-dir', required=True, help='output manifests dir')
parser.add_argument('--include-holdout-in-val', action='store_true', help='map holdout queries into val.json')
args = parser.parse_args()
rows = load_jsonl(Path(args.input).resolve())
refs_src = [row for row in rows if row.get('role') == 'reference']
query_src = [row for row in rows if row.get('role') == 'query']
refs = dedupe_refs([build_reference(row) for row in refs_src])
train_queries = [build_query(row) for row in query_src if row.get('split') == 'train']
test_queries = [build_query(row) for row in query_src if row.get('split') == 'test']
val_queries = [build_query(row) for row in query_src if row.get('split') == 'val']
if args.include_holdout_in_val:
val_queries.extend(build_query(row) for row in query_src if row.get('split') == 'holdout')
out_dir = Path(args.output_dir).resolve()
write_json(out_dir / 'catalog.json', refs)
write_json(out_dir / 'train.json', train_queries + refs)
write_json(out_dir / 'test.json', test_queries + refs)
write_json(out_dir / 'val.json', val_queries)
summary = {
'catalog_refs': len(refs),
'train_queries': len(train_queries),
'test_queries': len(test_queries),
'val_queries': len(val_queries),
'output_dir': str(out_dir),
}
print(json.dumps(summary, ensure_ascii=False, indent=2))
if __name__ == '__main__':
main()