_job_common.py
8.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from __future__ import annotations
import json
import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import psycopg
SCHEMA_RE = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$')
ALLOWED_JOB_STATUSES = {'pending', 'running', 'completed', 'failed'}
@dataclass
class JobContext:
extraction_job_id: int
feature_set_id: int
target_scope: str
job_status: str
shard_key: str | None
job_metadata: dict[str, Any]
feature_name: str
feature_level: str
extraction_granularity: str
window_sec: float | None
hop_sec: float | None
embedding_dim: int | None
distance_metric: str
feature_config: dict[str, Any]
model_id: int
model_name: str
model_version: str
model_family: str
input_sample_rate: int | None
output_embedding_dim: int | None
model_metadata: dict[str, Any]
def require_env(name: str, default: str | None = None) -> str:
value = os.environ.get(name, default)
if value is None or value == '':
raise SystemExit(f'missing required env: {name}')
return value
def validate_schema(schema: str) -> str:
if not SCHEMA_RE.match(schema):
raise SystemExit(f'invalid schema name: {schema}')
return schema
def ensure_output_parent(path: str | None) -> Path | None:
if not path:
return None
output = Path(path)
output.parent.mkdir(parents=True, exist_ok=True)
return output
def connect(dsn: str, schema: str, *, autocommit: bool = True) -> psycopg.Connection:
conn = psycopg.connect(dsn, autocommit=autocommit)
conn.execute(f'SET search_path TO {validate_schema(schema)}, public;')
return conn
def fetch_job_context(conn: psycopg.Connection, extraction_job_id: int) -> JobContext:
row = conn.execute(
"""
SELECT
fej.extraction_job_id,
fej.feature_set_id,
fej.target_scope,
fej.job_status,
fej.shard_key,
fej.metadata_json,
fs.feature_name,
fs.feature_level,
fs.extraction_granularity,
fs.window_sec,
fs.hop_sec,
fs.embedding_dim,
fs.distance_metric,
fs.config_json,
mr.model_id,
mr.model_name,
mr.model_version,
mr.model_family,
mr.input_sample_rate,
mr.output_embedding_dim,
mr.metadata_json
FROM feature_extraction_job fej
JOIN feature_set_registry fs ON fs.feature_set_id = fej.feature_set_id
JOIN model_registry mr ON mr.model_id = fs.model_id
WHERE fej.extraction_job_id = %s
LIMIT 1;
""",
(extraction_job_id,),
).fetchone()
if not row:
raise SystemExit(f'feature_extraction_job not found: {extraction_job_id}')
return JobContext(
extraction_job_id=int(row[0]),
feature_set_id=int(row[1]),
target_scope=row[2],
job_status=row[3],
shard_key=row[4],
job_metadata=row[5] or {},
feature_name=row[6],
feature_level=row[7],
extraction_granularity=row[8],
window_sec=float(row[9]) if row[9] is not None else None,
hop_sec=float(row[10]) if row[10] is not None else None,
embedding_dim=int(row[11]) if row[11] is not None else None,
distance_metric=row[12],
feature_config=row[13] or {},
model_id=int(row[14]),
model_name=row[15],
model_version=row[16],
model_family=row[17],
input_sample_rate=int(row[18]) if row[18] is not None else None,
output_embedding_dim=int(row[19]) if row[19] is not None else None,
model_metadata=row[20] or {},
)
def parse_target_scope(target_scope: str) -> tuple[str, str]:
if ':' in target_scope:
scope_type, scope_value = target_scope.split(':', 1)
return scope_type, scope_value
return 'unknown', target_scope
def resolve_scope_summary(conn: psycopg.Connection, target_scope: str) -> dict[str, Any]:
scope_type, scope_value = parse_target_scope(target_scope)
if scope_type == 'reference_set':
row = conn.execute(
"""
SELECT
rs.reference_set_id,
rs.set_name,
count(DISTINCT rsm.recording_id) AS recording_count,
count(DISTINCT ra.asset_id) FILTER (WHERE ra.ingest_status = 'ready') AS ready_asset_count,
count(DISTINCT aw.window_id) FILTER (WHERE aw.active_for_index) AS active_window_count
FROM reference_set_registry rs
LEFT JOIN reference_set_member rsm ON rsm.reference_set_id = rs.reference_set_id
LEFT JOIN recording_asset ra ON ra.recording_id = rsm.recording_id
LEFT JOIN audio_window aw ON aw.recording_id = rsm.recording_id
WHERE rs.set_name = %s
GROUP BY rs.reference_set_id, rs.set_name
LIMIT 1;
""",
(scope_value,),
).fetchone()
if not row:
raise SystemExit(f'reference set not found for target_scope={target_scope}')
return {
'scope_type': scope_type,
'scope_value': scope_value,
'reference_set_id': int(row[0]),
'reference_set_name': row[1],
'recording_count': int(row[2]),
'ready_asset_count': int(row[3]),
'active_window_count': int(row[4]),
}
return {
'scope_type': scope_type,
'scope_value': scope_value,
'recording_count': 0,
'ready_asset_count': 0,
'active_window_count': 0,
}
def update_job_status(
conn: psycopg.Connection,
extraction_job_id: int,
*,
status: str,
expected_status: str | None = None,
input_count: int | None = None,
output_count: int | None = None,
log_uri: str | None = None,
metadata_patch: dict[str, Any] | None = None,
set_started_at: bool = False,
set_finished_at: bool = False,
) -> dict[str, Any]:
if status not in ALLOWED_JOB_STATUSES:
raise SystemExit(f'invalid job status: {status}')
if expected_status is not None and expected_status not in ALLOWED_JOB_STATUSES:
raise SystemExit(f'invalid expected job status: {expected_status}')
patch = json.dumps(metadata_patch or {}, ensure_ascii=False)
row = conn.execute(
"""
UPDATE feature_extraction_job
SET job_status = %s,
input_count = COALESCE(%s, input_count),
output_count = COALESCE(%s, output_count),
log_uri = COALESCE(%s, log_uri),
started_at = CASE
WHEN %s THEN COALESCE(started_at, NOW())
ELSE started_at
END,
finished_at = CASE
WHEN %s THEN COALESCE(finished_at, NOW())
ELSE finished_at
END,
metadata_json = COALESCE(metadata_json, '{}'::jsonb) || %s::jsonb
WHERE extraction_job_id = %s
AND (%s OR job_status = %s)
RETURNING extraction_job_id, job_status, input_count, output_count, started_at, finished_at, log_uri, metadata_json;
""",
(
status,
input_count,
output_count,
log_uri,
set_started_at,
set_finished_at,
patch,
extraction_job_id,
expected_status is None,
expected_status,
),
).fetchone()
if not row:
expectation = f' with expected_status={expected_status}' if expected_status else ''
raise SystemExit(f'failed to update feature_extraction_job={extraction_job_id}{expectation}')
return {
'extraction_job_id': int(row[0]),
'job_status': row[1],
'input_count': int(row[2]) if row[2] is not None else None,
'output_count': int(row[3]) if row[3] is not None else None,
'started_at': row[4].isoformat() if row[4] is not None else None,
'finished_at': row[5].isoformat() if row[5] is not None else None,
'log_uri': row[6],
'metadata_json': row[7] or {},
}
def emit_payload(payload: dict[str, Any], output: str | None) -> None:
text = json.dumps(payload, ensure_ascii=False, indent=2)
if output:
target = ensure_output_parent(output)
assert target is not None
target.write_text(text, encoding='utf-8')
print(text)