Coverage for polars_analysis / plotting / qc_db_plotting.py: 76%

261 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-13 13:37 -0400

1import logging 

2from itertools import product 

3from pathlib import Path 

4from typing import Dict, List, Literal, Optional, Union, cast 

5 

6import matplotlib 

7 

8matplotlib.use("agg") 

9import os 

10 

11import matplotlib.pyplot as plt 

12import numpy as np 

13import polars as pl 

14from matplotlib import colormaps as cm 

15from polars._typing import PythonLiteral 

16from psycopg import sql 

17from rich.progress import track 

18 

19from polars_analysis.plotting import helper 

20 

21# Instantiate logger 

22log = logging.getLogger(__name__) 

23 

24""" 

25Functions for plotting from the QC database. 

26""" 

27 

28 

29def _check_none(value: Union[PythonLiteral, None], default=-999) -> PythonLiteral: 

30 return value if value is not None else default 

31 

32 

33def plot_qc_db( 

34 plot_dir: Path, postgres_uri: str, batch_mode: bool = False, board_type: Literal["EM", "HEC"] = "EM" 

35) -> tuple[List[str], List[str]]: 

36 """ 

37 Example to run: ./typer_feb2.py plot-qc --plot-dir plots 

38 

39 Args: 

40 plot_dir: location to save plots 

41 postgres_uri: uri to database with QC values 

42 batch_mode: turn off track() 

43 """ 

44 

45 plot_dir = plot_dir 

46 if not plot_dir.exists(): 

47 plot_dir.mkdir(parents=True, exist_ok=True) 

48 os.chmod(plot_dir, 0o775) 

49 

50 df = ( 

51 pl.read_database_uri( 

52 sql.SQL(""" 

53 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label 

54 FROM latest_derived as d 

55 JOIN threshold_labels as l on d.label_id=l.id 

56 JOIN boards as b on b.board_id=d.board_id 

57 WHERE b.board_type = {} 

58 """) 

59 .format(board_type) 

60 .as_string(), 

61 postgres_uri, 

62 ) 

63 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None)) 

64 .unnest("label") 

65 ) 

66 

67 df = df.with_columns( 

68 pl.when(pl.col("variable") == "mean") 

69 .then(pl.lit("pedestal_mean")) 

70 .otherwise(pl.col("variable")) 

71 .alias("variable") 

72 ) 

73 df = df.with_columns( 

74 pl.when(pl.col("variable") == "std") 

75 .then(pl.lit("pedestal_std")) 

76 .otherwise(pl.col("variable")) 

77 .alias("variable") 

78 ) 

79 

80 # when you use savefig, use plot_dir to put them all in a folder 

81 # eg plt.savefig( plot_dir / f'{variable}_Histogram.png') 

82 # also pro-tip, your life will be easier if you don't put spaces in the filenames 

83 

84 # Histograms by variable 

85 filtered_df = df.filter( 

86 pl.col("board_id") != "1234", 

87 pl.col("board_id") != "E190324", 

88 pl.col("board_id") != "194169", # Changed to E... 

89 pl.col("board_id") != "E191700", 

90 ~((pl.col("board_id") == "E190322") & (pl.col("channel") == 119)).fill_null(False), # Bad channel 

91 pl.col("run_number") != 1428, # Doesn't appear to have loads? 

92 ~((pl.col("variable") == "gain_ratio") & (pl.col("awg_amp") == 6.0)).fill_null( 

93 False 

94 ), # ratio we want has awp_amp = 1.5, for reasons 

95 ) 

96 

97 variables = filtered_df["variable"].unique().drop_nulls().sort().to_list() 

98 boards = filtered_df["board_id"].unique().sort().to_list() 

99 

100 inf_data_warnings_dict: Dict[str, list] = {} 

101 nan_data_warnings_dict: Dict[str, list] = {} 

102 null_data_warnings_dict: Dict[str, list] = {} 

103 missing_data_warnings_dict: Dict[str, list] = {} 

104 

105 value = pl.col("value") 

106 outlier_filter = (value < value.mean() - 3 * value.std()) | (value > value.mean() + 3 * value.std()) 

107 for variable in variables if batch_mode else track(variables, description="Making QC plots..."): 

108 var_df = filtered_df.filter(pl.col("variable") == variable) 

109 inf_boards: List[str] = var_df.filter(pl.col("value").is_infinite())["board_id"].unique().to_list() 

110 if len(inf_boards) > 0: 

111 inf_data_warnings_dict[str(variable)] = inf_boards 

112 nan_boards: List[str] = var_df.filter(pl.col("value").is_nan())["board_id"].unique().to_list() 

113 if len(nan_boards) > 0: 

114 nan_data_warnings_dict[variable] = nan_boards 

115 null_boards: List[str] = var_df.filter(pl.col("value").is_null())["board_id"].unique().to_list() 

116 if len(null_boards) > 0: 

117 null_data_warnings_dict[variable] = null_boards 

118 

119 # TODO could try to infer variable length to find incomplete boards 

120 missing_boards = [id for id in boards if id not in var_df["board_id"].unique().to_list()] 

121 if len(missing_boards) > 0: 

122 missing_data_warnings_dict[variable] = missing_boards 

123 

124 var_df = var_df.filter( 

125 ~pl.col("value").is_infinite(), 

126 ~pl.col("value").is_null(), 

127 ~pl.col("value").is_nan(), 

128 ) 

129 

130 all_null = var_df["gain"].is_null().all() 

131 some_null = var_df["gain"].has_nulls() 

132 if some_null and not all_null: 

133 log.warning(f"Some null gain values for {variable}! That's not good.") 

134 

135 # variable is not split into hi and lo gain 

136 if all_null: 

137 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}) 

138 

139 var_mean = cast(float, _check_none(var_df["value"].mean())) 

140 var_std = cast(float, _check_none(var_df["value"].std())) 

141 var_n = var_df["value"].len() 

142 

143 var_hist.hist( 

144 var_df["value"], 

145 bins=100, 

146 histtype="step", 

147 color="purple", 

148 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}", 

149 ) 

150 var_hist.set( 

151 xlabel=f"{variable}", 

152 ylabel="Entries", 

153 yscale="log", 

154 ) 

155 var_hist.legend() 

156 

157 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

158 var_outliers = var_outliers.rename( 

159 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

160 ) 

161 var_tab.axis("off") 

162 if var_outliers.is_empty(): 

163 var_tab.set_title("No Outliers") 

164 else: 

165 var_table = var_tab.table( 

166 cellText=var_outliers.to_pandas().values, 

167 colLabels=var_outliers.columns, 

168 loc="best", 

169 cellLoc="center", 

170 ) 

171 var_table.auto_set_font_size(False) 

172 var_table.set_fontsize(7) 

173 var_tab.set_title("Outliers") 

174 else: 

175 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi") 

176 

177 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo") 

178 

179 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots( 

180 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]} 

181 ) 

182 

183 hi_mean = cast(float, _check_none(hi["value"].mean())) 

184 hi_std = cast(float, _check_none(hi["value"].std())) 

185 hi_n = hi["value"].len() 

186 

187 hi_hist.hist( 

188 hi["value"], 

189 bins=100, 

190 histtype="step", 

191 color="r", 

192 label=f"Mean: {hi_mean:.02g}\nSTD: {hi_std:.02g}\nN: {hi_n}", 

193 ) 

194 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries") 

195 hi_hist.set_yscale("log") 

196 hi_hist.legend() 

197 

198 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

199 hi_outliers = hi_outliers.rename( 

200 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

201 ) 

202 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2)) 

203 hi_tab.axis("off") 

204 if hi_outliers.is_empty(): 

205 hi_tab.set_title("No High Gain Outliers") 

206 else: 

207 hi_table = hi_tab.table( 

208 cellText=hi_outliers.to_pandas().values, colLabels=hi_outliers.columns, loc="best", cellLoc="center" 

209 ) 

210 hi_table.auto_set_font_size(False) 

211 hi_table.set_fontsize(7) 

212 hi_tab.set_title("High Gain Outliers") 

213 

214 lo_mean = cast(float, _check_none(lo["value"].mean())) 

215 lo_std = cast(float, _check_none(lo["value"].std())) 

216 lo_n = lo["value"].len() 

217 

218 lo_hist.hist( 

219 lo["value"], 

220 bins=100, 

221 histtype="step", 

222 color="b", 

223 label=f"Mean: {lo_mean:.02g}\nSTD: {lo_std:.02g}\nN: {lo_n}", 

224 ) 

225 lo_hist.set(title="Low Gain, 6.0 V, 7.0 dB", xlabel=f"{variable}", ylabel="Entries") 

226 lo_hist.set_yscale("log") 

227 lo_hist.legend() 

228 

229 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

230 lo_outliers = lo_outliers.rename( 

231 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

232 ) 

233 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2)) 

234 lo_tab.axis("off") 

235 if lo_outliers.is_empty(): 

236 lo_tab.set_title("No Low Gain Outliers") 

237 else: 

238 lo_table = lo_tab.table( 

239 cellText=lo_outliers.to_pandas().values, colLabels=lo_outliers.columns, loc="best", cellLoc="center" 

240 ) 

241 lo_table.auto_set_font_size(False) 

242 lo_table.set_fontsize(7) 

243 lo_tab.set_title("Low Gain Outliers") 

244 

245 fig.suptitle(f"{variable}") 

246 

247 plt.savefig(plot_dir / f"{variable}_Histogram.png") 

248 

249 plt.close() 

250 

251 # Histograms by variable sliced by board 

252 if var_df["gain"].has_nulls(): 

253 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}) 

254 

255 var_min = cast(float, _check_none(var_df["value"].min())) 

256 var_max = cast(float, _check_none(var_df["value"].max())) 

257 

258 var_mean = cast(float, _check_none(var_df["value"].mean())) 

259 var_std = cast(float, _check_none(var_df["value"].std())) 

260 var_n = var_df["value"].len() 

261 

262 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

263 var_outliers = var_outliers.rename( 

264 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

265 ) 

266 var_tab.axis("off") 

267 if var_outliers.is_empty(): 

268 var_tab.set_title("No Outliers") 

269 else: 

270 var_table = var_tab.table( 

271 cellText=var_outliers.to_pandas().values, 

272 colLabels=var_outliers.columns, 

273 loc="best", 

274 cellLoc="center", 

275 ) 

276 var_table.auto_set_font_size(False) 

277 var_table.set_fontsize(7) 

278 var_tab.set_title("Outliers") 

279 

280 var_hist.hist( 

281 var_df["value"], 

282 bins=100, 

283 histtype="step", 

284 color="purple", 

285 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}", 

286 ) 

287 var_hist.legend() 

288 

289 board_ids = filtered_df["board_id"].unique(maintain_order=True) 

290 for i, board in enumerate(board_ids): 

291 board_gr = var_df.filter(pl.col("board_id") == board) 

292 if board_gr.is_empty(): 

293 continue 

294 board_var_mean = cast(float, _check_none(board_gr["value"].mean())) 

295 var_hist.hist( 

296 board_gr["value"], 

297 bins=np.linspace(var_min, var_max, 100), 

298 histtype="step", 

299 color=cm.get_cmap("tab10")(i), 

300 label=f"{board} (mean: {board_var_mean:.04g})", 

301 ) 

302 var_hist.set( 

303 xlabel=f"{variable}", 

304 ylabel="Entries", 

305 yscale="log", 

306 ) 

307 var_hist.legend(fontsize="x-small", loc="best") 

308 else: 

309 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots( 

310 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]} 

311 ) 

312 

313 # Set default values that should never get used to ensure variable is defined 

314 hi_min, hi_max = -999.0, 999.0 

315 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi") 

316 if not hi.is_empty(): 

317 hi_min = cast(float, _check_none(hi["value"].min())) 

318 hi_max = cast(float, _check_none(hi["value"].max())) 

319 

320 hi_mean = cast(float, _check_none(hi["value"].mean())) 

321 hi_std = cast(float, _check_none(hi["value"].std())) 

322 hi_n = hi["value"].len() 

323 

324 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

325 hi_outliers = hi_outliers.rename( 

326 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

327 ) 

328 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2)) 

329 hi_tab.axis("off") 

330 if hi_outliers.is_empty(): 

331 hi_tab.set_title("No High Gain Outliers") 

332 else: 

333 hi_table = hi_tab.table( 

334 cellText=hi_outliers.to_pandas().values, 

335 colLabels=hi_outliers.columns, 

336 loc="best", 

337 cellLoc="center", 

338 ) 

339 hi_table.auto_set_font_size(False) 

340 hi_table.set_fontsize(7) 

341 hi_tab.set_title("High Gain Outliers") 

342 else: 

343 log.warning(f"No hi gain data for {variable}, skipping") 

344 pass 

345 

346 # Set default values that should never get used to ensure variable is defined 

347 lo_min, lo_max = -999.0, 999.0 

348 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo") 

349 if not lo.is_empty(): 

350 lo_min = cast(float, _check_none(lo["value"].min())) 

351 lo_max = cast(float, _check_none(lo["value"].max())) 

352 

353 lo_mean = cast(float, _check_none(lo["value"].mean())) 

354 lo_std = cast(float, _check_none(lo["value"].std())) 

355 lo_n = lo["value"].len() 

356 

357 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"] 

358 lo_outliers = lo_outliers.rename( 

359 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"} 

360 ) 

361 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2)) 

362 lo_tab.axis("off") 

363 if lo_outliers.is_empty(): 

364 lo_tab.set_title("No Low Gain Outliers") 

365 else: 

366 lo_table = lo_tab.table( 

367 cellText=lo_outliers.to_pandas().values, 

368 colLabels=lo_outliers.columns, 

369 loc="best", 

370 cellLoc="center", 

371 ) 

372 lo_table.auto_set_font_size(False) 

373 lo_table.set_fontsize(7) 

374 lo_tab.set_title("Low Gain Outliers") 

375 else: 

376 log.warning(f"No lo gain data for {variable}, skipping") 

377 pass 

378 

379 board_ids = filtered_df["board_id"].unique(maintain_order=True) 

380 for i, board in enumerate(board_ids): 

381 board_hi = hi.filter(pl.col("board_id") == board) 

382 if not board_hi.is_empty(): 

383 board_hi_mean = cast(float, board_hi["value"].mean()) 

384 hi_hist.hist( 

385 board_hi["value"], 

386 bins=np.linspace(hi_min, hi_max, 100), 

387 histtype="step", 

388 color=cm.get_cmap("tab10")(i), 

389 label=f"{board} (mean: {board_hi_mean:.04g})", 

390 ) 

391 

392 board_lo = lo.filter(pl.col("board_id") == board) 

393 if not board_lo.is_empty(): 

394 board_lo_mean = cast(float, board_lo["value"].mean()) 

395 lo_hist.hist( 

396 board_lo["value"], 

397 bins=np.linspace(lo_min, lo_max, 100), 

398 histtype="step", 

399 color=cm.get_cmap("tab10")(i), 

400 label=f"{board} (mean: {board_lo_mean:.04g})", 

401 ) 

402 

403 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries") 

404 hi_hist.set_yscale("log") 

405 hi_hist.legend(fontsize="x-small", loc="best") 

406 

407 lo_hist.set( 

408 title="Low Gain, 6.0 V, 7.0 dB", 

409 xlabel=f"{variable}", 

410 ylabel="Entries", 

411 ) 

412 lo_hist.set_yscale("log") 

413 lo_hist.legend(fontsize="x-small", loc="best") 

414 

415 fig.suptitle(f"{variable} Sliced by Board") 

416 

417 plt.savefig(plot_dir / f"{variable}_Histogram_Sliced.png") 

418 

419 plt.close() 

420 

421 # Print warnings 

422 if len(inf_data_warnings_dict) > 0: 

423 log.warning(f"Boards with inf values:\n{inf_data_warnings_dict}") 

424 if len(nan_data_warnings_dict) > 0: 

425 log.warning(f"Boards with nan values:\n{nan_data_warnings_dict}") 

426 if len(null_data_warnings_dict) > 0: 

427 log.warning(f"Boards with null values:\n{null_data_warnings_dict}") 

428 if len(missing_data_warnings_dict) > 0: 

429 log.warning(f"Missing Boards:\n{missing_data_warnings_dict}") 

430 

431 return variables, boards 

432 

433 

434def plot_qc_db_per_board( 

435 plot_dir: Path, postgres_uri: str, boards: Optional[List[str]] = None, batch_mode: bool = False 

436) -> tuple[List[str], List[str]]: 

437 """ 

438 Example to run: ./typer_feb2.py plot-qc --plot-dir plots 

439 

440 Args: 

441 plot_dir: location to save plots 

442 postgres_uri: uri to database with QC values 

443 batch_mode: turn off track() 

444 

445 """ 

446 if boards: 

447 available_boards = ( 

448 pl.read_database_uri( 

449 """ 

450 SELECT DISTINCT board_id 

451 FROM latest_derived 

452 """, 

453 postgres_uri, 

454 ) 

455 .to_series() 

456 .to_list() 

457 ) 

458 

459 bad_boards = set(boards).difference(set(available_boards)) 

460 if bad_boards: 

461 log.error(f"Board IDs {bad_boards} not found in derived database") 

462 raise Exception(f"Board IDs {bad_boards} not found in derived database") 

463 

464 plot_dir = plot_dir 

465 if not plot_dir.exists(): 

466 plot_dir.mkdir(parents=True, exist_ok=True) 

467 os.chmod(plot_dir, 0o775) 

468 

469 query = """ 

470 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label 

471 FROM latest_derived as d 

472 JOIN threshold_labels as l on d.label_id=l.id 

473 """ 

474 if boards: 

475 board_id_string = ", ".join(f"'{b}'" for b in boards) 

476 query += f"\nWHERE board_id IN ({board_id_string})" 

477 

478 df = ( 

479 pl.read_database_uri(query, postgres_uri) 

480 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None)) 

481 .unnest("label") 

482 ) 

483 df = df.with_columns( 

484 pl.when(pl.col("variable") == "mean") 

485 .then(pl.lit("pedestal_mean")) 

486 .otherwise(pl.col("variable")) 

487 .alias("variable") 

488 ) 

489 df = df.with_columns( 

490 pl.when(pl.col("variable") == "std") 

491 .then(pl.lit("pedestal_std")) 

492 .otherwise(pl.col("variable")) 

493 .alias("variable") 

494 ) 

495 

496 if boards is None: 

497 boards = df["board_id"].unique().to_list() 

498 variables = df["variable"].unique().drop_nulls().to_list() 

499 

500 for variable, board in ( 

501 list(product(variables, boards)) 

502 if batch_mode 

503 else track(list(product(variables, boards)), description="Making per board QC plots...") 

504 ): 

505 log.debug(f"{variable}, {board}") 

506 _, ax = plt.subplots(constrained_layout=True) 

507 

508 var_df = df.filter(pl.col("variable") == variable, pl.col("board_id") == board) 

509 

510 runs = helper.list_to_text_ranges(var_df["run_number"].unique().sort().to_list()) 

511 title = f"{variable} for Board {board}\nRuns {runs}" 

512 

513 if "channel" in var_df.columns and not var_df["channel"].has_nulls(): 

514 x_axis = "channel" 

515 elif "adc" in var_df.columns and not var_df["adc"].has_nulls(): 

516 x_axis = "adc" 

517 elif "link" in var_df.columns and not var_df["link"].has_nulls(): 

518 x_axis = "link" 

519 else: 

520 x_axis = None 

521 if var_df["gain"].has_nulls(): 

522 if x_axis: 

523 ax.bar(var_df[x_axis], var_df["value"], fill=False, edgecolor="purple") 

524 ax.set(title=title, xlabel=x_axis, ylabel=variable) 

525 else: 

526 ax.hist(var_df["value"], fill=False, edgecolor="purple") 

527 ax.set(title=title, xlabel=variable, ylabel="Counts") 

528 else: 

529 hi = var_df.filter(pl.col("gain") == "hi") 

530 lo = var_df.filter(pl.col("gain") == "lo") 

531 if x_axis: 

532 ax.bar(hi[x_axis], hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB") 

533 ax.bar(lo[x_axis], lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB") 

534 ax.set(title=title, xlabel=x_axis, ylabel=variable) 

535 else: 

536 ax.hist(hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB") 

537 ax.hist(lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB") 

538 ax.set(title=title, xlabel=variable, ylabel="Counts") 

539 ax.legend() 

540 

541 plt.savefig(plot_dir / f"{variable}_{board}.png") 

542 plt.close() 

543 

544 return variables, boards