Coverage for polars_analysis / plotting / qc_db_plotting.py: 76%
261 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-13 13:37 -0400
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-13 13:37 -0400
1import logging
2from itertools import product
3from pathlib import Path
4from typing import Dict, List, Literal, Optional, Union, cast
6import matplotlib
8matplotlib.use("agg")
9import os
11import matplotlib.pyplot as plt
12import numpy as np
13import polars as pl
14from matplotlib import colormaps as cm
15from polars._typing import PythonLiteral
16from psycopg import sql
17from rich.progress import track
19from polars_analysis.plotting import helper
21# Instantiate logger
22log = logging.getLogger(__name__)
24"""
25Functions for plotting from the QC database.
26"""
29def _check_none(value: Union[PythonLiteral, None], default=-999) -> PythonLiteral:
30 return value if value is not None else default
33def plot_qc_db(
34 plot_dir: Path, postgres_uri: str, batch_mode: bool = False, board_type: Literal["EM", "HEC"] = "EM"
35) -> tuple[List[str], List[str]]:
36 """
37 Example to run: ./typer_feb2.py plot-qc --plot-dir plots
39 Args:
40 plot_dir: location to save plots
41 postgres_uri: uri to database with QC values
42 batch_mode: turn off track()
43 """
45 plot_dir = plot_dir
46 if not plot_dir.exists():
47 plot_dir.mkdir(parents=True, exist_ok=True)
48 os.chmod(plot_dir, 0o775)
50 df = (
51 pl.read_database_uri(
52 sql.SQL("""
53 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label
54 FROM latest_derived as d
55 JOIN threshold_labels as l on d.label_id=l.id
56 JOIN boards as b on b.board_id=d.board_id
57 WHERE b.board_type = {}
58 """)
59 .format(board_type)
60 .as_string(),
61 postgres_uri,
62 )
63 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None))
64 .unnest("label")
65 )
67 df = df.with_columns(
68 pl.when(pl.col("variable") == "mean")
69 .then(pl.lit("pedestal_mean"))
70 .otherwise(pl.col("variable"))
71 .alias("variable")
72 )
73 df = df.with_columns(
74 pl.when(pl.col("variable") == "std")
75 .then(pl.lit("pedestal_std"))
76 .otherwise(pl.col("variable"))
77 .alias("variable")
78 )
80 # when you use savefig, use plot_dir to put them all in a folder
81 # eg plt.savefig( plot_dir / f'{variable}_Histogram.png')
82 # also pro-tip, your life will be easier if you don't put spaces in the filenames
84 # Histograms by variable
85 filtered_df = df.filter(
86 pl.col("board_id") != "1234",
87 pl.col("board_id") != "E190324",
88 pl.col("board_id") != "194169", # Changed to E...
89 pl.col("board_id") != "E191700",
90 ~((pl.col("board_id") == "E190322") & (pl.col("channel") == 119)).fill_null(False), # Bad channel
91 pl.col("run_number") != 1428, # Doesn't appear to have loads?
92 ~((pl.col("variable") == "gain_ratio") & (pl.col("awg_amp") == 6.0)).fill_null(
93 False
94 ), # ratio we want has awp_amp = 1.5, for reasons
95 )
97 variables = filtered_df["variable"].unique().drop_nulls().sort().to_list()
98 boards = filtered_df["board_id"].unique().sort().to_list()
100 inf_data_warnings_dict: Dict[str, list] = {}
101 nan_data_warnings_dict: Dict[str, list] = {}
102 null_data_warnings_dict: Dict[str, list] = {}
103 missing_data_warnings_dict: Dict[str, list] = {}
105 value = pl.col("value")
106 outlier_filter = (value < value.mean() - 3 * value.std()) | (value > value.mean() + 3 * value.std())
107 for variable in variables if batch_mode else track(variables, description="Making QC plots..."):
108 var_df = filtered_df.filter(pl.col("variable") == variable)
109 inf_boards: List[str] = var_df.filter(pl.col("value").is_infinite())["board_id"].unique().to_list()
110 if len(inf_boards) > 0:
111 inf_data_warnings_dict[str(variable)] = inf_boards
112 nan_boards: List[str] = var_df.filter(pl.col("value").is_nan())["board_id"].unique().to_list()
113 if len(nan_boards) > 0:
114 nan_data_warnings_dict[variable] = nan_boards
115 null_boards: List[str] = var_df.filter(pl.col("value").is_null())["board_id"].unique().to_list()
116 if len(null_boards) > 0:
117 null_data_warnings_dict[variable] = null_boards
119 # TODO could try to infer variable length to find incomplete boards
120 missing_boards = [id for id in boards if id not in var_df["board_id"].unique().to_list()]
121 if len(missing_boards) > 0:
122 missing_data_warnings_dict[variable] = missing_boards
124 var_df = var_df.filter(
125 ~pl.col("value").is_infinite(),
126 ~pl.col("value").is_null(),
127 ~pl.col("value").is_nan(),
128 )
130 all_null = var_df["gain"].is_null().all()
131 some_null = var_df["gain"].has_nulls()
132 if some_null and not all_null:
133 log.warning(f"Some null gain values for {variable}! That's not good.")
135 # variable is not split into hi and lo gain
136 if all_null:
137 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]})
139 var_mean = cast(float, _check_none(var_df["value"].mean()))
140 var_std = cast(float, _check_none(var_df["value"].std()))
141 var_n = var_df["value"].len()
143 var_hist.hist(
144 var_df["value"],
145 bins=100,
146 histtype="step",
147 color="purple",
148 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}",
149 )
150 var_hist.set(
151 xlabel=f"{variable}",
152 ylabel="Entries",
153 yscale="log",
154 )
155 var_hist.legend()
157 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
158 var_outliers = var_outliers.rename(
159 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
160 )
161 var_tab.axis("off")
162 if var_outliers.is_empty():
163 var_tab.set_title("No Outliers")
164 else:
165 var_table = var_tab.table(
166 cellText=var_outliers.to_pandas().values,
167 colLabels=var_outliers.columns,
168 loc="best",
169 cellLoc="center",
170 )
171 var_table.auto_set_font_size(False)
172 var_table.set_fontsize(7)
173 var_tab.set_title("Outliers")
174 else:
175 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi")
177 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo")
179 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots(
180 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}
181 )
183 hi_mean = cast(float, _check_none(hi["value"].mean()))
184 hi_std = cast(float, _check_none(hi["value"].std()))
185 hi_n = hi["value"].len()
187 hi_hist.hist(
188 hi["value"],
189 bins=100,
190 histtype="step",
191 color="r",
192 label=f"Mean: {hi_mean:.02g}\nSTD: {hi_std:.02g}\nN: {hi_n}",
193 )
194 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries")
195 hi_hist.set_yscale("log")
196 hi_hist.legend()
198 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
199 hi_outliers = hi_outliers.rename(
200 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
201 )
202 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2))
203 hi_tab.axis("off")
204 if hi_outliers.is_empty():
205 hi_tab.set_title("No High Gain Outliers")
206 else:
207 hi_table = hi_tab.table(
208 cellText=hi_outliers.to_pandas().values, colLabels=hi_outliers.columns, loc="best", cellLoc="center"
209 )
210 hi_table.auto_set_font_size(False)
211 hi_table.set_fontsize(7)
212 hi_tab.set_title("High Gain Outliers")
214 lo_mean = cast(float, _check_none(lo["value"].mean()))
215 lo_std = cast(float, _check_none(lo["value"].std()))
216 lo_n = lo["value"].len()
218 lo_hist.hist(
219 lo["value"],
220 bins=100,
221 histtype="step",
222 color="b",
223 label=f"Mean: {lo_mean:.02g}\nSTD: {lo_std:.02g}\nN: {lo_n}",
224 )
225 lo_hist.set(title="Low Gain, 6.0 V, 7.0 dB", xlabel=f"{variable}", ylabel="Entries")
226 lo_hist.set_yscale("log")
227 lo_hist.legend()
229 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
230 lo_outliers = lo_outliers.rename(
231 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
232 )
233 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2))
234 lo_tab.axis("off")
235 if lo_outliers.is_empty():
236 lo_tab.set_title("No Low Gain Outliers")
237 else:
238 lo_table = lo_tab.table(
239 cellText=lo_outliers.to_pandas().values, colLabels=lo_outliers.columns, loc="best", cellLoc="center"
240 )
241 lo_table.auto_set_font_size(False)
242 lo_table.set_fontsize(7)
243 lo_tab.set_title("Low Gain Outliers")
245 fig.suptitle(f"{variable}")
247 plt.savefig(plot_dir / f"{variable}_Histogram.png")
249 plt.close()
251 # Histograms by variable sliced by board
252 if var_df["gain"].has_nulls():
253 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]})
255 var_min = cast(float, _check_none(var_df["value"].min()))
256 var_max = cast(float, _check_none(var_df["value"].max()))
258 var_mean = cast(float, _check_none(var_df["value"].mean()))
259 var_std = cast(float, _check_none(var_df["value"].std()))
260 var_n = var_df["value"].len()
262 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
263 var_outliers = var_outliers.rename(
264 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
265 )
266 var_tab.axis("off")
267 if var_outliers.is_empty():
268 var_tab.set_title("No Outliers")
269 else:
270 var_table = var_tab.table(
271 cellText=var_outliers.to_pandas().values,
272 colLabels=var_outliers.columns,
273 loc="best",
274 cellLoc="center",
275 )
276 var_table.auto_set_font_size(False)
277 var_table.set_fontsize(7)
278 var_tab.set_title("Outliers")
280 var_hist.hist(
281 var_df["value"],
282 bins=100,
283 histtype="step",
284 color="purple",
285 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}",
286 )
287 var_hist.legend()
289 board_ids = filtered_df["board_id"].unique(maintain_order=True)
290 for i, board in enumerate(board_ids):
291 board_gr = var_df.filter(pl.col("board_id") == board)
292 if board_gr.is_empty():
293 continue
294 board_var_mean = cast(float, _check_none(board_gr["value"].mean()))
295 var_hist.hist(
296 board_gr["value"],
297 bins=np.linspace(var_min, var_max, 100),
298 histtype="step",
299 color=cm.get_cmap("tab10")(i),
300 label=f"{board} (mean: {board_var_mean:.04g})",
301 )
302 var_hist.set(
303 xlabel=f"{variable}",
304 ylabel="Entries",
305 yscale="log",
306 )
307 var_hist.legend(fontsize="x-small", loc="best")
308 else:
309 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots(
310 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}
311 )
313 # Set default values that should never get used to ensure variable is defined
314 hi_min, hi_max = -999.0, 999.0
315 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi")
316 if not hi.is_empty():
317 hi_min = cast(float, _check_none(hi["value"].min()))
318 hi_max = cast(float, _check_none(hi["value"].max()))
320 hi_mean = cast(float, _check_none(hi["value"].mean()))
321 hi_std = cast(float, _check_none(hi["value"].std()))
322 hi_n = hi["value"].len()
324 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
325 hi_outliers = hi_outliers.rename(
326 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
327 )
328 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2))
329 hi_tab.axis("off")
330 if hi_outliers.is_empty():
331 hi_tab.set_title("No High Gain Outliers")
332 else:
333 hi_table = hi_tab.table(
334 cellText=hi_outliers.to_pandas().values,
335 colLabels=hi_outliers.columns,
336 loc="best",
337 cellLoc="center",
338 )
339 hi_table.auto_set_font_size(False)
340 hi_table.set_fontsize(7)
341 hi_tab.set_title("High Gain Outliers")
342 else:
343 log.warning(f"No hi gain data for {variable}, skipping")
344 pass
346 # Set default values that should never get used to ensure variable is defined
347 lo_min, lo_max = -999.0, 999.0
348 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo")
349 if not lo.is_empty():
350 lo_min = cast(float, _check_none(lo["value"].min()))
351 lo_max = cast(float, _check_none(lo["value"].max()))
353 lo_mean = cast(float, _check_none(lo["value"].mean()))
354 lo_std = cast(float, _check_none(lo["value"].std()))
355 lo_n = lo["value"].len()
357 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"]
358 lo_outliers = lo_outliers.rename(
359 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}
360 )
361 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2))
362 lo_tab.axis("off")
363 if lo_outliers.is_empty():
364 lo_tab.set_title("No Low Gain Outliers")
365 else:
366 lo_table = lo_tab.table(
367 cellText=lo_outliers.to_pandas().values,
368 colLabels=lo_outliers.columns,
369 loc="best",
370 cellLoc="center",
371 )
372 lo_table.auto_set_font_size(False)
373 lo_table.set_fontsize(7)
374 lo_tab.set_title("Low Gain Outliers")
375 else:
376 log.warning(f"No lo gain data for {variable}, skipping")
377 pass
379 board_ids = filtered_df["board_id"].unique(maintain_order=True)
380 for i, board in enumerate(board_ids):
381 board_hi = hi.filter(pl.col("board_id") == board)
382 if not board_hi.is_empty():
383 board_hi_mean = cast(float, board_hi["value"].mean())
384 hi_hist.hist(
385 board_hi["value"],
386 bins=np.linspace(hi_min, hi_max, 100),
387 histtype="step",
388 color=cm.get_cmap("tab10")(i),
389 label=f"{board} (mean: {board_hi_mean:.04g})",
390 )
392 board_lo = lo.filter(pl.col("board_id") == board)
393 if not board_lo.is_empty():
394 board_lo_mean = cast(float, board_lo["value"].mean())
395 lo_hist.hist(
396 board_lo["value"],
397 bins=np.linspace(lo_min, lo_max, 100),
398 histtype="step",
399 color=cm.get_cmap("tab10")(i),
400 label=f"{board} (mean: {board_lo_mean:.04g})",
401 )
403 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries")
404 hi_hist.set_yscale("log")
405 hi_hist.legend(fontsize="x-small", loc="best")
407 lo_hist.set(
408 title="Low Gain, 6.0 V, 7.0 dB",
409 xlabel=f"{variable}",
410 ylabel="Entries",
411 )
412 lo_hist.set_yscale("log")
413 lo_hist.legend(fontsize="x-small", loc="best")
415 fig.suptitle(f"{variable} Sliced by Board")
417 plt.savefig(plot_dir / f"{variable}_Histogram_Sliced.png")
419 plt.close()
421 # Print warnings
422 if len(inf_data_warnings_dict) > 0:
423 log.warning(f"Boards with inf values:\n{inf_data_warnings_dict}")
424 if len(nan_data_warnings_dict) > 0:
425 log.warning(f"Boards with nan values:\n{nan_data_warnings_dict}")
426 if len(null_data_warnings_dict) > 0:
427 log.warning(f"Boards with null values:\n{null_data_warnings_dict}")
428 if len(missing_data_warnings_dict) > 0:
429 log.warning(f"Missing Boards:\n{missing_data_warnings_dict}")
431 return variables, boards
434def plot_qc_db_per_board(
435 plot_dir: Path, postgres_uri: str, boards: Optional[List[str]] = None, batch_mode: bool = False
436) -> tuple[List[str], List[str]]:
437 """
438 Example to run: ./typer_feb2.py plot-qc --plot-dir plots
440 Args:
441 plot_dir: location to save plots
442 postgres_uri: uri to database with QC values
443 batch_mode: turn off track()
445 """
446 if boards:
447 available_boards = (
448 pl.read_database_uri(
449 """
450 SELECT DISTINCT board_id
451 FROM latest_derived
452 """,
453 postgres_uri,
454 )
455 .to_series()
456 .to_list()
457 )
459 bad_boards = set(boards).difference(set(available_boards))
460 if bad_boards:
461 log.error(f"Board IDs {bad_boards} not found in derived database")
462 raise Exception(f"Board IDs {bad_boards} not found in derived database")
464 plot_dir = plot_dir
465 if not plot_dir.exists():
466 plot_dir.mkdir(parents=True, exist_ok=True)
467 os.chmod(plot_dir, 0o775)
469 query = """
470 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label
471 FROM latest_derived as d
472 JOIN threshold_labels as l on d.label_id=l.id
473 """
474 if boards:
475 board_id_string = ", ".join(f"'{b}'" for b in boards)
476 query += f"\nWHERE board_id IN ({board_id_string})"
478 df = (
479 pl.read_database_uri(query, postgres_uri)
480 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None))
481 .unnest("label")
482 )
483 df = df.with_columns(
484 pl.when(pl.col("variable") == "mean")
485 .then(pl.lit("pedestal_mean"))
486 .otherwise(pl.col("variable"))
487 .alias("variable")
488 )
489 df = df.with_columns(
490 pl.when(pl.col("variable") == "std")
491 .then(pl.lit("pedestal_std"))
492 .otherwise(pl.col("variable"))
493 .alias("variable")
494 )
496 if boards is None:
497 boards = df["board_id"].unique().to_list()
498 variables = df["variable"].unique().drop_nulls().to_list()
500 for variable, board in (
501 list(product(variables, boards))
502 if batch_mode
503 else track(list(product(variables, boards)), description="Making per board QC plots...")
504 ):
505 log.debug(f"{variable}, {board}")
506 _, ax = plt.subplots(constrained_layout=True)
508 var_df = df.filter(pl.col("variable") == variable, pl.col("board_id") == board)
510 runs = helper.list_to_text_ranges(var_df["run_number"].unique().sort().to_list())
511 title = f"{variable} for Board {board}\nRuns {runs}"
513 if "channel" in var_df.columns and not var_df["channel"].has_nulls():
514 x_axis = "channel"
515 elif "adc" in var_df.columns and not var_df["adc"].has_nulls():
516 x_axis = "adc"
517 elif "link" in var_df.columns and not var_df["link"].has_nulls():
518 x_axis = "link"
519 else:
520 x_axis = None
521 if var_df["gain"].has_nulls():
522 if x_axis:
523 ax.bar(var_df[x_axis], var_df["value"], fill=False, edgecolor="purple")
524 ax.set(title=title, xlabel=x_axis, ylabel=variable)
525 else:
526 ax.hist(var_df["value"], fill=False, edgecolor="purple")
527 ax.set(title=title, xlabel=variable, ylabel="Counts")
528 else:
529 hi = var_df.filter(pl.col("gain") == "hi")
530 lo = var_df.filter(pl.col("gain") == "lo")
531 if x_axis:
532 ax.bar(hi[x_axis], hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB")
533 ax.bar(lo[x_axis], lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB")
534 ax.set(title=title, xlabel=x_axis, ylabel=variable)
535 else:
536 ax.hist(hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB")
537 ax.hist(lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB")
538 ax.set(title=title, xlabel=variable, ylabel="Counts")
539 ax.legend()
541 plt.savefig(plot_dir / f"{variable}_{board}.png")
542 plt.close()
544 return variables, boards