Coverage for polars_analysis/plotting/qc_db

1import logging

2from itertools import product

3from pathlib import Path

4from typing import Dict, List, Literal, Optional, Union, cast

6import matplotlib

8matplotlib.use("agg")

9import os

11import matplotlib.pyplot as plt

12import numpy as np

13import polars as pl

14from matplotlib import colormaps as cm

15from polars._typing import PythonLiteral

16from psycopg import sql

17from rich.progress import track

19from polars_analysis.plotting import helper

21# Instantiate logger

22log = logging.getLogger(__name__)

24"""

25Functions for plotting from the QC database.

26"""

29def _check_none(value: Union[PythonLiteral, None], default=-999) -> PythonLiteral:

30 return value if value is not None else default

33def plot_qc_db(

34 plot_dir: Path, postgres_uri: str, batch_mode: bool = False, board_type: Literal["EM", "HEC"] = "EM"

35) -> tuple[List[str], List[str]]:

36 """

37 Example to run: ./typer_feb2.py plot-qc --plot-dir plots

39 Args:

40 plot_dir: location to save plots

41 postgres_uri: uri to database with QC values

42 batch_mode: turn off track()

43 """

45 plot_dir = plot_dir

46 if not plot_dir.exists():

47 plot_dir.mkdir(parents=True, exist_ok=True)

48 os.chmod(plot_dir, 0o775)

50 df = (

51 pl.read_database_uri(

52 sql.SQL("""

53 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label

54 FROM latest_derived as d

55 JOIN threshold_labels as l on d.label_id=l.id

56 JOIN boards as b on b.board_id=d.board_id

57 WHERE b.board_type = {}

58 """)

59 .format(board_type)

60 .as_string(),

61 postgres_uri,

62 )

63 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None))

64 .unnest("label")

65 )

67 df = df.with_columns(

68 pl.when(pl.col("variable") == "mean")

69 .then(pl.lit("pedestal_mean"))

70 .otherwise(pl.col("variable"))

71 .alias("variable")

72 )

73 df = df.with_columns(

74 pl.when(pl.col("variable") == "std")

75 .then(pl.lit("pedestal_std"))

76 .otherwise(pl.col("variable"))

77 .alias("variable")

78 )

80 # when you use savefig, use plot_dir to put them all in a folder

81 # eg plt.savefig( plot_dir / f'{variable}_Histogram.png')

82 # also pro-tip, your life will be easier if you don't put spaces in the filenames

84 # Histograms by variable

85 filtered_df = df.filter(

86 pl.col("board_id") != "1234",

87 pl.col("board_id") != "E190324",

88 pl.col("board_id") != "194169", # Changed to E...

89 pl.col("board_id") != "E191700",

90 ~((pl.col("board_id") == "E190322") & (pl.col("channel") == 119)).fill_null(False), # Bad channel

91 pl.col("run_number") != 1428, # Doesn't appear to have loads?

92 ~((pl.col("variable") == "gain_ratio") & (pl.col("awg_amp") == 6.0)).fill_null(

93 False

94 ), # ratio we want has awp_amp = 1.5, for reasons

95 )

97 variables = filtered_df["variable"].unique().drop_nulls().sort().to_list()

98 boards = filtered_df["board_id"].unique().sort().to_list()

100 inf_data_warnings_dict: Dict[str, list] = {}

101 nan_data_warnings_dict: Dict[str, list] = {}

102 null_data_warnings_dict: Dict[str, list] = {}

103 missing_data_warnings_dict: Dict[str, list] = {}

104

105 value = pl.col("value")

106 outlier_filter = (value < value.mean() - 3 * value.std()) | (value > value.mean() + 3 * value.std())

107 for variable in variables if batch_mode else track(variables, description="Making QC plots..."):

108 var_df = filtered_df.filter(pl.col("variable") == variable)

109 inf_boards: List[str] = var_df.filter(pl.col("value").is_infinite())["board_id"].unique().to_list()

110 if len(inf_boards) > 0:

111 inf_data_warnings_dict[str(variable)] = inf_boards

112 nan_boards: List[str] = var_df.filter(pl.col("value").is_nan())["board_id"].unique().to_list()

113 if len(nan_boards) > 0:

114 nan_data_warnings_dict[variable] = nan_boards

115 null_boards: List[str] = var_df.filter(pl.col("value").is_null())["board_id"].unique().to_list()

116 if len(null_boards) > 0:

117 null_data_warnings_dict[variable] = null_boards

118

119 # TODO could try to infer variable length to find incomplete boards

120 missing_boards = [id for id in boards if id not in var_df["board_id"].unique().to_list()]

121 if len(missing_boards) > 0:

122 missing_data_warnings_dict[variable] = missing_boards

123

124 var_df = var_df.filter(

125 ~pl.col("value").is_infinite(),

126 ~pl.col("value").is_null(),

127 ~pl.col("value").is_nan(),

128 )

129

130 all_null = var_df["gain"].is_null().all()

131 some_null = var_df["gain"].has_nulls()

132 if some_null and not all_null:

133 log.warning(f"Some null gain values for {variable}! That's not good.")

134

135 # variable is not split into hi and lo gain

136 if all_null:

137 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]})

138

139 var_mean = cast(float, _check_none(var_df["value"].mean()))

140 var_std = cast(float, _check_none(var_df["value"].std()))

141 var_n = var_df["value"].len()

142

143 var_hist.hist(

144 var_df["value"],

145 bins=100,

146 histtype="step",

147 color="purple",

148 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}",

149 )

150 var_hist.set(

151 xlabel=f"{variable}",

152 ylabel="Entries",

153 yscale="log",

154 )

155 var_hist.legend()

156

157 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

158 var_outliers = var_outliers.rename(

159 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

160 )

161 var_tab.axis("off")

162 if var_outliers.is_empty():

163 var_tab.set_title("No Outliers")

164 else:

165 var_table = var_tab.table(

166 cellText=var_outliers.to_pandas().values,

167 colLabels=var_outliers.columns,

168 loc="best",

169 cellLoc="center",

170 )

171 var_table.auto_set_font_size(False)

172 var_table.set_fontsize(7)

173 var_tab.set_title("Outliers")

174 else:

175 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi")

176

177 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo")

178

179 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots(

180 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}

181 )

182

183 hi_mean = cast(float, _check_none(hi["value"].mean()))

184 hi_std = cast(float, _check_none(hi["value"].std()))

185 hi_n = hi["value"].len()

186

187 hi_hist.hist(

188 hi["value"],

189 bins=100,

190 histtype="step",

191 color="r",

192 label=f"Mean: {hi_mean:.02g}\nSTD: {hi_std:.02g}\nN: {hi_n}",

193 )

194 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries")

195 hi_hist.set_yscale("log")

196 hi_hist.legend()

197

198 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

199 hi_outliers = hi_outliers.rename(

200 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

201 )

202 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2))

203 hi_tab.axis("off")

204 if hi_outliers.is_empty():

205 hi_tab.set_title("No High Gain Outliers")

206 else:

207 hi_table = hi_tab.table(

208 cellText=hi_outliers.to_pandas().values, colLabels=hi_outliers.columns, loc="best", cellLoc="center"

209 )

210 hi_table.auto_set_font_size(False)

211 hi_table.set_fontsize(7)

212 hi_tab.set_title("High Gain Outliers")

213

214 lo_mean = cast(float, _check_none(lo["value"].mean()))

215 lo_std = cast(float, _check_none(lo["value"].std()))

216 lo_n = lo["value"].len()

217

218 lo_hist.hist(

219 lo["value"],

220 bins=100,

221 histtype="step",

222 color="b",

223 label=f"Mean: {lo_mean:.02g}\nSTD: {lo_std:.02g}\nN: {lo_n}",

224 )

225 lo_hist.set(title="Low Gain, 6.0 V, 7.0 dB", xlabel=f"{variable}", ylabel="Entries")

226 lo_hist.set_yscale("log")

227 lo_hist.legend()

228

229 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

230 lo_outliers = lo_outliers.rename(

231 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

232 )

233 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2))

234 lo_tab.axis("off")

235 if lo_outliers.is_empty():

236 lo_tab.set_title("No Low Gain Outliers")

237 else:

238 lo_table = lo_tab.table(

239 cellText=lo_outliers.to_pandas().values, colLabels=lo_outliers.columns, loc="best", cellLoc="center"

240 )

241 lo_table.auto_set_font_size(False)

242 lo_table.set_fontsize(7)

243 lo_tab.set_title("Low Gain Outliers")

244

245 fig.suptitle(f"{variable}")

246

247 plt.savefig(plot_dir / f"{variable}_Histogram.png")

248

249 plt.close()

250

251 # Histograms by variable sliced by board

252 if var_df["gain"].has_nulls():

253 fig, (var_hist, var_tab) = plt.subplots(1, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]})

254

255 var_min = cast(float, _check_none(var_df["value"].min()))

256 var_max = cast(float, _check_none(var_df["value"].max()))

257

258 var_mean = cast(float, _check_none(var_df["value"].mean()))

259 var_std = cast(float, _check_none(var_df["value"].std()))

260 var_n = var_df["value"].len()

261

262 var_outliers = var_df.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

263 var_outliers = var_outliers.rename(

264 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

265 )

266 var_tab.axis("off")

267 if var_outliers.is_empty():

268 var_tab.set_title("No Outliers")

269 else:

270 var_table = var_tab.table(

271 cellText=var_outliers.to_pandas().values,

272 colLabels=var_outliers.columns,

273 loc="best",

274 cellLoc="center",

275 )

276 var_table.auto_set_font_size(False)

277 var_table.set_fontsize(7)

278 var_tab.set_title("Outliers")

279

280 var_hist.hist(

281 var_df["value"],

282 bins=100,

283 histtype="step",

284 color="purple",

285 label=f"Mean: {var_mean:.02g}\nSTD: {var_std:.02g}\nN: {var_n}",

286 )

287 var_hist.legend()

288

289 board_ids = filtered_df["board_id"].unique(maintain_order=True)

290 for i, board in enumerate(board_ids):

291 board_gr = var_df.filter(pl.col("board_id") == board)

292 if board_gr.is_empty():

293 continue

294 board_var_mean = cast(float, _check_none(board_gr["value"].mean()))

295 var_hist.hist(

296 board_gr["value"],

297 bins=np.linspace(var_min, var_max, 100),

298 histtype="step",

299 color=cm.get_cmap("tab10")(i),

300 label=f"{board} (mean: {board_var_mean:.04g})",

301 )

302 var_hist.set(

303 xlabel=f"{variable}",

304 ylabel="Entries",

305 yscale="log",

306 )

307 var_hist.legend(fontsize="x-small", loc="best")

308 else:

309 fig, ((hi_hist, hi_tab), (lo_hist, lo_tab)) = plt.subplots(

310 2, 2, constrained_layout=True, gridspec_kw={"width_ratios": [2, 1]}

311 )

312

313 # Set default values that should never get used to ensure variable is defined

314 hi_min, hi_max = -999.0, 999.0

315 hi = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "hi")

316 if not hi.is_empty():

317 hi_min = cast(float, _check_none(hi["value"].min()))

318 hi_max = cast(float, _check_none(hi["value"].max()))

319

320 hi_mean = cast(float, _check_none(hi["value"].mean()))

321 hi_std = cast(float, _check_none(hi["value"].std()))

322 hi_n = hi["value"].len()

323

324 hi_outliers = hi.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

325 hi_outliers = hi_outliers.rename(

326 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

327 )

328 hi_outliers = hi_outliers.with_columns(pl.col("Value").round(2))

329 hi_tab.axis("off")

330 if hi_outliers.is_empty():

331 hi_tab.set_title("No High Gain Outliers")

332 else:

333 hi_table = hi_tab.table(

334 cellText=hi_outliers.to_pandas().values,

335 colLabels=hi_outliers.columns,

336 loc="best",

337 cellLoc="center",

338 )

339 hi_table.auto_set_font_size(False)

340 hi_table.set_fontsize(7)

341 hi_tab.set_title("High Gain Outliers")

342 else:

343 log.warning(f"No hi gain data for {variable}, skipping")

344 pass

345

346 # Set default values that should never get used to ensure variable is defined

347 lo_min, lo_max = -999.0, 999.0

348 lo = filtered_df.filter(pl.col("variable") == variable, pl.col("gain") == "lo")

349 if not lo.is_empty():

350 lo_min = cast(float, _check_none(lo["value"].min()))

351 lo_max = cast(float, _check_none(lo["value"].max()))

352

353 lo_mean = cast(float, _check_none(lo["value"].mean()))

354 lo_std = cast(float, _check_none(lo["value"].std()))

355 lo_n = lo["value"].len()

356

357 lo_outliers = lo.filter(outlier_filter)["board_id", "channel", "run_number", "value"]

358 lo_outliers = lo_outliers.rename(

359 {"board_id": "Board", "channel": "Channel", "run_number": "Run", "value": "Value"}

360 )

361 lo_outliers = lo_outliers.with_columns(pl.col("Value").round(2))

362 lo_tab.axis("off")

363 if lo_outliers.is_empty():

364 lo_tab.set_title("No Low Gain Outliers")

365 else:

366 lo_table = lo_tab.table(

367 cellText=lo_outliers.to_pandas().values,

368 colLabels=lo_outliers.columns,

369 loc="best",

370 cellLoc="center",

371 )

372 lo_table.auto_set_font_size(False)

373 lo_table.set_fontsize(7)

374 lo_tab.set_title("Low Gain Outliers")

375 else:

376 log.warning(f"No lo gain data for {variable}, skipping")

377 pass

378

379 board_ids = filtered_df["board_id"].unique(maintain_order=True)

380 for i, board in enumerate(board_ids):

381 board_hi = hi.filter(pl.col("board_id") == board)

382 if not board_hi.is_empty():

383 board_hi_mean = cast(float, board_hi["value"].mean())

384 hi_hist.hist(

385 board_hi["value"],

386 bins=np.linspace(hi_min, hi_max, 100),

387 histtype="step",

388 color=cm.get_cmap("tab10")(i),

389 label=f"{board} (mean: {board_hi_mean:.04g})",

390 )

391

392 board_lo = lo.filter(pl.col("board_id") == board)

393 if not board_lo.is_empty():

394 board_lo_mean = cast(float, board_lo["value"].mean())

395 lo_hist.hist(

396 board_lo["value"],

397 bins=np.linspace(lo_min, lo_max, 100),

398 histtype="step",

399 color=cm.get_cmap("tab10")(i),

400 label=f"{board} (mean: {board_lo_mean:.04g})",

401 )

402

403 hi_hist.set(title="High Gain, 1.5 V, 20.0 dB", xlabel=f"{variable}", ylabel="Entries")

404 hi_hist.set_yscale("log")

405 hi_hist.legend(fontsize="x-small", loc="best")

406

407 lo_hist.set(

408 title="Low Gain, 6.0 V, 7.0 dB",

409 xlabel=f"{variable}",

410 ylabel="Entries",

411 )

412 lo_hist.set_yscale("log")

413 lo_hist.legend(fontsize="x-small", loc="best")

414

415 fig.suptitle(f"{variable} Sliced by Board")

416

417 plt.savefig(plot_dir / f"{variable}_Histogram_Sliced.png")

418

419 plt.close()

420

421 # Print warnings

422 if len(inf_data_warnings_dict) > 0:

423 log.warning(f"Boards with inf values:\n{inf_data_warnings_dict}")

424 if len(nan_data_warnings_dict) > 0:

425 log.warning(f"Boards with nan values:\n{nan_data_warnings_dict}")

426 if len(null_data_warnings_dict) > 0:

427 log.warning(f"Boards with null values:\n{null_data_warnings_dict}")

428 if len(missing_data_warnings_dict) > 0:

429 log.warning(f"Missing Boards:\n{missing_data_warnings_dict}")

430

431 return variables, boards

432

433

434def plot_qc_db_per_board(

435 plot_dir: Path, postgres_uri: str, boards: Optional[List[str]] = None, batch_mode: bool = False

436) -> tuple[List[str], List[str]]:

437 """

438 Example to run: ./typer_feb2.py plot-qc --plot-dir plots

439

440 Args:

441 plot_dir: location to save plots

442 postgres_uri: uri to database with QC values

443 batch_mode: turn off track()

444

445 """

446 if boards:

447 available_boards = (

448 pl.read_database_uri(

449 """

450 SELECT DISTINCT board_id

451 FROM latest_derived

452 """,

453 postgres_uri,

454 )

455 .to_series()

456 .to_list()

457 )

458

459 bad_boards = set(boards).difference(set(available_boards))

460 if bad_boards:

461 log.error(f"Board IDs {bad_boards} not found in derived database")

462 raise Exception(f"Board IDs {bad_boards} not found in derived database")

463

464 plot_dir = plot_dir

465 if not plot_dir.exists():

466 plot_dir.mkdir(parents=True, exist_ok=True)

467 os.chmod(plot_dir, 0o775)

468

469 query = """

470 SELECT d.created_at, d.run_number, d.board_id, d.value, l.label

471 FROM latest_derived as d

472 JOIN threshold_labels as l on d.label_id=l.id

473 """

474 if boards:

475 board_id_string = ", ".join(f"'{b}'" for b in boards)

476 query += f"\nWHERE board_id IN ({board_id_string})"

477

478 df = (

479 pl.read_database_uri(query, postgres_uri)

480 .with_columns(pl.col("label").str.json_decode(infer_schema_length=None))

481 .unnest("label")

482 )

483 df = df.with_columns(

484 pl.when(pl.col("variable") == "mean")

485 .then(pl.lit("pedestal_mean"))

486 .otherwise(pl.col("variable"))

487 .alias("variable")

488 )

489 df = df.with_columns(

490 pl.when(pl.col("variable") == "std")

491 .then(pl.lit("pedestal_std"))

492 .otherwise(pl.col("variable"))

493 .alias("variable")

494 )

495

496 if boards is None:

497 boards = df["board_id"].unique().to_list()

498 variables = df["variable"].unique().drop_nulls().to_list()

499

500 for variable, board in (

501 list(product(variables, boards))

502 if batch_mode

503 else track(list(product(variables, boards)), description="Making per board QC plots...")

504 ):

505 log.debug(f"{variable}, {board}")

506 _, ax = plt.subplots(constrained_layout=True)

507

508 var_df = df.filter(pl.col("variable") == variable, pl.col("board_id") == board)

509

510 runs = helper.list_to_text_ranges(var_df["run_number"].unique().sort().to_list())

511 title = f"{variable} for Board {board}\nRuns {runs}"

512

513 if "channel" in var_df.columns and not var_df["channel"].has_nulls():

514 x_axis = "channel"

515 elif "adc" in var_df.columns and not var_df["adc"].has_nulls():

516 x_axis = "adc"

517 elif "link" in var_df.columns and not var_df["link"].has_nulls():

518 x_axis = "link"

519 else:

520 x_axis = None

521 if var_df["gain"].has_nulls():

522 if x_axis:

523 ax.bar(var_df[x_axis], var_df["value"], fill=False, edgecolor="purple")

524 ax.set(title=title, xlabel=x_axis, ylabel=variable)

525 else:

526 ax.hist(var_df["value"], fill=False, edgecolor="purple")

527 ax.set(title=title, xlabel=variable, ylabel="Counts")

528 else:

529 hi = var_df.filter(pl.col("gain") == "hi")

530 lo = var_df.filter(pl.col("gain") == "lo")

531 if x_axis:

532 ax.bar(hi[x_axis], hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB")

533 ax.bar(lo[x_axis], lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB")

534 ax.set(title=title, xlabel=x_axis, ylabel=variable)

535 else:

536 ax.hist(hi["value"], fill=False, edgecolor="r", label="HG, 1.5 V, 20.0 dB")

537 ax.hist(lo["value"], fill=False, edgecolor="b", label="LG, 6.0 V, 7.0 dB")

538 ax.set(title=title, xlabel=variable, ylabel="Counts")

539 ax.legend()

540

541 plt.savefig(plot_dir / f"{variable}_{board}.png")

542 plt.close()

543

544 return variables, boards

Coverage for polars_analysis / plotting / qc_db_plotting.py: 76%

261 statements