Coverage for polars_analysis/utils.py: 33%

1import json

2import logging

3import os

4import re

5import sys

6from itertools import combinations

7from pathlib import Path

8from typing import Any, Dict, List, Literal, Optional, Tuple

10import polars as pl

11import requests

12from dotenv import dotenv_values

14from polars_analysis import data_sources

15from polars_analysis.analysis import constants

17# Instantiate logger

18log = logging.getLogger(__name__)

21def parse_skip_channels(input: List[str]) -> Optional[Tuple[List[int], List[int]]]:

22 # Parse channels to skip

23 skip_channels_lo: List[int] = []

24 skip_channels_hi: List[int] = []

25 for i in input:

26 regex_match = re.findall(r"[0-9]+", i)

27 if not regex_match:

28 return None

29 if "l" in i.lower():

30 skip_channels_lo.append(int(regex_match[0]))

31 if "h" in i.lower():

32 skip_channels_hi.append(int(regex_match[0]))

33 if "l" not in i.lower() and "h" not in i.lower():

34 skip_channels_lo.append(int(regex_match[0]))

35 skip_channels_hi.append(int(regex_match[0]))

36 if len(skip_channels_lo) > 0:

37 log.info(f"Skipping lo channels: {skip_channels_lo}")

38 if len(skip_channels_hi) > 0:

39 log.info(f"Skipping hi channels: {skip_channels_hi}")

41 return skip_channels_lo, skip_channels_hi

44def get_columns_or_exit(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:

45 try:

46 df = df.select(columns)

47 except pl.exceptions.ColumnNotFoundError:

48 log.critical("Could not find all needed columns in dataframe")

49 log.critical(f"Expected: {columns}")

50 log.critical(f"Found: {df.columns}")

51 log.critical(f"Missing: {set(columns).difference(df.columns)}")

52 sys.exit(1)

53 return df

56def check_missing_www_runs(run_numbers: List[int], www_dir: Path) -> List[int]:

57 # collect runs in www dir

58 www_runs: List[int] = []

59 for d in os.listdir(www_dir):

60 if "html" not in d.lower():

61 continue

62 m = re.search(r"\d+", d)

63 if m:

64 www_runs.append(int(m.group(0)))

66 return sorted(list(set(run_numbers).difference(set(www_runs))))

69def clear_run_info(board_id: str, plot_dir: Path) -> None:

70 json_path = os.path.join(plot_dir, f"run_info_{board_id}.json")

71 if os.path.exists(json_path):

72 os.remove(json_path)

75def add_run_info(name: str, info, board_id: str, plot_dir: Path, print_to_website: bool = True) -> None:

76 json_path = os.path.join(plot_dir, f"run_info_{board_id}.json")

77 if os.path.exists(json_path):

78 try:

79 with open(json_path, "r") as f:

80 run_info: Dict[str, Any] = json.load(f)

81 except json.decoder.JSONDecodeError:

82 log.warning(f"Could not decode {json_path}, creating new file")

83 run_info = {}

84 else:

85 run_info = {}

87 run_info[name] = {"info": info, "print_to_website": print_to_website}

89 with open(json_path, "w") as f:

90 json.dump(run_info, f, indent=4)

93def get_run_info_str(json_file_path: Path) -> str:

94 run_info = ""

95 if os.path.exists(json_file_path):

96 with open(json_file_path, "r") as f:

97 json_data = json.load(f)

98 for key, value in json_data.items():

99 try:

100 if value["print_to_website"]:

101 run_info += f"{key}: {value['info']}<br>"

102 except KeyError:

103 log.warning(f"Key 'print_to_website' not found in {key}")

104 continue

105 else:

106 log.warning(f"Could not find {json_file_path}")

107 return run_info

108

109

110def check_bad_samples(loader: data_sources.DataSource, run_number: int, run_plot_dir: Path):

111 df_raw_samples = loader.get_bad_samples_check(run_number)

112

113 for board_id in df_raw_samples["board_id"].unique().to_list():

114 saturated_gain_channels = [

115 f"{row['channel']}-{row['gain']}"

116 for row in df_raw_samples.filter(pl.col("samples_max") >= 2**15)

117 .filter(pl.col("board_id") == board_id)

118 .select(pl.col("channel"), pl.col("gain"))

119 .unique(subset="channel")

120 .sort("channel")

121 .iter_rows(named=True)

122 ]

123 if len(saturated_gain_channels) > 0:

124 log.warning(f"Saturated Channels: {saturated_gain_channels}")

125 add_run_info("Saturated Channels", saturated_gain_channels, board_id, run_plot_dir, True)

126

127 negative_gain_channels = [

128 f"{row['channel']}-{row['gain']}"

129 for row in df_raw_samples.filter(pl.col("samples_min") < 0)

130 .filter(pl.col("board_id") == board_id)

131 .select(pl.col("channel"), pl.col("gain"))

132 .unique(subset="channel")

133 .sort("channel")

134 .iter_rows(named=True)

135 ]

136 if len(negative_gain_channels) > 0:

137 log.warning(f"Negative (Saturated?) Channels: {negative_gain_channels}")

138 add_run_info("Negative (Saturated?) Channels", negative_gain_channels, board_id, run_plot_dir, True)

139

140 """

141 Probably don't want this for now as it will flag single ADC

142 and crosstalk runs until something more specific is done

143 """

144 # empty_gain_channels = [

145 # f"{row['channel']}-{row['gain']}"

146 # for row in df_raw_samples.filter(pl.col("samples_len") == 0)

147 # .select(pl.col("channel"), pl.col("gain"))

148 # .iter_rows(named=True)

149 # ]

150 # if len(empty_gain_channels) > 0:

151 # log.warning(f"Empty Channels: {empty_gain_channels}")

152 # add_run_info("Empty Channels", empty_gain_channels, board_id, run_plot_dir, True)

153

154 zero_gain_channels = [

155 f"{row['channel']}-{row['gain']}"

156 for row in df_raw_samples.filter(pl.col("samples_min") == 0)

157 .filter(pl.col("board_id") == board_id)

158 .select(pl.col("channel"), pl.col("gain"))

159 .unique(subset="channel")

160 .sort("channel")

161 .iter_rows(named=True)

162 ]

163 if len(zero_gain_channels) > 0:

164 log.warning(f"Zero Value Samples Channels: {zero_gain_channels}")

165 add_run_info("Zero Value Samples Channels", zero_gain_channels, board_id, run_plot_dir, True)

166

167

168def get_board_combinations(boards: List[str], ignore_boards: List[str] = []) -> List[List[str]]:

169 """

170 Get all combinations of boards with the first entry being all boards together.

171 """

172 boards_sorted = sorted(boards)

173 if len(boards_sorted) == 1:

174 return []

175 elif len(boards_sorted) == 2:

176 return [boards_sorted]

177 elif len(boards_sorted) > 3 and len(ignore_boards) > 0:

178 log.warning(f"Ignoring boards {ignore_boards} in combinations")

179 board_combinations = [

180 boards_sorted,

181 [board for board in boards_sorted if board not in ignore_boards],

182 *(list(c) for c in combinations(boards_sorted, 2)),

183 ]

184 return board_combinations

185 else:

186 return [boards_sorted, *(list(c) for c in combinations(boards_sorted, 2))]

187

188

189def notify_mattermost(url: str, msg: str):

190 j = {

191 "attachments": [

192 {

193 "title": "Analysis Webserver",

194 "title_link": "https://www.nevis.columbia.edu/feb2/FEB2/feb2_home.html",

195 "text": msg,

196 }

197 ]

198 }

199

200 try:

201 r = requests.post(url, data=json.dumps(j))

202 r.raise_for_status()

203 except requests.exceptions.HTTPError as http_err:

204 log.error(f"Mattermost HTTP error occurred: {http_err}")

205 except Exception as err:

206 log.error(f"Mattermost: other error occurred: {err}")

207

208

209def non_nevis_config_overrider(data_location: Literal["bnl", "cern", "bnl-calib"]) -> tuple:

210 bnl_data = False

211

212 if data_location.lower() == "bnl":

213 bnl_data = True

214 log.warning("bnl_data flag does not overwite env variables loaded in sub function calls.")

215 if Path(".env-bnl").exists():

216 config = dotenv_values(".env-bnl")

217 if config["DATA_DIR"]:

218 data_dir = Path(config["DATA_DIR"])

219 else:

220 log.error("DATA_DIR not set in .env-bnl")

221 raise Exception("DATA_DIR not set in .env-bnl")

222 if config["DERIVED_DIR"]:

223 derived_dir = Path(config["DERIVED_DIR"])

224 else:

225 log.error("DERIVED_DIR not set in .env-bnl")

226 raise Exception("DERIVED_DIR not set in .env-bnl")

227 if config["RUNS_PLOT_DIR"]:

228 plot_dir = Path(config["RUNS_PLOT_DIR"])

229 else:

230 log.error("RUNS_PLOT_DIR not set in .env-bnl")

231 raise Exception("RUNS_PLOT_DIR not set in .env-bnl")

232 if config["RENDERED_DIR"]:

233 rendered_dir = Path(config["RENDERED_DIR"])

234 else:

235 log.error("RENDERED_DIR not set in .env-bnl")

236 raise Exception("RENDERED_DIR not set in .env-bnl")

237 if config["FRAME_DIR"]:

238 frame_dir = Path(config["FRAME_DIR"])

239 else:

240 log.error("FRAME_DIR not set in .env-bnl")

241 raise Exception("FRAME_DIR not set in .env-bnl")

242 if config["MONITORING_DIR"]:

243 monitoring_dir = Path(config["MONITORING_DIR"])

244 else:

245 log.error("MONITORING_DIR not set in .env-bnl")

246 raise Exception("MONITORING_DIR not set in .env-bnl")

247 postgres_uri = None

248 postgres_prod_uri = None

249 upload_to_prod_db = False

250 data_source = "deltalake"

251 else:

252 log.error("bnl_data is true, but .env-bnl does not exist")

253 raise FileNotFoundError(".env-bnl")

254 elif data_location.lower() == "bnl-calib":

255 bnl_data = True

256 log.warning("bnl_data flag does not overwite env variables loaded in sub function calls.")

257 if Path(".env-bnl-calib").exists():

258 config = dotenv_values(".env-bnl-calib")

259 if config["DATA_DIR"]:

260 data_dir = Path(config["DATA_DIR"])

261 else:

262 log.error("DATA_DIR not set in .env-bnl-calib")

263 raise Exception("DATA_DIR not set in .env-bnl-calib")

264 if config["DERIVED_DIR"]:

265 derived_dir = Path(config["DERIVED_DIR"])

266 else:

267 log.error("DERIVED_DIR not set in .env-bnl-calib")

268 raise Exception("DERIVED_DIR not set in .env-bnl-calib")

269 if config["RUNS_PLOT_DIR"]:

270 plot_dir = Path(config["RUNS_PLOT_DIR"])

271 else:

272 log.error("RUNS_PLOT_DIR not set in .env-bnl-calib")

273 raise Exception("RUNS_PLOT_DIR not set in .env-bnl-calib")

274 if config["RENDERED_DIR"]:

275 rendered_dir = Path(config["RENDERED_DIR"])

276 else:

277 log.error("RENDERED_DIR not set in .env-bnl-calib")

278 raise Exception("RENDERED_DIR not set in .env-bnl-calib")

279 if config["FRAME_DIR"]:

280 frame_dir = Path(config["FRAME_DIR"])

281 else:

282 log.error("FRAME_DIR not set in .env-bnl-calib")

283 raise Exception("FRAME_DIR not set in .env-bnl-calib")

284 if config["MONITORING_DIR"]:

285 monitoring_dir = Path(config["MONITORING_DIR"])

286 else:

287 log.error("MONITORING_DIR not set in .env-bnl-calib")

288 raise Exception("MONITORING_DIR not set in .env-bnl-calib")

289 postgres_uri = None

290 postgres_prod_uri = None

291 upload_to_prod_db = False

292 data_source = "deltalake"

293 else:

294 log.error("bnl_data is true, but .env-bnl does not exist")

295 raise FileNotFoundError(".env-bnl")

296 elif data_location.lower() == "cern":

297 log.warning("cern_data flag does not overwite env variables loaded in sub function calls.")

298 if Path(".env-cern").exists():

299 config = dotenv_values(".env-cern")

300 if config["DATA_DIR"]:

301 data_dir = Path(config["DATA_DIR"])

302 else:

303 log.error("DATA_DIR not set in .env-cern")

304 raise Exception("DATA_DIR not set in .env-cern")

305 if config["DERIVED_DIR"]:

306 derived_dir = Path(config["DERIVED_DIR"])

307 else:

308 log.error("DERIVED_DIR not set in .env-cern")

309 raise Exception("DERIVED_DIR not set in .env-cern")

310 if config["RUNS_PLOT_DIR"]:

311 plot_dir = Path(config["RUNS_PLOT_DIR"])

312 else:

313 log.error("RUNS_PLOT_DIR not set in .env-cern")

314 raise Exception("RUNS_PLOT_DIR not set in .env-cern")

315 if config["RENDERED_DIR"]:

316 rendered_dir = Path(config["RENDERED_DIR"])

317 else:

318 log.error("RENDERED_DIR not set in .env-cern")

319 raise Exception("RENDERED_DIR not set in .env-cern")

320 if config["FRAME_DIR"]:

321 frame_dir = Path(config["FRAME_DIR"])

322 else:

323 log.error("FRAME_DIR not set in .env-cern")

324 raise Exception("FRAME_DIR not set in .env-cern")

325 if config["MONITORING_DIR"]:

326 monitoring_dir = Path(config["MONITORING_DIR"])

327 else:

328 log.error("MONITORING_DIR not set in .env-cern")

329 raise Exception("MONITORING_DIR not set in .env-cern")

330 postgres_uri = None

331 postgres_prod_uri = None

332 upload_to_prod_db = False

333 data_source = "deltalake"

334 else:

335 log.error("cern_data is true, but .env-cern does not exist")

336 raise FileNotFoundError(".env-cern")

337 else:

338 raise Exception(f"Incorrect data_location {data_location} for config override.")

339

340 return (

341 bnl_data,

342 data_dir,

343 derived_dir,

344 plot_dir,

345 rendered_dir,

346 frame_dir,

347 monitoring_dir,

348 postgres_uri,

349 postgres_prod_uri,

350 upload_to_prod_db,

351 data_source,

352 )

353

354

355def append_trigger_rate_hz(df: pl.DataFrame, bnl_data: bool = False) -> pl.DataFrame:

356 if "trigger_rate_hz" not in df.columns and "trigger_rate" in df.columns:

357 run_number = df["run_number"].unique().to_list()[0]

358 trigger_rate = df["trigger_rate"].unique().to_list()[0]

359 trigger_rate_hz = constants.felix_trigger_rate(

360 bnl_data=bnl_data, run_number=run_number, trigger_rate_bit=trigger_rate

361 )

362

363 return df.with_columns(pl.lit(trigger_rate_hz).alias("trigger_rate_hz"))

364 else:

365 return df

Coverage for polars_analysis / utils.py: 33%

234 statements