Coverage for polars_analysis/data

1import logging

2import os

3import sys

4from collections import defaultdict

5from datetime import datetime

6from glob import glob

7from pathlib import Path

8from typing import Any, Dict, List, Optional, Protocol, cast

10import numpy as np

11import polars as pl

12import polars.selectors as cs

14from polars_analysis.analysis.constants import MIDDLE_ATTENUATIONS

16# Instantiate logger

17log = logging.getLogger(__name__)

20class DataSource(Protocol):

21 def load_raw_data(

22 self,

23 *run_numbers: int,

24 require_positive: bool = False,

25 require_unsaturated: bool = False,

26 require_nonempty: bool = True,

27 ignore_boards: List[str] = [],

28 ) -> pl.DataFrame: ...

30 def load_derived_data(self, run_number: int, meas_type: str) -> pl.DataFrame: ...

32 def load_coherent_noise_data(self, run_number: int) -> pl.DataFrame: ...

34 def load_frame_data(

35 self, *run_numbers: int, reject_single_adc: bool = False, non_empty: bool = True

36 ) -> pl.DataFrame: ...

38 def load_monitoring_data(self, *run_numbers: int) -> pl.DataFrame: ...

40 def load_lab_environment_data(self) -> pl.DataFrame: ...

42 def save_derived_data(self, derived_data: pl.DataFrame, run_number: int, meas_type: str) -> None: ...

44 def save_coherent_noise_data(self, coherent_noise_data: pl.DataFrame, run_number: int) -> None: ...

46 def check_run_exists(self, run_number: int) -> bool: ...

48 def check_measurement_exists(self, run_number: int, measurement: int) -> bool: ...

50 def get_runs_summary(self) -> pl.DataFrame: ...

52 def get_boards_summary(self) -> pl.DataFrame: ...

54 def get_boards_list(self, run_number: Optional[int] = None, meas_type: Optional[str] = None) -> pl.DataFrame: ...

56 def get_runs_list(self) -> pl.DataFrame: ...

58 def get_channels_list(

59 self,

60 run_number: int,

61 meas_type: str,

62 require_pulsed: bool = False,

63 require_positive: bool = False,

64 require_unsaturated: bool = False,

65 require_nonempty: bool = True,

66 ) -> pl.DataFrame: ...

68 def get_amplitudes_list(self, run_number: int, meas_type: str) -> pl.DataFrame: ...

70 def get_measurements_list(self, run_number: int) -> List[int]: ...

72 def get_measurement_types(self, run_number: int, measurement: Optional[int] = None) -> List[str]: ...

74 def get_bad_samples_check(self, run_number: int) -> pl.DataFrame:

75 """

76 Returns a Polars DataFrame with columns [channel, gain, samples_min, samples_max, samples_len]

77 For use when checking if any samples arrays contain values outside the expected range

78 """

79 ...

81 def get_middle_attenuation_run(self, run_numbers: List[int]) -> int: ...

84class DeltaSource(DataSource):

85 def __init__(

86 self,

87 raw_data_dir: Optional[Path],

88 derived_dir: Optional[Path],

89 frame_dir: Optional[Path] = None,

90 monitoring_dir: Optional[Path] = None,

91 crate_lab_path: Path = Path("/data/feb2/lab-env/env_monitoring_crate_lab.csv"),

92 test_stand_path: Path = Path("/data/feb2/lab-env/env_monitoring_test_stand.csv"),

93 ) -> None:

94 self.raw_dir = raw_data_dir

95 self.derived_dir = derived_dir

96 self.frame_dir = frame_dir

97 self.monitoring_dir = monitoring_dir

98 self.crate_lab_path = crate_lab_path

99 self.test_stand_path = test_stand_path

100

101 if derived_dir is not None and not derived_dir.exists():

102 derived_dir.mkdir(exist_ok=True)

103 os.chmod(derived_dir, 0o775)

104

105 def load_raw_data(

106 self,

107 *run_numbers: int,

108 require_positive: bool = False,

109 require_unsaturated: bool = False,

110 require_nonempty: bool = True,

111 ignore_boards: List[str] = [],

112 ) -> pl.DataFrame:

113 log.info(f"Loading raw data from DeltaSource {self.raw_dir}")

114 if self.raw_dir is None:

115 log.error("Raw data directory must be set to load raw data")

116 sys.exit(1)

117 raw_data = pl.scan_delta(str(self.raw_dir)).filter(pl.col("run_number").is_in(run_numbers)).collect()

118 if require_positive:

119 raw_data = raw_data.filter(pl.col("samples").list.min() > 0)

120 if require_unsaturated:

121 raw_data = raw_data.filter(pl.col("samples").list.max() < 2**15)

122 if require_nonempty:

123 raw_data = raw_data.filter(pl.col("samples").list.len() != 0)

124

125 if "board_version" not in raw_data.columns:

126 raw_data = raw_data.with_columns(pl.lit("").alias("board_version"))

127

128 if raw_data.is_empty():

129 log.error(f"No raw data for {run_numbers=} found in {self.raw_dir}, exiting...")

130 sys.exit(1)

131

132 # Add Null hps_ps_gain to handle old data without this column

133 if "hps_ps_gain" not in raw_data.columns:

134 raw_data = raw_data.with_columns(pl.lit(None).alias("hps_ps_gain"))

135

136 # Merge alfe_mode and hps gain into pas_mode, preferring alfe_mode

137 raw_data = raw_data.with_columns(

138 pl.col("timestamp").dt.replace_time_zone("America/New_York").dt.convert_time_zone("UTC"),

139 pl.coalesce(cs.by_name("alfe_mode") | cs.by_name("hps_ps_gain")).alias("pas_mode"),

140 )

141

142 # Example of filtering out specific channels per board

143 """

144 raw_data = raw_data.filter(

145 ~(

146 ((pl.col("board_id") =="2000015530001") & (pl.col("channel").is_in([i for i in range(64,85)])))

147 | ((pl.col("board_id") =="2000015530005") & (pl.col("channel").is_in([100,101,102,103,104,105])))

148 | ((pl.col("board_id") =="2000015530002") & (pl.col("channel").is_in([12,13,14,15,16,17])))

149 | ((pl.col("board_id") =="E190322") & (pl.col("channel").is_in([119])))

150 | ((pl.col("board_id") =="2000015530006") & (pl.col("channel").is_in([100,101,102,103,104,105])))

151 | (pl.col("board_id") =="2000015530006")

152 | (pl.col("board_id") =="2000015530002")

153 | (pl.col("board_id") =="E191703")

154

155 )

156 )

157 """

158

159 if len(ignore_boards) > 0:

160 log.warning(f"Will skip loading boards {ignore_boards}")

161 return raw_data.filter(~pl.col("board_id").is_in(ignore_boards))

162 else:

163 return raw_data

164

165 def load_derived_data(

166 self,

167 run_number: int,

168 meas_type: str,

169 ) -> pl.DataFrame:

170 log.debug(f"Loading derived data from {self.derived_dir}")

171 if self.derived_dir is None:

172 log.error("Derived directory must be set to load derived data")

173 sys.exit(1)

174 derived_df = (

175 pl.scan_parquet(self.derived_dir / f"{meas_type}_derived_values*{run_number}.parquet")

176 .filter(pl.col("run_number") == run_number)

177 .collect()

178 )

179

180 if derived_df.is_empty():

181 log.error(f"No derived {meas_type} data for run {run_number} in {self.derived_dir}, exiting...")

182 log.error("Have you run [yellow]calc-save-runs[/yellow]?")

183 sys.exit(1)

184

185 return derived_df

186

187 def load_coherent_noise_data(self, run_number: int) -> pl.DataFrame:

188 log.debug(f"Loading coherent noise data from {self.derived_dir}")

189 if self.derived_dir is None:

190 log.error("Derived directory must be set to load coherent noise data")

191 sys.exit(1)

192 coherent_noise_df = pl.concat(

193 [

194 pl.scan_parquet(p).filter(pl.col("run_number") == run_number).collect()

195 for p in Path(self.derived_dir).glob(f"coherent_noise_*{run_number}.parquet")

196 ],

197 how="diagonal",

198 )

199

200 if coherent_noise_df.is_empty():

201 log.error(f"No coherent noise data for run {run_number} found in {self.derived_dir}, exiting...")

202 log.error("Have you run [yellow]calc-save-runs[/yellow]?")

203 log.error(f"Is {run_number} a pedestal run?")

204 sys.exit(1)

205

206 return coherent_noise_df

207

208 def load_frame_data(

209 self,

210 *run_numbers: int,

211 reject_single_adc: bool = False,

212 non_empty: bool = True,

213 ) -> pl.DataFrame:

214 log.debug(f"Loading frame data from {self.frame_dir} with {reject_single_adc=} and {non_empty=}")

215 if self.frame_dir is None:

216 log.error("Frame directory must be set to load frame data")

217 sys.exit(1)

218

219 # Collect Filters

220 filters = []

221 if reject_single_adc:

222 filters.append(pl.col("frame8").list.len() < 1e5)

223 if non_empty:

224 filters.append(pl.col("frame8").list.len() != 0)

225

226 frame_data = pl.DataFrame()

227

228 if len(run_numbers) < 50:

229 # Collecting a bunch of runs with scan_delta (and maybe pyarrow) uses a lot of memory

230 # So don't do this if we're trying to get many runs. 50 is ad-hoc from testing

231 try:

232 # use_pyarrow=True avoids schema errors

233 log.debug("Scan_delta")

234 df_lazy = pl.scan_delta(str(self.frame_dir), use_pyarrow=True)

235 if filters:

236 df_lazy = df_lazy.filter(filters)

237

238 frame_data = df_lazy.filter(pl.col("run_number").is_in(run_numbers)).collect()

239 except pl.exceptions.SchemaError:

240 log.warning(

241 f"""Runs {run_numbers} caused a schema error with scan_delta.\

242 Trying scan_parquet and picking largest file"""

243 )

244

245 if frame_data.is_empty(): # pragma: no cover

246 log.debug("Empty frame, trying scan_parquet")

247

248 # Collect run directory paths and run numbers

249 run_dirs = [

250 (p, int(str(p).split("run_number=")[-1]))

251 for p in Path(str(self.frame_dir)).glob("*run*")

252 if int(str(p).split("run_number=")[-1]) in run_numbers

253 ]

254

255 dfs_to_concat = []

256 for rd, run_number in run_dirs:

257 log.debug(f"Scanning: {rd}")

258

259 files = glob(f"{str(rd)}/*")

260 if len(files) == 1:

261 file_path = files[0]

262 else:

263 # single ADC data is merged into one file

264 log.info(f"Multiple files in {rd}, picking the largest for frame data.")

265 largest_file_idx = np.argmax([os.path.getsize(f) for f in files])

266 file_path = files[largest_file_idx]

267

268 try:

269 dfs_to_concat.append(

270 pl.scan_parquet(file_path)

271 .filter(filters)

272 .with_columns(

273 # felix_bcid and felix_evt_count are UInt in a few runs around 2170 ish

274 cs.by_dtype(pl.List(pl.UInt16())).cast(pl.List(pl.Int16())),

275 # cs.matches("felix_bcid").cast(pl.List(pl.UInt16())).alias("felix_bcid"),

276 pl.lit(run_number).alias("run_number"),

277 )

278 .collect()

279 )

280 except pl.exceptions.SchemaError as e:

281 # This run can't be loaded due to the datatype in the felix_evt_count column (eg 2204)

282 log.warning(rf"{e}")

283 log.warning(f"Schema error when scanning {file_path}")

284 log.warning(f"Skipping run: {run_number}")

285

286 if len(dfs_to_concat) > 0:

287 frame_data = pl.concat(dfs_to_concat, how="diagonal")

288

289 if frame_data.is_empty():

290 log.error(

291 f"No frame data for run {run_numbers} in {self.frame_dir} with {reject_single_adc=} and {non_empty=}, exiting..." # noqa: E501

292 )

293 sys.exit(1)

294

295 frame_data = frame_data.with_columns(pl.col("adc").str.slice(3).str.to_integer().alias("adc"))

296

297 return frame_data

298

299 def load_monitoring_data(

300 self,

301 *run_numbers: int,

302 ) -> pl.DataFrame:

303 log.debug(f"Loading monitoring data from {self.monitoring_dir}")

304 if self.monitoring_dir is None:

305 log.error("Monitoring data directory must be set to load monitoring data")

306 sys.exit(1)

307 monitoring_data = (

308 pl.scan_delta(str(self.monitoring_dir)).filter(pl.col("run_number").is_in(run_numbers)).collect()

309 )

310 if monitoring_data.is_empty():

311 log.error(f"No monitoring data for {run_numbers=} found in {self.monitoring_dir}, exiting...")

312 sys.exit(1)

313 monitoring_data = monitoring_data.with_columns(

314 pl.col("timestamp").dt.replace_time_zone("America/New_York").dt.convert_time_zone("UTC")

315 )

316 return monitoring_data

317

318 def save_derived_data(self, derived_data: pl.DataFrame, run_number: int, meas_type: str) -> None:

319 if self.derived_dir is None:

320 log.error("Derived directory must be set to save derived data")

321 sys.exit(1)

322 derived_data.write_parquet(

323 self.derived_dir / f"{meas_type}_derived_values_run{run_number:04}.parquet", compression="zstd"

324 )

325

326 def load_lab_environment_data(self) -> pl.DataFrame:

327 lab_env_data_UTC = pl.concat([pl.read_csv(self.crate_lab_path), pl.read_csv(self.test_stand_path)])

328 lab_env_data = lab_env_data_UTC.with_columns(

329 pl.col("datetime_utc").str.to_datetime(time_zone="UTC").alias("timestamp")

330 ).drop("datetime_utc")

331 return lab_env_data

332

333 def save_coherent_noise_data(self, coherent_noise_data: pl.DataFrame, run_number: int) -> None:

334 if self.derived_dir is None:

335 log.error("Derived directory must be set to save coherent noise data")

336 sys.exit(1)

337

338 # TODO data_sum could be added to a pytest and this moved into a less obscure place

339 # Don't save data_sum, a column with type List[Float64] and length n_samples

340 coherent_noise_data.select(pl.exclude("data_sum")).write_parquet(

341 self.derived_dir / f"coherent_noise_run{run_number:04}.parquet", compression="zstd"

342 )

343

344 def check_run_exists(self, run_number: int) -> bool:

345 """

346 Check if a run exists exists in data_dir.

347

348 Args:

349 run_number: the run number, eg 770

350

351 Returns:

352 True if the run exists, False otherwise

353 """

354 if self.raw_dir is None:

355 log.error("Raw data directory must be set")

356 sys.exit(1)

357 try:

358 return not pl.scan_delta(str(self.raw_dir)).filter(pl.col("run_number") == run_number).collect().is_empty()

359 except Exception as e:

360 log.error(f"check_run_exists error for {run_number=} and raw_dir={self.raw_dir}")

361 log.error("You might have an incomplete data dir, eg missing the _delta_log directory")

362 log.error(type(e))

363 log.error(e)

364 return False

365

366 def check_measurement_exists(self, run_number: int, measurement: int) -> bool:

367 """

368 Check if a measurement exists in a run in data_dir.

369

370 Args:

371 run_number: the run number, eg 770

372 measurement: the measurement number, eg 1

373

374 Returns:

375 True if the measurement exists, False otherwise

376 """

377 if self.raw_dir is None:

378 log.error("Raw data directory must be set")

379 sys.exit(1)

380 return (

381 not pl.scan_delta(str(self.raw_dir))

382 .filter(pl.col("run_number") == run_number, pl.col("measurement") == measurement)

383 .collect()

384 .is_empty()

385 )

386

387 def get_runs_list(self) -> pl.DataFrame:

388 if self.raw_dir is None:

389 log.error("Raw data directory must be set")

390 sys.exit(1)

391 run_numbers = (

392 pl.scan_delta(str(self.raw_dir))

393 .select([pl.col("run_number"), pl.col("meas_type")])

394 .unique()

395 .sort("run_number")

396 .collect()

397 )

398 return run_numbers

399

400 def get_runs_summary(self) -> pl.DataFrame:

401 if self.raw_dir is None:

402 log.error("Raw data directory must be set")

403 sys.exit(1)

404

405 df = (

406 pl.scan_delta(str(self.raw_dir))

407 .sort("measurement")

408 .unique(["run_number", "meas_type", "att_val", "board_id", "channel"], keep="first")

409 )

410

411 # Add Null hps_ps_gain to handle old data without this column

412 if "hps_ps_gain" not in df.collect_schema().names():

413 df = df.with_columns(pl.lit(None).alias("hps_ps_gain"))

414

415 # Merge alfe_mode and hps gain into pas_mode, preferring alfe_mode

416 return (

417 df.with_columns(

418 pl.coalesce(cs.by_name("alfe_mode") | cs.by_name("hps_ps_gain")).alias("pas_mode"),

419 )

420 .group_by(["run_number", "meas_type", "pas_mode"])

421 .agg(

422 pl.col("att_val").unique().sort(),

423 pl.col("board_id").unique().sort(),

424 pl.col("timestamp").first(),

425 pl.col("channel").unique().sort(),

426 )

427 .select(["run_number", "board_id", "meas_type", "att_val", "pas_mode", "timestamp", "channel"])

428 .sort("run_number", descending=True)

429 .collect()

430 )

431

432 def get_channels_list(

433 self,

434 run_number: int,

435 meas_type: str,

436 require_pulsed: bool = False,

437 require_positive: bool = False,

438 require_unsaturated: bool = False,

439 require_nonempty: bool = True,

440 ) -> pl.DataFrame:

441 if self.raw_dir is None:

442 log.error("Raw data directory must be set")

443 sys.exit(1)

444 return (

445 pl.scan_delta(str(self.raw_dir))

446 .filter(

447 pl.col("run_number") == run_number,

448 )

449 .collect()

450 .filter(

451 pl.col("meas_type") == meas_type,

452 pl.col("samples").list.len() != 0 if require_nonempty else True,

453 pl.col("samples").list.min() > 0 if require_positive else True,

454 pl.col("samples").list.max() < 2**15 if require_unsaturated else True,

455 pl.col("is_pulsed") if require_pulsed else True,

456 )

457 .select(pl.col("channel").unique())

458 )

459

460 def get_amplitudes_list(self, run_number: int, meas_type: str) -> pl.DataFrame:

461 return self.load_derived_data(run_number, meas_type).select(pl.col("amp").unique())

462

463 def get_boards_summary(self) -> pl.DataFrame:

464 if self.raw_dir is None:

465 log.error("Raw data directory must be set")

466 sys.exit(1)

467 return (

468 pl.scan_delta(str(self.raw_dir))

469 .sort("run_number", descending=True)

470 .unique(["board_id"], keep="first")

471 .select(["board_id", "timestamp"])

472 .collect()

473 )

474

475 def get_boards_list(self, run_number: Optional[int] = None, meas_type: Optional[str] = None) -> pl.DataFrame:

476 if self.raw_dir is None:

477 log.error("Raw data directory must be set")

478 sys.exit(1)

479

480 filters = []

481 if run_number is not None:

482 filters.append(pl.col("run_number") == run_number)

483

484 if meas_type is not None:

485 filters.append(pl.col("meas_type") == meas_type)

486

487 df_lazy = pl.scan_delta(str(self.raw_dir))

488 if filters:

489 df_lazy = df_lazy.filter(filters)

490

491 if "board_version" in df_lazy.collect_schema().names():

492 return (

493 df_lazy.select("board_id", "board_version", pl.col("board_variant").alias("board_type"))

494 .unique()

495 .collect()

496 )

497 else:

498 # BNL data doesn't have board_version

499 return df_lazy.select("board_id").unique().collect()

500

501 def get_measurements_list(self, run_number: int) -> List[int]:

502 df = pl.scan_delta(str(self.raw_dir)).filter(pl.col("run_number") == run_number).select("measurement").collect()

503

504 return df.select(pl.col("measurement").unique()).to_series().to_list()

505

506 def get_measurement_types(self, run_number: int, measurement: Optional[int] = None) -> List[str]:

507 df = (

508 pl.scan_delta(str(self.raw_dir))

509 .filter(pl.col("run_number") == run_number)

510 .select("meas_type", "measurement")

511 .collect()

512 )

513 if measurement is not None:

514 df = df.filter(pl.col("measurement") == measurement)

515 measurement_types = df.select(pl.col("meas_type").unique()).to_series().to_list()

516

517 return measurement_types

518

519 def get_middle_attenuation_run(self, run_numbers: List[int]) -> int:

520 df = pl.concat(

521 pl.scan_delta(str(self.raw_dir))

522 .filter(pl.col("run_number") == run_number)

523 .select("run_number", "att_val")

524 .collect()

525 for run_number in run_numbers

526 )

527 result_df = df.filter(pl.col("att_val").is_in(MIDDLE_ATTENUATIONS))

528 if result_df.is_empty():

529 log.error(f"Did not find a run with att_val in {MIDDLE_ATTENUATIONS} in runs {run_numbers}")

530 sys.exit(1)

531 return result_df["run_number"][0]

532

533 def get_bad_samples_check(self, run_number: int) -> pl.DataFrame:

534 df = (

535 pl.scan_delta(str(self.raw_dir))

536 .filter(pl.col("run_number") == run_number)

537 .select(

538 "channel",

539 "gain",

540 "board_id",

541 pl.col("samples").list.min().alias("samples_min"),

542 pl.col("samples").list.max().alias("samples_max"),

543 pl.col("samples").list.len().alias("samples_len"),

544 )

545 .collect()

546 )

547

548 return df

549

550

551class ParquetSource(DeltaSource):

552 """

553 For loading raw data from specific parquet files

554 """

555

556 def __init__(

557 self,

558 raw_data_path: Optional[Path],

559 derived_path: Optional[Path],

560 frame_path: Optional[Path] = None,

561 monitoring_dir: Optional[Path] = None,

562 crate_lab_path: Path = Path("/data/feb2/lab-env/env_monitoring_crate_lab.csv"),

563 test_stand_path: Path = Path("/data/feb2/lab-env/env_monitoring_test_stand.csv"),

564 ) -> None:

565 self.derived_path = derived_path

566 derived_dir = derived_path.parent if derived_path is not None else Path("derived/")

567 super().__init__(raw_data_path, derived_dir, frame_path, monitoring_dir, crate_lab_path, test_stand_path)

568

569 def load_raw_data(

570 self,

571 *run_numbers: int,

572 require_positive: bool = False,

573 require_unsaturated: bool = False,

574 require_nonempty: bool = True,

575 ignore_boards: List[str] = [],

576 ) -> pl.DataFrame:

577 # This is named raw_dir because it's inherited from DeltaSource, but it needs to be a path to a parquet file

578 log.info(f"Loading raw data from ParquetSource {self.raw_dir}")

579 if self.raw_dir is None:

580 log.error("Raw data directory must be set to load raw data")

581 sys.exit(1)

582

583 if Path(self.raw_dir).is_dir():

584 log.error(f"{self.raw_dir=} must be a parquet filepath")

585 sys.exit(1)

586

587 if len(run_numbers) != 1:

588 log.error("Only one run number supported for direct parquet files.")

589 sys.exit(1)

590

591 run_number = run_numbers[0]

592 # maybe something for later

593 # fpaths = []

594 # for run_number in run_numbers:

595 # fpaths += glob(str(self.raw_path)+f"/*{run_number}*.parquet") + glob(str(self.raw_path)+f"/*{run_number}*/*.parquet") # noqa: E501

596 raw_data = (

597 pl.scan_parquet(self.raw_dir)

598 .collect()

599 .filter(

600 pl.col("samples").list.len() != 0,

601 )

602 )

603

604 # Add Null hps_ps_gain to handle old data without this column

605 if "hps_ps_gain" not in raw_data.columns:

606 raw_data = raw_data.with_columns(pl.lit(None).alias("hps_ps_gain"))

607

608 # Merge alfe_mode and hps gain into pas_mode, preferring alfe_mode

609 raw_data = raw_data.with_columns(

610 pl.lit(run_number).alias("run_number"),

611 pl.coalesce(cs.by_name("alfe_mode") | cs.by_name("hps_ps_gain")).alias("pas_mode"),

612 )

613

614 if require_positive:

615 raw_data = raw_data.filter(pl.col("samples").list.min() > 0)

616 if require_unsaturated:

617 raw_data = raw_data.filter(pl.col("samples").list.max() < 2**15)

618 if require_nonempty:

619 raw_data = raw_data.filter(pl.col("samples").list.len() != 0)

620

621 if "board_version" not in raw_data.columns:

622 raw_data = raw_data.with_columns(pl.lit("").alias("board_version"))

623

624 if raw_data.is_empty():

625 log.error(f"No raw data for {run_numbers=} found in {self.raw_dir}, exiting...")

626 sys.exit(1)

627

628 raw_data = raw_data.with_columns(

629 pl.col("timestamp").dt.replace_time_zone("America/New_York").dt.convert_time_zone("UTC")

630 )

631 if len(ignore_boards) > 0:

632 log.warning(f"Will skip loading boards {ignore_boards}")

633 return raw_data.filter(~pl.col("board_id").is_in(ignore_boards))

634 else:

635 return raw_data

636

637 def load_derived_data(

638 self,

639 run_number: int,

640 meas_type: str,

641 ) -> pl.DataFrame:

642 log.debug(f"Loading derived data from {self.derived_path}")

643 if self.derived_path is None:

644 log.error("Derived data path must be set to load derived data")

645 sys.exit(1)

646 derived_df = pl.read_parquet(self.derived_path).filter(pl.col("run_number") == run_number)

647

648 if derived_df.is_empty():

649 log.error(f"No derived {meas_type} data for run {run_number} in {self.derived_path}, exiting...")

650 log.error("Have you run [yellow]calc-save-runs[/yellow]?")

651 sys.exit(1)

652

653 return derived_df

654

655 def get_boards_list(self, run_number: Optional[int] = None, meas_type: Optional[str] = None) -> pl.DataFrame:

656 # This is named raw_dir because it's inherited from DeltaSource, but it needs to be a path to a parquet file

657 if self.raw_dir is None:

658 log.error("Raw data directory must be set")

659 sys.exit(1)

660

661 filters = []

662 if run_number is not None:

663 filters.append(pl.col("run_number") == run_number)

664

665 if meas_type is not None:

666 filters.append(pl.col("meas_type") == meas_type)

667

668 df_lazy = pl.scan_parquet(str(self.raw_dir))

669 if filters:

670 df_lazy = df_lazy.filter(filters)

671

672 if "board_version" in df_lazy.collect_schema().names():

673 return (

674 df_lazy.select("board_id", "board_version", pl.col("board_variant").alias("board_type"))

675 .unique()

676 .collect()

677 )

678 else:

679 # BNL data doesn't have board_version

680 return df_lazy.select("board_id").unique().collect()

681

682 def check_run_exists(self, run_number: int) -> bool:

683 """

684 Does nothing for direct parquet files

685 """

686 _ = run_number

687 return True

688

689 def load_lab_environment_data(self) -> pl.DataFrame:

690 # Allow for reading parquet file direct to help with tests

691 if str(self.crate_lab_path).endswith(".parquet"):

692 lab_env_data = pl.read_parquet(self.crate_lab_path)

693 else:

694 lab_env_data = super().load_lab_environment_data()

695

696 if lab_env_data.is_empty():

697 log.error(f"No lab environment data in {self.crate_lab_path} or {self.test_stand_path}, exiting...")

698 sys.exit(1)

699

700 return lab_env_data

701

702 def load_monitoring_data(self, *run_numbers: int) -> pl.DataFrame:

703 if not self.monitoring_dir:

704 log.error("monitoring_dir must be set to load monitoring data")

705 sys.exit(1)

706

707 if str(self.monitoring_dir).endswith(".parquet"):

708 monitoring_df = pl.read_parquet(self.monitoring_dir).filter(pl.col("run_number").is_in(run_numbers))

709 else:

710 monitoring_df = super().load_monitoring_data(*run_numbers)

711

712 if monitoring_df.is_empty():

713 log.error(f"No monitoring data for runs {run_numbers} in {self.monitoring_dir}, exiting...")

714 sys.exit(1)

715

716 return monitoring_df

717

718

719class HDF5Source(DataSource):

720 def __init__(

721 self,

722 raw_data_dir: Path,

723 derived_dir: Optional[Path],

724 crate_lab_path: Path = Path("/data/feb2/lab-env/env_monitoring_crate_lab.csv"),

725 test_stand_path: Path = Path("/data/feb2/lab-env/env_monitoring_test_stand.csv"),

726 ) -> None:

727 self.raw_dir: Path = raw_data_dir

728 self.derived_dir = derived_dir

729 self.crate_lab_path = crate_lab_path

730 self.test_stand_path = test_stand_path

731 self.delta_source = DeltaSource(

732 raw_data_dir / "samples",

733 derived_dir,

734 crate_lab_path=crate_lab_path,

735 test_stand_path=test_stand_path,

736 )

737

738 if self.derived_dir is not None and not self.derived_dir.exists():

739 self.derived_dir.mkdir(exist_ok=True)

740 os.chmod(self.derived_dir, 0o775)

741

742 @staticmethod

743 def get_dataset_keys(f):

744 import h5py # type: ignore

745

746 keys = []

747 f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)

748 return keys

749

750 def load_raw_data(

751 self,

752 *run_numbers: int,

753 require_positive: bool = False,

754 require_unsaturated: bool = False,

755 require_nonempty: bool = True,

756 ignore_boards: List[str] = [],

757 ) -> pl.DataFrame:

758 import h5py

759

760 log.info("Loading raw data from HDF5Source")

761

762 if len(run_numbers) != 1:

763 log.critical("Only single run numbers implemented for HDF5, exiting...")

764 sys.exit(1)

765 run_number = run_numbers[0]

766 filename = str(self.raw_dir / f"run{run_number:04d}.hdf5")

767 try:

768 _ = h5py.File(filename)

769 except FileNotFoundError:

770 log.critical(f"Could not find input file {filename}")

771 sys.exit(1)

772

773 samples_dict: Dict[str, List[Any]] = defaultdict(list)

774 with h5py.File(filename) as f:

775 dataset_keys = self.get_dataset_keys(f)

776 for dataset_key in dataset_keys:

777 key_list = dataset_key.split("/")

778 if len(key_list) != 4:

779 continue

780 measurement, channel, gain, samples = key_list

781 for key, value in f[measurement].attrs.items():

782 if value == "":

783 samples_dict[key].append(None)

784 elif key == "alfe_mode" and value == -99:

785 samples_dict[key].append(None)

786 elif key == "awg_amp":

787 samples_dict[key].append(float(value))

788 elif key == "board_id":

789 samples_dict[key].append(str(value))

790 elif key == "timestamp":

791 samples_dict[key].append(

792 datetime.strptime(value, "%Y-%m-%d %H:%M:%S.%f").replace(microsecond=0)

793 )

794 elif key == "meas_chan":

795 samples_dict[key].append(int(value[-3:]))

796 samples_dict["is_pulsed"].append(int(channel[-3:]) == int(value[-3:]))

797 else:

798 samples_dict[key].append(value)

799 meas_group = cast(h5py.Group, f[measurement])

800 for key, value in meas_group[channel].attrs.items():

801 # is_pulsed is filled based on meas_chan

802 if key == "is_pulsed":

803 continue

804 samples_dict[key].append(value)

805 samples_dict["measurement"].append(int(measurement.split("_")[1]))

806 samples_dict["channel"].append(int(channel[-3:]))

807 samples_dict["gain"].append(gain)

808 meas_chan_group = cast(h5py.Group, meas_group[channel])

809 meas_chan_gain_group = cast(h5py.Group, meas_chan_group[gain])

810 samples_dataset = cast(h5py.Dataset, meas_chan_gain_group[samples])

811 samples_dict["samples"].append(samples_dataset[()])

812

813 raw_data = pl.DataFrame(samples_dict).filter(pl.col("samples").list.len() != 0)

814 if require_positive:

815 raw_data = raw_data.filter(pl.col("samples").list.min() > 0)

816 if require_unsaturated:

817 raw_data = raw_data.filter(pl.col("samples").list.max() < 2**15)

818 if require_nonempty:

819 raw_data = raw_data.filter(pl.col("samples").list.len() != 0)

820 raw_data = raw_data.with_columns(

821 pl.col("timestamp").dt.replace_time_zone("America/New_York").dt.convert_time_zone("UTC")

822 )

823 print(f"HERE!! raw_data timestamp: {raw_data.select('timestamp').head(1)}")

824 if len(ignore_boards) > 0:

825 log.warning(f"Will skip loading boards {ignore_boards}")

826 return raw_data.filter(~pl.col("board_id").is_in(ignore_boards))

827 else:

828 return raw_data

829

830 def load_derived_data(self, run_number: int, meas_type: str) -> pl.DataFrame:

831 return self.delta_source.load_derived_data(run_number, meas_type)

832

833 def load_coherent_noise_data(self, run_number: int) -> pl.DataFrame:

834 return self.delta_source.load_coherent_noise_data(run_number)

835

836 def load_frame_data(

837 self, *run_numbers: int, reject_single_adc: bool = False, non_empty: bool = True

838 ) -> pl.DataFrame:

839 return self.delta_source.load_frame_data(*run_numbers, reject_single_adc=reject_single_adc, non_empty=non_empty)

840

841 def save_derived_data(self, derived_data: pl.DataFrame, run_number: int, meas_type: str) -> None:

842 self.delta_source.save_derived_data(derived_data, run_number, meas_type)

843

844 def save_coherent_noise_data(self, coherent_noise_data: pl.DataFrame, run_number: int) -> None:

845 self.delta_source.save_coherent_noise_data(coherent_noise_data, run_number)

846

847 def check_run_exists(self, run_number: int) -> bool:

848 import h5py

849

850 filename = str(self.raw_dir / f"run{run_number:04d}.hdf5")

851 try:

852 _ = h5py.File(filename)

853 except FileNotFoundError:

854 log.error(f"Could not find HDF5 input file {filename} in {self.raw_dir}")

855 return False

856 return True

857

858 def check_measurement_exists(self, run_number: int, measurement: int) -> bool:

859 import h5py

860

861 filename = str(self.raw_dir / f"run{run_number:04d}.hdf5")

862 if not self.check_run_exists(run_number):

863 return False

864 with h5py.File(filename) as f:

865 measurements = f.keys()

866 if f"Measurement_{measurement:03d}" in measurements:

867 return True

868 else:

869 return False

870

871 def get_runs_summary(self) -> pl.DataFrame:

872 raise NotImplementedError

873

874 def get_boards_summary(self) -> pl.DataFrame:

875 raise NotImplementedError

876

877 def get_boards_list(self, run_number: Optional[int] = None, meas_type: Optional[str] = None) -> pl.DataFrame:

878 raise NotImplementedError

879

880 def get_runs_list(self) -> pl.DataFrame:

881 raise NotImplementedError

882

883 def get_channels_list(

884 self,

885 run_number: int,

886 meas_type: str,

887 require_pulsed: bool = False,

888 require_positive: bool = False,

889 require_unsaturated: bool = False,

890 require_nonempty: bool = True,

891 ) -> pl.DataFrame:

892 raise NotImplementedError

893

894 def get_amplitudes_list(self, run_number: int, meas_type: str) -> pl.DataFrame:

895 raise NotImplementedError

896

897 def load_lab_environment_data(self) -> pl.DataFrame:

898 return self.delta_source.load_lab_environment_data()

899

900 def load_monitoring_data(self, *run_numbers: int) -> pl.DataFrame:

901 raise NotImplementedError

902

903 def get_measurements_list(self, run_number: int) -> List[int]:

904 raise NotImplementedError

905

906 def get_measurement_types(self, run_number: int, measurement: Optional[int] = None) -> List[str]:

907 raise NotImplementedError

908

909 def get_bad_samples_check(self, run_number: int) -> pl.DataFrame:

910 raise NotImplementedError

911

912 def get_middle_attenuation_run(self, run_numbers: List[int]) -> int:

913 raise NotImplementedError

914

915

916class SQLSource(DataSource):

917 def __init__(self, uri: str, derived_dir: Optional[Path] = None) -> None:

918 self.uri = uri

919 self.derived_dir = derived_dir

920

921 self.delta_source = DeltaSource(None, derived_dir)

922

923 if derived_dir is not None and not derived_dir.exists():

924 derived_dir.mkdir(exist_ok=True)

925 os.chmod(derived_dir, 0o775)

926

927 def load_raw_data(

928 self,

929 *run_numbers: int,

930 require_positive: bool = False,

931 require_unsaturated: bool = False,

932 require_nonempty: bool = True,

933 ignore_boards: List[str] = [],

934 ) -> pl.DataFrame:

935 log.info("Loading raw data from SQLSource")

936

937 run_number_string = ",".join(f"'{r}'" for r in run_numbers)

938 ignore_boards_string = ",".join(f"'{b}'" for b in ignore_boards)

939

940 # Merge alfe_mode and hps gain into pas_mode, preferring alfe_mode

941 query = f"""

942 SELECT

943 m.*,

944 s.channel,

945 s.gain,

946 s.is_pulsed,

947 s.samples,

948 b.board_type as board_variant,

949 b.board_version as board_version,

950 r.githash,

951 COALESCE(m.alfe_mode, m.hps_ps_gain) AS pas_mode

952 FROM runs r

953 JOIN measurements m

954 ON r.run_number = m.run_number

955 JOIN samples s

956 ON m.id = s.measurement_id

957 JOIN boards b

958 ON m.board_id = b.board_id

959 WHERE m.run_number in ({run_number_string})

960 """

961 if len(ignore_boards) > 0:

962 log.warning(f"Will skip loading boards {ignore_boards}")

963 query += f"""

964 AND b.board_id NOT IN ({ignore_boards_string})

965 """

966

967 if require_positive:

968 query += "\nAND samples_min > 0"

969 if require_unsaturated:

970 query += f"\nAND samples_max < {2**15}"

971 if require_nonempty:

972 query += "\nAND samples_len > 0"

973

974 df = pl.read_database_uri(query, self.uri, partition_on="channel", partition_num=16)

975 df = df.rename({"measurement_number": "measurement", "measurement_timestamp": "timestamp"}).drop("id")

976 # Because the data is being loaded from multiple threads, sorting must be down outside of the SQL query

977 df = df.sort(by=["run_number", "channel", "gain"])

978 return df

979

980 def load_derived_data(self, run_number: int, meas_type: str) -> pl.DataFrame:

981 return self.delta_source.load_derived_data(run_number, meas_type)

982

983 def load_coherent_noise_data(self, run_number: int) -> pl.DataFrame:

984 return self.delta_source.load_coherent_noise_data(run_number)

985

986 def load_frame_data(

987 self, *run_numbers: int, reject_single_adc: bool = False, non_empty: bool = True

988 ) -> pl.DataFrame:

989 run_number_string = ",".join(str(r) for r in run_numbers)

990 query = f"""

991 SELECT

992 m.*,

993 f.adc,

994 f.channel,

995 f.frame,

996 h.felix_event_count,

997 CAST(h.felix_bcid AS smallint[]) AS felix_bcid

998 FROM runs r

999 JOIN measurements m

1000 ON r.run_number = m.run_number

1001 JOIN frames f

1002 ON m.id = f.measurement_id

1003 JOIN felix_headers h

1004 ON m.id = h.measurement_id

1005 WHERE r.run_number in ({run_number_string})

1006 """

1007 if reject_single_adc:

1008 query += f"\nAND frame_max < {1e5}"

1009 if non_empty:

1010 query += "\nAND frame_len != 0"

1011 df = pl.read_database_uri(query, self.uri, partition_on="adc", partition_num=16)

1012 df = (

1013 df.drop("id")

1014 .pivot(on="channel", values="frame")

1015 .rename(

1016 {

1017 "1": "frame1",

1018 "8": "frame8",

1019 "measurement_number": "measurement",

1020 "measurement_timestamp": "timestamp",

1021 }

1022 )

1023 )

1024 return df.sort(by=["run_number", "measurement", "adc"])

1025

1026 def save_derived_data(self, derived_data: pl.DataFrame, run_number: int, meas_type: str) -> None:

1027 self.delta_source.save_derived_data(derived_data, run_number, meas_type)

1028

1029 def save_coherent_noise_data(self, coherent_noise_data: pl.DataFrame, run_number: int) -> None:

1030 self.delta_source.save_coherent_noise_data(coherent_noise_data, run_number)

1031

1032 def check_run_exists(self, run_number: int) -> bool:

1033 query = f"SELECT run_number FROM runs WHERE run_number = {run_number}"

1034 df = pl.read_database_uri(query, self.uri)

1035 return not df.is_empty()

1036

1037 def check_measurement_exists(self, run_number: int, measurement: int) -> bool:

1038 query = f"""

1039 SELECT run_number

1040 FROM measurements

1041 WHERE run_number = {run_number}

1042 AND measurement_number = {measurement}

1043 """

1044 df = pl.read_database_uri(query, self.uri)

1045 return not df.is_empty()

1046

1047 def get_runs_summary(self) -> pl.DataFrame:

1048 # Merge alfe_mode and hps gain into pas_mode, preferring alfe_mode

1049 query = """

1050 SELECT DISTINCT

1051 r.run_number,

1052 array_agg(DISTINCT m.board_id ORDER BY m.board_id) AS board_id,

1053 r.run_timestamp AS timestamp,

1054 m.meas_type,

1055 array_agg(DISTINCT m.att_val ORDER BY m.att_val) AS att_val,

1056 COALESCE(m.alfe_mode, m.hps_ps_gain) AS pas_mode,

1057 array_agg(DISTINCT s.channel ORDER BY s.channel) AS channel

1058 FROM runs r

1059 JOIN measurements m

1060 ON r.run_number = m.run_number

1061 JOIN samples s

1062 ON m.id = s.measurement_id

1063 GROUP BY r.run_number, r.run_timestamp, m.meas_type, COALESCE(m.alfe_mode, m.hps_ps_gain)

1064 ORDER BY r.run_number DESC

1065 """

1066 return pl.read_database_uri(query, self.uri)

1067

1068 def get_boards_summary(self) -> pl.DataFrame:

1069 query = """

1070 SELECT DISTINCT

1071 r.run_number,

1072 m.board_id,

1073 r.run_timestamp as timestamp

1074 FROM runs r

1075 JOIN measurements m

1076 USING (run_number)

1077 ORDER BY run_number DESC

1078 """

1079 return pl.read_database_uri(query, self.uri)

1080

1081 def get_boards_list(self, run_number: Optional[int] = None, meas_type: Optional[str] = None) -> pl.DataFrame:

1082 query = """

1083 SELECT DISTINCT

1084 boards.board_id,

1085 board_version,

1086 board_type

1087 FROM boards

1088 JOIN measurements m

1089 ON boards.board_id = m.board_id

1090 """

1091

1092 if run_number is not None:

1093 query += f"""

1094 WHERE m.run_number = {run_number}

1095 """

1096

1097 if meas_type is not None:

1098 query += f"""

1099 {"AND" if run_number is not None else "WHERE"} meas_type = '{meas_type}'

1100 """

1101 query += """

1102 ORDER BY boards.board_id

1103 """

1104

1105 return pl.read_database_uri(query, self.uri)

1106

1107 def get_runs_list(self) -> pl.DataFrame:

1108 query = """

1109 SELECT DISTINCT

1110 run_number,

1111 meas_type

1112 FROM measurements

1113 ORDER BY run_number

1114 """

1115 return pl.read_database_uri(query, self.uri)

1116

1117 def get_channels_list(

1118 self,

1119 run_number: int,

1120 meas_type: str,

1121 require_pulsed: bool = False,

1122 require_positive: bool = False,

1123 require_unsaturated: bool = False,

1124 require_nonempty: bool = True,

1125 ) -> pl.DataFrame:

1126 query = f"""

1127 SELECT DISTINCT s.channel

1128 FROM measurements m

1129 JOIN samples s

1130 ON m.id = s.measurement_id

1131 WHERE m.run_number = {run_number}

1132 AND m.meas_type = '{meas_type}'

1133 """

1134

1135 if require_pulsed:

1136 query += "\nAND s.is_pulsed"

1137 if require_positive:

1138 query += "\nAND samples_min > 0"

1139 if require_unsaturated:

1140 query += f"\nAND samples_max < {2**15}"

1141 if require_nonempty:

1142 query += "\nAND samples_len > 0"

1143

1144 query += "\nORDER BY channel"

1145

1146 return pl.read_database_uri(query, self.uri)

1147

1148 def get_run_boards_list(

1149 self,

1150 run_number: int,

1151 meas_type: str,

1152 ) -> pl.DataFrame:

1153 query = f"""

1154 SELECT DISTINCT board_id

1155 FROM measurements

1156 WHERE run_number = {run_number}

1157 AND meas_type = '{meas_type}'

1158 ORDER BY board_id

1159 """

1160

1161 return pl.read_database_uri(query, self.uri)

1162

1163 def get_amplitudes_list(self, run_number: int, meas_type: str) -> pl.DataFrame:

1164 return self.load_derived_data(run_number, meas_type).select(pl.col("amp").unique())

1165

1166 def load_monitoring_data(self, *run_numbers: int) -> pl.DataFrame:

1167 run_number_string = ",".join(str(r) for r in run_numbers)

1168 query = f"""

1169 SELECT

1170 mon.monitor,

1171 mon.unit,

1172 mon.monitor_type,

1173 mon.group_name,

1174 mon.ideal_value,

1175 v.value,

1176 v.run_number,

1177 v.monitoring_timestamp AS timestamp,

1178 v.board_id,

1179 meas.measurement_number AS measurement

1180 FROM monitors mon

1181 JOIN monitor_values v

1182 ON mon.id = v.monitor_id

1183 LEFT OUTER JOIN measurements meas

1184 ON v.measurement_id = meas.id

1185 WHERE v.run_number IN ({run_number_string})

1186 """

1187 data = pl.read_database_uri(query, self.uri)

1188 data_with_timezone = data.with_columns(

1189 pl.col("timestamp").dt.replace_time_zone("America/New_York").dt.convert_time_zone("UTC")

1190 )

1191 return data_with_timezone

1192

1193 def get_measurements_list(self, run_number: int) -> List[int]:

1194 query = f"""

1195 SELECT DISTINCT measurement_number AS measurement

1196 FROM measurements

1197 WHERE run_number = {run_number}

1198 ORDER BY measurement

1199 """

1200 return pl.read_database_uri(query, self.uri).to_series().to_list()

1201

1202 def get_measurement_types(self, run_number: int, measurement: Optional[int] = None) -> List[str]:

1203 query = f"""

1204 SELECT DISTINCT meas_type

1205 FROM measurements

1206 WHERE run_number = {run_number}

1207 """

1208

1209 if measurement is not None:

1210 query += f"\nAND measurement_number = {measurement}"

1211

1212 measurement_types = pl.read_database_uri(query, self.uri).to_series().to_list()

1213

1214 return measurement_types

1215

1216 def get_bad_samples_check(self, run_number: int) -> pl.DataFrame:

1217 query = f"""

1218 SELECT

1219 m.board_id,

1220 s.channel,

1221 s.gain,

1222 s.samples_min,

1223 s.samples_max,

1224 s.samples_len

1225 FROM measurements m

1226 JOIN samples s

1227 ON m.id = s.measurement_id

1228 WHERE m.run_number = {run_number}

1229 """

1230 return pl.read_database_uri(query, self.uri)

1231

1232 def load_lab_environment_data(self) -> pl.DataFrame:

1233 query = """

1234 SELECT datetime_utc AS timestamp, lab_name, humidity, pressure, lab_temp, crate_temp

1235 FROM lab_environment

1236 """

1237 return pl.read_database_uri(query, self.uri)

1238

1239 def get_middle_attenuation_run(self, run_numbers: List[int]) -> int:

1240 run_number_string = ",".join(str(r) for r in run_numbers)

1241 attenuations_string = ", ".join(str(a) for a in MIDDLE_ATTENUATIONS)

1242 query = f"""

1243 SELECT DISTINCT

1244 run_number

1245 FROM measurements

1246 WHERE att_val IN ({attenuations_string})

1247 AND run_number IN ({run_number_string})

1248 LIMIT 1

1249 """

1250 df = pl.read_database_uri(query, self.uri)

1251 if df.is_empty():

1252 log.error(f"Did not find a run with att_val in {MIDDLE_ATTENUATIONS} in runs {run_numbers}")

1253 sys.exit(1)

1254 min_att_run_number = df["run_number"][0]

1255 return min_att_run_number

Coverage for polars_analysis / data_sources.py: 67%

533 statements