Coverage for polars_analysis/analysis/pulse

1import logging

2from contextlib import suppress

3from pathlib import Path

4from typing import List, Optional, Tuple

6import numpy as np

7import polars as pl

8from scipy import linalg

9from scipy.optimize import curve_fit

10from scipy.signal import find_peaks

12from polars_analysis.analysis import constants

13from polars_analysis.plotting import helper

15# Instantiate logger

16log = logging.getLogger(__name__)

18"""

19Functions to calculate derived values for pulse runs.

20"""

23def expr_awg_amp_to_amp(col1: str = "awg_amp", col2: str = "att_val") -> pl.Expr:

24 """

25 Convert AWG amplitude to amplitude in mA.

27 Args:

28 col1: Column name of AWG amplitude.

29 col2: Column name of attenuation value.

31 Returns:

32 Expression to calculate amplitude in mA.

33 """

34 return 4.0 * pl.col(col1) * 10 ** (-pl.col(col2) / 20.0)

37def expr_max_pulse_amp(col: str = "samples") -> pl.Expr:

38 """

39 Get maximum pulse amplitude.

41 Args:

42 col: Column name of pulse samples.

44 Returns:

45 Expression to calculate maximum pulse amplitude.

46 """

47 return pl.col(col).list.max() - pl.col(col).list.median()

50def expr_max_phase_indices(mean_interleaved_pulse: str = "mean_interleaved_pulse", phase_shift: int = 0) -> pl.Expr:

51 """

52 Get indices of samples around the pulse peak, shifted by a phase.

53 Uses the maximum amplitude pulse per channel and gain, to mirror how the OFCs are derived.

55 Args:

56 mean_interleaved_pulse: Column name of mean interleaved pulse.

57 phase_shift: Phase shift in samples.

59 Returns:

60 Expression to calculate indices of samples around the maximum pulse amplitude.

61 """

62 samples_around_max = [-2, -1, 0, 1, 2]

63 return pl.concat_list(

64 [

65 (

66 pl.col(mean_interleaved_pulse)

67 .list.arg_max()

68 .filter(pl.col("awg_amp") == pl.col("awg_amp").max())

69 .first()

70 .over(["channel", "gain"])

71 + phase_shift

72 + (constants.N_PHASES * i)

73 )

74 % (constants.PULSES_PER_TRAIN * constants.SAMPLES_PER_PULSE)

75 for i in samples_around_max

76 ]

77 )

80def per_ch_interleaving_util(

81 all_pulses: np.ndarray,

82 trimmed_width: int,

83) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:

84 """

85 Utility function for pipe_samples_interleaved below

86 """

88 mean_pulse_train = all_pulses.reshape([-1, constants.PULSE_TRAIN_PERIOD]).mean(axis=0)

89 threshold = np.median(mean_pulse_train) + 3 * np.std(mean_pulse_train)

90 # Require peaks be at least SAMPLES_PER_PULSE away from each other, with 5 samples of wiggle room

91 # Based on some testing with large pulses 1 or 2 samples should be fine, but I wanted to be safe

92 PEAK_SPACING_PADDING = 5

93 peaks: np.ndarray = find_peaks(

94 mean_pulse_train,

95 height=threshold,

96 distance=constants.SAMPLES_PER_PULSE - PEAK_SPACING_PADDING,

97 )[0]

98 if len(peaks) > constants.PULSES_PER_TRAIN:

99 log.warning(

100 (

101 f"Found more than {constants.PULSES_PER_TRAIN} possible "

102 f"pulses, will use {constants.PULSES_PER_TRAIN} highest peaks"

103 )

104 )

105 peaks = np.sort(peaks[np.argsort(mean_pulse_train[peaks])][-constants.PULSES_PER_TRAIN :])

106 if len(peaks) < constants.PULSES_PER_TRAIN:

107 log.warning(f"Found {len(peaks)} peaks, retrying with lower threshold")

108 threshold = np.median(mean_pulse_train) + 2 * np.std(mean_pulse_train)

109 peaks = find_peaks(

110 mean_pulse_train,

111 height=threshold,

112 distance=constants.SAMPLES_PER_PULSE - PEAK_SPACING_PADDING,

113 )[0]

114 peaks = np.sort(peaks[np.argsort(mean_pulse_train[peaks])][-constants.PULSES_PER_TRAIN :])

115 if len(peaks) < constants.PULSES_PER_TRAIN:

116 # Sometimes we get unlucky and sample on the falling edge of the first peak in a train.

117 # We can add one pulse to the end of the train to let us find the first peak

118 mean_pulse_train = np.concatenate([mean_pulse_train, mean_pulse_train[: constants.SAMPLES_PER_PULSE]])

119 log.warning(f"Found {len(peaks)} peaks after lowering threshold, retrying with extra first pulse")

120 threshold = np.median(mean_pulse_train) + 3 * np.std(mean_pulse_train)

121 peaks = find_peaks(

122 mean_pulse_train,

123 height=threshold,

124 distance=constants.SAMPLES_PER_PULSE - PEAK_SPACING_PADDING,

125 )[0]

126 peaks = np.sort(peaks[np.argsort(mean_pulse_train[peaks])][-constants.PULSES_PER_TRAIN :])

127

128 start, end = None, None

129 if len(peaks) > 0:

130 peaks = np.append(peaks, peaks[0] + constants.PULSE_TRAIN_PERIOD)

131 # Offset by PEAK_INDEX to make sure pulse peak is in the right spot

132 start = (peaks[np.diff(peaks).argmax() + 1] - constants.PEAK_INDEX) % constants.PULSE_TRAIN_PERIOD

133 # # This lines up with the previous version, but I'm not sure why that timing was chosen

134 # # The new timing also fixes the OFC samples plot, so maybe the old was a mistake

135 # start = (peaks[np.diff(peaks).argmax() + 1] - 8) % constants.PULSE_TRAIN_PERIOD

136 end = start + constants.PULSE_TRAIN_PERIOD * ((trimmed_width // constants.PULSE_TRAIN_PERIOD) - 1)

137 else:

138 log.error("No peaks found for interleaving!")

139

140 return (start, end)

141

142

143def pipe_samples_interleaved(

144 df: pl.DataFrame, samples: str = "samples", is_reference_pulse: Optional[str] = None

145) -> pl.DataFrame:

146 """

147 Trigger and interleave pulse samples, producing series of pulses that are SAMPLES_PER_PULSE * N_PHASES long.

148 Adds "samples_triggered", "samples_interleaved", "mean_interleaved_pulse"

149 and "samples_baseline" columns to the DataFrame.

150

151 Args:

152 df: DataFrame.

153 samples: Column name of pulse samples.

154

155 Returns:

156 DataFrame with 1D array of triggered pulse samples in "samples_triggered",

157 2D array of interleaved pulse samples in "samples_interleaved" column,

158 the mean of all interleaved pulses in "mean_interleaved_pulse" column,

159 and 2D array of baseline samples (in between pulse trains) in "samples_baseline" column.

160 """

161 width = df.select(pl.col(samples).list.len().min()).item()

162 num_trains = width // constants.PULSE_TRAIN_PERIOD

163 samples_interleaved = np.zeros([len(df), num_trains - 1, constants.SAMPLES_PER_PULSE * constants.PULSES_PER_TRAIN])

164 samples_baseline = np.zeros([len(df), num_trains - 1, constants.TRIGGER_OFFSET])

165 # If you want to save the interleaved samples without the baseline subtracted

166 # samples_interleaved_raw = np.zeros(

167 # [len(df), num_trains - 1, constants.SAMPLES_PER_PULSE * constants.PULSES_PER_TRAIN]

168 # )

169

170 trimmed_width = constants.PULSE_TRAIN_PERIOD * num_trains

171 samples_array = df[samples].list.slice(0, trimmed_width).list.to_array(trimmed_width).to_numpy()

172

173 ref_start, ref_end = None, None

174 if is_reference_pulse:

175 # The reference pulse for cross talk runs

176 reference_samples = ( # noqa F841

177 df.filter(pl.col(is_reference_pulse))

178 .sort(pl.col(samples).list.max()) # Need to pick high gain

179 .select(pl.last(samples))[samples]

180 .list.slice(0, trimmed_width)

181 .list.to_array(trimmed_width)

182 .to_numpy()

183 )

184 ref_start, ref_end = per_ch_interleaving_util(reference_samples, trimmed_width)

185

186 for i, all_pulses in enumerate(samples_array):

187 if is_reference_pulse:

188 start, end = ref_start, ref_end

189 else:

190 start, end = per_ch_interleaving_util(all_pulses, trimmed_width)

191

192 if start is None and end is None:

193 log.warning(f"No start and end found for index {i}")

194 log.warning(f"{df['channel', 'gain', 'awg_amp'][i]}")

195

196 # Samples triggered starts right at the peak, mainly used for autocorrelation

197 triggered_pulses = all_pulses[start:end].reshape([-1, constants.PULSE_TRAIN_PERIOD])

198 baseline_samples = triggered_pulses[:, constants.SAMPLES_PER_PULSE * constants.PULSES_PER_TRAIN :]

199 interleaved_pulses = triggered_pulses[:, constants.TIME_INDICES_SORTED] - baseline_samples.mean()

200 with suppress(ValueError):

201 samples_interleaved[i] = interleaved_pulses

202 samples_baseline[i] = baseline_samples

203

204 # samples_interleaved_raw[i] = triggered_pulses[:, constants.TIME_INDICES_SORTED]

205

206 mean_interleaved_pulse = np.mean(samples_interleaved, axis=1)

207

208 df = df.with_columns(

209 pl.Series(name="samples_interleaved", values=samples_interleaved, dtype=pl.List(pl.List(pl.Float64))),

210 pl.Series(name="mean_interleaved_pulse", values=mean_interleaved_pulse, dtype=pl.List(pl.Float64)),

211 pl.Series(name="samples_baseline", values=samples_baseline),

212 # pl.Series(name="samples_interleaved_raw", values=samples_interleaved_raw),

213 )

214

215 # # Replace samples_interleaved with one where the baseline is determined once per channel, not per measurement

216 # df = df.join(

217 # df.filter(pl.col("awg_amp") == pl.col("awg_amp").max().over("gain")).with_columns(

218 # pl.col("samples_baseline")

219 # .arr.to_list()

220 # .list.eval(pl.element().arr.to_list().list.mean())

221 # .list.mean()

222 # .alias("max_amp_baseline_mean")

223 # )["run_number", "channel", "gain", "max_amp_baseline_mean"],

224 # on=["run_number", "channel", "gain"],

225 # how="left",

226 # )

227 # df = df.with_columns(

228 # (pl.col("samples_interleaved_raw") - pl.col("max_amp_baseline_mean")).alias("samples_interleaved")

229 # ).drop("max_amp_baseline_mean")

230

231 return df

232

233

234def calc_autocorr_along_axis(x: np.ndarray) -> np.ndarray:

235 """

236 Helper function to calculate autocorrelation along an axis.

237 TODO: Incorporate this into calc_autocorr.

238

239 Args:

240 x: Array of samples.

241

242 Returns:

243 Autocorrelation of the samples.

244 """

245 correlated = np.correlate(x, x, mode="full")

246 return correlated[correlated.size // 2 :]

247

248

249def baseline_calc_autocorr(trains: np.ndarray) -> np.ndarray:

250 """

251 Helper function to calculate autocorrelation of baseline samples.

252 Only uses the last 500 samples of each baseline, where the ADC response has returned to baseline.

253

254 Args:

255 trains: Array of baseline samples.

256

257 Returns:

258 Mean of the autocorrelations of the baseline samples.

259 """

260 last_500_samples = trains[:, -500:].astype(float)

261 last_500_samples -= np.nanmean(last_500_samples)

262 x = np.apply_along_axis(calc_autocorr_along_axis, 1, last_500_samples[:, :60])

263 ac = np.nanmean(x, axis=0)

264 return ac[:60]

265

266

267def pipe_OFCs(

268 df: pl.DataFrame,

269 mean_interleaved_pulse: str = "mean_interleaved_pulse",

270 max_phase_indices: str = "max_phase_indices",

271 amp: str = "amp",

272 is_crosstalk: bool = False,

273 quantile: float = 1.0,

274) -> pl.DataFrame:

275 """

276 Calculate Optimal Filter Coefficients (OFCs) for pulse samples.

277 Returns DataFrame with "autocorr", "OFCs_a", "OFCs_b", and "OFC_amp" columns.

278

279 Args:

280 df: DataFrame.

281 mean_interleaved_pulse: Column name of mean interleaved pulse.

282 max_phase_indices: Column name of indices around the maximum pulse amplitude.

283 amp: Column name of pulse amplitude.

284

285 Returns:

286 DataFrame with OFCs and autocorrelation.

287 """

288 if is_crosstalk:

289 # Only one row per channel and gain with max amplitude, and must be a reference pulse

290 ofc_df = df.filter(pl.col(amp) == pl.col(amp).max().over(["channel", "gain"]), pl.col("is_reference_pulse"))

291 join_columns = ["gain"]

292 if len(ofc_df) > 2:

293 log.warning(f"Found {len(ofc_df)} references pulses, only 2 (1 per gain) are supported for crosstalk runs")

294 else:

295 # One row per channel and gain with max amplitude

296 # Code to use Nth quantile amplitude

297 ofc_df = df.filter(pl.col(amp) == pl.quantile(amp, quantile).over(["channel", "gain"]))

298 log.debug(f"Amp used for OFCs = {ofc_df['channel', 'gain', 'amp', 'awg_amp']}")

299 # Code to use max amplitude

300 # ofc_df = df.filter(pl.col(amp) == pl.col(amp).max().over(["channel", "gain"]))

301

302 join_columns = ["channel", "gain"]

303

304 ofc_df = ofc_df.unique(subset=["channel", "gain"], keep="first", maintain_order=True)

305 updated_rows = []

306

307 for channel in ofc_df.select("channel").unique().to_series():

308 channel_df = ofc_df.filter(pl.col("channel") == channel)

309

310 a_coeffs: dict[str, List[List[float]]] = {

311 "hi": [[] for _ in range(constants.N_PHASES)],

312 "lo": [[] for _ in range(constants.N_PHASES)],

313 }

314 b_coeffs: dict[str, List[List[float]]] = {

315 "hi": [[] for _ in range(constants.N_PHASES)],

316 "lo": [[] for _ in range(constants.N_PHASES)],

317 }

318 autocorr: dict[str, List[float]] = {"hi": [], "lo": []}

319

320 for row in channel_df.iter_rows(named=True):

321 autocorrelation = baseline_calc_autocorr(np.array(row["samples_baseline"]))

322 autocorr[row["gain"]] = autocorrelation.tolist()

323 max_peak = np.max(np.array(row[mean_interleaved_pulse])[row[max_phase_indices]])

324

325 for i in range(constants.N_PHASES):

326 phase_shift = i + constants.PHASE_SHIFT0

327 indices = np.asarray(row[max_phase_indices]) + phase_shift

328 ofcs = calc_OFCs(

329 autocorrelation[:5],

330 np.array(row[mean_interleaved_pulse])[indices],

331 np.gradient(row[mean_interleaved_pulse])[indices],

332 max_peak,

333 )

334 a_coeffs[row["gain"]][i] = ofcs[0].tolist()

335 b_coeffs[row["gain"]][i] = ofcs[1].tolist()

336

337 updated_rows.append(

338 {

339 "channel": row["channel"],

340 "gain": row["gain"],

341 "autocorr": autocorr[row["gain"]],

342 "OFCs_a": a_coeffs[row["gain"]],

343 "OFCs_b": b_coeffs[row["gain"]],

344 "OFC_amp": row["amp"],

345 }

346 )

347

348 for i in range(constants.N_PHASES):

349 phase_shift = i + constants.PHASE_SHIFT0

350 if a_coeffs["lo"][i] == []:

351 log.error(f"Failed to compute lo gain OFCs for channel {channel} phase shift {phase_shift}!")

352 # Uncomment these to get around error

353 # a_coeffs["lo"][i] = a_coeffs["hi"][i]

354 # b_coeffs["lo"][i] = b_coeffs["hi"][i]

355

356 if a_coeffs["hi"][i] == []:

357 log.error(f"Failed to compute hi gain OFCs for channel {channel} phase shift {phase_shift}!")

358 # Uncomment these to get around error

359 # a_coeffs["hi"][i] = a_coeffs["lo"][i]

360 # b_coeffs["hi"][i] = b_coeffs["lo"][i]

361

362 return df.join(

363 pl.DataFrame(updated_rows, schema_overrides={"OFC_amp": pl.Float32}), on=join_columns, how="left"

364 ).drop("channel_right", strict=False)

365

366

367def calc_OFCs(

368 autocorrelation: np.ndarray, samples: np.ndarray, gradient: np.ndarray, max_peak: float

369) -> Tuple[np.ndarray, np.ndarray]:

370 """

371 Helper function to calculate Optimal Filter Coefficients (OFCs) for pulse samples.

372 Uses method and notation described in the Cleland & Stern paper,

373 "Signal processing considerations for liquid ionization calorimeters in a high rate environment".

374

375 Args:

376 autocorrelation: Autocorrelation of the pulse samples.

377 samples: Pulse samples.

378

379 Returns:

380 Tuple of OFCs a and b for the pulse samples.

381 """

382 autocorrelation = np.ravel(autocorrelation) / autocorrelation[0]

383 samples = np.ravel(samples)

384 gradient = np.ravel(gradient)

385 # scale = max(samples)

386 samples /= max_peak

387 gradient /= max_peak

388 # calculate V = R^{-1}.

389 inv_ac = linalg.inv(linalg.toeplitz(autocorrelation))

390 # calculate V*g and V*dg only once.

391 vg = np.dot(inv_ac, samples)

392 vdg = np.dot(inv_ac, gradient)

393 # calculate helper variables

394 q1 = np.dot(samples, vg)

395 q2 = np.dot(gradient, vdg)

396 q3 = np.dot(gradient, vg)

397 delta = q1 * q2 - q3 * q3

398 # calculate Lagrange multipliers

399 lm_lambda = q2 / delta

400 lm_kappa = -q3 / delta

401 lm_mu = q3 / delta

402 lm_rho = -q1 / delta

403 # calculate optimal filter coefficients

404 a_coeffs = lm_lambda * vg + lm_kappa * vdg

405 b_coeffs = lm_mu * vg + lm_rho * vdg

406 return a_coeffs, b_coeffs

407

408

409def pipe_apply_OFCs(

410 df: pl.DataFrame,

411 samples_interleaved: str = "samples_interleaved",

412 max_phase_indices: str = "max_phase_indices",

413 OFCs_a: str = "OFCs_a",

414 OFCs_b: str = "OFCs_b",

415 all_phases: bool = False,

416) -> pl.DataFrame:

417 """

418 Applies Optimal Filter Coefficients (OFCs) to pulse samples.

419 Returns DataFrame with "energies", "times", "energy_mean", "energy_std", "time_mean", and "time_std" columns.

420

421 Args:

422 df: DataFrame.

423 samples_interleaved: Column name of interleaved pulse samples.

424 max_phase_indices: Column name of indices around the maximum pulse amplitude.

425 OFCs_a: Column name of a OFCs.

426 OFCs_b: Column name of b OFCs.

427

428 Returns:

429 DataFrame with energies, times, and statistics.

430 """

431

432 log.debug("Applying OFCs")

433 # Calculate energies for all 30 phases of OFCs

434 energies = (

435 df.lazy()

436 .with_row_index("index0") # per dataframe row

437 .with_columns(pl.arange(0, constants.N_PHASES).implode().alias("phase"))

438 .explode(samples_interleaved)

439 .with_row_index("index1") # per pulse train

440 .explode(OFCs_a, "phase")

441 .select(

442 "index0",

443 "index1",

444 OFCs_a,

445 pl.col(samples_interleaved).list.gather(

446 (pl.col(max_phase_indices) + pl.col("phase") + constants.PHASE_SHIFT0)

447 % (constants.SAMPLES_PER_PULSE * constants.PULSES_PER_TRAIN)

448 ),

449 "run_number",

450 "measurement",

451 "channel",

452 "gain",

453 )

454 .with_row_index("index2") # per phase per pulse train

455 .explode(OFCs_a, samples_interleaved)

456 .group_by("index2", maintain_order=True)

457 .agg(

458 pl.col("index0").first(),

459 pl.col("index1").first(),

460 pl.col(samples_interleaved).dot(OFCs_a).alias("energies"),

461 pl.col("run_number").first(),

462 pl.col("measurement").first(),

463 pl.col("channel").first(),

464 pl.col("gain").first(),

465 )

466 .drop("index2")

467 .group_by("index1", maintain_order=True)

468 .agg(

469 pl.col("index0").first(),

470 pl.col("energies"),

471 pl.col("run_number").first(),

472 pl.col("measurement").first(),

473 pl.col("channel").first(),

474 pl.col("gain").first(),

475 )

476 .drop("index1")

477 .group_by("index0")

478 .agg(

479 pl.col("energies"),

480 pl.col("run_number").first(),

481 pl.col("measurement").first(),

482 pl.col("channel").first(),

483 pl.col("gain").first(),

484 )

485 .drop("index0")

486 .collect()

487 )

488 df = df.join(energies, on=["run_number", "measurement", "channel", "gain"])

489

490 # Calculate times for all 30 phases of OFCs

491 times = (

492 df.lazy()

493 .with_row_index("index0") # per dataframe row

494 .with_columns(pl.arange(0, constants.N_PHASES).implode().alias("phase"))

495 .explode("energies", samples_interleaved)

496 .with_row_index("index1") # per pulse train

497 .explode(OFCs_b, "energies", "phase")

498 .select(

499 "index0",

500 "index1",

501 "energies",

502 OFCs_b,

503 pl.col(samples_interleaved).list.gather(

504 (pl.col(max_phase_indices) + pl.col("phase") + constants.PHASE_SHIFT0)

505 % (constants.SAMPLES_PER_PULSE * constants.PULSES_PER_TRAIN)

506 ),

507 "run_number",

508 "measurement",

509 "channel",

510 "gain",

511 )

512 .with_row_index("index2") # per phase

513 .explode(OFCs_b, samples_interleaved)

514 .group_by("index2", maintain_order=True)

515 .agg(

516 pl.col("index0").first(),

517 pl.col("index1").first(),

518 (

519 (constants.SEC_PER_SAMPLE / constants.N_PHASES)

520 * pl.col(samples_interleaved).dot(OFCs_b).truediv(pl.col("energies").first())

521 ).alias("times"),

522 pl.col("run_number").first(),

523 pl.col("measurement").first(),

524 pl.col("channel").first(),

525 pl.col("gain").first(),

526 )

527 .drop("index2")

528 .group_by("index1", maintain_order=True)

529 .agg(

530 pl.col("index0").first(),

531 pl.col("times"),

532 pl.col("run_number").first(),

533 pl.col("measurement").first(),

534 pl.col("channel").first(),

535 pl.col("gain").first(),

536 )

537 .drop("index1")

538 .group_by("index0")

539 .agg(

540 pl.col("times"),

541 pl.col("run_number").first(),

542 pl.col("measurement").first(),

543 pl.col("channel").first(),

544 pl.col("gain").first(),

545 )

546 .drop("index0")

547 .collect()

548 )

549 df = df.join(times, on=["run_number", "measurement", "channel", "gain"])

550

551 # Calculate statistics just for peak pulse

552 if not all_phases:

553 df = (

554 df.lazy()

555 .with_columns(

556 pl.col("energies").list.eval(pl.element().list.get(constants.PHASE_SHIFT_PEAK)).alias("energies"),

557 pl.col("times").list.eval(pl.element().list.get(constants.PHASE_SHIFT_PEAK)).alias("times"),

558 )

559 .with_columns(

560 pl.col(OFCs_a).list.get(constants.PHASE_SHIFT_PEAK).alias(OFCs_a),

561 pl.col(OFCs_b).list.get(constants.PHASE_SHIFT_PEAK).alias(OFCs_b),

562 pl.col("energies").list.mean().alias("energy_mean"),

563 pl.col("energies").list.std().alias("energy_std"),

564 pl.col("energies").list.max().alias("energy_max"),

565 pl.col("energies").list.min().alias("energy_min"),

566 )

567 .collect()

568 )

569 # Calculate statistics across all 30 pulses in a train, rather than just the peak pulse

570 else:

571 df = (

572 df.lazy()

573 .with_columns(

574 pl.col("energies").list.eval(pl.element().explode()).list.mean().alias("energy_mean"),

575 pl.col("energies").list.eval(pl.element().explode()).list.max().alias("energy_max"),

576 pl.col("energies").list.eval(pl.element().explode()).list.min().alias("energy_min"),

577 pl.col("energies").list.eval(pl.element().explode()).list.std().alias("energy_std"),

578 )

579 .collect()

580 )

581

582 # This paper about Run 1 topoclustering says only times from cells with energy significance > 2

583 # are used to calculate cluster timing

584 # Refer to Section 4.2.3, "Signal timing": https://arxiv.org/abs/1603.02934

585 CELL_SIGMA_TRESHOLD = 2

586 if not all_phases:

587 df = (

588 df.drop("times")

589 .join(

590 df.lazy()

591 .select("run_number", "measurement", "channel", "gain", "energies", "times", "energy_std")

592 .with_row_index()

593 .explode("energies", "times")

594 .filter(pl.col("energies").abs() > CELL_SIGMA_TRESHOLD * pl.col("energy_std"))

595 .group_by("index")

596 .agg(pl.col("run_number", "measurement", "channel", "gain").first(), "times")

597 .drop("index")

598 .collect(),

599 on=["run_number", "measurement", "channel", "gain"],

600 )

601 .with_columns(

602 pl.col("times").list.mean().alias("time_mean"),

603 pl.col("times").list.std().alias("time_std"),

604 )

605 )

606 else:

607 df = (

608 df.drop("times")

609 .join(

610 df.lazy()

611 .select("run_number", "measurement", "channel", "gain", "energies", "times", "energy_std")

612 .with_row_index("index0")

613 .explode("energies", "times")

614 .with_row_index("index1")

615 .explode("energies", "times")

616 # .filter(pl.col("energies").abs() > CELL_SIGMA_TRESHOLD * pl.col("energy_std"))

617 .with_columns(

618 pl.when(pl.col("energies").abs() > CELL_SIGMA_TRESHOLD * pl.col("energy_std"))

619 .then(pl.col("times"))

620 .otherwise(None)

621 .alias("times")

622 )

623 .group_by("index1", maintain_order=True)

624 .agg(pl.col("run_number", "measurement", "channel", "gain", "index0").first(), "times")

625 .drop("index1")

626 .group_by("index0", maintain_order=True)

627 .agg(pl.col("run_number", "measurement", "channel", "gain").first(), "times")

628 .drop("index0")

629 .collect(),

630 on=["run_number", "measurement", "channel", "gain"],

631 )

632 .lazy()

633 .with_columns(

634 pl.col("times").list.eval(pl.element().explode()).list.mean().alias("time_mean"),

635 pl.col("times").list.eval(pl.element().explode()).list.std().alias("time_std"),

636 )

637 .collect()

638 )

639

640 ### Ignoring times from low energy cells is designed to replicate what ATLAS does,

641 ### but some less aggressive ways to discard outlier times are included below.

642 # # Drop outlier times

643 # df = df.with_columns(

644 # pl.col("times")

645 # .list.eval(

646 # pl.element().filter(

647 # pl.element()

648 # < pl.max_horizontal(

649 # pl.element().quantile(0.75) + 1.5 * (pl.element().quantile(0.75)-pl.element().quantile(0.25)), 25

650 # ),

651 # pl.element()

652 # > pl.min_horizontal(

653 # pl.element().quantile(0.25) - 1.5 * (pl.element().quantile(0.75)-pl.element().quantile(0.25)), -25

654 # ),

655 # )

656 # )

657 # .alias("times")

658 # )

659 # # Drop outlier times and their corresponding energies

660 # df = df.select(pl.exclude("energies", "times")).hstack(

661 # df["measurement", "channel", "gain", "energies", "times"]

662 # .explode(["energies", "times"])

663 # .with_columns(

664 # pl.col("times").quantile(0.75).over("measurement", "channel", "gain").alias("Q3"),

665 # pl.col("times").quantile(0.25).over("measurement", "channel", "gain").alias("Q1"),

666 # )

667 # .with_columns((pl.col("Q3") - pl.col("Q1")).alias("IQR"))

668 # .filter(

669 # pl.col("times") < pl.max_horizontal(pl.col("Q3") + 1.5 * pl.col("IQR"), 25),

670 # pl.col("times") > pl.min_horizontal(pl.col("Q1") - 1.5 * pl.col("IQR"), -25),

671 # )

672 # .group_by("measurement", "channel", "gain", maintain_order=True)

673 # .agg("energies", "times")

674 # .select("energies", "times")

675 # )

676

677 return df

678

679

680def pipe_rise_time(

681 df: pl.DataFrame,

682 samples_interleaved: str = "samples_interleaved",

683 mean_interleaved_pulse: str = "mean_interleaved_pulse",

684) -> pl.DataFrame:

685 """

686 Calculate both rise time (from mean pulse) and rise time error (Gaussian sigma from individual pulses).

687

688 Args:

689 df: Polars DataFrame.

690 samples_interleaved: Column name with list of pulses.

691 mean_interleaved_pulse: Column name with mean pulse.

692

693 Returns:

694 DataFrame with 'rise_time' and 'rise_time_error' columns.

695 """

696 log.debug("Calculating rise times")

697 FIVE_PERCENT = 0.05

698 rise_times = []

699 rise_time_errors: List[float] = []

700

701 for row in df.iter_rows(named=True):

702 # --- Rise Time from Mean Pulse ---

703 flipped_pulse = np.flip(row[mean_interleaved_pulse])

704 flipped_times = np.flip(constants.INTERLEAVED_TIMES)

705 max_i = np.argmax(flipped_pulse)

706 pulse_peak_time = flipped_times[max_i]

707 flipped_pulse = flipped_pulse[max_i:]

708

709 low_points = np.copy(flipped_pulse)

710 low_points[flipped_pulse > FIVE_PERCENT * flipped_pulse[0]] = 0

711

712 if np.nonzero(low_points)[0].shape[0] == 0:

713 rise_times.append(0.0)

714 else:

715 five_percent_point = np.nonzero(low_points)[0][0]

716 five_percent_time = flipped_times[max_i + five_percent_point]

717 rise_times.append(pulse_peak_time - five_percent_time)

718

719 # Rise time error from all pulses

720 rise_time_vals = []

721 for sample in row[samples_interleaved]:

722 flipped = np.flip(sample)

723 flipped_times = np.flip(constants.INTERLEAVED_TIMES)

724 max_i = np.argmax(flipped)

725 peak_time = flipped_times[max_i]

726 flipped = flipped[max_i:]

727

728 low_points = np.copy(flipped)

729 low_points[flipped > FIVE_PERCENT * flipped[0]] = 0

730

731 if np.nonzero(low_points)[0].shape[0] == 0:

732 continue

733 else:

734 five_percent_point = np.nonzero(low_points)[0][0]

735 five_percent_time = flipped_times[max_i + five_percent_point]

736 rise_time_vals.append(peak_time - five_percent_time)

737

738 # Dummy values to make sure this isn't empty

739 if len(rise_time_vals) == 0:

740 rise_time_errors = [0 for _ in rise_times]

741 else:

742 d_mu = np.std(rise_time_vals) / np.sqrt(len(rise_time_vals))

743 rise_time_errors.append(d_mu)

744

745 return df.with_columns(

746 [

747 pl.Series(name="rise_time", values=rise_times),

748 pl.Series(name="rise_time_error", values=rise_time_errors, strict=False),

749 ]

750 )

751

752

753def pipe_zero_crossing(

754 df: pl.DataFrame,

755 mean_interleaved_pulse: str = "mean_interleaved_pulse",

756 samples_interleaved: str = "samples_interleaved",

757) -> pl.DataFrame:

758 """

759 Calculate falling zero crossing time (after peak) and zero crossing error

760 from pulse samples. Adds 'zero_crossing_time' and 'zero_crossing_error'

761 columns to the DataFrame.

762

763 Args:

764 df: Polars DataFrame.

765 mean_interleaved_pulse: Column with mean interleaved pulse.

766 samples_interleaved: Column with list of interleaved pulses.

767

768 Returns:

769 DataFrame with 'zero_crossing_time' and 'zero_crossing_error' columns.

770 """

771 log.debug("Calculating zero crossing times")

772 zero_crossing_times: list[float] = []

773 zero_crossing_errors: list[float] = []

774 times = constants.INTERLEAVED_TIMES

775

776 for row in df.iter_rows(named=True):

777 # zero crossing time (from mean pulse)

778 pulse = row[mean_interleaved_pulse]

779 max_i = np.argmax(pulse)

780 peak_time = times[max_i]

781

782 for i in range(max_i, len(pulse) - 1):

783 if pulse[i] == 0:

784 zero_crossing_times.append(round(times[i], 1))

785 break

786 elif pulse[i] > 0 and pulse[i + 1] < 0:

787 y1, y2 = pulse[i], pulse[i + 1]

788 closest_to_zero = min([y1, y2], key=lambda y: abs(y))

789 crossing_idx = i if closest_to_zero == y1 else i + 1

790 zero_crossing_time = round(times[crossing_idx] - peak_time, 1)

791 zero_crossing_times.append(zero_crossing_time)

792 break

793 else:

794 zero_crossing_times.append(np.nan)

795

796 # zero crossing error (from samples)

797 zero_crossing_time_row = []

798 for sample in row[samples_interleaved]:

799 max_i = np.argmax(sample)

800 peak_time = times[max_i]

801

802 for i in range(max_i, len(sample) - 1):

803 if sample[i] == 0:

804 zero_crossing_time_row.append(round(times[i] - peak_time, 1))

805 break

806 elif sample[i] > 0 and sample[i + 1] < 0:

807 y1, y2 = sample[i], sample[i + 1]

808 closest_to_zero = min([y1, y2], key=lambda y: abs(y))

809 crossing_idx = i if closest_to_zero == y1 else i + 1

810 zero_crossing_time_row.append(round(times[crossing_idx] - peak_time, 1))

811 break

812

813 if zero_crossing_time_row:

814 d_mu = np.std(zero_crossing_time_row) / np.sqrt(len(zero_crossing_time_row))

815 zero_crossing_errors.append(d_mu)

816 else:

817 zero_crossing_errors.append(np.nan)

818

819 return df.with_columns(

820 [

821 pl.Series(name="zero_crossing_time", values=zero_crossing_times, dtype=pl.Float64),

822 pl.Series(name="zero_crossing_error", values=zero_crossing_errors, dtype=pl.Float64),

823 ]

824 )

825

826

827def pipe_inl(

828 df: pl.DataFrame,

829 skip_last_n_hi: Optional[int] = None,

830 skip_last_n_lo: Optional[int] = None,

831) -> pl.DataFrame:

832 """

833 Args:

834 df: Dataframe. Needs columns 'energy_mean',

835 'energy_std', 'amp', 'gain', 'board_id',

836 'run_number', 'att_val', and 'channel'.

837 skip_last_n_hi: skip last n points for hi gain

838 skip_last_n_lo: skip last n points for lo gain. If None, it's set to hi

839

840 Returns:

841 Updated dataframe with INL

842 """

843

844 if skip_last_n_hi is not None:

845 skip_last_n_hi = -skip_last_n_hi

846

847 if skip_last_n_lo is not None:

848 skip_last_n_lo = -skip_last_n_lo

849 elif skip_last_n_hi is not None:

850 skip_last_n_lo = skip_last_n_hi

851

852 INL_df: pl.DataFrame = pl.DataFrame()

853

854 for (gain, channel), frame in df.group_by(["gain", "channel"]):

855 if gain == "hi":

856 skip_last_n = skip_last_n_hi

857 else:

858 skip_last_n = skip_last_n_lo

859

860 energies_lf = frame.lazy().select("energies", "amp").explode("energies")

861 if frame["energies"][0].dtype.is_nested():

862 energies_lf = energies_lf.explode("energies")

863 energies_df = (

864 energies_lf.group_by("amp")

865 .agg(

866 energy_mean=pl.col("energies").mean(),

867 energy_std=pl.col("energies").std(),

868 n_energies=pl.col("energies").len(),

869 )

870 .sort(by="amp")

871 .collect()

872 )

873 amps_arr: np.ndarray = energies_df["amp"].to_numpy()

874 n_energies: np.ndarray = energies_df["n_energies"].to_numpy()

875 e_arr: np.ndarray = energies_df["energy_mean"].to_numpy()

876 dE_arr: np.ndarray = energies_df["energy_std"].to_numpy() / np.sqrt(n_energies)

877

878 if len(e_arr[:skip_last_n]) <= 1 or len(dE_arr[:skip_last_n]) <= 1 or len(amps_arr[:skip_last_n]) <= 1:

879 # You can't fit a _unique_ line to a single point

880 log.error("pipe_inl has only one energy")

881 return df

882

883 popt, _ = curve_fit(

884 helper.lin,

885 amps_arr[:skip_last_n],

886 e_arr[:skip_last_n],

887 p0=[e_arr[1] / amps_arr[1], 0],

888 sigma=dE_arr[:skip_last_n],

889 absolute_sigma=True,

890 )

891

892 y_pred: np.ndarray = helper.lin(amps_arr, *popt)

893 INL: np.ndarray = 100 * (e_arr - y_pred) / max(e_arr)

894

895 INL_df = pl.concat(

896 [

897 INL_df,

898 pl.DataFrame(

899 {

900 "channel": channel,

901 "gain": gain,

902 "amp": pl.Series(amps_arr, dtype=pl.Float32),

903 "INL": pl.Series(INL, dtype=pl.Float32),

904 },

905 ),

906 ]

907 )

908

909 return df.join(INL_df, on=["gain", "channel", "amp"])

910

911

912def pipe_energy_sigma(df: pl.DataFrame, all_phases: bool = False) -> pl.DataFrame:

913 """

914 Calculate per channel/amplitude energy histogram std and rms from pulse samples.

915 Adds 'd_sigma' and 'd_mu' columns to the DataFrame.

916

917 Args:

918 df: Polars DataFrame.

919 mean_interleaved_pulse: Column with mean interleaved pulse.

920 samples_interleaved: Column with list of interleaved pulses.

921 OFCs_a: Column with OFCs_a

922 energies: Column with energies

923

924 Returns:

925 DataFrame with 'zero_crossing_time' and 'zero_crossing_error' columns.

926 """

927 log.debug("Calculating energy sigma")

928 if all_phases:

929 return df.with_columns(

930 d_mu=pl.col("energy_std") / pl.col("energies").list.eval(pl.element().list.len()).list.sum().sqrt(),

931 d_sigma=pl.col("energy_std")

932 / (2 * pl.col("energies").list.eval(pl.element().list.len()).list.sum() - 2).sqrt(),

933 )

934 else:

935 return df.with_columns(

936 d_mu=pl.col("energy_std") / pl.col("energies").list.len().sqrt(),

937 d_sigma=pl.col("energy_std") / (2 * pl.col("energies").list.len() - 2).sqrt(),

938 )

939

940

941def find_max_sum_index(arr: np.ndarray, window_radius: int) -> int:

942 window_size = 2 * window_radius + 1

943 window = np.ones(window_size)

944 sums = np.convolve(arr, window, mode="valid")

945 max_index_in_sums = np.argmax(sums)

946 max_index = max_index_in_sums + window_radius

947

948 return int(max_index)

949

950

951def get_std_diff(

952 df: pl.DataFrame,

953 mean_interleaved_pulse: str = "mean_interleaved_pulse",

954 ref_pulse_path: Path = constants.ROOTDIR / "polars_analysis/analysis/reference_pulse.txt",

955) -> pl.DataFrame:

956 ref_pulse: np.ndarray = np.loadtxt(ref_pulse_path)

957

958 pulse_std: list[float] = []

959 pulse_highest_diff: list[float] = []

960 pulse_ref_diff: list[np.ndarray] = []

961 pulse_normalized: list[np.ndarray] = []

962

963 max_ind = find_max_sum_index(ref_pulse, 20)

964 ref_pulse = ref_pulse[max_ind - 200 : max_ind + 1000]

965

966 for row in df.select(mean_interleaved_pulse).iter_rows(named=True):

967 pulse = np.array(row[mean_interleaved_pulse])

968 pulse = pulse / np.max(pulse)

969 try:

970 max_ind = find_max_sum_index(pulse, 20)

971 pulse = pulse[max_ind - 200 : max_ind + 1000]

972 # print("MAX IND")

973 # print(max_ind)

974

975 pulse_std.append(np.std(pulse - ref_pulse, dtype=float))

976 pulse_highest_diff.append(np.max(np.abs(pulse - ref_pulse)))

977 pulse_ref_diff.append(pulse - ref_pulse)

978 pulse_normalized.append(pulse)

979 except ValueError:

980 pulse_std.append(-100.0)

981 pulse_highest_diff.append(-100.0)

982 pulse_ref_diff.append(ref_pulse * 0)

983 pulse_normalized.append(ref_pulse * 0)

984

985 return df.with_columns(

986 pl.Series("pulse_std", pulse_std),

987 pl.Series("pulse_highest_diff", pulse_highest_diff),

988 pl.Series("pulse_ref_diff", values=pulse_ref_diff).cast(pl.List(pl.Float32)),

989 pl.Series("pulse_normalized", values=pulse_normalized).cast(pl.List(pl.Float32)),

990 )

991

992

993def pipe_ref_pulse_correlation(

994 df: pl.DataFrame,

995 mean_interleaved_pulse: str = "mean_interleaved_pulse",

996 ref_pulse_path: Path = constants.ROOTDIR / "polars_analysis/analysis/reference_pulse.txt",

997) -> pl.DataFrame:

998 ref_pulse = np.loadtxt(ref_pulse_path)

999 ref_df = df.with_columns(ref_pulse=pl.Series([ref_pulse]).first())

1000

1001 group_by_columns = ["run_number", "measurement", "channel", "gain"]

1002

1003 ref_df = (

1004 ref_df.lazy()

1005 .explode(mean_interleaved_pulse, "ref_pulse")

1006 .with_columns(ref_pulse_corr=pl.corr(mean_interleaved_pulse, "ref_pulse").over(group_by_columns))

1007 .group_by(group_by_columns)

1008 .agg(pl.col("ref_pulse_corr").first().fill_nan(0))

1009 .collect()

1010 )

1011

1012 return df.join(ref_df, on=group_by_columns)

1013

1014

1015def pipe_ref_pulse_rmse(

1016 df: pl.DataFrame,

1017 mean_interleaved_pulse: str = "mean_interleaved_pulse",

1018 ref_pulse_path: Path = constants.ROOTDIR / "polars_analysis/analysis/reference_pulse.txt",

1019) -> pl.DataFrame:

1020 ref_pulse = np.loadtxt(ref_pulse_path)

1021 ref_df = df.with_columns(ref_pulse=pl.Series([ref_pulse]).first())

1022

1023 group_by_columns = ["run_number", "measurement", "channel", "gain"]

1024

1025 ref_df = (

1026 ref_df.lazy()

1027 .with_columns(

1028 pl.col(mean_interleaved_pulse).list.eval((pl.element() - pl.element().mean()) / pl.element().std()),

1029 pl.col("ref_pulse").list.eval((pl.element() - pl.element().mean()) / pl.element().std()),

1030 )

1031 .explode(mean_interleaved_pulse, "ref_pulse")

1032 .with_columns(

1033 ref_pulse_rmse=(pl.col(mean_interleaved_pulse) - pl.col("ref_pulse"))

1034 .pow(2)

1035 .mean()

1036 .sqrt()

1037 .over(group_by_columns)

1038 )

1039 .group_by(group_by_columns)

1040 .agg(pl.col("ref_pulse_rmse").first().fill_nan(0))

1041 .collect()

1042 )

1043

1044 return df.join(ref_df, on=group_by_columns)

1045

1046

1047def pipe_gain_ratio(df: pl.DataFrame) -> pl.DataFrame:

1048 join_columns = ["measurement", "channel"]

1049 gain_ratios = (

1050 df.lazy()

1051 .filter(pl.col("gain") == "lo")

1052 .join(df.lazy().filter(gain="hi").select("amp", "energy_mean", "channel"), on=["amp", "channel"], suffix="_hi")

1053 .select(*join_columns, (pl.col("energy_mean_hi") / pl.col("energy_mean")).alias("gain_ratio"))

1054 .collect()

1055 )

1056 return df.join(gain_ratios, on=join_columns, how="left")

Coverage for polars_analysis / analysis / pulse_analysis.py: 91%

307 statements