Coverage for polars_analysis/analysis/pedestal

1import logging

2from typing import List, Literal, Optional, Union

4import numpy as np

5import polars as pl

6import scipy.signal as sps # type: ignore

7from scipy.stats import gamma

9from polars_analysis.analysis import constants

10from polars_analysis.plotting.helper import calc_gaussian, gauss

12# Instantiate logger

13log = logging.getLogger(__name__)

15"""

16Functions to calculate derived values for pedestal runs.

17"""

20def next_power_of_2(x: int) -> int:

21 """

22 Utility function for determining if we should calculate coherent noise.

23 Calculate the nearest larger power of 2.

24 e.g. 3->4, 4->4, 5->8, ..., 100->128, etc.

25 """

26 return 2 ** (x - 1).bit_length()

29def calc_coherent_noise(

30 df: pl.DataFrame,

31 min_channel: int,

32 n_channels: int,

33 run_number: int,

34 board_id: str,

35 measurement: int,

36 pas_mode: int,

37 gain: Literal["hi", "lo"],

38 skip_channels: Optional[List[int]] = None,

39 col: str = "samples",

40) -> pl.DataFrame:

41 """

42 Calculate the coherent noise for a given measurement, gain, and channel range.

44 Refer to Section 5 of this paper for a description of coherent noise:

45 https://cds.cern.ch/record/683745/files/tilecal-98-168.pdf

46 In short, the total noise in n channels can be broken down as

48 .. math:: σ_n^2 = n * σ_{rnd}^2 + n^2 * σ_{coh}^2

50 where σ_n is the total noise, σ_rnd is the random per channel noise,

51 and sigma_coh is the coherent noise.

52 The combined channel noise (ch_noise) is taken to be

54 .. math:: Σ σ_i^2 = n * σ_{rnd}^2 + n * σ_{coh}^2

56 In terms of the output DataFrame column names:

58 * σ_n => tot_noise

59 * σ_coh => coh_noise

60 * σ_rnd => not reported directly, but n * σ_rnd^2 = σ_n^2 - n^2 * σ_coh^2

62 :param df: The DataFrame to calculate the coherent noise from.

63 :type df: pl.DataFrame

64 :param min_channel: The minimum channel to calculate the coherent noise from.

65 :type min_channel: int

66 :param n_channels: The number of channels to calculate the coherent noise from.

67 :type n_channels: int

68 :param run_number: The run number to calculate the coherent noise from.

69 :type run_number: int

70 :param board_id: The board ID.

71 :type board_id: str

72 :param measurement: The measurement number to calculate the coherent noise from.

73 :type measurement: int

74 :param gain: The gain to calculate the coherent noise from.

75 :type gain: Literal["hi", "lo"]

76 :param col: The column to calculate the coherent noise from. Defaults to "samples".

77 :type col: str, optional

78 :return: A DataFrame with one row containing the coherent noise results. It has the columns:

80 * ch_noise: The square root of the sum of squares of the channel noise

81 * d_ch_noise: ch_noise but each error is divided by the number of samples

82 * avg_noise: ch_noise divided by the square root of the number of channels

83 * d_avg: d_ch_noise divided by the square root of the number of channels

84 * data_sum: The sum of the baseline subtracted data, per sample rather than per channel

85 * tot_noise: The standard deviation of data_sum

86 * coh_noise: The coherent noise

87 * pct_coh: The coherent noise expressed as a percentage of the average noise

88 * d_coh: The coherent noise error

89 * d_pct: The percent coherent noise error

90 * gain: The measurement gain

91 * min_channel: The minimum channel used for the calculation

92 * n_channels: The number of channels used for the calculation

94 :rtype: pl.DataFrame

95 """

96 filtered_df = df.filter(

97 pl.col("run_number") == run_number,

98 pl.col("measurement") == measurement,

99 pl.col("gain") == gain,

100 pl.col("channel").is_in(range(min_channel, min_channel + n_channels)),

101 pl.col("samples").list.len() != 0,

102 )

103

104 # If we could do the same calculation with a smaller n_channels, return early

105 # We do every other power of 2 (besides 64 -> 128)

106 if next_power_of_2(filtered_df["channel"].unique().shape[0]) < n_channels // 2:

107 log.debug("Returning early from coh_noise")

108 return pl.DataFrame()

109

110 if min_channel == 0 and n_channels % 128 == 0:

111 if skip_channels is None:

112 skip_channels = []

113 present_channels = filtered_df["channel"].to_list()

114 for i in skip_channels:

115 if i not in present_channels:

116 log.warning(f"Error, channel {i} already not present in df")

117

118 if skip_channels:

119 filtered_df = filtered_df.filter(~pl.col("channel").is_in(skip_channels))

120

121 if filtered_df.is_empty():

122 return pl.DataFrame()

123

124 n_samples = filtered_df.select(pl.col(col).list.len().min()).item()

125 filtered_channels = filtered_df["channel"].unique().to_list()

126 n_filtered_channels = len(filtered_channels)

127

128 data_sum: np.ndarray = (

129 filtered_df.select(

130 pl.col(col).list.eval(pl.element() - pl.element().mean()).list.head(n_samples).list.to_array(n_samples)

131 )

132 .to_series()

133 .to_numpy()

134 .sum(axis=0)

135 )

136 bin_width = 2 * max(1, round((max(data_sum) - min(data_sum)) / 100))

137 data_sum_bins = np.arange(min(data_sum), max(data_sum) + bin_width / 2, bin_width)

138 data_sum_hist, _ = np.histogram(data_sum, bins=data_sum_bins)

139

140 k_coh = np.sqrt(n_filtered_channels * (n_filtered_channels - 1)) # denominator for coh noise calculation

141 m_factor = np.sqrt(2 * n_samples - 2) # denominator for uncertainty of uncertainties

142

143 results_df = (

144 filtered_df.lazy()

145 .select(

146 ch_noise=pl.col(col).list.std().pow(2).sum().sqrt(),

147 tot_noise=pl.Series(values=data_sum).std(),

148 )

149 .with_columns(

150 avg_noise=pl.col("ch_noise") / np.sqrt(n_filtered_channels),

151 coh_noise=(pl.col("tot_noise") ** 2 - pl.col("ch_noise") ** 2).sqrt() / k_coh,

152 )

153 .with_columns(

154 pct_coh=100 * pl.col("coh_noise") / pl.col("avg_noise"),

155 )

156 .with_columns(

157 d_ch_noise=pl.col("ch_noise") / m_factor,

158 d_tot_noise=pl.col("tot_noise") / m_factor,

159 d_avg=pl.col("avg_noise") / m_factor,

160 )

161 .with_columns(

162 d_coh=(

163 (pl.col("tot_noise") * pl.col("d_tot_noise")) ** 2 + (pl.col("ch_noise") * pl.col("d_ch_noise")) ** 2

164 ).sqrt()

165 / (pl.col("coh_noise") * k_coh**2),

166 )

167 .with_columns(

168 d_pct=pl.col("pct_coh")

169 * ((pl.col("d_coh") / pl.col("coh_noise")) ** 2 + (pl.col("d_avg") / pl.col("avg_noise")) ** 2).sqrt()

170 )

171 .with_columns(

172 data_sum_hist=pl.Series(name="data_sum_hist", values=data_sum_hist).implode(),

173 data_sum_bins=pl.Series(name="data_sum_bins", values=data_sum_bins).implode(),

174 )

175 .with_columns(

176 pl.lit(run_number).alias("run_number"),

177 pl.lit(board_id).alias("board_id"),

178 pl.lit(pas_mode).alias("pas_mode"),

179 pl.lit(measurement).alias("measurement"),

180 pl.lit(gain).alias("gain"),

181 min_channel=min_channel, # this is a choice that only controls the plot file name

182 n_channels=n_channels, # this is a choice that only controls the plot file name

183 channel_list=filtered_channels,

184 )

185 .select(pl.exclude("data_sum"))

186 )

187

188 return results_df.collect()

189

190

191def calc_correlation_matrix(

192 df: pl.DataFrame,

193 measurements: List[int],

194 gain: Literal["hi", "lo"],

195 col: str = "samples",

196 multiple_boards: Optional[List[str]] = None,

197) -> np.ndarray:

198 """

199 Calculate the correlation matrix for a given measurement, gain, and channel range.

200

201 :param df: The DataFrame to calculate the correlation matrix from

202 :type df: pl.DataFrame

203 :param measurement: The measurement number to calculate the correlation matrix from

204 :type measurement: int

205 :param gain: The gain to calculate the correlation matrix from

206 :type gain: Literal["hi", "lo"]

207 :param col: The column to calculate the correlation matrix from. Defaults to "samples".

208 :type col: str, optional

209 :param multiple_boards: List of board IDs only for multiple board runs

210 :type multiple_boards: Optional[List[str]], optional

211 :return: A 128x128 correlation matrix, filled for input channels, and padded with zeros.

212 :rtype: npt.NDArray[np.float64]

213 """

214 log.debug("Computing correlation matrix")

215

216 # Make 128 channel matrix of zeros here and update with output below.

217 nchan = 128 * len(multiple_boards) if multiple_boards is not None else 128

218 tmp_matrix = np.zeros([nchan, nchan])

219

220 for measurement in measurements:

221 measurement_df = df.filter(

222 pl.col("measurement") == measurement,

223 pl.col("gain") == gain,

224 pl.col("samples").list.len() != 0,

225 )

226

227 width = df.select(pl.col(col).list.len().min()).item()

228 matrix = (

229 measurement_df.select(pl.col(col).list.slice(0, width).list.to_array(width).arr.to_struct())

230 .unnest(col)

231 .transpose()

232 .corr()

233 .to_numpy()

234 )

235

236 # An array of the actual channels available in the dataframe

237 channel_mask = np.unique(measurement_df.select(pl.col("channel")).transpose().to_numpy().flatten())

238

239 # Update return matrix

240 tmp_matrix[np.ix_(channel_mask, channel_mask)] = matrix

241

242 return tmp_matrix

243

244

245def pipe_psd(df: pl.DataFrame, col: str = "samples") -> pl.DataFrame:

246 """

247 Calculate the PSD of the samples column as well as the frequency axis for the PSD

248

249 :param df: The DataFrame to calculate the PSD from

250 :type df: pl.DataFrame

251 :param col: The column to calculate the PSD from. Defaults to "samples".

252 :type col: str, optional

253 :return: A DataFrame with the psd and freq columns added

254 :rtype: pl.DataFrame

255 """

256 log.debug("Calculating PSD")

257 width = df.select(pl.col(col).list.len().min()).item()

258 samples = df.select(pl.col(col).list.slice(0, width).list.to_array(width)).to_series().to_numpy()

259 freq, psd = sps.welch(samples, fs=constants.FLX_FRQ_40MHZ, nperseg=2**10, axis=1, average="mean")

260

261 # Remove DC component and last bin, which often shows peak or dip which is likely an artifact

262 psd = psd[:, 1:-1]

263 freq = freq[1:-1]

264

265 freq = np.tile(freq, (psd.shape[0], 1))

266

267 ## Find peaks in distribution

268

269 # Convert to dBFS

270 psd_dbfs = 10 * np.log10(np.array(psd) / (2**constants.ADC_BITS) ** 2)

271

272 # Peak detection strategy for dB scale:

273 # 1. Calculate rolling median to establish local baseline

274 # 2. Find peaks relative to local baseline

275

276 # Use convolution for rolling median estimation

277 window_size = 7 # Should be odd; adjust based on your needs

278 window: np.ndarray = np.ones(window_size) / window_size

279

280 baseline = np.apply_along_axis(np.convolve, 1, psd_dbfs, window, mode="same")

281

282 # At edges, use the first/last valid baseline value

283 baseline[:, : window_size // 2] = np.repeat(baseline[:, window_size // 2, None], window_size // 2, axis=1)

284 baseline[:, -window_size // 2 :] = np.repeat(

285 baseline[:, -window_size // 2 - 1, None], np.abs(-window_size // 2), axis=1

286 )

287

288 # Calculate deviation from local baseline

289 deviation = psd_dbfs - baseline

290

291 # Define peak parameters

292 min_peak_height = 0.5 # Minimum dB above local baseline

293 min_peak_distance = int(constants.FFT_SIZE / 100) # Minimum distance between peaks

294

295 # numpy can't handle jagged return array

296 # Find peaks relative to baseline

297 peaks = []

298 peak_heights = []

299 for row in deviation:

300 found_peaks, pkh_dict = sps.find_peaks(row, height=min_peak_height, distance=min_peak_distance)

301 peaks.append(found_peaks)

302

303 # Peak height in PSD

304 # peak_heights.append([psd_dbfs[i][p] for p in found_peaks])

305 # Maybe we want the height above baseline

306 # "peak_heights" is in pkh_dict as long as height kwarg is passed

307 peak_heights.append(pkh_dict["peak_heights"]) # type: ignore

308

309 return df.with_columns(

310 pl.Series(name="psd", values=psd, dtype=pl.List(pl.Float64)),

311 pl.Series(name="freq", values=freq, dtype=pl.List(pl.Float64)),

312 pl.Series(name="peaks", values=peaks, dtype=pl.List(pl.Float64)),

313 pl.Series(name="peak_heights", values=peak_heights, dtype=pl.List(pl.Float64)),

314 )

315

316

317def calc_coherence(c1: int, c2: int, df: pl.DataFrame) -> Union[tuple, None]:

318 # Check if channels are in DF

319 if c1 not in df["channel"]:

320 log.warning(f"Channel {c1} not found in dataframe")

321 return None

322 if c2 not in df["channel"]:

323 log.warning(f"Channel {c2} not found in dataframe")

324 return None

325

326 f, Cxy = sps.coherence(

327 df.filter(pl.col("channel") == c1).select(pl.col("samples")).to_series()[0].to_numpy(),

328 df.filter(pl.col("channel") == c2).select(pl.col("samples")).to_series()[0].to_numpy(),

329 fs=constants.FLX_FRQ_40MHZ,

330 nperseg=2**10,

331 )

332

333 return (f, Cxy)

334

335

336def pipe_fft(df: pl.DataFrame, col: str = "samples") -> pl.DataFrame:

337 """

338 Calculate the FFT of the samples column as well as the frequency axis for the FFT

339

340 :param df: The DataFrame to calculate the FFT from

341 :type df: pl.DataFrame

342 :param col: The column to calculate the FFT from. Defaults to "samples".

343 :type col: str, optional

344 :return: A DataFrame with the fft and freq columns added

345 :rtype: pl.DataFrame

346 """

347 log.debug("Calculating FFT")

348 width = df.select(pl.col(col).list.len().min()).item()

349 fourier = np.fft.fft(

350 df.select(pl.col(col).list.slice(0, width).list.to_array(width)).to_series().to_numpy(), axis=1

351 )

352 fourier = np.abs(fourier)[:, 1:]

353

354 freq = np.fft.fftfreq(fourier.shape[1], d=1 / constants.FLX_FRQ_40MHZ)

355 freq = freq[: int(freq.shape[0] / 2)]

356 freq = np.tile(freq, (fourier.shape[0], 1))

357

358 fourier = fourier[:, : int(fourier.shape[1] / 2)]

359 fourier = fourier / np.max(fourier, axis=1)[:, None]

360

361 return df.with_columns(

362 pl.Series(name="fft", values=fourier, dtype=pl.List(pl.Float64)),

363 pl.Series(name="freq", values=freq, dtype=pl.List(pl.Float64)),

364 )

365

366

367def pipe_chi2(df: pl.DataFrame, col: str = "samples") -> pl.DataFrame:

368 """

369 Calculate the chi² per degree of freedom for each entry in the samples column,

370 with all logic in-line.

371

372 Steps:

373 1. Convert the samples to a numpy array.

374 2. Define histogram bins with a fixed bin width of 1.

375 3. If there aren't enough bins, assign NaN.

376 4. Compute the histogram and bin centers.

377 5. Fit a Gaussian to the histogram using calc_gaussian and gauss.

378 6. Calculate degrees of freedom (number of bins minus 3).

379 7. Compute asymmetric Poisson errors using the gamma distribution.

380 8. Calculate the chi² per degree of freedom.

381

382 :param df: The input DataFrame.

383 :param col: The column containing the sample data (default "samples").

384 :return: A DataFrame with an added "chi2_dof" column.

385 """

386 log.debug("Calculating chi²/dof for the samples column")

387 chi2_values = []

388

389 for samples in df[col]:

390 s = np.asarray(samples)

391

392 bins = np.arange(s.min(), s.max() + 1, 1)

393 if bins.size <= 1:

394 chi2_values.append(np.nan)

395 continue

396

397 hist, _ = np.histogram(s, bins=bins)

398 centers = 0.5 * (bins[1:] + bins[:-1])

399

400 fp = calc_gaussian(s, bins)

401 gauss_fit = gauss(centers, mu=fp[0], sigma=fp[2], N=fp[4])

402 dof = centers.size - 3

403

404 # Compute asymmetric Poisson errors using the gamma distribution

405 a = 0.32 # approximately 1 sigma

406 err_up = gamma.ppf(1 - a / 2, gauss_fit + 1, scale=1) - gauss_fit

407 err_dw = gauss_fit - gamma.ppf(a / 2, gauss_fit, scale=1)

408 residuals = hist - gauss_fit

409 err = np.where(residuals > 0, err_up, err_dw)

410

411 chi2 = np.sum((residuals) ** 2 / (err**2)) / dof

412 chi2_values.append(chi2)

413

414 return df.with_columns(pl.Series("chi2_dof", chi2_values))

415

416

417def pipe_autocorr(df: pl.DataFrame, col: str = "samples") -> pl.DataFrame:

418 """

419 Calculate the autocorrelation of the samples column

420

421 :param df: The DataFrame to calculate the autocorrelation from

422 :type df: pl.DataFrame

423 :param col: The column to calculate the autocorrelation from. Defaults to "samples".

424 :type col: str, optional

425 :return: A DataFrame with the autocorr column added

426 :rtype: pl.DataFrame

427 """

428 log.debug("Calculating autocorrelation")

429 width = df.select(pl.col(col).list.len().min()).item()

430 data = (

431 df.select(

432 pl.col(col)

433 .list.eval(pl.element() - pl.element().mean())

434 .list.slice(0, width)

435 .list.to_array(width)

436 .arr.to_struct()

437 )

438 .unnest(col)

439 .fill_null(0)

440 .to_numpy()

441 )

442 correlate = np.vectorize(sps.correlate, signature="(n),(n)->(k)")

443 result = correlate(data, data)

444 result /= np.max(result, axis=1)[:, None]

445 result = result[:, result.shape[1] // 2 :]

446

447 return df.with_columns(pl.Series(name="autocorr", values=result, dtype=pl.List(pl.Float64)))

448

449

450def expr_mean(col: str = "samples") -> pl.Expr:

451 """

452 Calculate the mean of the samples column

453

454 :param col: The column name to calculate the mean of

455 :type col: str, optional

456 :return: The mean of the samples column

457 :rtype: pl.Expr

458 """

459 return pl.col(col).list.mean()

460

461

462def expr_rms(col: str = "samples") -> pl.Expr:

463 """

464 Calculate the root mean square of the samples column

465

466 :param col: The column name to calculate the rms of

467 :type col: str, optional

468 :return: The rms of the samples column

469 :rtype: pl.Expr

470 """

471 return pl.col(col).list.std()

472

473

474def expr_max_min(col: str = "samples") -> pl.Expr:

475 """

476 Calculate the difference between the max and min of the samples columns

477

478 :param col: The column name to calculate the max-min of

479 :type col: str, optional

480 :return: The max-min of the samples column

481 :rtype: pl.Expr

482 """

483 return pl.col(col).list.max() - pl.col(col).list.min()

484

485

486def expr_psd(col: str = "fft") -> pl.Expr:

487 """

488 Calculate the power spectral density of the FFT column

489

490 :param col: The column to calculate the PSD from. Defaults to "fft".

491 :type col: str, optional

492 :return: The power spectral density of the FFT column

493 :rtype: pl.Expr

494 """

495 return pl.col(col).list.eval(20 * np.log10(pl.element()))

496

497

498def expr_sinad(col: str = "fft") -> pl.Expr:

499 """

500 Calculate the signal-to-noise and distortion ratio of the FFT column

501

502 :param col: The column to calculate the SINAD from. Defaults to "fft".

503 :type col: str, optional

504 :return: The signal-to-noise and distortion ratio of the FFT column

505 :rtype: pl.Expr

506 """

507 return -10 * pl.col(col).list.eval(pl.element().filter(pl.element() < 1) ** 2).list.sum().log10()

508

509

510def expr_enob(col: str = "sinad") -> pl.Expr:

511 """

512 Calculate the effective number of bits of the SINAD column

513

514 :param col: The column to calculate the ENOB from. Defaults to "sinad".

515 :type col: str, optional

516 :return: The effective number of bits of the SINAD column

517 :rtype: pl.Expr

518 """

519 return (pl.col(col) - 1.76) / 6.02

520

521

522def expr_snr(col: str = "fft") -> pl.Expr:

523 """

524 Calculate the signal-to-noise ratio of the FFT column

525

526 :param col: The column to calculate the SNR from. Defaults to "fft".

527 :type col: str, optional

528 :return: The signal-to-noise ratio of the FFT column

529 :rtype: pl.Expr

530 """

531 return (

532 -10 * pl.col(col).list.sort().list.eval(pl.element().slice(0, pl.element().len() - 3) ** 2).list.sum().log10()

533 )

534

535

536def expr_sfdr(col: str = "fft") -> pl.Expr:

537 """

538 Calculate the spurious-free dynamic range of the FFT column

539

540 :param col: The column to calculate the SFDR from. Defaults to "fft".

541 :type col: str, optional

542 :return: The spurious-free dynamic range of the FFT column

543 :rtype: pl.Expr

544 """

545 return -20 * pl.col(col).list.set_difference(pl.col(col).list.max()).list.max().log10()

Coverage for polars_analysis / analysis / pedestal_analysis.py: 82%

141 statements