Full Day RFI Flagging¶

by Josh Dillon, last updated August 1, 2025

This notebook is designed to figure out a single full-day RFI mask using the best autocorelations, taking individual file_calibration notebook results as a prior but then potentially undoing flags.

Here's a set of links to skip to particular figures and tables:

• Figure 1: Show All DPSS Residual z-Scores¶

• Figure 2: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Initial Flags¶

• Figure 3: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Expanded Flags¶

• Figure 4: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Final, Re-Computed Flags¶

• Figure 5: Summary of Flags Before and After Recomputing Them¶

In [1]:
import time
tstart = time.time()
In [2]:
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
import h5py
import hdf5plugin  # REQUIRED to have the compression plugins available
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import matplotlib
import copy
import warnings
import textwrap
from pyuvdata import UVFlag, UVData, UVCal
from hera_cal import io, utils, abscal
from hera_cal.smooth_cal import CalibrationSmoother, dpss_filters, solve_2D_DPSS
from hera_qm import ant_class, xrfi, metrics_io
from hera_filters import dspec

from IPython.display import display, HTML
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
_ = np.seterr(all='ignore')  # get rid of red warnings
%config InlineBackend.figure_format = 'retina'

Parse inputs¶

In [3]:
# get filenames
SUM_FILE = os.environ.get("SUM_FILE", None)
# SUM_FILE = '/lustre/aoc/projects/hera/h6c-analysis/IDR2/2459866/zen.2459866.25282.sum.uvh5'  # If sum_file is not defined in the environment variables, define it here.
SUM_SUFFIX = os.environ.get("SUM_SUFFIX", 'sum.uvh5')
SUM_AUTOS_SUFFIX = os.environ.get("SUM_AUTOS_SUFFIX", 'sum.autos.uvh5')
DIFF_AUTOS_SUFFIX = os.environ.get("DIFF_AUTOS_SUFFIX", 'diff.autos.uvh5')
CAL_SUFFIX = os.environ.get("CAL_SUFFIX", 'sum.omni.calfits')
ANT_CLASS_SUFFIX = os.environ.get("ANT_CLASS_SUFFIX", 'sum.ant_class.csv')
APRIORI_YAML_PATH = os.environ.get("APRIORI_YAML_PATH", None)
OUT_FLAG_SUFFIX = os.environ.get("OUT_FLAG_SUFFIX", 'sum.flag_waterfall.h5')

sum_glob = '.'.join(SUM_FILE.split('.')[:-3]) + '.*.' + SUM_SUFFIX
auto_sums_glob = sum_glob.replace(SUM_SUFFIX, SUM_AUTOS_SUFFIX)
auto_diffs_glob = sum_glob.replace(SUM_SUFFIX, DIFF_AUTOS_SUFFIX)
cal_files_glob = sum_glob.replace(SUM_SUFFIX, CAL_SUFFIX)
ant_class_csvs_glob = sum_glob.replace(SUM_SUFFIX, ANT_CLASS_SUFFIX)
In [4]:
# A priori flag settings
FM_LOW_FREQ = float(os.environ.get("FM_LOW_FREQ", 87.5)) # in MHz
FM_HIGH_FREQ = float(os.environ.get("FM_HIGH_FREQ", 108.0)) # in MHz
FM_freq_range = [FM_LOW_FREQ * 1e6, FM_HIGH_FREQ * 1e6]
MAX_SOLAR_ALT = float(os.environ.get("MAX_SOLAR_ALT", 0.0)) # in degrees
PER_POL_FILE_FLAG_THRESH = float(os.environ.get("PER_POL_FILE_FLAG_THRESH", .75)) 

# DPSS settings
FREQ_FILTER_SCALE = float(os.environ.get("FREQ_FILTER_SCALE", 5.0)) # in MHz
TIME_FILTER_SCALE = float(os.environ.get("TIME_FILTER_SCALE", 450.0))# in s
EIGENVAL_CUTOFF = float(os.environ.get("EIGENVAL_CUTOFF", 1e-12))

# Outlier flagging settings
MIN_FRAC_OF_AUTOS = float(os.environ.get("MIN_FRAC_OF_AUTOS", .25))
MAX_AUTO_L2 = float(os.environ.get("MAX_AUTRO_L2", 1.2))
Z_THRESH = float(os.environ.get("Z_THRESH", 5.0))
WS_Z_THRESH = float(os.environ.get("WS_Z_THRESH", 4.0))
AVG_Z_THRESH = float(os.environ.get("AVG_Z_THRESH", 1.5))
REPEAT_FLAG_Z_THRESH = float(os.environ.get("REPEAT_FLAG_Z_THESH", 0.0))
MAX_FREQ_FLAG_FRAC = float(os.environ.get("MAX_FREQ_FLAG_FRAC", .25))
MAX_TIME_FLAG_FRAC = float(os.environ.get("MAX_TIME_FLAG_FRAC", .1))

for setting in ['FM_LOW_FREQ', 'FM_HIGH_FREQ', 'MAX_SOLAR_ALT', 'PER_POL_FILE_FLAG_THRESH', 
                'FREQ_FILTER_SCALE', 'TIME_FILTER_SCALE', 'EIGENVAL_CUTOFF', 'MIN_FRAC_OF_AUTOS', 
                'MAX_AUTO_L2', 'Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'REPEAT_FLAG_Z_THRESH', 
                'MAX_FREQ_FLAG_FRAC ', 'MAX_TIME_FLAG_FRAC ']:
        print(f'{setting} = {eval(setting)}')
FM_LOW_FREQ = 87.5
FM_HIGH_FREQ = 108.0
MAX_SOLAR_ALT = 0.0
PER_POL_FILE_FLAG_THRESH = 0.75
FREQ_FILTER_SCALE = 5.0
TIME_FILTER_SCALE = 450.0
EIGENVAL_CUTOFF = 1e-12
MIN_FRAC_OF_AUTOS = 0.25
MAX_AUTO_L2 = 1.2
Z_THRESH = 5.0
WS_Z_THRESH = 4.0
AVG_Z_THRESH = 1.5
REPEAT_FLAG_Z_THRESH = 0.0
MAX_FREQ_FLAG_FRAC  = 0.25
MAX_TIME_FLAG_FRAC  = 0.1

Load Data¶

In [5]:
auto_sums = sorted(glob.glob(auto_sums_glob))
print(f'Found {len(auto_sums)} *.{SUM_AUTOS_SUFFIX} files starting with {auto_sums[0]}.')

auto_diffs = sorted(glob.glob(auto_diffs_glob))
print(f'Found {len(auto_diffs)} *.{DIFF_AUTOS_SUFFIX} files starting with {auto_diffs[0]}.')

cal_files = sorted(glob.glob(cal_files_glob))
print(f'Found {len(cal_files)} *.{CAL_SUFFIX} files starting with {cal_files[0]}.')

ant_class_csvs = sorted(glob.glob(ant_class_csvs_glob))
print(f'Found {len(ant_class_csvs)} *.{ANT_CLASS_SUFFIX} files starting with {ant_class_csvs[0]}.')
Found 1848 *.sum.autos.uvh5 files starting with /lustre/aoc/projects/hera/h6c-analysis/IDR3/2459934/zen.2459934.21347.sum.autos.uvh5.
Found 1848 *.diff.autos.uvh5 files starting with /lustre/aoc/projects/hera/h6c-analysis/IDR3/2459934/zen.2459934.21347.diff.autos.uvh5.
Found 1848 *.sum.omni.calfits files starting with /lustre/aoc/projects/hera/h6c-analysis/IDR3/2459934/zen.2459934.21347.sum.omni.calfits.
Found 1848 *.sum.ant_class.csv files starting with /lustre/aoc/projects/hera/h6c-analysis/IDR3/2459934/zen.2459934.21347.sum.ant_class.csv.
In [6]:
# Load ant_class csvs
tables = [pd.read_csv(f).dropna(axis=0, how='all') for f in ant_class_csvs]
table_cols = tables[0].columns[1::2]
class_cols = tables[0].columns[2::2]
In [7]:
# set up for for figuring out candidate antennas
ap_strs = np.array(tables[0]['Antenna'])
ant_flags = np.array([t[class_cols] for t in tables]) == 'bad'
sun_low_enough = np.array([t['Solar Alt'] < MAX_SOLAR_ALT for t in tables])
ants = sorted(set(int(a[:-1]) for a in ap_strs))

# get relevant indices (exclude antennas only flagged for Even/Odd Zeros or Redcal chi^2 or Bad X-Engine Diffs)
e_pols = [i for i, ap_str in enumerate(ap_strs) if 'e' in ap_str]
n_pols = [i for i, ap_str in enumerate(ap_strs) if 'n' in ap_str]
cols_to_use = [cc for cc, colname in enumerate(class_cols) if colname not in 
               ['Antenna Class', 'Even/Odd Zeros Class','Redcal chi^2 Class', 'Bad Diff X-Engines Class']]

# perfrom any over flagging rationales, excluding times where the sun is too high
passes_checks_grid = np.any(ant_flags[:, :, cols_to_use] & sun_low_enough[:, :, None], axis=2)

# also exclude nearly fully-flagged files
files_to_flag = np.mean(passes_checks_grid[:, e_pols], axis=1) > PER_POL_FILE_FLAG_THRESH
files_to_flag |= np.mean(passes_checks_grid[:, n_pols], axis=1) > PER_POL_FILE_FLAG_THRESH
print(f'Found {int(np.sum(files_to_flag))} files to fully flag based on one pol exceeding {PER_POL_FILE_FLAG_THRESH:.2%}')
is_candidate_auto = ~np.any(passes_checks_grid[~files_to_flag, :], axis=0)

# get set of candidate autocorrelation keys
candidate_autos = set()
for ap_str in ap_strs[is_candidate_auto]:
    ap = int(ap_str[:-1]), utils.comply_pol(ap_str[-1])
    candidate_autos.add(utils.join_bl(ap, ap))
print(f'{len(candidate_autos)} candidate autocorrelations identified for RFI flagging.')
Found 5 files to fully flag based on one pol exceeding 75.00%
35 candidate autocorrelations identified for RFI flagging.
In [8]:
# Load sum and diff autos, checking to see whether any of them show packet loss
good_data = {}
info_dicts = {}
new_fully_flagged_files = 0
for i, (sf, df, f2f) in enumerate(zip(auto_sums, auto_diffs, files_to_flag)):
    if f2f:
        continue
    rv = io.read_hera_hdf5(sf, bls=candidate_autos)
    good_data[sf] = rv['data']
    info_dicts[sf] = rv['info']
    diff = io.read_hera_hdf5(df, bls=candidate_autos)['data']
    zeros_class = ant_class.even_odd_zeros_checker(good_data[sf], diff)
    
    # if this file is fully flagged, don't let that affect the candidates
    if len(zeros_class.bad_ants) == len(zeros_class.ants):
        new_fully_flagged_files += 1
        files_to_flag[i] = True
        continue    
    
    for ant in zeros_class.bad_ants:
        candidate_autos.remove(utils.join_bl(ant, ant))
        print(f'Removing {utils.join_bl(ant, ant)} on {sf}, {len(candidate_autos)} remain.')
    
if new_fully_flagged_files > 0:
    print(f'{new_fully_flagged_files} additional files were flagged because they were 100% flagged for Even/Odd zeros.')
print(f'{len(candidate_autos)} candidate autocorrelations remain after looking for packet loss effects in autos.')
35 candidate autocorrelations remain after looking for packet loss effects in autos.
In [9]:
# load calibration solutions
cs = CalibrationSmoother(cal_files, load_cspa=False, load_chisq=False, pick_refant=False)
In [10]:
# load a priori flagged times
if APRIORI_YAML_PATH is not None:
    print(f'Loading a priori flagged times from {APRIORI_YAML_PATH}')
    apriori_flags = np.zeros(len(cs.time_grid), dtype=bool)
    apriori_flags[metrics_io.read_a_priori_int_flags(APRIORI_YAML_PATH, times=cs.time_grid).astype(int)] = True
Loading a priori flagged times from /lustre/aoc/projects/hera/h6c-analysis/IDR3/src/hera_pipelines/pipelines/h6c/idr3/v1/analysis/apriori_flags/2459934_apriori_flags.yaml
In [11]:
# completely flag times that had too many antennas flagged
for f2f in np.array(sorted(cs.time_indices.keys()))[files_to_flag]:
    for ant in cs.flag_grids:
        cs.flag_grids[ant][cs.time_indices[f2f], :] = True

Figure out a subset of most-stable antennas to filter and flag on¶

In [12]:
initial_cal_flags = np.all([f for f in cs.flag_grids.values()], axis=0)
In [13]:
def average_autos(per_file_autos, files_to_flag, bls_to_use, auto_sums, cs):
    '''Averages autos over baselines, matching the time_grid in CalibrationSmoother cs.'''
    avg_per_file_autos = {sf: None if f2f else np.mean([per_file_autos[sf][bl] for bl in bls_to_use], axis=0)
                          for sf, f2f in zip(auto_sums, files_to_flag)}
    avg_autos = np.zeros((len(cs.time_grid), len(cs.freqs)), dtype=float)
    for sf, cf in zip(auto_sums, cs.cals):
        if avg_per_file_autos[sf] is not None:  # because the file was flagged
            avg_autos[cs.time_indices[cf], :] = np.abs(avg_per_file_autos[sf])
    return avg_autos
In [14]:
avg_candidate_auto = average_autos(good_data, files_to_flag, candidate_autos, auto_sums, cs)
In [15]:
def flag_FM(flags, freqs, freq_range=[87.5e6, 108e6]):
    '''Apply flags to all frequencies within freq_range (in Hz).'''
    flags[:, np.logical_and(freqs >= freq_range[0], freqs <= freq_range[1])] = True 
In [16]:
flag_FM(initial_cal_flags, cs.freqs, freq_range=FM_freq_range)
In [17]:
def flag_sun(flags, times, max_solar_alt=0):
    '''Apply flags to all times where the solar altitude is greater than max_solar_alt (in degrees).'''
    solar_altitudes_degrees = utils.get_sun_alt(times)
    flags[solar_altitudes_degrees >= max_solar_alt, :] = True
In [18]:
flag_sun(initial_cal_flags, cs.time_grid, max_solar_alt=MAX_SOLAR_ALT)
In [19]:
if APRIORI_YAML_PATH is not None:
    initial_cal_flags[apriori_flags, :] = True
In [20]:
def predict_auto_noise(auto, dt, df, nsamples=1):
    '''Predict noise on an (antenna-averaged) autocorrelation. The product of Delta t and Delta f
    must be unitless. For N autocorrelations averaged together, use nsamples=N.'''
    int_count = int(dt * df) * nsamples
    return np.abs(auto) / np.sqrt(int_count / 2)
In [21]:
# Figure out noise and weights
int_time = 24 * 3600 * np.median(np.diff(cs.time_grid))
chan_res = np.median(np.diff(cs.freqs))
noise = predict_auto_noise(avg_candidate_auto, int_time, chan_res, nsamples=1)
wgts = np.where(initial_cal_flags, 0, noise**-2)
In [22]:
# get slices to index into region of waterfall outwide of which it's 100% flagged
unflagged_ints = np.squeeze(np.argwhere(~np.all(initial_cal_flags, axis=1)))
ints_to_filt = slice(unflagged_ints[0], unflagged_ints[-1] + 1)
unflagged_chans = np.squeeze(np.argwhere(~np.all(initial_cal_flags, axis=0)))
chans_to_filt = slice(unflagged_chans[0], unflagged_chans[-1] + 1)
In [23]:
# Filter every autocorrelation individually
cached_output = {}
models = {}
sqrt_mean_sqs = {}
time_filters, freq_filters = dpss_filters(freqs=cs.freqs[chans_to_filt], # Hz
                                          times=cs.time_grid[ints_to_filt], # JD
                                          freq_scale=FREQ_FILTER_SCALE,
                                          time_scale=TIME_FILTER_SCALE,
                                          eigenval_cutoff=EIGENVAL_CUTOFF)

for bl in candidate_autos:
    auto_here = average_autos(good_data, files_to_flag, [bl], auto_sums, cs)

    models[bl] = np.array(auto_here)
    model, cached_output = solve_2D_DPSS(auto_here[ints_to_filt, chans_to_filt], wgts[ints_to_filt, chans_to_filt], 
                                         time_filters, freq_filters, method='lu_solve', cached_input=cached_output)
    models[bl][ints_to_filt, chans_to_filt] = model
    
    noise_model = predict_auto_noise(models[bl], int_time, chan_res, nsamples=1)   
    sqrt_mean_sqs[bl] = np.nanmean(np.where(initial_cal_flags, np.nan, (auto_here - models[bl]) / noise_model)**2)**.5
In [24]:
# Pick best autocorrelations to filter on
L2_bound = max(np.quantile(list(sqrt_mean_sqs.values()), MIN_FRAC_OF_AUTOS), MAX_AUTO_L2)
good_auto_bls = [bl for bl in candidate_autos if sqrt_mean_sqs[bl] <= L2_bound]
print(f'Using {len(good_auto_bls)} out of {len(candidate_autos)} candidate autocorrelations ({len(good_auto_bls) / len(candidate_autos):.2%}).') 
Using 9 out of 35 candidate autocorrelations (25.71%).
In [25]:
extent = [cs.freqs[0]/1e6, cs.freqs[-1]/1e6, cs.time_grid[-1] - int(cs.time_grid[0]), cs.time_grid[0] - int(cs.time_grid[0])]
In [26]:
def plot_all_filtered_bls(N_per_row=8):
    N_rows = int(np.ceil(len(candidate_autos) / N_per_row))
    fig, axes = plt.subplots(N_rows, N_per_row, figsize=(14, 3 * N_rows), dpi=100,
                             sharex=True, sharey=True, gridspec_kw={'wspace': 0, 'hspace': .18})

    for i, (ax, bl) in enumerate(zip(axes.flatten(), sorted(sqrt_mean_sqs.keys(), key=lambda bl: sqrt_mean_sqs[bl]))):
        auto_here = average_autos(good_data, files_to_flag, [bl], auto_sums, cs)
        noise_model = predict_auto_noise(models[bl], int_time, chan_res, nsamples=1)

        im = ax.imshow(np.where(initial_cal_flags, np.nan, (auto_here - models[bl]) / noise_model).real, 
                       aspect='auto', interpolation='none', cmap='bwr', vmin=-10, vmax=10, extent=extent)
        ax.set_title(f'{bl[0]}{bl[2][0]}: {sqrt_mean_sqs[bl]:.3}', color=('k' if sqrt_mean_sqs[bl] <= L2_bound else 'r'), fontsize=10)

        if i == 0:
            plt.colorbar(im, ax=axes, location='top', label=r'Autocorrelation z-score after DPSS filtering (with $\langle z^2 \rangle^{1/2}$)', extend='both', aspect=40, pad=.015)
        if i % N_per_row == 0:
            ax.set_ylabel(f'JD - {int(cs.time_grid[0])}')       
    for ax in axes[-1, :]:
        ax.set_xlabel('Frequency (MHz)')
    plt.tight_layout()
    plt.show()
    
    antpols = [(int(ap[:-1]), utils.comply_pol(ap[-1])) for ap in ap_strs]
    other_autos = [f'{ap[0]}{ap[-1][-1]}' for ap in antpols if utils.join_bl(ap, ap) not in candidate_autos]
    print('Not plotted here due to prior antenna flagging:')
    print('\t' + '\n\t'.join(textwrap.wrap(', '.join(other_autos), 80, break_long_words=False)))

Figure 1: Show All DPSS Residual z-Scores¶

This figure shows the z-score waterfall of each antenna. Also shown is the square root of the mean of the square of each waterfall, as a metric of its instability. Antennas in red are excluded from the average of most stable antennas that are used for subsequent flagging.

In [27]:
plot_all_filtered_bls()
This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
No description has been provided for this image
Not plotted here due to prior antenna flagging:
	3e, 3n, 4e, 4n, 5e, 5n, 7e, 7n, 8e, 8n, 9e, 9n, 10e, 10n, 15e, 15n, 16e, 16n,
	17e, 17n, 18e, 18n, 19e, 19n, 20e, 20n, 21e, 21n, 22e, 22n, 27e, 27n, 28e, 28n,
	29e, 29n, 30e, 31e, 31n, 32e, 32n, 34e, 34n, 35e, 35n, 36e, 36n, 38e, 38n, 40e,
	40n, 41e, 41n, 42e, 42n, 43e, 43n, 44e, 44n, 45e, 45n, 46e, 46n, 47e, 47n, 48e,
	48n, 49e, 49n, 50e, 50n, 51e, 51n, 52e, 52n, 53e, 53n, 54e, 54n, 55e, 55n, 56e,
	56n, 57e, 57n, 58e, 58n, 59e, 59n, 60e, 60n, 61e, 61n, 62e, 62n, 63e, 63n, 64e,
	64n, 65e, 65n, 67e, 67n, 68e, 68n, 69e, 69n, 70e, 70n, 71e, 71n, 72e, 72n, 73e,
	73n, 74e, 74n, 77e, 77n, 78e, 78n, 79e, 79n, 80e, 80n, 81n, 82e, 82n, 83e, 83n,
	84e, 84n, 85e, 85n, 86e, 86n, 87e, 90e, 90n, 91e, 91n, 92e, 92n, 93e, 93n, 94e,
	94n, 95e, 95n, 96e, 96n, 97e, 97n, 98e, 98n, 99e, 99n, 100e, 100n, 101n, 103e,
	103n, 104n, 105e, 105n, 106e, 106n, 107e, 107n, 108e, 108n, 109e, 109n, 110e,
	110n, 111e, 111n, 112e, 112n, 113e, 113n, 114e, 114n, 115e, 115n, 116e, 116n,
	117e, 117n, 118n, 119e, 119n, 120e, 120n, 121e, 121n, 125e, 125n, 126e, 127e,
	127n, 128e, 128n, 129e, 129n, 130e, 130n, 131e, 131n, 132e, 132n, 133e, 133n,
	135e, 135n, 136e, 137e, 137n, 138e, 138n, 139e, 139n, 140e, 140n, 141e, 141n,
	142e, 142n, 143e, 143n, 145e, 145n, 146e, 147e, 147n, 148e, 148n, 149n, 150e,
	150n, 155e, 155n, 156e, 156n, 157e, 157n, 158e, 158n, 159e, 159n, 160e, 160n,
	161e, 161n, 162e, 162n, 164e, 164n, 165e, 167e, 167n, 168n, 169e, 169n, 170e,
	170n, 179e, 179n, 180e, 180n, 181e, 181n, 182e, 182n, 183e, 183n, 184e, 184n,
	185e, 186e, 186n, 187e, 187n, 189e, 189n, 190e, 190n, 191e, 191n, 200e, 200n,
	201e, 201n, 202e, 202n, 203e, 203n, 205e, 205n, 206e, 206n, 207e, 207n, 208e,
	208n, 209e, 209n, 210e, 210n, 211e, 211n, 219e, 219n, 220e, 220n, 221e, 221n,
	222e, 222n, 223e, 223n, 224e, 224n, 225e, 225n, 226e, 226n, 227e, 227n, 228e,
	228n, 229e, 229n, 237e, 237n, 238e, 238n, 239e, 239n, 240e, 240n, 241e, 241n,
	242e, 242n, 243e, 243n, 244e, 244n, 245e, 245n, 246e, 246n, 261e, 261n, 262e,
	262n, 320e, 320n, 324e, 324n, 325e, 325n, 329e, 329n, 333e, 333n
In [28]:
# Compute average autocorrelation and DPSS filter it
avg_auto = average_autos(good_data, files_to_flag, good_auto_bls, auto_sums, cs)
model = np.array(avg_auto)
submodel, _ = solve_2D_DPSS(avg_auto[ints_to_filt, chans_to_filt], wgts[ints_to_filt, chans_to_filt], 
                                time_filters, freq_filters, method='lu_solve', cached_input=cached_output)
model[ints_to_filt, chans_to_filt] = submodel
noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls))
zscore = ((avg_auto - model) / noise_model).real
In [29]:
def plot_z_score(flags, zscore):
    plt.figure(figsize=(14,10), dpi=100)
    plt.imshow(np.where(flags, np.nan, zscore.real), aspect='auto', cmap='bwr', interpolation='none', vmin=-10, vmax=10, extent=extent)
    plt.colorbar(location='top', label='z score', extend='both')
    plt.xlabel('Frequency (MHz)')
    plt.ylabel(f'JD - {int(cs.time_grid[0])}')
    plt.tight_layout()

Figure 2: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Initial Flags¶

This plot shows the z-score of a DPSS-filtered, deeply averaged autocorrelation, where the noise is inferred from the integration time, channel width, and DPSS model.

In [30]:
plot_z_score(initial_cal_flags, zscore)
No description has been provided for this image

Expand original flags to include potential RFI missed by the file_calibration notebook¶

In [31]:
# flag outliers and perform watershed for lesser outliers neighboring flags
round_1_flags = copy.deepcopy(initial_cal_flags)
round_1_flags[zscore > Z_THRESH] = True
ws_flags = xrfi._ws_flag_waterfall(zscore, round_1_flags, WS_Z_THRESH)
round_1_flags |= ws_flags
In [32]:
def iteratively_flag_on_averaged_zscore(flags, zscore, avg_z_thresh=1.5, verbose=True):
    '''Flag whole integrations or channels based on average z-score. This is done
    iteratively to prevent bad times affecting channel averages or vice versa.'''
    flagged_chan_count = 0
    flagged_int_count = 0
    while True:
        zspec = np.nanmean(np.where(flags, np.nan, zscore), axis=0)
        ztseries = np.nanmean(np.where(flags, np.nan, zscore), axis=1)

        if (np.nanmax(zspec) < avg_z_thresh) and (np.nanmax(ztseries) < avg_z_thresh):
            break

        if np.nanmax(zspec) >= np.nanmax(ztseries):
            flagged_chan_count += np.sum((zspec >= np.nanmax(ztseries)) & (zspec >= avg_z_thresh))
            flags[:, (zspec >= np.nanmax(ztseries)) & (zspec >= avg_z_thresh)] = True
        else:
            flagged_int_count += np.sum((ztseries >= np.nanmax(zspec)) & (ztseries >= avg_z_thresh))
            flags[(ztseries >= np.nanmax(zspec)) & (ztseries >= avg_z_thresh), :] = True

    if verbose:
        print(f'Flagging an additional {flagged_int_count} integrations and {flagged_chan_count} channels.')

def impose_max_chan_flag_frac(flags, max_flag_frac=.25, verbose=True):
    '''Flag channels already flagged more than max_flag_frac (excluding completely flagged times).'''
    unflagged_times = ~np.all(flags, axis=1)
    frequently_flagged_chans =  np.mean(flags[unflagged_times, :], axis=0) >= max_flag_frac
    if verbose:
        print(f'Flagging {np.sum(frequently_flagged_chans) - np.sum(np.all(flags, axis=0))} channels previously flagged {max_flag_frac:.2%} or more.')        
    flags[:, frequently_flagged_chans] = True 
        
def impose_max_time_flag_frac(flags, max_flag_frac=.25, verbose=True):
    '''Flag times already flagged more than max_flag_frac (excluding completely flagged channels).'''
    unflagged_chans = ~np.all(flags, axis=0)
    frequently_flagged_times =  np.mean(flags[:, unflagged_chans], axis=1) >= max_flag_frac
    if verbose:
        print(f'Flagging {np.sum(frequently_flagged_times) - np.sum(np.all(flags, axis=1))} times previously flagged {max_flag_frac:.2%} or more.')
    flags[frequently_flagged_times, :] = True             
In [33]:
# Flag whole integrations or channels
iteratively_flag_on_averaged_zscore(round_1_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True)
impose_max_chan_flag_frac(round_1_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)
impose_max_time_flag_frac(round_1_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)
Mean of empty slice
Mean of empty slice
Flagging an additional 5 integrations and 1 channels.
Flagging 69 channels previously flagged 25.00% or more.
Flagging 25 times previously flagged 10.00% or more.

Figure 3: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Expanded Flags¶

This is the same as Figure 2, but includes additional flags identified based on a full 2D DPSS filter of this waterfall.

In [34]:
plot_z_score(round_1_flags, zscore)
No description has been provided for this image

Make new flags from scratch¶

In [35]:
noise = predict_auto_noise(avg_auto, int_time, chan_res, nsamples=len(good_auto_bls))
wgts = wgts = np.where(round_1_flags, 0, noise**-2)
In [36]:
time_filters, freq_filters = dpss_filters(freqs=cs.freqs[chans_to_filt], # Hz
                                          times=cs.time_grid[ints_to_filt], # JD
                                          freq_scale=FREQ_FILTER_SCALE,
                                          time_scale=TIME_FILTER_SCALE,
                                          eigenval_cutoff=EIGENVAL_CUTOFF)
model = np.array(avg_auto)
submodel, _ = solve_2D_DPSS(avg_auto[ints_to_filt, chans_to_filt], wgts[ints_to_filt, chans_to_filt],
                                   time_filters, freq_filters, method='lu_solve')
model[ints_to_filt, chans_to_filt] = submodel
In [37]:
noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls))
zscore = ((avg_auto - model) / noise_model).real
In [38]:
round_2_flags = np.zeros_like(round_1_flags)

# flag any integrations fully-flagged by the notebooks (also accounts for missing data)
round_2_flags[np.all(initial_cal_flags, axis=1), :] = True

# flag on FM, sun-up data, and a priori flags
flag_FM(round_2_flags, cs.freqs, freq_range=FM_freq_range)
flag_sun(round_2_flags, cs.time_grid, max_solar_alt=MAX_SOLAR_ALT)
if APRIORI_YAML_PATH is not None:
    round_2_flags[apriori_flags, :] = True

# flag any round 1 flags that are still moderately high z-score
round_2_flags[round_1_flags & (zscore > REPEAT_FLAG_Z_THRESH)] = True
In [39]:
# Flag outliers and then perform watershed flagging
round_2_flags[zscore.real > Z_THRESH] = True
ws_flags = xrfi._ws_flag_waterfall(zscore.real, round_2_flags, WS_Z_THRESH)
round_2_flags |= ws_flags
In [40]:
# Flag whole integrations or channels
iteratively_flag_on_averaged_zscore(round_2_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True)
impose_max_chan_flag_frac(round_2_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)
impose_max_time_flag_frac(round_2_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)
Mean of empty slice
Mean of empty slice
Flagging an additional 1 integrations and 0 channels.
Flagging 64 channels previously flagged 25.00% or more.
Flagging 30 times previously flagged 10.00% or more.

Figure 4: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Final, Re-Computed Flags¶

This is the same as Figures 2 and 3, but now includes only the final set of flags.

In [41]:
plot_z_score(round_2_flags, zscore)
No description has been provided for this image
In [42]:
def summarize_flagging():
    plt.figure(figsize=(14,10), dpi=100)
    cmap = matplotlib.colors.ListedColormap(((0, 0, 0),) + matplotlib.cm.get_cmap("Set2").colors[:3])
    plt.imshow(np.where(initial_cal_flags & round_2_flags, 1, np.where(initial_cal_flags, 2, np.where(round_2_flags, 3, 0))), 
               aspect='auto', cmap=cmap, interpolation='none', extent=extent)
    plt.clim([-.5, 3.5])
    cbar = plt.colorbar(location='top', aspect=40, pad=.02)
    cbar.set_ticks([0, 1, 2, 3])
    cbar.set_ticklabels(['Unflagged', 'Flagged by both file_calibration and here', 'Flagged by file_calibration only', 'Flagged here only'])
    plt.xlabel('Frequency (MHz)')
    plt.ylabel(f'JD - {int(cs.time_grid[0])}')
    plt.tight_layout()

Figure 5: Summary of Flags Before and After Recomputing Them¶

This plot shows which times and frequencies were flagged by either the file_calibration notebook (which also includes a priori flags imposed here like FM), which ones were flagged only in this notebook, and which ones were flagged consistently (and often independently) in both.

In [43]:
summarize_flagging()
The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
No description has been provided for this image

Save Results¶

In [44]:
add_to_history = 'Produced by full_day_rfi notebook with the following environment:\n' + '=' * 65 + '\n' + os.popen('conda env export').read() + '=' * 65
In [45]:
out_flag_files = [auto_sum.replace(SUM_AUTOS_SUFFIX, OUT_FLAG_SUFFIX) for auto_sum in auto_sums]
for auto_sum, cal, out_flag_file in zip(auto_sums, cs.cals, out_flag_files):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # create UVFlag object based on UVData
        uvd = UVData()
        uvd.read(auto_sum)
        uvf = UVFlag(uvd, waterfall=True, mode='flag')

        # fill out flags
        for p in range(uvf.Npols):
            uvf.flag_array[:, :, p] = round_2_flags[cs.time_indices[cal], :]

        # write to disk
        uvf.history += add_to_history
    uvf.write(out_flag_file, clobber=True)
    
print(f'Saved {len(out_flag_files)} *.{OUT_FLAG_SUFFIX} files starting with {out_flag_files[0]}.')    
Saved 1848 *.sum.flag_waterfall.h5 files starting with /lustre/aoc/projects/hera/h6c-analysis/IDR3/2459934/zen.2459934.21347.sum.flag_waterfall.h5.

TODO: Explore per-antenna flagging using DPSS filters¶

Metadata¶

In [46]:
for repo in ['hera_cal', 'hera_qm', 'hera_filters', 'hera_notebook_templates', 'pyuvdata']:
    exec(f'from {repo} import __version__')
    print(f'{repo}: {__version__}')
hera_cal: 3.7.7.dev67+g3ed1b8028
hera_qm: 2.2.1.dev4+gf6d0211
hera_filters: 0.1.7
hera_notebook_templates: 0.0.1.dev1269+g4fd50fce5
pyuvdata: 3.2.3.dev10+g11d3f658
In [47]:
print(f'Finished execution in {(time.time() - tstart) / 60:.2f} minutes.')
Finished execution in 104.56 minutes.