In [1]:
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Image, display
from pathlib import Path
from sklearn.decomposition import PCA

from pea_met_network.qa_qc import coverage_summary
from pea_met_network.redundancy import (
    benchmark_to_stanhope,
    build_station_matrix,
    build_station_recommendations,
    cluster_station_order,
    pairwise_station_correlation,
    pca_station_loadings,
)

# ---------------------------------------------------------------------------
# FWI mode selector — switch between "hourly" and "compliant"
# ---------------------------------------------------------------------------
FWI_MODE = "hourly"

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(".").resolve()
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "notebooks" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Date range for analysis — must match cleaning-config.json
ANALYSIS_START = pd.Timestamp("2023-04-01", tz="UTC")

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 120

print(f"FWI mode: {FWI_MODE}")
print(f"Processed dir: {PROCESSED_DIR}")
mkdir -p failed for path /home/moltis/.config/matplotlib: [Errno 13] Permission denied: '/home/moltis/.config/matplotlib'
Matplotlib created a temporary cache directory at /tmp/matplotlib-ep8gx_14 because there was an issue with the default path (/home/moltis/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
FWI mode: hourly
Processed dir: /mnt/fast_data/workspaces/pea-met-network/data/processed

1. Data Loading¶

Load hourly processed data for all stations. Station list is derived from disk.

In [2]:
# Discover stations from disk — no hardcoded list
stations = sorted([
    d.name for d in PROCESSED_DIR.iterdir()
    if d.is_dir() and (d / "station_hourly.csv").exists()
])
print(f"Stations found: {stations}")

hourly_frames = []
daily_frames = []
for station in stations:
    h_path = PROCESSED_DIR / station / "station_hourly.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    h_df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
    h_df["timestamp_utc"] = pd.to_datetime(h_df["timestamp_utc"], utc=True)
    h_df = h_df[h_df["timestamp_utc"] >= ANALYSIS_START]
    hourly_frames.append(h_df)
    if d_path.exists():
        d_df = pd.read_csv(d_path, parse_dates=["timestamp_utc"])
        d_df["timestamp_utc"] = pd.to_datetime(d_df["timestamp_utc"], utc=True)
        daily_frames.append(d_df)
    tmin = h_df["timestamp_utc"].min()
    tmax = h_df["timestamp_utc"].max()
    print(f"  {station}: {len(h_df)} hourly rows  ({tmin} to {tmax})")

hourly_all = pd.concat(hourly_frames, ignore_index=True)
if daily_frames:
    daily_all = pd.concat(daily_frames, ignore_index=True)
else:
    daily_all = pd.DataFrame()
n = hourly_all["station"].nunique()
print(f"\nCombined: {len(hourly_all)} hourly rows, {n} stations")
Stations found: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
  cavendish: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
  greenwich: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
  north_rustico: 26113 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-24 00:00:00+00:00)
  stanhope: 24144 hourly rows  (2023-04-01 00:00:00+00:00 to 2025-12-31 23:00:00+00:00)
  stanley_bridge: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
  tracadie: 23956 hourly rows  (2023-06-29 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)

Combined: 152489 hourly rows, 6 stations

2. Exploratory Data Analysis¶

2.1 Station Coverage Table¶

Summary of temporal coverage per station.

In [3]:
coverage_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    coverage_frames.append(df[["station", "timestamp_utc"]])

coverage_df = pd.concat(coverage_frames, ignore_index=True)
cov_summary = coverage_summary(coverage_df)
print(cov_summary.to_string(index=False))
       station  total_records
     cavendish          26092
     greenwich          26092
 north_rustico          26113
      stanhope          24144
stanley_bridge          26092
      tracadie          23956

2.2 Temporal Coverage Summary¶

Visual overview of data availability by station over time.

In [4]:
fig, ax = plt.subplots(figsize=(12, 4))

for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    start = df["timestamp_utc"].min().floor("D")
    end = df["timestamp_utc"].max().floor("D")
    date_range = pd.date_range(start, end, freq="D")
    daily_counts = df.set_index("timestamp_utc").resample("D").size()
    daily_counts = daily_counts.reindex(date_range, fill_value=0)
    ax.fill_between(daily_counts.index, 0, daily_counts.values, alpha=0.4, label=station)
    ax.plot(daily_counts.index, daily_counts.values, linewidth=0.8)

ax.set_xlabel("Date")
ax.set_ylabel("Hourly Records per Day")
ax.set_title("Temporal Coverage by Station")
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "temporal_coverage.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "temporal_coverage.png")))
print("Saved: notebooks/figures/temporal_coverage.png")
No description has been provided for this image
Saved: notebooks/figures/temporal_coverage.png
No description has been provided for this image

2.3 Missingness Heatmaps¶

Each heatmap is split into two panels:

  • Left panel — Core Sensors / FWI: Variables directly used as FWI inputs (temp, RH, wind, rain) plus FWI output indices (FFMC, DMC, DC, ISI, BUI, FWI on daily). These are the columns that matter for the fire weather pipeline.

  • Right panel — Extended Sensors: Auxiliary measurements (wind direction, solar radiation, barometric pressure, dew point, gust speed) that are not used in FWI computation. Many stations lack some or all of these sensors, so 100% missing values here indicate a missing instrument, not a data problem.

  • Hourly heatmap — raw observations from station_hourly.csv.

  • Daily heatmap — daily aggregates with FWI components from <station>_daily_compliant.csv (falls back to station_daily.csv).

In [5]:
# --- Hourly missingness heatmap (core vs extended sensors) ---
FWI_CORE_VARS = [
    "air_temperature_c", "relative_humidity_pct",
    "wind_speed_kmh", "rain_mm",
]
HOURLY_EXTENDED_VARS = [
    "wind_direction_deg", "solar_radiation_w_m2",
    "barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]

hourly_missing_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath)
    df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
    row = {"station": station}
    for var in FWI_CORE_VARS + HOURLY_EXTENDED_VARS:
        if var in df.columns:
            row[var] = df[var].isna().mean() * 100
        else:
            row[var] = 100.0
    hourly_missing_frames.append(row)

hourly_missing_df = pd.DataFrame(hourly_missing_frames).set_index("station")

# Split into core and extended panels
core_cols = [v for v in FWI_CORE_VARS if v in hourly_missing_df.columns]
ext_cols = [v for v in HOURLY_EXTENDED_VARS if v in hourly_missing_df.columns]

fig, (ax_core, ax_ext) = plt.subplots(
    1, 2, figsize=(max(12, (len(core_cols) + len(ext_cols)) * 1.1), max(3, len(stations) * 0.45)),
    gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.4},
)

sns.heatmap(
    hourly_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
    ax=ax_core, cbar_kws={"label": "% Missing"},
)
ax_core.set_title("Core Sensors (FWI Inputs)", fontweight="bold", fontsize=11)
ax_core.set_xlabel("Variable")
ax_core.set_ylabel("Station")

if ext_cols:
    sns.heatmap(
        hourly_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_ext, cbar_kws={"label": "% Missing"},
    )
else:
    ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
                ha="center", va="center", fontsize=12, color="gray")
ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
ax_ext.set_xlabel("Variable")
ax_ext.set_ylabel("")

fig.suptitle("Hourly Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
             fontsize=12, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "missingness_heatmap_hourly.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_hourly.png")))
print("Saved: notebooks/figures/missingness_heatmap_hourly.png")
print()
print("Hourly missingness summary (core):")
print(hourly_missing_df[core_cols].round(1).to_string())
print()
print("Hourly missingness summary (extended):")
print(hourly_missing_df[ext_cols].round(1).to_string())
No description has been provided for this image
Saved: notebooks/figures/missingness_heatmap_hourly.png

Hourly missingness summary (core):
                air_temperature_c  relative_humidity_pct  wind_speed_kmh  rain_mm
station                                                                          
cavendish                    14.5                   14.5             0.0      0.0
greenwich                     4.1                   16.2             4.0      0.1
north_rustico                 4.1                    4.1             0.0      0.0
stanhope                      0.3                    0.3             0.3      0.2
stanley_bridge                0.0                   17.2             0.0      0.0
tracadie                      0.0                   12.8             0.0      0.0

Hourly missingness summary (extended):
                wind_direction_deg  solar_radiation_w_m2  barometric_pressure_kpa  dew_point_c  wind_gust_speed_kmh
station                                                                                                            
cavendish                     14.5                  14.5                    100.0         14.5                 14.9
greenwich                     12.1                  12.1                    100.0         16.2                 12.5
north_rustico                  2.1                   1.3                      1.3          1.3                  4.3
stanhope                       2.1                 100.0                    100.0          0.3                100.0
stanley_bridge                18.4                  18.4                     34.7        100.0                 32.1
tracadie                      13.0                  13.0                     18.8        100.0                 15.5
No description has been provided for this image
In [6]:
# --- Daily missingness heatmap (core vs extended, includes FWI) ---
FWI_OUTPUT_VARS = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]
FWI_CORE_DAILY = [
    "air_temperature_c", "relative_humidity_pct",
    "wind_speed_kmh", "rain_mm",
] + FWI_OUTPUT_VARS
DAILY_EXTENDED = [
    "wind_direction_deg", "solar_radiation_w_m2",
    "barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]

daily_missing_frames = []
for station in stations:
    c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    fpath = c_path if c_path.exists() else d_path
    if not fpath.exists():
        continue
    df = pd.read_csv(fpath)
    if "timestamp_utc" in df.columns:
        df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
    data_cols = sorted([
        c for c in df.columns
        if not c.endswith("_qf") and not c.endswith("_src")
        and not c.endswith("_method") and not c.startswith("_quality")
        and c not in ("station", "timestamp_utc", "carry_forward_used")
    ])
    row = {"station": station}
    for var in data_cols:
        row[var] = df[var].isna().mean() * 100
    daily_missing_frames.append(row)

if daily_missing_frames:
    daily_missing_df = pd.DataFrame(daily_missing_frames).set_index("station")
    all_cols = sorted(daily_missing_df.columns)

    core_cols = [v for v in FWI_CORE_DAILY if v in all_cols]
    ext_cols = [v for v in all_cols if v not in core_cols]

    fig, (ax_core, ax_ext) = plt.subplots(
        1, 2,
        figsize=(max(14, (len(core_cols) + len(ext_cols)) * 1.0), max(3, len(daily_missing_df) * 0.45)),
        gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.5},
    )

    sns.heatmap(
        daily_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_core, cbar_kws={"label": "% Missing"},
    )
    ax_core.set_title("Core + FWI Outputs", fontweight="bold", fontsize=11)
    ax_core.set_xlabel("Variable")
    ax_core.set_ylabel("Station")

    if ext_cols:
        sns.heatmap(
            daily_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
            ax=ax_ext, cbar_kws={"label": "% Missing"},
        )
    else:
        ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
                    ha="center", va="center", fontsize=12, color="gray")
    ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
    ax_ext.set_xlabel("Variable")
    ax_ext.set_ylabel("")

    fig.suptitle("Daily Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
                 fontsize=12, y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "missingness_heatmap_daily.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_daily.png")))
    print("Saved: notebooks/figures/missingness_heatmap_daily.png")
    print()
    print("Daily missingness summary (core + FWI):")
    print(daily_missing_df[core_cols].round(1).to_string())
    print()
    print("Daily missingness summary (extended):")
    print(daily_missing_df[ext_cols].round(1).to_string())
else:
    print("No daily CSVs found \u2014 daily missingness heatmap skipped.")
No description has been provided for this image
Saved: notebooks/figures/missingness_heatmap_daily.png

Daily missingness summary (core + FWI):
                air_temperature_c  relative_humidity_pct  wind_speed_kmh  rain_mm  ffmc  dmc   dc  isi  bui  fwi
station                                                                                                         
cavendish                    14.6                   14.6             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
greenwich                     4.1                   16.2             4.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
north_rustico                 4.0                    4.0             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
stanhope                      0.6                    0.7             0.6      0.0   0.0  0.0  0.0  0.0  0.0  0.0
stanley_bridge                0.0                   17.2             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
tracadie                      0.0                   12.8             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0

Daily missingness summary (extended):
                barometric_pressure_kpa  dew_point_c  solar_radiation_w_m2  wind_direction_deg  wind_gust_speed_kmh
station                                                                                                            
cavendish                         100.0         14.6                  14.6                14.6                 15.0
greenwich                         100.0         16.2                  12.1                12.1                 12.5
north_rustico                       1.2          1.2                   1.3                 1.9                  4.1
stanhope                          100.0          0.6                   NaN                 1.2                  NaN
stanley_bridge                     34.6        100.0                  18.4                18.4                 32.0
tracadie                           18.7        100.0                  12.9                12.9                 15.5
No description has been provided for this image

2.4 Imputation Summary¶

Gaps filled per station per variable from the pipeline imputation report.

In [7]:
imp_path = PROCESSED_DIR / "imputation_report.csv"
if imp_path.exists():
    imp_df = pd.read_csv(imp_path)
    print(f"Imputation report: {len(imp_df)} entries")
    print("\nGaps filled per station per variable:")
    pivot = imp_df.groupby(["station", "variable"])["count_affected"].sum().reset_index()
    pivot_table = pivot.pivot(index="station", columns="variable", values="count_affected").fillna(0).astype(int)
    print(pivot_table.to_string())

    print("\nImputation method breakdown:")
    method_counts = imp_df["method"].value_counts()
    print(method_counts.to_string())

    # Bar chart: total gaps filled by station
    station_totals = imp_df.groupby("station")["count_affected"].sum().sort_values(ascending=True)
    fig, ax = plt.subplots(figsize=(8, 4))
    station_totals.plot(kind="barh", ax=ax, color="steelblue", alpha=0.8)
    ax.set_xlabel("Total Gaps Filled")
    ax.set_title("Imputation Summary by Station")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "imputation_summary.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "imputation_summary.png")))
    print("Saved: notebooks/figures/imputation_summary.png")
else:
    print("imputation_report.csv not found — run pipeline first.")
Imputation report: 717 entries

Gaps filled per station per variable:
variable        air_temperature_c  barometric_pressure_kpa  dew_point_c  relative_humidity_pct  solar_radiation_w_m2  wind_direction_deg  wind_speed_kmh
station                                                                                                                                                 
cavendish                    3789                    26092         3787                   3787                  3786                3786            3890
greenwich                   10158                    26092         4216                   4229                  3147                3145            3249
north_rustico                1076                      330          331                   1075                   331                 547            1188
stanhope                       77                    24144           68                     79                     0                 516              68
stanley_bridge               4486                     9043        26092                  26092                  4800                4800            8371
tracadie                     3076                     4508        23956                  23956                  3107                3107            3809

Imputation method breakdown:
method
linear_interpolation    575
preserve                142
No description has been provided for this image
Saved: notebooks/figures/imputation_summary.png
No description has been provided for this image

3. Fire Weather Index Analysis¶

Time series of FWI system components for all stations with computed indices.

3.1 FWI Component Time Series¶

Daily-mean FWI components for all stations. FFMC uses hourly values; DMC, DC, BUI, FWI use daily means.

In [8]:
fwi_vars = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]

fwi_stations = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    if "fwi" in df.columns and df["fwi"].notna().sum() > 100:
        fwi_stations.append(station)

print(f"Stations with FWI data: {fwi_stations}")

if fwi_stations:
    fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
    if len(fwi_stations) == 1:
        axes = [axes]

    for idx, station in enumerate(fwi_stations):
        fpath = PROCESSED_DIR / station / "station_hourly.csv"
        df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()
        ax = axes[idx]
        for var in fwi_vars:
            if var in df_daily.columns:
                ax.plot(df_daily.index, df_daily[var], label=var, alpha=0.8, linewidth=0.9)
        ax.set_ylabel("Value")
        ax.set_title(f"FWI Components - {station}")
        ax.legend(fontsize=7, ncol=3)

    axes[-1].set_xlabel("Date")
    fig.suptitle("Fire Weather Index Time Series (Daily Means)", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_timeseries.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_timeseries.png")))
    print(f"Plotted FWI for: {fwi_stations}")
    print("Saved: notebooks/figures/fwi_timeseries.png")
else:
    print("FWI analysis skipped: no stations with non-missing FWI data")
Stations with FWI data: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
No description has been provided for this image
Plotted FWI for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
Saved: notebooks/figures/fwi_timeseries.png
No description has been provided for this image

3.1b QF-Aware FWI Time Series¶

FWI values plotted with background shading indicating data quality.

  • Yellow band: at least one FWI input was synthetically imputed (qf == 1)
  • Red band: at least one FWI input was chain-break gap-filled (qf == 9)
  • No shading: all FWI inputs from observed data (qf == 0)

Stations without per-variable _qf columns (stanhope) are skipped.

In [9]:
# --- QF-Aware FWI Timeseries ---
if fwi_stations:
    fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
    if len(fwi_stations) == 1:
        axes = [axes]

    fwi_input_qf = ["air_temperature_c_qf", "relative_humidity_pct_qf", "wind_speed_kmh_qf", "rain_mm_qf"]

    for idx, station in enumerate(fwi_stations):
        fpath = PROCESSED_DIR / station / "station_hourly.csv"
        df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        ax = axes[idx]

        # Daily-mean FWI
        df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()

        # Daily quality: worst qf per day across all available input qf cols
        available_qf = [c for c in fwi_input_qf if c in df.columns]
        if available_qf:
            qf_daily = df.set_index("timestamp_utc")[available_qf].resample("D").max()
            worst_qf = qf_daily.max(axis=1)
        else:
            worst_qf = pd.Series(0, index=df_daily.index)

        # Background shading
        for i in range(len(worst_qf)):
            if worst_qf.iloc[i] >= 9:
                ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
                          worst_qf.index[i] + pd.Timedelta(hours=12),
                          color='red', alpha=0.15)
            elif worst_qf.iloc[i] >= 1:
                ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
                          worst_qf.index[i] + pd.Timedelta(hours=12),
                          color='gold', alpha=0.20)

        # FWI line
        ax.plot(df_daily.index, df_daily["fwi"], color="black", linewidth=0.8, label="FWI")
        ax.set_ylabel("FWI")
        ax.set_title(f"QF-Aware FWI - {station}")
        ax.legend(fontsize=7)

    axes[-1].set_xlabel("Date")
    fig.suptitle("FWI with Quality Flag Shading (yellow=synthetic, red=chain-break)", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_qf_aware.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_qf_aware.png")))
    print("Saved: notebooks/figures/fwi_qf_aware.png")
else:
    print("QF-Aware FWI skipped: no stations with FWI data.")
No description has been provided for this image
Saved: notebooks/figures/fwi_qf_aware.png
No description has been provided for this image

3.2 FWI Mode Comparison¶

Overlay FWI values from hourly and compliant modes to show divergence (e.g., during chain breaks).

In [10]:
# Try to load both modes for comparison
hourly_fwi_data = {}
compliant_fwi_data = {}

for station in stations:
    h_path = PROCESSED_DIR / station / "station_hourly.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"

    if h_path.exists():
        df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        if "fwi" in df.columns and df["fwi"].notna().sum() > 0:
            hourly_fwi_data[station] = df.set_index("timestamp_utc")["fwi"].resample("D").mean()

    if c_path.exists():
        df_c = pd.read_csv(c_path, parse_dates=["timestamp_utc"])
        df_c["timestamp_utc"] = pd.to_datetime(df_c["timestamp_utc"], utc=True)
        df_c = df_c[df_c["timestamp_utc"] >= ANALYSIS_START]
        if "fwi" in df_c.columns and df_c["fwi"].notna().sum() > 0:
            compliant_fwi_data[station] = df_c.set_index("timestamp_utc")["fwi"]

compare_stations = sorted(set(hourly_fwi_data.keys()) & set(compliant_fwi_data.keys()))

if compare_stations:
    fig, axes = plt.subplots(len(compare_stations), 1, figsize=(14, 3 * len(compare_stations)), sharex=True)
    if len(compare_stations) == 1:
        axes = [axes]

    for idx, station in enumerate(compare_stations):
        ax = axes[idx]
        ax.plot(hourly_fwi_data[station].index, hourly_fwi_data[station].values, label="hourly", alpha=0.7, linewidth=0.9)
        ax.plot(compliant_fwi_data[station].index, compliant_fwi_data[station].values, label="compliant", alpha=0.7, linewidth=0.9)
        ax.set_ylabel("FWI")
        ax.set_title(f"FWI Mode Comparison — {station}")
        ax.legend(fontsize=8)

    axes[-1].set_xlabel("Date")
    fig.suptitle("Hourly vs Compliant FWI", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_mode_comparison.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_mode_comparison.png")))
    print(f"Mode comparison for: {compare_stations}")
    print("Saved: notebooks/figures/fwi_mode_comparison.png")
else:
    print("FWI mode comparison skipped: compliant-mode daily files not found.")
    print("Run the pipeline with --fwi-mode compliant to generate them.")
No description has been provided for this image
Mode comparison for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
Saved: notebooks/figures/fwi_mode_comparison.png
No description has been provided for this image

3.3 FWI Value Statistics¶

Descriptive statistics (min, max, mean, std) for FWI codes per station, sourced from the QA/QC report.

In [11]:
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)
    fwi_stat_cols = [c for c in qa_df.columns if c.startswith("fwi_stat_")]
    if fwi_stat_cols:
        stat_display = qa_df[["station"] + fwi_stat_cols].copy()
        print("FWI Value Statistics (from QA/QC report):")
        print(stat_display.to_string(index=False))
    else:
        print("QA/QC report exists but has no fwi_stat_ columns.")
        print("Columns available:", list(qa_df.columns))
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")
QA/QC report exists but has no fwi_stat_ columns.
Columns available: ['station', 'hourly_rows', 'daily_rows', 'date_range_start', 'date_range_end', 'completeness', 'fwi_mode', 'pre_imp_missing_pct_air_temperature_c', 'pre_imp_missing_pct_relative_humidity_pct', 'pre_imp_missing_pct_wind_speed_kmh', 'pre_imp_missing_pct_rain_mm', 'post_imp_missing_pct_air_temperature_c', 'post_imp_missing_pct_air_temperature_c_qf', 'post_imp_missing_pct_barometric_pressure_kpa', 'post_imp_missing_pct_bui', 'post_imp_missing_pct_bui_qf', 'post_imp_missing_pct_dc', 'post_imp_missing_pct_dc_qf', 'post_imp_missing_pct_dew_point_c', 'post_imp_missing_pct_dmc', 'post_imp_missing_pct_dmc_qf', 'post_imp_missing_pct_ffmc', 'post_imp_missing_pct_ffmc_qf', 'post_imp_missing_pct_fwi', 'post_imp_missing_pct_fwi_qf', 'post_imp_missing_pct_isi', 'post_imp_missing_pct_isi_qf', 'post_imp_missing_pct_rain_mm', 'post_imp_missing_pct_rain_mm_qf', 'post_imp_missing_pct_relative_humidity_pct', 'post_imp_missing_pct_relative_humidity_pct_qf', 'post_imp_missing_pct_solar_radiation_w_m2', 'post_imp_missing_pct_wind_direction_deg', 'post_imp_missing_pct_wind_gust_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh_qf', 'duplicate_count', 'out_of_range_temp_count', 'out_of_range_rh_count', 'out_of_range_wind_count', 'quality_enforced_count', 'quality_flagged_count', 'out_of_range_pre_enforcement', 'out_of_range_post_enforcement', 'fwi_chain_breaks', 'carry_forward_days', 'carry_forward_pct', 'ffmc_min', 'ffmc_max', 'ffmc_mean', 'ffmc_std', 'dmc_min', 'dmc_max', 'dmc_mean', 'dmc_std', 'dc_min', 'dc_max', 'dc_mean', 'dc_std', 'isi_min', 'isi_max', 'isi_mean', 'isi_std', 'bui_min', 'bui_max', 'bui_mean', 'bui_std', 'fwi_min', 'fwi_max', 'fwi_mean', 'fwi_std']

3.4 Chain Break Analysis¶

Analysis of FWI chain breaks by station and cause, sourced from the chain break diagnostic report.

In [12]:
cb_path = PROCESSED_DIR / f"fwi_missingness_report_{FWI_MODE}.csv"
if cb_path.exists():
    cb_df = pd.read_csv(cb_path)
    print(f"Chain break report: {len(cb_df)} entries")

    # Summary table: breaks per station, grouped by cause
    if "cause" in cb_df.columns:
        print("\nBreaks per station by cause:")
        cause_pivot = cb_df.groupby(["station", "cause"]).size().reset_index(name="count")
        cause_table = cause_pivot.pivot(index="station", columns="cause", values="count").fillna(0).astype(int)
        print(cause_table.to_string())

        # Breakdown by missing input
        if "missing_input" in cb_df.columns:
            print("\nBreakdown by missing input:")
            input_counts = cb_df["missing_input"].value_counts()
            print(input_counts.to_string())

        # Bar chart: chain breaks by station, colored by cause
        fig, ax = plt.subplots(figsize=(10, 5))
        causes = cb_df["cause"].unique()
        station_breaks = cb_df.groupby(["station", "cause"]).size().unstack(fill_value=0)
        station_breaks.plot(kind="bar", stacked=True, ax=ax, colormap="Set2")
        ax.set_xlabel("Station")
        ax.set_ylabel("Chain Break Count")
        ax.set_title(f"FWI Chain Breaks by Station and Cause ({FWI_MODE} mode)")
        ax.legend(title="Cause")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        fig.savefig(FIGURES_DIR / "chain_breaks.png", bbox_inches="tight")
        display(Image(filename=str(FIGURES_DIR / "chain_breaks.png")))
        print("Saved: notebooks/figures/chain_breaks.png")

        # Cascade origins table
        cascade = cb_df[cb_df["cause"] == "cascade"]
        if len(cascade) > 0 and "cascade_origin" in cascade.columns:
            print("\nCascade origins:")
            print(cascade[["station", "cascade_origin"]].to_string(index=False))
    else:
        print("Chain break report has no 'cause' column. Columns:", list(cb_df.columns))
else:
    print(f"fwi_missingness_report_{FWI_MODE}.csv not found — no chain breaks or pipeline not run.")
Chain break report: 143 entries

Breaks per station by cause:
cause           input_missing  startup
station                               
cavendish                  19        2
greenwich                  42        0
north_rustico              10        0
stanhope                   44        2
stanley_bridge             11        0
tracadie                   11        2

Breakdown by missing input:
missing_input
relative_humidity_pct                                       62
air_temperature_c, relative_humidity_pct, wind_speed_kmh    29
air_temperature_c                                           28
air_temperature_c, relative_humidity_pct                    18
No description has been provided for this image
Saved: notebooks/figures/chain_breaks.png
No description has been provided for this image

4. Data Quality Report¶

Quality metrics from the QA/QC report, quality enforcement actions, and cross-station imputation audit.

4.1 Pre/Post Imputation Missingness¶

Side-by-side heatmap comparing missingness before and after imputation for the four core FWI input variables (temperature, relative humidity, wind speed, precipitation). Both panels share the same color scale so improvements are visually obvious.

Note: Pre-imputation data is only available for the four core variables — those are the ones the pipeline imputes. Extended sensor variables (solar radiation, barometric pressure, etc.) are not imputed by the pipeline.

In [13]:
# --- Pre/Post Imputation Missingness Heatmaps ---
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)

    # Core variables have pre-imputation data (these are what the pipeline imputes)
    core_vars = ["air_temperature_c", "relative_humidity_pct", "wind_speed_kmh", "rain_mm"]

    pre_rows, post_rows = [], []
    for _, row in qa_df.iterrows():
        station = row["station"]
        pre_r = {"station": station}
        post_r = {"station": station}
        for var in core_vars:
            pre_key = f"pre_imp_missing_pct_{var}"
            post_key = f"post_imp_missing_pct_{var}"
            pre_r[var] = row.get(pre_key, None)
            post_r[var] = row.get(post_key, None)
        pre_rows.append(pre_r)
        post_rows.append(post_r)

    pre_df = pd.DataFrame(pre_rows).set_index("station").dropna(axis=1, how="all")
    post_df = pd.DataFrame(post_rows).set_index("station").dropna(axis=1, how="all")

    common_cols = sorted(set(pre_df.columns) & set(post_df.columns))
    pre_df = pre_df[common_cols]
    post_df = post_df[common_cols]

    fig, (ax_pre, ax_post) = plt.subplots(
        1, 2,
        figsize=(max(10, len(common_cols) * 2.2), max(3, len(pre_df) * 0.45)),
    )

    vmin = 0
    vmax = max(pre_df.values.max(), post_df.values.max(), 1)

    sns.heatmap(
        pre_df, annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_pre, vmin=vmin, vmax=vmax,
        cbar_kws={"label": "% Missing"},
    )
    ax_pre.set_title("Before Imputation", fontweight="bold", fontsize=11)
    ax_pre.set_xlabel("Variable")
    ax_pre.set_ylabel("Station")

    sns.heatmap(
        post_df, annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_post, vmin=vmin, vmax=vmax,
        cbar_kws={"label": "% Missing"},
    )
    ax_post.set_title("After Imputation", fontweight="bold", fontsize=11)
    ax_post.set_xlabel("Variable")
    ax_post.set_ylabel("")

    fig.suptitle(
        "Pre vs Post Imputation Missingness — Core FWI Variables (% Missing)",
        fontsize=12, y=1.02,
    )
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "missingness_pre_post_imputation.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "missingness_pre_post_imputation.png")))
    print("Saved: notebooks/figures/missingness_pre_post_imputation.png")

    # Also print the text comparison
    print()
    print("Pre/Post Imputation Missingness Comparison:")
    comparison_rows = []
    for station in pre_df.index:
        for var in common_cols:
            comparison_rows.append({
                "station": station,
                "variable": var,
                "pre_%": pre_df.loc[station, var],
                "post_%": post_df.loc[station, var],
                "improved_%": round(pre_df.loc[station, var] - post_df.loc[station, var], 2),
            })
    comp_df = pd.DataFrame(comparison_rows)
    print(comp_df.to_string(index=False))
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")
No description has been provided for this image
Saved: notebooks/figures/missingness_pre_post_imputation.png

Pre/Post Imputation Missingness Comparison:
       station              variable  pre_%  post_%  improved_%
     greenwich     air_temperature_c  38.93    4.06       34.87
     greenwich               rain_mm   0.13    0.13        0.00
     greenwich relative_humidity_pct  16.21   16.19        0.02
     greenwich        wind_speed_kmh  12.45    3.96        8.49
     cavendish     air_temperature_c  14.52   14.52        0.00
     cavendish               rain_mm   0.03    0.03        0.00
     cavendish relative_humidity_pct  14.51   14.51        0.00
     cavendish        wind_speed_kmh  14.91    0.00       14.91
 north_rustico     air_temperature_c   4.12    4.12        0.00
 north_rustico               rain_mm   0.04    0.04        0.00
 north_rustico relative_humidity_pct   4.12    4.12        0.00
 north_rustico        wind_speed_kmh   4.55    0.00        4.55
stanley_bridge     air_temperature_c  17.19    0.00       17.19
stanley_bridge               rain_mm   0.00    0.00        0.00
stanley_bridge relative_humidity_pct 100.00   17.19       82.81
stanley_bridge        wind_speed_kmh  32.08    0.00       32.08
      tracadie     air_temperature_c  12.84    0.00       12.84
      tracadie               rain_mm   0.00    0.00        0.00
      tracadie relative_humidity_pct 100.00   12.84       87.16
      tracadie        wind_speed_kmh  15.90    0.00       15.90
      stanhope     air_temperature_c   0.32    0.32        0.00
      stanhope               rain_mm   0.19    0.19        0.00
      stanhope relative_humidity_pct   0.33    0.33        0.00
      stanhope        wind_speed_kmh   0.28    0.28        0.00
No description has been provided for this image

4.2 Quality Enforcement Actions¶

Count of quality enforcement actions by type and station.

In [14]:
qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
    qe_df = pd.read_csv(qe_path)
    print(f"Quality enforcement report: {len(qe_df)} actions")

    if "action" in qe_df.columns and "station" in qe_df.columns:
        print("\nActions by type and station:")
        action_pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
        action_table = action_pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)
        print(action_table.to_string())
    else:
        print("Columns:", list(qe_df.columns))
else:
    print("quality_enforcement_report.csv not found.")
Quality enforcement report: 5215 actions

Actions by type and station:
action          flag_only  set_nan
station                           
cavendish             126       12
greenwich            1161      607
north_rustico        2559       87
stanhope              315       66
stanley_bridge         56        1
tracadie              136       89

4.3 Out-of-Range Summary¶

Out-of-range counts for temperature, RH, and wind per station from the QA/QC report.

In [15]:
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)
    oor_cols = [c for c in qa_df.columns if "out_of_range" in c.lower()]
    if oor_cols:
        print("Out-of-Range Summary:")
        print(qa_df[["station"] + oor_cols].to_string(index=False))
    else:
        print("No out-of-range columns in QA/QC report.")
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found.")
Out-of-Range Summary:
       station  out_of_range_temp_count  out_of_range_rh_count  out_of_range_wind_count  out_of_range_pre_enforcement  out_of_range_post_enforcement
     greenwich                        0                      0                        0                             0                              0
     cavendish                        0                      0                        0                             0                              0
 north_rustico                        0                      0                        0                             0                              0
stanley_bridge                        0                      0                        0                             0                              0
      tracadie                        0                      0                        0                             0                              0
      stanhope                        0                      0                        0                             0                              0

4.2b Quality Enforcement Summary (Visual)¶

Stacked bar chart of quality enforcement actions by station. Each bar segment represents an action type (set_nan, clip, etc.).

In [16]:
# --- Quality Enforcement Stacked Bar ---
qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
    qe_df = pd.read_csv(qe_path)
    pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
    pivot_table = pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)

    fig, ax = plt.subplots(figsize=(12, 5))
    pivot_table.plot(kind="bar", stacked=True, ax=ax, colormap="tab10", edgecolor="white", linewidth=0.5)
    ax.set_title("Quality Enforcement Actions by Station")
    ax.set_xlabel("Station")
    ax.set_ylabel("Action Count")
    ax.legend(title="Action", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "quality_enforcement_stacked.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "quality_enforcement_stacked.png")))
    print("Saved: notebooks/figures/quality_enforcement_stacked.png")
else:
    print("quality_enforcement_report.csv not found.")
No description has been provided for this image
Saved: notebooks/figures/quality_enforcement_stacked.png
No description has been provided for this image

4.2c Synthetic Data Proportion Heatmap¶

Percentage of rows with qf > 0 (synthetic or chain-break) per station per variable. Higher values indicate greater reliance on imputed/synthetic data. Stations without _qf columns for a variable are shown as N/A.

In [17]:
# --- Synthetic Data Proportion Heatmap ---
qf_vars = [
    "air_temperature_c_qf", "relative_humidity_pct_qf",
    "wind_speed_kmh_qf", "rain_mm_qf",
    "ffmc_qf", "dmc_qf", "dc_qf", "isi_qf", "bui_qf", "fwi_qf"
]
prop_matrix = {}
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, low_memory=False)
    row = {}
    for qv in qf_vars:
        if qv in df.columns:
            row[qv.replace("_qf", "")] = (df[qv] > 0).mean() * 100
        else:
            row[qv.replace("_qf", "")] = float("nan")
    prop_matrix[station] = row

prop_df = pd.DataFrame(prop_matrix).T

fig, ax = plt.subplots(figsize=(14, 5))
mask = prop_df.isna()
cmap = plt.cm.YlOrRd.copy()
cmap.set_bad(color="#d0d0d0")
im = ax.imshow(prop_df.values, cmap=cmap, aspect="auto", vmin=0, vmax=100)
ax.set_xticks(range(len(prop_df.columns)))
ax.set_xticklabels(prop_df.columns, rotation=45, ha="right", fontsize=8)
ax.set_yticks(range(len(prop_df.index)))
ax.set_yticklabels(prop_df.index, fontsize=8)

# Annotate cells
for i in range(len(prop_df.index)):
    for j in range(len(prop_df.columns)):
        v = prop_df.iloc[i, j]
        if pd.isna(v):
            ax.text(j, i, "N/A", ha="center", va="center", fontsize=7, color="gray")
        else:
            ax.text(j, i, f"{v:.1f}%", ha="center", va="center", fontsize=7,
                    color="white" if v > 50 else "black")

fig.colorbar(im, ax=ax, label="% rows with qf > 0")
ax.set_title("Synthetic Data Proportion by Station and Variable")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "synthetic_proportion_heatmap.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "synthetic_proportion_heatmap.png")))
print("Saved: notebooks/figures/synthetic_proportion_heatmap.png")
No description has been provided for this image
Saved: notebooks/figures/synthetic_proportion_heatmap.png
No description has been provided for this image

4.2d Quality Flag Timeline¶

Colored bands showing when quality flags were raised per station. Each row is a station; colors indicate the type of quality issue detected. Stations without _quality_flags data are shown as empty.

In [18]:
# --- Quality Flag Timeline ---
import ast

flag_color_map = {
    "value_range": "#e6194b",     # red
    "rate_of_change": "#f58231",  # orange
    "flatline": "#3cb44b",        # green
    "cross_variable": "#4363d8",  # blue
    "stuck_sensor": "#911eb4",    # purple
}

flag_data = {}
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
    if "_quality_flags" not in df.columns:
        continue
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    # Parse flag lists and extract types
    flag_types = []
    for val in df["_quality_flags"].dropna():
        try:
            flags = ast.literal_eval(val) if isinstance(val, str) else val
            for f in flags:
                ftype = f.split(":")[0] if ":" in str(f) else str(f)
                flag_types.append(ftype)
        except Exception:
            pass
    if flag_types:
        flag_data[station] = pd.Series(flag_types).value_counts()

if flag_data:
    flag_df = pd.DataFrame(flag_data).fillna(0).astype(int)
    fig, ax = plt.subplots(figsize=(12, 4 + 0.3 * len(flag_df.columns)))
    flag_df.plot(kind="barh", stacked=True, ax=ax,
                 color=[flag_color_map.get(t, "#aaa") for t in flag_df.index])
    ax.set_title("Quality Flag Types by Station")
    ax.set_xlabel("Flag Count")
    ax.set_ylabel("")
    ax.legend(title="Flag Type", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "quality_flag_timeline.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "quality_flag_timeline.png")))
    print("Saved: notebooks/figures/quality_flag_timeline.png")
else:
    print("No quality flag data found for any station.")
No description has been provided for this image
Saved: notebooks/figures/quality_flag_timeline.png
No description has been provided for this image

4.4 Cross-Station Imputation Audit¶

Donor-to-recipient imputation pairs and counts.

In [19]:
cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
    cs_df = pd.read_csv(cs_path)
    print(f"Cross-station imputation audit: {len(cs_df)} entries")

    if "donor" in cs_df.columns and "target" in cs_df.columns:
        print("\nDonor -> Recipient pairs:")
        pairs = cs_df.groupby(["donor", "target"]).size().reset_index(name="count")
        print(pairs.to_string(index=False))
    else:
        print("Columns:", list(cs_df.columns))
else:
    print("cross_station_imputation_audit.csv not found.")
Cross-station imputation audit: 78623 entries
Columns: ['station', 'timestamp_utc', 'variable', 'imputed_value', 'quality_flag', 'source', 'method', 'donor_priority']

4.4b Cross-Station Donor Audit (Visual)¶

Dual-panel chart showing:

  • Left: Donor contribution bars (how many values each donor provided)
  • Right: Imputation method breakdown by station
In [20]:
# --- Cross-Station Donor Audit Visual ---
cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
    cs_df = pd.read_csv(cs_path)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Left: donor contributions
    if "source" in cs_df.columns:
        donor_counts = cs_df["source"].value_counts()
        donor_counts.plot(kind="barh", ax=ax1, color="#4363d8", edgecolor="white")
        ax1.set_title("Donor Contribution Count")
        ax1.set_xlabel("Number of Imputed Values")
        ax1.set_ylabel("")
    else:
        ax1.text(0.5, 0.5, "No source column", ha="center", va="center", transform=ax1.transAxes)

    # Right: method breakdown by station
    if "station" in cs_df.columns and "method" in cs_df.columns:
        method_pivot = cs_df.groupby(["station", "method"]).size().reset_index(name="count")
        method_table = method_pivot.pivot(index="station", columns="method", values="count").fillna(0).astype(int)
        method_table.plot(kind="bar", stacked=True, ax=ax2, colormap="Set2", edgecolor="white", linewidth=0.5)
        ax2.set_title("Imputation Method by Station")
        ax2.set_xlabel("Station")
        ax2.set_ylabel("Count")
        ax2.legend(title="Method", fontsize=7)
        plt.xticks(rotation=45, ha="right")
    else:
        ax2.text(0.5, 0.5, "No method/station columns", ha="center", va="center", transform=ax2.transAxes)

    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "cross_station_donor_audit.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "cross_station_donor_audit.png")))
    print("Saved: notebooks/figures/cross_station_donor_audit.png")
else:
    print("cross_station_imputation_audit.csv not found.")
No description has been provided for this image
Saved: notebooks/figures/cross_station_donor_audit.png
No description has been provided for this image

5. Principal Component Analysis¶

PCA on hourly temperature to identify station similarity patterns.

In [21]:
temp_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    if "air_temperature_c" in df.columns:
        temp_frames.append(df[["station", "timestamp_utc", "air_temperature_c"]])

temp_all = pd.concat(temp_frames, ignore_index=True)
matrix = build_station_matrix(temp_all, value_column="air_temperature_c")
print(f"Station matrix shape: {matrix.shape}")
print(f"Stations: {list(matrix.columns)}")
Station matrix shape: (26113, 6)
Stations: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
In [22]:
loadings = pca_station_loadings(matrix)
print("PCA Loadings:")
print(loadings.to_string(index=False))
PCA Loadings:
       station component   loading  explained_variance_ratio
     cavendish       PC1  0.413668                  0.963466
     greenwich       PC1  0.396266                  0.963466
 north_rustico       PC1  0.413277                  0.963466
      stanhope       PC1  0.406986                  0.963466
stanley_bridge       PC1  0.405674                  0.963466
      tracadie       PC1  0.413334                  0.963466
     cavendish       PC2 -0.167839                  0.018571
     greenwich       PC2  0.900793                  0.018571
 north_rustico       PC2 -0.171177                  0.018571
      stanhope       PC2 -0.054382                  0.018571
stanley_bridge       PC2 -0.323366                  0.018571
      tracadie       PC2 -0.153547                  0.018571
In [23]:
normalized = matrix.dropna(axis="index", how="any")
normalized = (normalized - normalized.mean()) / normalized.std()

n_comp = min(len(matrix.columns), normalized.shape[0])
pca_full = PCA(n_components=n_comp)
pca_full.fit(normalized)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(range(1, n_comp + 1), pca_full.explained_variance_ratio_, "o-")
ax.set_xlabel("Principal Component")
ax.set_ylabel("Explained Variance Ratio")
ax.set_title("Scree Plot")
ax.set_xticks(range(1, n_comp + 1))
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_scree.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_scree.png")))
print("Saved: notebooks/figures/pca_scree.png")
ratios = pca_full.explained_variance_ratio_.round(4).tolist()
print(f"Explained variance ratios: {ratios}")
No description has been provided for this image
Saved: notebooks/figures/pca_scree.png
Explained variance ratios: [0.9635, 0.0186, 0.0098, 0.0062, 0.0011, 0.0008]
No description has been provided for this image
In [24]:
scores = pca_full.transform(normalized)

# Distinct color per station
station_colors = {
    "greenwich": "#e6194b",
    "cavendish": "#3cb44b",
    "north_rustico": "#4363d8",
    "stanhope": "#f58231",
    "stanley_bridge": "#911eb4",
    "tracadie": "#42d4f4",
}

fig, ax = plt.subplots(figsize=(8, 6))
for i, station in enumerate(matrix.columns):
    color = station_colors.get(station, None)
    ax.scatter(scores[i, 0], scores[i, 1], s=120, zorder=5, color=color, label=station, edgecolors="white", linewidth=0.5)
    ax.annotate(station, (scores[i, 0], scores[i, 1]), textcoords="offset points", xytext=(5, 5), fontsize=9, color=color)

ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
ax.set_xlabel(f"PC1 ({pca_full.explained_variance_ratio_[0]:.1%})")
ax.set_ylabel(f"PC2 ({pca_full.explained_variance_ratio_[1]:.1%})")
ax.set_title("PCA Biplot - Station Scores")
ax.axhline(0, color="gray", linewidth=0.5)
ax.axvline(0, color="gray", linewidth=0.5)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_biplot.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_biplot.png")))
print("Saved: notebooks/figures/pca_biplot.png")
No description has been provided for this image
Saved: notebooks/figures/pca_biplot.png
No description has been provided for this image

6. Hierarchical Clustering¶

Stations clustered by temperature correlation to identify redundancy groups.

In [25]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

corr = matrix.corr()
distance = 1 - corr
dist_array = squareform(distance.values, checks=False)

link = linkage(dist_array, method="average")

fig, ax = plt.subplots(figsize=(8, 5))
dendrogram(link, labels=list(matrix.columns), ax=ax, leaf_rotation=45)
ax.set_ylabel("Distance (1 - correlation)")
ax.set_title("Hierarchical Clustering Dendrogram")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "clustering_dendrogram.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "clustering_dendrogram.png")))
print("Saved: notebooks/figures/clustering_dendrogram.png")
No description has been provided for this image
Saved: notebooks/figures/clustering_dendrogram.png
No description has been provided for this image
In [26]:
cluster_order = cluster_station_order(matrix)
print("Cluster assignments:")
for entry in cluster_order:
    print(f"  {entry}")

corr = pairwise_station_correlation(matrix).fillna(0.0)
dist = 1 - corr
print("\nPairwise distance matrix:")
print(dist.round(3).to_string())
Cluster assignments:
  stanhope
  tracadie
  cavendish
  north_rustico
  stanley_bridge
  greenwich

Pairwise distance matrix:
station         cavendish  greenwich  north_rustico  stanhope  stanley_bridge  tracadie
station                                                                                
cavendish           0.000      0.093          0.006     0.033           0.028     0.006
greenwich           0.093      0.000          0.086     0.095           0.102     0.058
north_rustico       0.006      0.086          0.000     0.035           0.026     0.005
stanhope            0.033      0.095          0.035     0.000           0.057     0.031
stanley_bridge      0.028      0.102          0.026     0.057           0.000     0.026
tracadie            0.006      0.058          0.005     0.031           0.026     0.000

7. Redundancy Analysis¶

Combining PCA, clustering, and benchmarking against the Stanhope reference to identify redundant stations.

In [27]:
try:
    benchmark = benchmark_to_stanhope(matrix, reference_station="stanhope")
    print("Benchmark results:")
    print(benchmark.to_string(index=False))
except ValueError as e:
    print(f"Stanhope benchmark skipped: {e}")
    print("Using inter-station metrics only.")
    benchmark = None
Benchmark results:
       station reference_station  overlap_count  mean_abs_diff  correlation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              observations
     cavendish          stanhope          21858       1.815121     0.966669                                                                                                                                           [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
     greenwich          stanhope          24067       1.540169     0.905163                                                                                                                                                                                               [0.131, 1.272, 1.776, 1.6753333333333327, 1.3593333333333337, 1.0616666666666668, 0.4973333333333335, 0.8983333333333332, 1.579333333333333, 2.796333333333333, 2.403, 1.596, 0.30799999999999983, 2.551, 2.433666666666667, 2.436, 1.4793333333333332, 0.9983333333333333, 0.7026666666666668, 0.5543333333333336, 0.9340000000000002, 2.0716666666666668, 2.3259999999999996, 2.262666666666667, 1.614, 0.8383333333333329, 1.033666666666667, 0.3340000000000001, 0.5190000000000001, 0.6160000000000001, 0.07799999999999985, 0.04800000000000004, 0.9796666666666667, 0.3923333333333332, 0.9243333333333337, 0.301333333333333, 0.07833333333333314, 0.44599999999999973, 0.16566666666666663, 0.3240000000000003, 1.0553333333333335, 1.745333333333333, 2.252333333333333, 2.981333333333333, 2.879333333333333, 2.4543333333333335, 1.6183333333333334, 1.6399999999999997, 2.2116666666666664, 2.530333333333333, 2.5700000000000003, 2.155, 1.5509999999999997, 1.5199999999999996, 1.189, 0.06699999999999928, 1.2399999999999998, 2.4606666666666666, 3.372, 4.061, 4.348333333333334, 4.505333333333333, 4.098333333333334, 4.56, 3.0966666666666667, 1.7093333333333338, 0.17433333333333367, 1.8640000000000003, 1.8049999999999997, 1.5246666666666657, 0.48566666666666647, 1.4106666666666663, 0.5083333333333331, 0.8073333333333332, 1.334, 0.44133333333333336, 0.852, 0.40733333333333344, 0.41466666666666663, 2.330333333333333, 4.1450000000000005, 5.676333333333333, 6.049666666666667, 5.577999999999999, 0.5589999999999993, 1.0030000000000001, 2.7380000000000004, 3.351, 2.309, 3.887333333333334, 4.211666666666666, 4.692333333333334, 5.078666666666667, 4.711333333333333, 4.510333333333333, 3.167666666666667, 0.07866666666666666, 0.3703333333333334, 0.5583333333333333, 2.6606666666666667, ...]
 north_rustico          stanhope          22994       1.872865     0.964902                                                                                                                                                                                                                                                                                  [3.2779999999999996, 4.0600000000000005, 3.0125, 3.3594999999999997, 5.177999999999999, 5.859999999999999, 5.2125, 5.2595, 4.9555, 0.5805000000000002, 0.3480000000000001, 0.6470000000000001, 1.1544999999999999, 0.42549999999999977, 0.607, 1.1465, 0.9923333333333335, 1.2554999999999996, 2.831, 5.4375, 5.569500000000001, 0.42600000000000016, 0.7905000000000006, 2.645999999999999, 3.653500000000001, 2.3629999999999995, 1.8560000000000008, 1.9699999999999998, 1.1725000000000003, 1.1084999999999994, 0.3155000000000001, 1.5074999999999998, 0.47550000000000026, 0.44649999999999945, 1.4965000000000002, 1.5819999999999999, 1.9894999999999996, 0.16349999999999998, 0.488, 0.23050000000000015, 0.956, 2.5140000000000002, 4.861000000000001, 3.7095000000000002, 5.258, 7.7545, 3.6179999999999986, 0.7385000000000002, 2.0774999999999997, 0.3949999999999996, 5.0165, 4.219999999999999, 4.367999999999999, 2.4349999999999996, 0.4775000000000009, 0.07399999999999984, 0.6455000000000002, 0.06300000000000061, 1.0415, 0.6745000000000001, 1.8719999999999999, 0.9215, 1.787, 2.569, 2.8975, 2.7225000000000006, 1.4784999999999995, 0.5105, 0.25849999999999973, 0.9654999999999996, 0.9155000000000002, 0.9110000000000005, 0.39700000000000024, 0.39449999999999985, 6.627, 5.5705, 2.4085, 0.44650000000000034, 1.5984999999999996, 0.8835000000000006, 0.7670000000000003, 1.9605000000000015, 1.8565000000000005, 1.9209999999999985, 1.0989999999999993, 0.7234999999999996, 0.7985000000000007, 1.2915, 1.1304999999999996, 0.573500000000001, 0.29800000000000093, 0.22299999999999986, 0.2054999999999998, 0.3130000000000006, 0.702, 0.5025000000000004, 0.49350000000000005, 2.8790000000000013, 4.922, 2.186, ...]
stanley_bridge          stanhope          24066       2.099542     0.942855                                                                                                                                           [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
      tracadie          stanhope          21940       1.822442     0.968903 [1.1550000000000011, 0.9824999999999982, 0.8283333333333331, 0.6266666666666687, 0.33333333333333215, 0.39416666666667055, 0.7650000000000006, 1.5183333333333309, 0.5924999999999976, 1.1966666666666725, 0.043333333333333, 1.0366666666666688, 1.1408333333333331, 1.1508333333333347, 1.8183333333333387, 1.5458333333333307, 0.769166666666667, 0.086666666666666, 2.557500000000001, 3.0933333333333337, 2.8599999999999994, 2.2641666666666644, 1.975833333333334, 1.7858333333333292, 1.6616666666666688, 1.5225000000000009, 1.6858333333333313, 1.803333333333331, 1.4700000000000024, 0.8125, 0.33500000000000085, 0.16999999999999815, 1.4050000000000011, 2.2225, 2.2025000000000006, 1.3141666666666687, 1.0933333333333337, 0.19583333333333286, 0.7155555555555608, 0.17499999999999716, 0.398333333333337, 0.21083333333333343, 0.8533333333333317, 1.8975000000000009, 1.3825000000000003, 2.1125000000000007, 2.233333333333338, 2.0383333333333304, 1.8616666666666646, 1.1433333333333344, 1.134999999999998, 0.8524999999999991, 0.8716666666666661, 0.9283333333333275, 0.09416666666666984, 0.6916666666666664, 2.231666666666669, 2.1633333333333304, 1.4450000000000003, 2.8283333333333367, 1.7749999999999986, 2.0249999999999986, 1.4258333333333333, 1.2699999999999996, 0.3758333333333326, 0.0591666666666697, 1.2349999999999994, 2.7025000000000006, 3.9775000000000027, 4.57833333333333, 4.006666666666664, 3.7441666666666684, 2.7775, 2.576666666666661, 2.0799999999999983, 0.6533333333333324, 0.4833333333333343, 0.03999999999999915, 0.1700000000000017, 1.5308333333333373, 2.6466666666666683, 2.288333333333334, 1.7424999999999997, 1.1666666666666679, 2.575833333333332, 2.3833333333333364, 1.8758333333333361, 0.9899999999999984, 0.036666666666668846, 1.1208333333333336, 3.6866666666666674, 3.418333333333333, 3.5199999999999996, 2.973333333333329, 1.4858333333333285, 1.4608333333333299, 1.1708333333333272, 1.047500000000003, 1.4683333333333373, 1.6616666666666653, ...]
In [28]:
if benchmark is not None and len(benchmark) > 0:
    recommendations = build_station_recommendations(
        benchmark,
        pca_loadings=loadings,
        cluster_order=cluster_order,
    )
    print("Station Recommendations:")
    print(recommendations.to_string(index=False))
else:
    print("Cannot build recommendations without benchmark.")
    recommendations = None
Station Recommendations:
       station reference_station recommendation  risk_probability  ci_lower  ci_upper risk_band                                                                                                    evidence                                                                                                                                                                                                                                                                                  assumptions                                                limitations
     cavendish          stanhope         remove          0.121140  0.012887  0.288408       low benchmark correlation=0.967; uncertainty=low (0.01-0.29); pca=[PC1=0.414, PC2=-0.168]; cluster=position 3/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
     greenwich          stanhope          defer          0.028022  0.000000  0.060345       low  benchmark correlation=0.905; uncertainty=low (0.00-0.06); pca=[PC1=0.396, PC2=0.901]; cluster=position 6/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
 north_rustico          stanhope         remove          0.110791  0.013123  0.265487       low benchmark correlation=0.965; uncertainty=low (0.01-0.27); pca=[PC1=0.413, PC2=-0.171]; cluster=position 4/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge          stanhope          defer          0.063608  0.010394  0.144060       low benchmark correlation=0.943; uncertainty=low (0.01-0.14); pca=[PC1=0.406, PC2=-0.323]; cluster=position 5/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
      tracadie          stanhope         remove          0.123004  0.014229  0.279774       low benchmark correlation=0.969; uncertainty=low (0.01-0.28); pca=[PC1=0.413, PC2=-0.154]; cluster=position 2/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.

8. Uncertainty Quantification¶

Risk probabilities and confidence intervals for station removal.

In [29]:
from pea_met_network.uncertainty import quantify_station_removal_risk

if benchmark is not None and len(benchmark) > 0:
    uncertainty = quantify_station_removal_risk(benchmark)
    print("Uncertainty Quantification (KDE-based risk):")
    print(uncertainty.to_string(index=False))

    fig, ax = plt.subplots(figsize=(8, 4))
    stations_plot = uncertainty["station"].tolist()
    risks = uncertainty["risk_probability"].tolist()
    ci_lo = uncertainty["ci_lower"].tolist()
    ci_hi = uncertainty["ci_upper"].tolist()

    x = range(len(stations_plot))
    ax.bar(x, risks, color="steelblue", alpha=0.7)
    yerr_lo = [r - lo for r, lo in zip(risks, ci_lo)]
    yerr_hi = [hi - r for r, hi in zip(risks, ci_hi)]
    ax.errorbar(x, risks, yerr=[yerr_lo, yerr_hi], fmt="none", c="black", capsize=5)
    ax.set_xticks(list(x))
    ax.set_xticklabels(stations_plot, rotation=45, ha="right")
    ax.set_ylabel("Removal Risk Probability")
    ax.set_title("Station Removal Risk with 95% Confidence Intervals")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "uncertainty_risk.png", bbox_inches="tight")
    plt.show()
    display(Image(filename=str(FIGURES_DIR / "uncertainty_risk.png")))
    print("Saved: notebooks/figures/uncertainty_risk.png")
else:
    print("Uncertainty analysis requires benchmark data (skipped).")
Uncertainty Quantification (KDE-based risk):
       station reference_station  risk_probability  ci_lower  ci_upper risk_band                                                                                                                                                                                                                                                                                  assumptions                                                limitations
     cavendish          stanhope          0.121140  0.012887  0.288408       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
     greenwich          stanhope          0.028022  0.000000  0.060345       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
 north_rustico          stanhope          0.110791  0.013123  0.265487       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge          stanhope          0.063608  0.010394  0.144060       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
      tracadie          stanhope          0.123004  0.014229  0.279774       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
No description has been provided for this image
No description has been provided for this image
Saved: notebooks/figures/uncertainty_risk.png

Conclusion¶

TBD — fill in after review.