import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Image, display
from pathlib import Path
from sklearn.decomposition import PCA

from pea_met_network.qa_qc import coverage_summary
from pea_met_network.redundancy import (
    benchmark_to_stanhope,
    build_station_matrix,
    build_station_recommendations,
    cluster_station_order,
    pairwise_station_correlation,
    pca_station_loadings,
)

# ---------------------------------------------------------------------------
# FWI mode selector — switch between "hourly" and "compliant"
# ---------------------------------------------------------------------------
FWI_MODE = "hourly"

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(".").resolve()
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "notebooks" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Date range for analysis — must match cleaning-config.json
ANALYSIS_START = pd.Timestamp("2023-04-01", tz="UTC")

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 120

print(f"FWI mode: {FWI_MODE}")
print(f"Processed dir: {PROCESSED_DIR}")

mkdir -p failed for path /home/moltis/.config/matplotlib: [Errno 13] Permission denied: '/home/moltis/.config/matplotlib'

Matplotlib created a temporary cache directory at /tmp/matplotlib-ep8gx_14 because there was an issue with the default path (/home/moltis/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.

FWI mode: hourly
Processed dir: /mnt/fast_data/workspaces/pea-met-network/data/processed

# Discover stations from disk — no hardcoded list
stations = sorted([
    d.name for d in PROCESSED_DIR.iterdir()
    if d.is_dir() and (d / "station_hourly.csv").exists()
])
print(f"Stations found: {stations}")

hourly_frames = []
daily_frames = []
for station in stations:
    h_path = PROCESSED_DIR / station / "station_hourly.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    h_df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
    h_df["timestamp_utc"] = pd.to_datetime(h_df["timestamp_utc"], utc=True)
    h_df = h_df[h_df["timestamp_utc"] >= ANALYSIS_START]
    hourly_frames.append(h_df)
    if d_path.exists():
        d_df = pd.read_csv(d_path, parse_dates=["timestamp_utc"])
        d_df["timestamp_utc"] = pd.to_datetime(d_df["timestamp_utc"], utc=True)
        daily_frames.append(d_df)
    tmin = h_df["timestamp_utc"].min()
    tmax = h_df["timestamp_utc"].max()
    print(f"  {station}: {len(h_df)} hourly rows  ({tmin} to {tmax})")

hourly_all = pd.concat(hourly_frames, ignore_index=True)
if daily_frames:
    daily_all = pd.concat(daily_frames, ignore_index=True)
else:
    daily_all = pd.DataFrame()
n = hourly_all["station"].nunique()
print(f"\nCombined: {len(hourly_all)} hourly rows, {n} stations")

Stations found: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
  cavendish: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)

  greenwich: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
  north_rustico: 26113 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-24 00:00:00+00:00)

  stanhope: 24144 hourly rows  (2023-04-01 00:00:00+00:00 to 2025-12-31 23:00:00+00:00)
  stanley_bridge: 26092 hourly rows  (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)

  tracadie: 23956 hourly rows  (2023-06-29 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)

Combined: 152489 hourly rows, 6 stations

coverage_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    coverage_frames.append(df[["station", "timestamp_utc"]])

coverage_df = pd.concat(coverage_frames, ignore_index=True)
cov_summary = coverage_summary(coverage_df)
print(cov_summary.to_string(index=False))

       station  total_records
     cavendish          26092
     greenwich          26092
 north_rustico          26113
      stanhope          24144
stanley_bridge          26092
      tracadie          23956

fig, ax = plt.subplots(figsize=(12, 4))

for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    start = df["timestamp_utc"].min().floor("D")
    end = df["timestamp_utc"].max().floor("D")
    date_range = pd.date_range(start, end, freq="D")
    daily_counts = df.set_index("timestamp_utc").resample("D").size()
    daily_counts = daily_counts.reindex(date_range, fill_value=0)
    ax.fill_between(daily_counts.index, 0, daily_counts.values, alpha=0.4, label=station)
    ax.plot(daily_counts.index, daily_counts.values, linewidth=0.8)

ax.set_xlabel("Date")
ax.set_ylabel("Hourly Records per Day")
ax.set_title("Temporal Coverage by Station")
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "temporal_coverage.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "temporal_coverage.png")))
print("Saved: notebooks/figures/temporal_coverage.png")

Saved: notebooks/figures/temporal_coverage.png

# --- Hourly missingness heatmap (core vs extended sensors) ---
FWI_CORE_VARS = [
    "air_temperature_c", "relative_humidity_pct",
    "wind_speed_kmh", "rain_mm",
]
HOURLY_EXTENDED_VARS = [
    "wind_direction_deg", "solar_radiation_w_m2",
    "barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]

hourly_missing_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath)
    df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
    row = {"station": station}
    for var in FWI_CORE_VARS + HOURLY_EXTENDED_VARS:
        if var in df.columns:
            row[var] = df[var].isna().mean() * 100
        else:
            row[var] = 100.0
    hourly_missing_frames.append(row)

hourly_missing_df = pd.DataFrame(hourly_missing_frames).set_index("station")

# Split into core and extended panels
core_cols = [v for v in FWI_CORE_VARS if v in hourly_missing_df.columns]
ext_cols = [v for v in HOURLY_EXTENDED_VARS if v in hourly_missing_df.columns]

fig, (ax_core, ax_ext) = plt.subplots(
    1, 2, figsize=(max(12, (len(core_cols) + len(ext_cols)) * 1.1), max(3, len(stations) * 0.45)),
    gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.4},
)

sns.heatmap(
    hourly_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
    ax=ax_core, cbar_kws={"label": "% Missing"},
)
ax_core.set_title("Core Sensors (FWI Inputs)", fontweight="bold", fontsize=11)
ax_core.set_xlabel("Variable")
ax_core.set_ylabel("Station")

if ext_cols:
    sns.heatmap(
        hourly_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_ext, cbar_kws={"label": "% Missing"},
    )
else:
    ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
                ha="center", va="center", fontsize=12, color="gray")
ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
ax_ext.set_xlabel("Variable")
ax_ext.set_ylabel("")

fig.suptitle("Hourly Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
             fontsize=12, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "missingness_heatmap_hourly.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_hourly.png")))
print("Saved: notebooks/figures/missingness_heatmap_hourly.png")
print()
print("Hourly missingness summary (core):")
print(hourly_missing_df[core_cols].round(1).to_string())
print()
print("Hourly missingness summary (extended):")
print(hourly_missing_df[ext_cols].round(1).to_string())

Saved: notebooks/figures/missingness_heatmap_hourly.png

Hourly missingness summary (core):
                air_temperature_c  relative_humidity_pct  wind_speed_kmh  rain_mm
station                                                                          
cavendish                    14.5                   14.5             0.0      0.0
greenwich                     4.1                   16.2             4.0      0.1
north_rustico                 4.1                    4.1             0.0      0.0
stanhope                      0.3                    0.3             0.3      0.2
stanley_bridge                0.0                   17.2             0.0      0.0
tracadie                      0.0                   12.8             0.0      0.0

Hourly missingness summary (extended):
                wind_direction_deg  solar_radiation_w_m2  barometric_pressure_kpa  dew_point_c  wind_gust_speed_kmh
station                                                                                                            
cavendish                     14.5                  14.5                    100.0         14.5                 14.9
greenwich                     12.1                  12.1                    100.0         16.2                 12.5
north_rustico                  2.1                   1.3                      1.3          1.3                  4.3
stanhope                       2.1                 100.0                    100.0          0.3                100.0
stanley_bridge                18.4                  18.4                     34.7        100.0                 32.1
tracadie                      13.0                  13.0                     18.8        100.0                 15.5

# --- Daily missingness heatmap (core vs extended, includes FWI) ---
FWI_OUTPUT_VARS = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]
FWI_CORE_DAILY = [
    "air_temperature_c", "relative_humidity_pct",
    "wind_speed_kmh", "rain_mm",
] + FWI_OUTPUT_VARS
DAILY_EXTENDED = [
    "wind_direction_deg", "solar_radiation_w_m2",
    "barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]

daily_missing_frames = []
for station in stations:
    c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    fpath = c_path if c_path.exists() else d_path
    if not fpath.exists():
        continue
    df = pd.read_csv(fpath)
    if "timestamp_utc" in df.columns:
        df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
    data_cols = sorted([
        c for c in df.columns
        if not c.endswith("_qf") and not c.endswith("_src")
        and not c.endswith("_method") and not c.startswith("_quality")
        and c not in ("station", "timestamp_utc", "carry_forward_used")
    ])
    row = {"station": station}
    for var in data_cols:
        row[var] = df[var].isna().mean() * 100
    daily_missing_frames.append(row)

if daily_missing_frames:
    daily_missing_df = pd.DataFrame(daily_missing_frames).set_index("station")
    all_cols = sorted(daily_missing_df.columns)

    core_cols = [v for v in FWI_CORE_DAILY if v in all_cols]
    ext_cols = [v for v in all_cols if v not in core_cols]

    fig, (ax_core, ax_ext) = plt.subplots(
        1, 2,
        figsize=(max(14, (len(core_cols) + len(ext_cols)) * 1.0), max(3, len(daily_missing_df) * 0.45)),
        gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.5},
    )

    sns.heatmap(
        daily_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_core, cbar_kws={"label": "% Missing"},
    )
    ax_core.set_title("Core + FWI Outputs", fontweight="bold", fontsize=11)
    ax_core.set_xlabel("Variable")
    ax_core.set_ylabel("Station")

    if ext_cols:
        sns.heatmap(
            daily_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
            ax=ax_ext, cbar_kws={"label": "% Missing"},
        )
    else:
        ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
                    ha="center", va="center", fontsize=12, color="gray")
    ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
    ax_ext.set_xlabel("Variable")
    ax_ext.set_ylabel("")

    fig.suptitle("Daily Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
                 fontsize=12, y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "missingness_heatmap_daily.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_daily.png")))
    print("Saved: notebooks/figures/missingness_heatmap_daily.png")
    print()
    print("Daily missingness summary (core + FWI):")
    print(daily_missing_df[core_cols].round(1).to_string())
    print()
    print("Daily missingness summary (extended):")
    print(daily_missing_df[ext_cols].round(1).to_string())
else:
    print("No daily CSVs found \u2014 daily missingness heatmap skipped.")

Saved: notebooks/figures/missingness_heatmap_daily.png

Daily missingness summary (core + FWI):
                air_temperature_c  relative_humidity_pct  wind_speed_kmh  rain_mm  ffmc  dmc   dc  isi  bui  fwi
station                                                                                                         
cavendish                    14.6                   14.6             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
greenwich                     4.1                   16.2             4.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
north_rustico                 4.0                    4.0             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
stanhope                      0.6                    0.7             0.6      0.0   0.0  0.0  0.0  0.0  0.0  0.0
stanley_bridge                0.0                   17.2             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0
tracadie                      0.0                   12.8             0.0      0.0   0.0  0.0  0.0  0.0  0.0  0.0

Daily missingness summary (extended):
                barometric_pressure_kpa  dew_point_c  solar_radiation_w_m2  wind_direction_deg  wind_gust_speed_kmh
station                                                                                                            
cavendish                         100.0         14.6                  14.6                14.6                 15.0
greenwich                         100.0         16.2                  12.1                12.1                 12.5
north_rustico                       1.2          1.2                   1.3                 1.9                  4.1
stanhope                          100.0          0.6                   NaN                 1.2                  NaN
stanley_bridge                     34.6        100.0                  18.4                18.4                 32.0
tracadie                           18.7        100.0                  12.9                12.9                 15.5

imp_path = PROCESSED_DIR / "imputation_report.csv"
if imp_path.exists():
    imp_df = pd.read_csv(imp_path)
    print(f"Imputation report: {len(imp_df)} entries")
    print("\nGaps filled per station per variable:")
    pivot = imp_df.groupby(["station", "variable"])["count_affected"].sum().reset_index()
    pivot_table = pivot.pivot(index="station", columns="variable", values="count_affected").fillna(0).astype(int)
    print(pivot_table.to_string())

    print("\nImputation method breakdown:")
    method_counts = imp_df["method"].value_counts()
    print(method_counts.to_string())

    # Bar chart: total gaps filled by station
    station_totals = imp_df.groupby("station")["count_affected"].sum().sort_values(ascending=True)
    fig, ax = plt.subplots(figsize=(8, 4))
    station_totals.plot(kind="barh", ax=ax, color="steelblue", alpha=0.8)
    ax.set_xlabel("Total Gaps Filled")
    ax.set_title("Imputation Summary by Station")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "imputation_summary.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "imputation_summary.png")))
    print("Saved: notebooks/figures/imputation_summary.png")
else:
    print("imputation_report.csv not found — run pipeline first.")

Imputation report: 717 entries

Gaps filled per station per variable:
variable        air_temperature_c  barometric_pressure_kpa  dew_point_c  relative_humidity_pct  solar_radiation_w_m2  wind_direction_deg  wind_speed_kmh
station                                                                                                                                                 
cavendish                    3789                    26092         3787                   3787                  3786                3786            3890
greenwich                   10158                    26092         4216                   4229                  3147                3145            3249
north_rustico                1076                      330          331                   1075                   331                 547            1188
stanhope                       77                    24144           68                     79                     0                 516              68
stanley_bridge               4486                     9043        26092                  26092                  4800                4800            8371
tracadie                     3076                     4508        23956                  23956                  3107                3107            3809

Imputation method breakdown:
method
linear_interpolation    575
preserve                142

Saved: notebooks/figures/imputation_summary.png

fwi_vars = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]

fwi_stations = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    if "fwi" in df.columns and df["fwi"].notna().sum() > 100:
        fwi_stations.append(station)

print(f"Stations with FWI data: {fwi_stations}")

if fwi_stations:
    fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
    if len(fwi_stations) == 1:
        axes = [axes]

    for idx, station in enumerate(fwi_stations):
        fpath = PROCESSED_DIR / station / "station_hourly.csv"
        df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()
        ax = axes[idx]
        for var in fwi_vars:
            if var in df_daily.columns:
                ax.plot(df_daily.index, df_daily[var], label=var, alpha=0.8, linewidth=0.9)
        ax.set_ylabel("Value")
        ax.set_title(f"FWI Components - {station}")
        ax.legend(fontsize=7, ncol=3)

    axes[-1].set_xlabel("Date")
    fig.suptitle("Fire Weather Index Time Series (Daily Means)", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_timeseries.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_timeseries.png")))
    print(f"Plotted FWI for: {fwi_stations}")
    print("Saved: notebooks/figures/fwi_timeseries.png")
else:
    print("FWI analysis skipped: no stations with non-missing FWI data")

Stations with FWI data: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']

Plotted FWI for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
Saved: notebooks/figures/fwi_timeseries.png

# --- QF-Aware FWI Timeseries ---
if fwi_stations:
    fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
    if len(fwi_stations) == 1:
        axes = [axes]

    fwi_input_qf = ["air_temperature_c_qf", "relative_humidity_pct_qf", "wind_speed_kmh_qf", "rain_mm_qf"]

    for idx, station in enumerate(fwi_stations):
        fpath = PROCESSED_DIR / station / "station_hourly.csv"
        df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        ax = axes[idx]

        # Daily-mean FWI
        df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()

        # Daily quality: worst qf per day across all available input qf cols
        available_qf = [c for c in fwi_input_qf if c in df.columns]
        if available_qf:
            qf_daily = df.set_index("timestamp_utc")[available_qf].resample("D").max()
            worst_qf = qf_daily.max(axis=1)
        else:
            worst_qf = pd.Series(0, index=df_daily.index)

        # Background shading
        for i in range(len(worst_qf)):
            if worst_qf.iloc[i] >= 9:
                ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
                          worst_qf.index[i] + pd.Timedelta(hours=12),
                          color='red', alpha=0.15)
            elif worst_qf.iloc[i] >= 1:
                ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
                          worst_qf.index[i] + pd.Timedelta(hours=12),
                          color='gold', alpha=0.20)

        # FWI line
        ax.plot(df_daily.index, df_daily["fwi"], color="black", linewidth=0.8, label="FWI")
        ax.set_ylabel("FWI")
        ax.set_title(f"QF-Aware FWI - {station}")
        ax.legend(fontsize=7)

    axes[-1].set_xlabel("Date")
    fig.suptitle("FWI with Quality Flag Shading (yellow=synthetic, red=chain-break)", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_qf_aware.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_qf_aware.png")))
    print("Saved: notebooks/figures/fwi_qf_aware.png")
else:
    print("QF-Aware FWI skipped: no stations with FWI data.")

Saved: notebooks/figures/fwi_qf_aware.png

# Try to load both modes for comparison
hourly_fwi_data = {}
compliant_fwi_data = {}

for station in stations:
    h_path = PROCESSED_DIR / station / "station_hourly.csv"
    d_path = PROCESSED_DIR / station / "station_daily.csv"
    c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"

    if h_path.exists():
        df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
        df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
        df = df[df["timestamp_utc"] >= ANALYSIS_START]
        if "fwi" in df.columns and df["fwi"].notna().sum() > 0:
            hourly_fwi_data[station] = df.set_index("timestamp_utc")["fwi"].resample("D").mean()

    if c_path.exists():
        df_c = pd.read_csv(c_path, parse_dates=["timestamp_utc"])
        df_c["timestamp_utc"] = pd.to_datetime(df_c["timestamp_utc"], utc=True)
        df_c = df_c[df_c["timestamp_utc"] >= ANALYSIS_START]
        if "fwi" in df_c.columns and df_c["fwi"].notna().sum() > 0:
            compliant_fwi_data[station] = df_c.set_index("timestamp_utc")["fwi"]

compare_stations = sorted(set(hourly_fwi_data.keys()) & set(compliant_fwi_data.keys()))

if compare_stations:
    fig, axes = plt.subplots(len(compare_stations), 1, figsize=(14, 3 * len(compare_stations)), sharex=True)
    if len(compare_stations) == 1:
        axes = [axes]

    for idx, station in enumerate(compare_stations):
        ax = axes[idx]
        ax.plot(hourly_fwi_data[station].index, hourly_fwi_data[station].values, label="hourly", alpha=0.7, linewidth=0.9)
        ax.plot(compliant_fwi_data[station].index, compliant_fwi_data[station].values, label="compliant", alpha=0.7, linewidth=0.9)
        ax.set_ylabel("FWI")
        ax.set_title(f"FWI Mode Comparison — {station}")
        ax.legend(fontsize=8)

    axes[-1].set_xlabel("Date")
    fig.suptitle("Hourly vs Compliant FWI", y=1.02)
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "fwi_mode_comparison.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "fwi_mode_comparison.png")))
    print(f"Mode comparison for: {compare_stations}")
    print("Saved: notebooks/figures/fwi_mode_comparison.png")
else:
    print("FWI mode comparison skipped: compliant-mode daily files not found.")
    print("Run the pipeline with --fwi-mode compliant to generate them.")

Mode comparison for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
Saved: notebooks/figures/fwi_mode_comparison.png

qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)
    fwi_stat_cols = [c for c in qa_df.columns if c.startswith("fwi_stat_")]
    if fwi_stat_cols:
        stat_display = qa_df[["station"] + fwi_stat_cols].copy()
        print("FWI Value Statistics (from QA/QC report):")
        print(stat_display.to_string(index=False))
    else:
        print("QA/QC report exists but has no fwi_stat_ columns.")
        print("Columns available:", list(qa_df.columns))
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")

QA/QC report exists but has no fwi_stat_ columns.
Columns available: ['station', 'hourly_rows', 'daily_rows', 'date_range_start', 'date_range_end', 'completeness', 'fwi_mode', 'pre_imp_missing_pct_air_temperature_c', 'pre_imp_missing_pct_relative_humidity_pct', 'pre_imp_missing_pct_wind_speed_kmh', 'pre_imp_missing_pct_rain_mm', 'post_imp_missing_pct_air_temperature_c', 'post_imp_missing_pct_air_temperature_c_qf', 'post_imp_missing_pct_barometric_pressure_kpa', 'post_imp_missing_pct_bui', 'post_imp_missing_pct_bui_qf', 'post_imp_missing_pct_dc', 'post_imp_missing_pct_dc_qf', 'post_imp_missing_pct_dew_point_c', 'post_imp_missing_pct_dmc', 'post_imp_missing_pct_dmc_qf', 'post_imp_missing_pct_ffmc', 'post_imp_missing_pct_ffmc_qf', 'post_imp_missing_pct_fwi', 'post_imp_missing_pct_fwi_qf', 'post_imp_missing_pct_isi', 'post_imp_missing_pct_isi_qf', 'post_imp_missing_pct_rain_mm', 'post_imp_missing_pct_rain_mm_qf', 'post_imp_missing_pct_relative_humidity_pct', 'post_imp_missing_pct_relative_humidity_pct_qf', 'post_imp_missing_pct_solar_radiation_w_m2', 'post_imp_missing_pct_wind_direction_deg', 'post_imp_missing_pct_wind_gust_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh_qf', 'duplicate_count', 'out_of_range_temp_count', 'out_of_range_rh_count', 'out_of_range_wind_count', 'quality_enforced_count', 'quality_flagged_count', 'out_of_range_pre_enforcement', 'out_of_range_post_enforcement', 'fwi_chain_breaks', 'carry_forward_days', 'carry_forward_pct', 'ffmc_min', 'ffmc_max', 'ffmc_mean', 'ffmc_std', 'dmc_min', 'dmc_max', 'dmc_mean', 'dmc_std', 'dc_min', 'dc_max', 'dc_mean', 'dc_std', 'isi_min', 'isi_max', 'isi_mean', 'isi_std', 'bui_min', 'bui_max', 'bui_mean', 'bui_std', 'fwi_min', 'fwi_max', 'fwi_mean', 'fwi_std']

cb_path = PROCESSED_DIR / f"fwi_missingness_report_{FWI_MODE}.csv"
if cb_path.exists():
    cb_df = pd.read_csv(cb_path)
    print(f"Chain break report: {len(cb_df)} entries")

    # Summary table: breaks per station, grouped by cause
    if "cause" in cb_df.columns:
        print("\nBreaks per station by cause:")
        cause_pivot = cb_df.groupby(["station", "cause"]).size().reset_index(name="count")
        cause_table = cause_pivot.pivot(index="station", columns="cause", values="count").fillna(0).astype(int)
        print(cause_table.to_string())

        # Breakdown by missing input
        if "missing_input" in cb_df.columns:
            print("\nBreakdown by missing input:")
            input_counts = cb_df["missing_input"].value_counts()
            print(input_counts.to_string())

        # Bar chart: chain breaks by station, colored by cause
        fig, ax = plt.subplots(figsize=(10, 5))
        causes = cb_df["cause"].unique()
        station_breaks = cb_df.groupby(["station", "cause"]).size().unstack(fill_value=0)
        station_breaks.plot(kind="bar", stacked=True, ax=ax, colormap="Set2")
        ax.set_xlabel("Station")
        ax.set_ylabel("Chain Break Count")
        ax.set_title(f"FWI Chain Breaks by Station and Cause ({FWI_MODE} mode)")
        ax.legend(title="Cause")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        fig.savefig(FIGURES_DIR / "chain_breaks.png", bbox_inches="tight")
        display(Image(filename=str(FIGURES_DIR / "chain_breaks.png")))
        print("Saved: notebooks/figures/chain_breaks.png")

        # Cascade origins table
        cascade = cb_df[cb_df["cause"] == "cascade"]
        if len(cascade) > 0 and "cascade_origin" in cascade.columns:
            print("\nCascade origins:")
            print(cascade[["station", "cascade_origin"]].to_string(index=False))
    else:
        print("Chain break report has no 'cause' column. Columns:", list(cb_df.columns))
else:
    print(f"fwi_missingness_report_{FWI_MODE}.csv not found — no chain breaks or pipeline not run.")

Chain break report: 143 entries

Breaks per station by cause:
cause           input_missing  startup
station                               
cavendish                  19        2
greenwich                  42        0
north_rustico              10        0
stanhope                   44        2
stanley_bridge             11        0
tracadie                   11        2

Breakdown by missing input:
missing_input
relative_humidity_pct                                       62
air_temperature_c, relative_humidity_pct, wind_speed_kmh    29
air_temperature_c                                           28
air_temperature_c, relative_humidity_pct                    18

Saved: notebooks/figures/chain_breaks.png

# --- Pre/Post Imputation Missingness Heatmaps ---
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)

    # Core variables have pre-imputation data (these are what the pipeline imputes)
    core_vars = ["air_temperature_c", "relative_humidity_pct", "wind_speed_kmh", "rain_mm"]

    pre_rows, post_rows = [], []
    for _, row in qa_df.iterrows():
        station = row["station"]
        pre_r = {"station": station}
        post_r = {"station": station}
        for var in core_vars:
            pre_key = f"pre_imp_missing_pct_{var}"
            post_key = f"post_imp_missing_pct_{var}"
            pre_r[var] = row.get(pre_key, None)
            post_r[var] = row.get(post_key, None)
        pre_rows.append(pre_r)
        post_rows.append(post_r)

    pre_df = pd.DataFrame(pre_rows).set_index("station").dropna(axis=1, how="all")
    post_df = pd.DataFrame(post_rows).set_index("station").dropna(axis=1, how="all")

    common_cols = sorted(set(pre_df.columns) & set(post_df.columns))
    pre_df = pre_df[common_cols]
    post_df = post_df[common_cols]

    fig, (ax_pre, ax_post) = plt.subplots(
        1, 2,
        figsize=(max(10, len(common_cols) * 2.2), max(3, len(pre_df) * 0.45)),
    )

    vmin = 0
    vmax = max(pre_df.values.max(), post_df.values.max(), 1)

    sns.heatmap(
        pre_df, annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_pre, vmin=vmin, vmax=vmax,
        cbar_kws={"label": "% Missing"},
    )
    ax_pre.set_title("Before Imputation", fontweight="bold", fontsize=11)
    ax_pre.set_xlabel("Variable")
    ax_pre.set_ylabel("Station")

    sns.heatmap(
        post_df, annot=True, fmt=".1f", cmap="YlOrRd",
        ax=ax_post, vmin=vmin, vmax=vmax,
        cbar_kws={"label": "% Missing"},
    )
    ax_post.set_title("After Imputation", fontweight="bold", fontsize=11)
    ax_post.set_xlabel("Variable")
    ax_post.set_ylabel("")

    fig.suptitle(
        "Pre vs Post Imputation Missingness — Core FWI Variables (% Missing)",
        fontsize=12, y=1.02,
    )
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "missingness_pre_post_imputation.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "missingness_pre_post_imputation.png")))
    print("Saved: notebooks/figures/missingness_pre_post_imputation.png")

    # Also print the text comparison
    print()
    print("Pre/Post Imputation Missingness Comparison:")
    comparison_rows = []
    for station in pre_df.index:
        for var in common_cols:
            comparison_rows.append({
                "station": station,
                "variable": var,
                "pre_%": pre_df.loc[station, var],
                "post_%": post_df.loc[station, var],
                "improved_%": round(pre_df.loc[station, var] - post_df.loc[station, var], 2),
            })
    comp_df = pd.DataFrame(comparison_rows)
    print(comp_df.to_string(index=False))
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")

Saved: notebooks/figures/missingness_pre_post_imputation.png

Pre/Post Imputation Missingness Comparison:
       station              variable  pre_%  post_%  improved_%
     greenwich     air_temperature_c  38.93    4.06       34.87
     greenwich               rain_mm   0.13    0.13        0.00
     greenwich relative_humidity_pct  16.21   16.19        0.02
     greenwich        wind_speed_kmh  12.45    3.96        8.49
     cavendish     air_temperature_c  14.52   14.52        0.00
     cavendish               rain_mm   0.03    0.03        0.00
     cavendish relative_humidity_pct  14.51   14.51        0.00
     cavendish        wind_speed_kmh  14.91    0.00       14.91
 north_rustico     air_temperature_c   4.12    4.12        0.00
 north_rustico               rain_mm   0.04    0.04        0.00
 north_rustico relative_humidity_pct   4.12    4.12        0.00
 north_rustico        wind_speed_kmh   4.55    0.00        4.55
stanley_bridge     air_temperature_c  17.19    0.00       17.19
stanley_bridge               rain_mm   0.00    0.00        0.00
stanley_bridge relative_humidity_pct 100.00   17.19       82.81
stanley_bridge        wind_speed_kmh  32.08    0.00       32.08
      tracadie     air_temperature_c  12.84    0.00       12.84
      tracadie               rain_mm   0.00    0.00        0.00
      tracadie relative_humidity_pct 100.00   12.84       87.16
      tracadie        wind_speed_kmh  15.90    0.00       15.90
      stanhope     air_temperature_c   0.32    0.32        0.00
      stanhope               rain_mm   0.19    0.19        0.00
      stanhope relative_humidity_pct   0.33    0.33        0.00
      stanhope        wind_speed_kmh   0.28    0.28        0.00

qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
    qe_df = pd.read_csv(qe_path)
    print(f"Quality enforcement report: {len(qe_df)} actions")

    if "action" in qe_df.columns and "station" in qe_df.columns:
        print("\nActions by type and station:")
        action_pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
        action_table = action_pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)
        print(action_table.to_string())
    else:
        print("Columns:", list(qe_df.columns))
else:
    print("quality_enforcement_report.csv not found.")

Quality enforcement report: 5215 actions

Actions by type and station:
action          flag_only  set_nan
station                           
cavendish             126       12
greenwich            1161      607
north_rustico        2559       87
stanhope              315       66
stanley_bridge         56        1
tracadie              136       89

qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
    qa_df = pd.read_csv(qa_qc_path)
    oor_cols = [c for c in qa_df.columns if "out_of_range" in c.lower()]
    if oor_cols:
        print("Out-of-Range Summary:")
        print(qa_df[["station"] + oor_cols].to_string(index=False))
    else:
        print("No out-of-range columns in QA/QC report.")
else:
    print(f"qa_qc_report_{FWI_MODE}.csv not found.")

Out-of-Range Summary:
       station  out_of_range_temp_count  out_of_range_rh_count  out_of_range_wind_count  out_of_range_pre_enforcement  out_of_range_post_enforcement
     greenwich                        0                      0                        0                             0                              0
     cavendish                        0                      0                        0                             0                              0
 north_rustico                        0                      0                        0                             0                              0
stanley_bridge                        0                      0                        0                             0                              0
      tracadie                        0                      0                        0                             0                              0
      stanhope                        0                      0                        0                             0                              0

# --- Quality Enforcement Stacked Bar ---
qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
    qe_df = pd.read_csv(qe_path)
    pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
    pivot_table = pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)

    fig, ax = plt.subplots(figsize=(12, 5))
    pivot_table.plot(kind="bar", stacked=True, ax=ax, colormap="tab10", edgecolor="white", linewidth=0.5)
    ax.set_title("Quality Enforcement Actions by Station")
    ax.set_xlabel("Station")
    ax.set_ylabel("Action Count")
    ax.legend(title="Action", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "quality_enforcement_stacked.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "quality_enforcement_stacked.png")))
    print("Saved: notebooks/figures/quality_enforcement_stacked.png")
else:
    print("quality_enforcement_report.csv not found.")

Saved: notebooks/figures/quality_enforcement_stacked.png

# --- Synthetic Data Proportion Heatmap ---
qf_vars = [
    "air_temperature_c_qf", "relative_humidity_pct_qf",
    "wind_speed_kmh_qf", "rain_mm_qf",
    "ffmc_qf", "dmc_qf", "dc_qf", "isi_qf", "bui_qf", "fwi_qf"
]
prop_matrix = {}
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, low_memory=False)
    row = {}
    for qv in qf_vars:
        if qv in df.columns:
            row[qv.replace("_qf", "")] = (df[qv] > 0).mean() * 100
        else:
            row[qv.replace("_qf", "")] = float("nan")
    prop_matrix[station] = row

prop_df = pd.DataFrame(prop_matrix).T

fig, ax = plt.subplots(figsize=(14, 5))
mask = prop_df.isna()
cmap = plt.cm.YlOrRd.copy()
cmap.set_bad(color="#d0d0d0")
im = ax.imshow(prop_df.values, cmap=cmap, aspect="auto", vmin=0, vmax=100)
ax.set_xticks(range(len(prop_df.columns)))
ax.set_xticklabels(prop_df.columns, rotation=45, ha="right", fontsize=8)
ax.set_yticks(range(len(prop_df.index)))
ax.set_yticklabels(prop_df.index, fontsize=8)

# Annotate cells
for i in range(len(prop_df.index)):
    for j in range(len(prop_df.columns)):
        v = prop_df.iloc[i, j]
        if pd.isna(v):
            ax.text(j, i, "N/A", ha="center", va="center", fontsize=7, color="gray")
        else:
            ax.text(j, i, f"{v:.1f}%", ha="center", va="center", fontsize=7,
                    color="white" if v > 50 else "black")

fig.colorbar(im, ax=ax, label="% rows with qf > 0")
ax.set_title("Synthetic Data Proportion by Station and Variable")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "synthetic_proportion_heatmap.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "synthetic_proportion_heatmap.png")))
print("Saved: notebooks/figures/synthetic_proportion_heatmap.png")

Saved: notebooks/figures/synthetic_proportion_heatmap.png

# --- Quality Flag Timeline ---
import ast

flag_color_map = {
    "value_range": "#e6194b",     # red
    "rate_of_change": "#f58231",  # orange
    "flatline": "#3cb44b",        # green
    "cross_variable": "#4363d8",  # blue
    "stuck_sensor": "#911eb4",    # purple
}

flag_data = {}
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
    if "_quality_flags" not in df.columns:
        continue
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    # Parse flag lists and extract types
    flag_types = []
    for val in df["_quality_flags"].dropna():
        try:
            flags = ast.literal_eval(val) if isinstance(val, str) else val
            for f in flags:
                ftype = f.split(":")[0] if ":" in str(f) else str(f)
                flag_types.append(ftype)
        except Exception:
            pass
    if flag_types:
        flag_data[station] = pd.Series(flag_types).value_counts()

if flag_data:
    flag_df = pd.DataFrame(flag_data).fillna(0).astype(int)
    fig, ax = plt.subplots(figsize=(12, 4 + 0.3 * len(flag_df.columns)))
    flag_df.plot(kind="barh", stacked=True, ax=ax,
                 color=[flag_color_map.get(t, "#aaa") for t in flag_df.index])
    ax.set_title("Quality Flag Types by Station")
    ax.set_xlabel("Flag Count")
    ax.set_ylabel("")
    ax.legend(title="Flag Type", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "quality_flag_timeline.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "quality_flag_timeline.png")))
    print("Saved: notebooks/figures/quality_flag_timeline.png")
else:
    print("No quality flag data found for any station.")

Saved: notebooks/figures/quality_flag_timeline.png

cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
    cs_df = pd.read_csv(cs_path)
    print(f"Cross-station imputation audit: {len(cs_df)} entries")

    if "donor" in cs_df.columns and "target" in cs_df.columns:
        print("\nDonor -> Recipient pairs:")
        pairs = cs_df.groupby(["donor", "target"]).size().reset_index(name="count")
        print(pairs.to_string(index=False))
    else:
        print("Columns:", list(cs_df.columns))
else:
    print("cross_station_imputation_audit.csv not found.")

Cross-station imputation audit: 78623 entries
Columns: ['station', 'timestamp_utc', 'variable', 'imputed_value', 'quality_flag', 'source', 'method', 'donor_priority']

# --- Cross-Station Donor Audit Visual ---
cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
    cs_df = pd.read_csv(cs_path)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Left: donor contributions
    if "source" in cs_df.columns:
        donor_counts = cs_df["source"].value_counts()
        donor_counts.plot(kind="barh", ax=ax1, color="#4363d8", edgecolor="white")
        ax1.set_title("Donor Contribution Count")
        ax1.set_xlabel("Number of Imputed Values")
        ax1.set_ylabel("")
    else:
        ax1.text(0.5, 0.5, "No source column", ha="center", va="center", transform=ax1.transAxes)

    # Right: method breakdown by station
    if "station" in cs_df.columns and "method" in cs_df.columns:
        method_pivot = cs_df.groupby(["station", "method"]).size().reset_index(name="count")
        method_table = method_pivot.pivot(index="station", columns="method", values="count").fillna(0).astype(int)
        method_table.plot(kind="bar", stacked=True, ax=ax2, colormap="Set2", edgecolor="white", linewidth=0.5)
        ax2.set_title("Imputation Method by Station")
        ax2.set_xlabel("Station")
        ax2.set_ylabel("Count")
        ax2.legend(title="Method", fontsize=7)
        plt.xticks(rotation=45, ha="right")
    else:
        ax2.text(0.5, 0.5, "No method/station columns", ha="center", va="center", transform=ax2.transAxes)

    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "cross_station_donor_audit.png", bbox_inches="tight")
    display(Image(filename=str(FIGURES_DIR / "cross_station_donor_audit.png")))
    print("Saved: notebooks/figures/cross_station_donor_audit.png")
else:
    print("cross_station_imputation_audit.csv not found.")

Saved: notebooks/figures/cross_station_donor_audit.png

temp_frames = []
for station in stations:
    fpath = PROCESSED_DIR / station / "station_hourly.csv"
    df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
    df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
    df = df[df["timestamp_utc"] >= ANALYSIS_START]
    if "air_temperature_c" in df.columns:
        temp_frames.append(df[["station", "timestamp_utc", "air_temperature_c"]])

temp_all = pd.concat(temp_frames, ignore_index=True)
matrix = build_station_matrix(temp_all, value_column="air_temperature_c")
print(f"Station matrix shape: {matrix.shape}")
print(f"Stations: {list(matrix.columns)}")

Station matrix shape: (26113, 6)
Stations: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']

loadings = pca_station_loadings(matrix)
print("PCA Loadings:")
print(loadings.to_string(index=False))

PCA Loadings:
       station component   loading  explained_variance_ratio
     cavendish       PC1  0.413668                  0.963466
     greenwich       PC1  0.396266                  0.963466
 north_rustico       PC1  0.413277                  0.963466
      stanhope       PC1  0.406986                  0.963466
stanley_bridge       PC1  0.405674                  0.963466
      tracadie       PC1  0.413334                  0.963466
     cavendish       PC2 -0.167839                  0.018571
     greenwich       PC2  0.900793                  0.018571
 north_rustico       PC2 -0.171177                  0.018571
      stanhope       PC2 -0.054382                  0.018571
stanley_bridge       PC2 -0.323366                  0.018571
      tracadie       PC2 -0.153547                  0.018571

normalized = matrix.dropna(axis="index", how="any")
normalized = (normalized - normalized.mean()) / normalized.std()

n_comp = min(len(matrix.columns), normalized.shape[0])
pca_full = PCA(n_components=n_comp)
pca_full.fit(normalized)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(range(1, n_comp + 1), pca_full.explained_variance_ratio_, "o-")
ax.set_xlabel("Principal Component")
ax.set_ylabel("Explained Variance Ratio")
ax.set_title("Scree Plot")
ax.set_xticks(range(1, n_comp + 1))
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_scree.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_scree.png")))
print("Saved: notebooks/figures/pca_scree.png")
ratios = pca_full.explained_variance_ratio_.round(4).tolist()
print(f"Explained variance ratios: {ratios}")

Saved: notebooks/figures/pca_scree.png
Explained variance ratios: [0.9635, 0.0186, 0.0098, 0.0062, 0.0011, 0.0008]

scores = pca_full.transform(normalized)

# Distinct color per station
station_colors = {
    "greenwich": "#e6194b",
    "cavendish": "#3cb44b",
    "north_rustico": "#4363d8",
    "stanhope": "#f58231",
    "stanley_bridge": "#911eb4",
    "tracadie": "#42d4f4",
}

fig, ax = plt.subplots(figsize=(8, 6))
for i, station in enumerate(matrix.columns):
    color = station_colors.get(station, None)
    ax.scatter(scores[i, 0], scores[i, 1], s=120, zorder=5, color=color, label=station, edgecolors="white", linewidth=0.5)
    ax.annotate(station, (scores[i, 0], scores[i, 1]), textcoords="offset points", xytext=(5, 5), fontsize=9, color=color)

ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
ax.set_xlabel(f"PC1 ({pca_full.explained_variance_ratio_[0]:.1%})")
ax.set_ylabel(f"PC2 ({pca_full.explained_variance_ratio_[1]:.1%})")
ax.set_title("PCA Biplot - Station Scores")
ax.axhline(0, color="gray", linewidth=0.5)
ax.axvline(0, color="gray", linewidth=0.5)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_biplot.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_biplot.png")))
print("Saved: notebooks/figures/pca_biplot.png")

Saved: notebooks/figures/pca_biplot.png

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

corr = matrix.corr()
distance = 1 - corr
dist_array = squareform(distance.values, checks=False)

link = linkage(dist_array, method="average")

fig, ax = plt.subplots(figsize=(8, 5))
dendrogram(link, labels=list(matrix.columns), ax=ax, leaf_rotation=45)
ax.set_ylabel("Distance (1 - correlation)")
ax.set_title("Hierarchical Clustering Dendrogram")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "clustering_dendrogram.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "clustering_dendrogram.png")))
print("Saved: notebooks/figures/clustering_dendrogram.png")

Saved: notebooks/figures/clustering_dendrogram.png

cluster_order = cluster_station_order(matrix)
print("Cluster assignments:")
for entry in cluster_order:
    print(f"  {entry}")

corr = pairwise_station_correlation(matrix).fillna(0.0)
dist = 1 - corr
print("\nPairwise distance matrix:")
print(dist.round(3).to_string())

Cluster assignments:
  stanhope
  tracadie
  cavendish
  north_rustico
  stanley_bridge
  greenwich

Pairwise distance matrix:
station         cavendish  greenwich  north_rustico  stanhope  stanley_bridge  tracadie
station                                                                                
cavendish           0.000      0.093          0.006     0.033           0.028     0.006
greenwich           0.093      0.000          0.086     0.095           0.102     0.058
north_rustico       0.006      0.086          0.000     0.035           0.026     0.005
stanhope            0.033      0.095          0.035     0.000           0.057     0.031
stanley_bridge      0.028      0.102          0.026     0.057           0.000     0.026
tracadie            0.006      0.058          0.005     0.031           0.026     0.000

try:
    benchmark = benchmark_to_stanhope(matrix, reference_station="stanhope")
    print("Benchmark results:")
    print(benchmark.to_string(index=False))
except ValueError as e:
    print(f"Stanhope benchmark skipped: {e}")
    print("Using inter-station metrics only.")
    benchmark = None

Benchmark results:
       station reference_station  overlap_count  mean_abs_diff  correlation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              observations
     cavendish          stanhope          21858       1.815121     0.966669                                                                                                                                           [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
     greenwich          stanhope          24067       1.540169     0.905163                                                                                                                                                                                               [0.131, 1.272, 1.776, 1.6753333333333327, 1.3593333333333337, 1.0616666666666668, 0.4973333333333335, 0.8983333333333332, 1.579333333333333, 2.796333333333333, 2.403, 1.596, 0.30799999999999983, 2.551, 2.433666666666667, 2.436, 1.4793333333333332, 0.9983333333333333, 0.7026666666666668, 0.5543333333333336, 0.9340000000000002, 2.0716666666666668, 2.3259999999999996, 2.262666666666667, 1.614, 0.8383333333333329, 1.033666666666667, 0.3340000000000001, 0.5190000000000001, 0.6160000000000001, 0.07799999999999985, 0.04800000000000004, 0.9796666666666667, 0.3923333333333332, 0.9243333333333337, 0.301333333333333, 0.07833333333333314, 0.44599999999999973, 0.16566666666666663, 0.3240000000000003, 1.0553333333333335, 1.745333333333333, 2.252333333333333, 2.981333333333333, 2.879333333333333, 2.4543333333333335, 1.6183333333333334, 1.6399999999999997, 2.2116666666666664, 2.530333333333333, 2.5700000000000003, 2.155, 1.5509999999999997, 1.5199999999999996, 1.189, 0.06699999999999928, 1.2399999999999998, 2.4606666666666666, 3.372, 4.061, 4.348333333333334, 4.505333333333333, 4.098333333333334, 4.56, 3.0966666666666667, 1.7093333333333338, 0.17433333333333367, 1.8640000000000003, 1.8049999999999997, 1.5246666666666657, 0.48566666666666647, 1.4106666666666663, 0.5083333333333331, 0.8073333333333332, 1.334, 0.44133333333333336, 0.852, 0.40733333333333344, 0.41466666666666663, 2.330333333333333, 4.1450000000000005, 5.676333333333333, 6.049666666666667, 5.577999999999999, 0.5589999999999993, 1.0030000000000001, 2.7380000000000004, 3.351, 2.309, 3.887333333333334, 4.211666666666666, 4.692333333333334, 5.078666666666667, 4.711333333333333, 4.510333333333333, 3.167666666666667, 0.07866666666666666, 0.3703333333333334, 0.5583333333333333, 2.6606666666666667, ...]
 north_rustico          stanhope          22994       1.872865     0.964902                                                                                                                                                                                                                                                                                  [3.2779999999999996, 4.0600000000000005, 3.0125, 3.3594999999999997, 5.177999999999999, 5.859999999999999, 5.2125, 5.2595, 4.9555, 0.5805000000000002, 0.3480000000000001, 0.6470000000000001, 1.1544999999999999, 0.42549999999999977, 0.607, 1.1465, 0.9923333333333335, 1.2554999999999996, 2.831, 5.4375, 5.569500000000001, 0.42600000000000016, 0.7905000000000006, 2.645999999999999, 3.653500000000001, 2.3629999999999995, 1.8560000000000008, 1.9699999999999998, 1.1725000000000003, 1.1084999999999994, 0.3155000000000001, 1.5074999999999998, 0.47550000000000026, 0.44649999999999945, 1.4965000000000002, 1.5819999999999999, 1.9894999999999996, 0.16349999999999998, 0.488, 0.23050000000000015, 0.956, 2.5140000000000002, 4.861000000000001, 3.7095000000000002, 5.258, 7.7545, 3.6179999999999986, 0.7385000000000002, 2.0774999999999997, 0.3949999999999996, 5.0165, 4.219999999999999, 4.367999999999999, 2.4349999999999996, 0.4775000000000009, 0.07399999999999984, 0.6455000000000002, 0.06300000000000061, 1.0415, 0.6745000000000001, 1.8719999999999999, 0.9215, 1.787, 2.569, 2.8975, 2.7225000000000006, 1.4784999999999995, 0.5105, 0.25849999999999973, 0.9654999999999996, 0.9155000000000002, 0.9110000000000005, 0.39700000000000024, 0.39449999999999985, 6.627, 5.5705, 2.4085, 0.44650000000000034, 1.5984999999999996, 0.8835000000000006, 0.7670000000000003, 1.9605000000000015, 1.8565000000000005, 1.9209999999999985, 1.0989999999999993, 0.7234999999999996, 0.7985000000000007, 1.2915, 1.1304999999999996, 0.573500000000001, 0.29800000000000093, 0.22299999999999986, 0.2054999999999998, 0.3130000000000006, 0.702, 0.5025000000000004, 0.49350000000000005, 2.8790000000000013, 4.922, 2.186, ...]
stanley_bridge          stanhope          24066       2.099542     0.942855                                                                                                                                           [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
      tracadie          stanhope          21940       1.822442     0.968903 [1.1550000000000011, 0.9824999999999982, 0.8283333333333331, 0.6266666666666687, 0.33333333333333215, 0.39416666666667055, 0.7650000000000006, 1.5183333333333309, 0.5924999999999976, 1.1966666666666725, 0.043333333333333, 1.0366666666666688, 1.1408333333333331, 1.1508333333333347, 1.8183333333333387, 1.5458333333333307, 0.769166666666667, 0.086666666666666, 2.557500000000001, 3.0933333333333337, 2.8599999999999994, 2.2641666666666644, 1.975833333333334, 1.7858333333333292, 1.6616666666666688, 1.5225000000000009, 1.6858333333333313, 1.803333333333331, 1.4700000000000024, 0.8125, 0.33500000000000085, 0.16999999999999815, 1.4050000000000011, 2.2225, 2.2025000000000006, 1.3141666666666687, 1.0933333333333337, 0.19583333333333286, 0.7155555555555608, 0.17499999999999716, 0.398333333333337, 0.21083333333333343, 0.8533333333333317, 1.8975000000000009, 1.3825000000000003, 2.1125000000000007, 2.233333333333338, 2.0383333333333304, 1.8616666666666646, 1.1433333333333344, 1.134999999999998, 0.8524999999999991, 0.8716666666666661, 0.9283333333333275, 0.09416666666666984, 0.6916666666666664, 2.231666666666669, 2.1633333333333304, 1.4450000000000003, 2.8283333333333367, 1.7749999999999986, 2.0249999999999986, 1.4258333333333333, 1.2699999999999996, 0.3758333333333326, 0.0591666666666697, 1.2349999999999994, 2.7025000000000006, 3.9775000000000027, 4.57833333333333, 4.006666666666664, 3.7441666666666684, 2.7775, 2.576666666666661, 2.0799999999999983, 0.6533333333333324, 0.4833333333333343, 0.03999999999999915, 0.1700000000000017, 1.5308333333333373, 2.6466666666666683, 2.288333333333334, 1.7424999999999997, 1.1666666666666679, 2.575833333333332, 2.3833333333333364, 1.8758333333333361, 0.9899999999999984, 0.036666666666668846, 1.1208333333333336, 3.6866666666666674, 3.418333333333333, 3.5199999999999996, 2.973333333333329, 1.4858333333333285, 1.4608333333333299, 1.1708333333333272, 1.047500000000003, 1.4683333333333373, 1.6616666666666653, ...]

if benchmark is not None and len(benchmark) > 0:
    recommendations = build_station_recommendations(
        benchmark,
        pca_loadings=loadings,
        cluster_order=cluster_order,
    )
    print("Station Recommendations:")
    print(recommendations.to_string(index=False))
else:
    print("Cannot build recommendations without benchmark.")
    recommendations = None

Station Recommendations:
       station reference_station recommendation  risk_probability  ci_lower  ci_upper risk_band                                                                                                    evidence                                                                                                                                                                                                                                                                                  assumptions                                                limitations
     cavendish          stanhope         remove          0.121140  0.012887  0.288408       low benchmark correlation=0.967; uncertainty=low (0.01-0.29); pca=[PC1=0.414, PC2=-0.168]; cluster=position 3/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
     greenwich          stanhope          defer          0.028022  0.000000  0.060345       low  benchmark correlation=0.905; uncertainty=low (0.00-0.06); pca=[PC1=0.396, PC2=0.901]; cluster=position 6/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
 north_rustico          stanhope         remove          0.110791  0.013123  0.265487       low benchmark correlation=0.965; uncertainty=low (0.01-0.27); pca=[PC1=0.413, PC2=-0.171]; cluster=position 4/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge          stanhope          defer          0.063608  0.010394  0.144060       low benchmark correlation=0.943; uncertainty=low (0.01-0.14); pca=[PC1=0.406, PC2=-0.323]; cluster=position 5/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
      tracadie          stanhope         remove          0.123004  0.014229  0.279774       low benchmark correlation=0.969; uncertainty=low (0.01-0.28); pca=[PC1=0.413, PC2=-0.154]; cluster=position 2/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.

from pea_met_network.uncertainty import quantify_station_removal_risk

if benchmark is not None and len(benchmark) > 0:
    uncertainty = quantify_station_removal_risk(benchmark)
    print("Uncertainty Quantification (KDE-based risk):")
    print(uncertainty.to_string(index=False))

    fig, ax = plt.subplots(figsize=(8, 4))
    stations_plot = uncertainty["station"].tolist()
    risks = uncertainty["risk_probability"].tolist()
    ci_lo = uncertainty["ci_lower"].tolist()
    ci_hi = uncertainty["ci_upper"].tolist()

    x = range(len(stations_plot))
    ax.bar(x, risks, color="steelblue", alpha=0.7)
    yerr_lo = [r - lo for r, lo in zip(risks, ci_lo)]
    yerr_hi = [hi - r for r, hi in zip(risks, ci_hi)]
    ax.errorbar(x, risks, yerr=[yerr_lo, yerr_hi], fmt="none", c="black", capsize=5)
    ax.set_xticks(list(x))
    ax.set_xticklabels(stations_plot, rotation=45, ha="right")
    ax.set_ylabel("Removal Risk Probability")
    ax.set_title("Station Removal Risk with 95% Confidence Intervals")
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "uncertainty_risk.png", bbox_inches="tight")
    plt.show()
    display(Image(filename=str(FIGURES_DIR / "uncertainty_risk.png")))
    print("Saved: notebooks/figures/uncertainty_risk.png")
else:
    print("Uncertainty analysis requires benchmark data (skipped).")

Uncertainty Quantification (KDE-based risk):
       station reference_station  risk_probability  ci_lower  ci_upper risk_band                                                                                                                                                                                                                                                                                  assumptions                                                limitations
     cavendish          stanhope          0.121140  0.012887  0.288408       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
     greenwich          stanhope          0.028022  0.000000  0.060345       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
 north_rustico          stanhope          0.110791  0.013123  0.265487       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge          stanhope          0.063608  0.010394  0.144060       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
      tracadie          stanhope          0.123004  0.014229  0.279774       low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.

Saved: notebooks/figures/uncertainty_risk.png

1. Data Loading¶

2. Exploratory Data Analysis¶

2.1 Station Coverage Table¶

2.2 Temporal Coverage Summary¶

2.3 Missingness Heatmaps¶

2.4 Imputation Summary¶

3. Fire Weather Index Analysis¶

3.1 FWI Component Time Series¶

3.1b QF-Aware FWI Time Series¶

3.2 FWI Mode Comparison¶

3.3 FWI Value Statistics¶

3.4 Chain Break Analysis¶

4. Data Quality Report¶

4.1 Pre/Post Imputation Missingness¶

4.2 Quality Enforcement Actions¶

4.3 Out-of-Range Summary¶

4.2b Quality Enforcement Summary (Visual)¶

4.2c Synthetic Data Proportion Heatmap¶

4.2d Quality Flag Timeline¶

4.4 Cross-Station Imputation Audit¶

4.4b Cross-Station Donor Audit (Visual)¶

5. Principal Component Analysis¶

6. Hierarchical Clustering¶

7. Redundancy Analysis¶

8. Uncertainty Quantification¶

Conclusion¶