import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Image, display
from pathlib import Path
from sklearn.decomposition import PCA
from pea_met_network.qa_qc import coverage_summary
from pea_met_network.redundancy import (
benchmark_to_stanhope,
build_station_matrix,
build_station_recommendations,
cluster_station_order,
pairwise_station_correlation,
pca_station_loadings,
)
# ---------------------------------------------------------------------------
# FWI mode selector — switch between "hourly" and "compliant"
# ---------------------------------------------------------------------------
FWI_MODE = "hourly"
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(".").resolve()
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "notebooks" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
# Date range for analysis — must match cleaning-config.json
ANALYSIS_START = pd.Timestamp("2023-04-01", tz="UTC")
sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 120
print(f"FWI mode: {FWI_MODE}")
print(f"Processed dir: {PROCESSED_DIR}")
mkdir -p failed for path /home/moltis/.config/matplotlib: [Errno 13] Permission denied: '/home/moltis/.config/matplotlib'
Matplotlib created a temporary cache directory at /tmp/matplotlib-ep8gx_14 because there was an issue with the default path (/home/moltis/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
FWI mode: hourly Processed dir: /mnt/fast_data/workspaces/pea-met-network/data/processed
1. Data Loading¶
Load hourly processed data for all stations. Station list is derived from disk.
# Discover stations from disk — no hardcoded list
stations = sorted([
d.name for d in PROCESSED_DIR.iterdir()
if d.is_dir() and (d / "station_hourly.csv").exists()
])
print(f"Stations found: {stations}")
hourly_frames = []
daily_frames = []
for station in stations:
h_path = PROCESSED_DIR / station / "station_hourly.csv"
d_path = PROCESSED_DIR / station / "station_daily.csv"
h_df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
h_df["timestamp_utc"] = pd.to_datetime(h_df["timestamp_utc"], utc=True)
h_df = h_df[h_df["timestamp_utc"] >= ANALYSIS_START]
hourly_frames.append(h_df)
if d_path.exists():
d_df = pd.read_csv(d_path, parse_dates=["timestamp_utc"])
d_df["timestamp_utc"] = pd.to_datetime(d_df["timestamp_utc"], utc=True)
daily_frames.append(d_df)
tmin = h_df["timestamp_utc"].min()
tmax = h_df["timestamp_utc"].max()
print(f" {station}: {len(h_df)} hourly rows ({tmin} to {tmax})")
hourly_all = pd.concat(hourly_frames, ignore_index=True)
if daily_frames:
daily_all = pd.concat(daily_frames, ignore_index=True)
else:
daily_all = pd.DataFrame()
n = hourly_all["station"].nunique()
print(f"\nCombined: {len(hourly_all)} hourly rows, {n} stations")
Stations found: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie'] cavendish: 26092 hourly rows (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
greenwich: 26092 hourly rows (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00) north_rustico: 26113 hourly rows (2023-04-01 00:00:00+00:00 to 2026-03-24 00:00:00+00:00)
stanhope: 24144 hourly rows (2023-04-01 00:00:00+00:00 to 2025-12-31 23:00:00+00:00) stanley_bridge: 26092 hourly rows (2023-04-01 00:00:00+00:00 to 2026-03-23 03:00:00+00:00)
tracadie: 23956 hourly rows (2023-06-29 00:00:00+00:00 to 2026-03-23 03:00:00+00:00) Combined: 152489 hourly rows, 6 stations
coverage_frames = []
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
coverage_frames.append(df[["station", "timestamp_utc"]])
coverage_df = pd.concat(coverage_frames, ignore_index=True)
cov_summary = coverage_summary(coverage_df)
print(cov_summary.to_string(index=False))
station total_records
cavendish 26092
greenwich 26092
north_rustico 26113
stanhope 24144
stanley_bridge 26092
tracadie 23956
2.2 Temporal Coverage Summary¶
Visual overview of data availability by station over time.
fig, ax = plt.subplots(figsize=(12, 4))
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
start = df["timestamp_utc"].min().floor("D")
end = df["timestamp_utc"].max().floor("D")
date_range = pd.date_range(start, end, freq="D")
daily_counts = df.set_index("timestamp_utc").resample("D").size()
daily_counts = daily_counts.reindex(date_range, fill_value=0)
ax.fill_between(daily_counts.index, 0, daily_counts.values, alpha=0.4, label=station)
ax.plot(daily_counts.index, daily_counts.values, linewidth=0.8)
ax.set_xlabel("Date")
ax.set_ylabel("Hourly Records per Day")
ax.set_title("Temporal Coverage by Station")
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "temporal_coverage.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "temporal_coverage.png")))
print("Saved: notebooks/figures/temporal_coverage.png")
Saved: notebooks/figures/temporal_coverage.png
2.3 Missingness Heatmaps¶
Each heatmap is split into two panels:
Left panel — Core Sensors / FWI: Variables directly used as FWI inputs (temp, RH, wind, rain) plus FWI output indices (FFMC, DMC, DC, ISI, BUI, FWI on daily). These are the columns that matter for the fire weather pipeline.
Right panel — Extended Sensors: Auxiliary measurements (wind direction, solar radiation, barometric pressure, dew point, gust speed) that are not used in FWI computation. Many stations lack some or all of these sensors, so 100% missing values here indicate a missing instrument, not a data problem.
Hourly heatmap — raw observations from
station_hourly.csv.Daily heatmap — daily aggregates with FWI components from
<station>_daily_compliant.csv(falls back tostation_daily.csv).
# --- Hourly missingness heatmap (core vs extended sensors) ---
FWI_CORE_VARS = [
"air_temperature_c", "relative_humidity_pct",
"wind_speed_kmh", "rain_mm",
]
HOURLY_EXTENDED_VARS = [
"wind_direction_deg", "solar_radiation_w_m2",
"barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]
hourly_missing_frames = []
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath)
df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
row = {"station": station}
for var in FWI_CORE_VARS + HOURLY_EXTENDED_VARS:
if var in df.columns:
row[var] = df[var].isna().mean() * 100
else:
row[var] = 100.0
hourly_missing_frames.append(row)
hourly_missing_df = pd.DataFrame(hourly_missing_frames).set_index("station")
# Split into core and extended panels
core_cols = [v for v in FWI_CORE_VARS if v in hourly_missing_df.columns]
ext_cols = [v for v in HOURLY_EXTENDED_VARS if v in hourly_missing_df.columns]
fig, (ax_core, ax_ext) = plt.subplots(
1, 2, figsize=(max(12, (len(core_cols) + len(ext_cols)) * 1.1), max(3, len(stations) * 0.45)),
gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.4},
)
sns.heatmap(
hourly_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_core, cbar_kws={"label": "% Missing"},
)
ax_core.set_title("Core Sensors (FWI Inputs)", fontweight="bold", fontsize=11)
ax_core.set_xlabel("Variable")
ax_core.set_ylabel("Station")
if ext_cols:
sns.heatmap(
hourly_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_ext, cbar_kws={"label": "% Missing"},
)
else:
ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
ha="center", va="center", fontsize=12, color="gray")
ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
ax_ext.set_xlabel("Variable")
ax_ext.set_ylabel("")
fig.suptitle("Hourly Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
fontsize=12, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "missingness_heatmap_hourly.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_hourly.png")))
print("Saved: notebooks/figures/missingness_heatmap_hourly.png")
print()
print("Hourly missingness summary (core):")
print(hourly_missing_df[core_cols].round(1).to_string())
print()
print("Hourly missingness summary (extended):")
print(hourly_missing_df[ext_cols].round(1).to_string())
Saved: notebooks/figures/missingness_heatmap_hourly.png
Hourly missingness summary (core):
air_temperature_c relative_humidity_pct wind_speed_kmh rain_mm
station
cavendish 14.5 14.5 0.0 0.0
greenwich 4.1 16.2 4.0 0.1
north_rustico 4.1 4.1 0.0 0.0
stanhope 0.3 0.3 0.3 0.2
stanley_bridge 0.0 17.2 0.0 0.0
tracadie 0.0 12.8 0.0 0.0
Hourly missingness summary (extended):
wind_direction_deg solar_radiation_w_m2 barometric_pressure_kpa dew_point_c wind_gust_speed_kmh
station
cavendish 14.5 14.5 100.0 14.5 14.9
greenwich 12.1 12.1 100.0 16.2 12.5
north_rustico 2.1 1.3 1.3 1.3 4.3
stanhope 2.1 100.0 100.0 0.3 100.0
stanley_bridge 18.4 18.4 34.7 100.0 32.1
tracadie 13.0 13.0 18.8 100.0 15.5
# --- Daily missingness heatmap (core vs extended, includes FWI) ---
FWI_OUTPUT_VARS = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]
FWI_CORE_DAILY = [
"air_temperature_c", "relative_humidity_pct",
"wind_speed_kmh", "rain_mm",
] + FWI_OUTPUT_VARS
DAILY_EXTENDED = [
"wind_direction_deg", "solar_radiation_w_m2",
"barometric_pressure_kpa", "dew_point_c", "wind_gust_speed_kmh",
]
daily_missing_frames = []
for station in stations:
c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"
d_path = PROCESSED_DIR / station / "station_daily.csv"
fpath = c_path if c_path.exists() else d_path
if not fpath.exists():
continue
df = pd.read_csv(fpath)
if "timestamp_utc" in df.columns:
df = df[pd.to_datetime(df["timestamp_utc"], utc=True) >= ANALYSIS_START]
data_cols = sorted([
c for c in df.columns
if not c.endswith("_qf") and not c.endswith("_src")
and not c.endswith("_method") and not c.startswith("_quality")
and c not in ("station", "timestamp_utc", "carry_forward_used")
])
row = {"station": station}
for var in data_cols:
row[var] = df[var].isna().mean() * 100
daily_missing_frames.append(row)
if daily_missing_frames:
daily_missing_df = pd.DataFrame(daily_missing_frames).set_index("station")
all_cols = sorted(daily_missing_df.columns)
core_cols = [v for v in FWI_CORE_DAILY if v in all_cols]
ext_cols = [v for v in all_cols if v not in core_cols]
fig, (ax_core, ax_ext) = plt.subplots(
1, 2,
figsize=(max(14, (len(core_cols) + len(ext_cols)) * 1.0), max(3, len(daily_missing_df) * 0.45)),
gridspec_kw={"width_ratios": [len(core_cols), max(len(ext_cols), 1)], "wspace": 0.5},
)
sns.heatmap(
daily_missing_df[core_cols], annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_core, cbar_kws={"label": "% Missing"},
)
ax_core.set_title("Core + FWI Outputs", fontweight="bold", fontsize=11)
ax_core.set_xlabel("Variable")
ax_core.set_ylabel("Station")
if ext_cols:
sns.heatmap(
daily_missing_df[ext_cols], annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_ext, cbar_kws={"label": "% Missing"},
)
else:
ax_ext.text(0.5, 0.5, "No extended\nvariables", transform=ax_ext.transAxes,
ha="center", va="center", fontsize=12, color="gray")
ax_ext.set_title("Extended Sensors (non-FWI)", fontweight="bold", fontsize=11)
ax_ext.set_xlabel("Variable")
ax_ext.set_ylabel("")
fig.suptitle("Daily Data \u2014 Missingness Heatmap (% Missing per Variable per Station)",
fontsize=12, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "missingness_heatmap_daily.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "missingness_heatmap_daily.png")))
print("Saved: notebooks/figures/missingness_heatmap_daily.png")
print()
print("Daily missingness summary (core + FWI):")
print(daily_missing_df[core_cols].round(1).to_string())
print()
print("Daily missingness summary (extended):")
print(daily_missing_df[ext_cols].round(1).to_string())
else:
print("No daily CSVs found \u2014 daily missingness heatmap skipped.")
Saved: notebooks/figures/missingness_heatmap_daily.png
Daily missingness summary (core + FWI):
air_temperature_c relative_humidity_pct wind_speed_kmh rain_mm ffmc dmc dc isi bui fwi
station
cavendish 14.6 14.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
greenwich 4.1 16.2 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
north_rustico 4.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
stanhope 0.6 0.7 0.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0
stanley_bridge 0.0 17.2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
tracadie 0.0 12.8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Daily missingness summary (extended):
barometric_pressure_kpa dew_point_c solar_radiation_w_m2 wind_direction_deg wind_gust_speed_kmh
station
cavendish 100.0 14.6 14.6 14.6 15.0
greenwich 100.0 16.2 12.1 12.1 12.5
north_rustico 1.2 1.2 1.3 1.9 4.1
stanhope 100.0 0.6 NaN 1.2 NaN
stanley_bridge 34.6 100.0 18.4 18.4 32.0
tracadie 18.7 100.0 12.9 12.9 15.5
2.4 Imputation Summary¶
Gaps filled per station per variable from the pipeline imputation report.
imp_path = PROCESSED_DIR / "imputation_report.csv"
if imp_path.exists():
imp_df = pd.read_csv(imp_path)
print(f"Imputation report: {len(imp_df)} entries")
print("\nGaps filled per station per variable:")
pivot = imp_df.groupby(["station", "variable"])["count_affected"].sum().reset_index()
pivot_table = pivot.pivot(index="station", columns="variable", values="count_affected").fillna(0).astype(int)
print(pivot_table.to_string())
print("\nImputation method breakdown:")
method_counts = imp_df["method"].value_counts()
print(method_counts.to_string())
# Bar chart: total gaps filled by station
station_totals = imp_df.groupby("station")["count_affected"].sum().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(8, 4))
station_totals.plot(kind="barh", ax=ax, color="steelblue", alpha=0.8)
ax.set_xlabel("Total Gaps Filled")
ax.set_title("Imputation Summary by Station")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "imputation_summary.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "imputation_summary.png")))
print("Saved: notebooks/figures/imputation_summary.png")
else:
print("imputation_report.csv not found — run pipeline first.")
Imputation report: 717 entries Gaps filled per station per variable: variable air_temperature_c barometric_pressure_kpa dew_point_c relative_humidity_pct solar_radiation_w_m2 wind_direction_deg wind_speed_kmh station cavendish 3789 26092 3787 3787 3786 3786 3890 greenwich 10158 26092 4216 4229 3147 3145 3249 north_rustico 1076 330 331 1075 331 547 1188 stanhope 77 24144 68 79 0 516 68 stanley_bridge 4486 9043 26092 26092 4800 4800 8371 tracadie 3076 4508 23956 23956 3107 3107 3809 Imputation method breakdown: method linear_interpolation 575 preserve 142
Saved: notebooks/figures/imputation_summary.png
3. Fire Weather Index Analysis¶
Time series of FWI system components for all stations with computed indices.
3.1 FWI Component Time Series¶
Daily-mean FWI components for all stations. FFMC uses hourly values; DMC, DC, BUI, FWI use daily means.
fwi_vars = ["ffmc", "dmc", "dc", "isi", "bui", "fwi"]
fwi_stations = []
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
df = df[df["timestamp_utc"] >= ANALYSIS_START]
if "fwi" in df.columns and df["fwi"].notna().sum() > 100:
fwi_stations.append(station)
print(f"Stations with FWI data: {fwi_stations}")
if fwi_stations:
fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
if len(fwi_stations) == 1:
axes = [axes]
for idx, station in enumerate(fwi_stations):
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()
ax = axes[idx]
for var in fwi_vars:
if var in df_daily.columns:
ax.plot(df_daily.index, df_daily[var], label=var, alpha=0.8, linewidth=0.9)
ax.set_ylabel("Value")
ax.set_title(f"FWI Components - {station}")
ax.legend(fontsize=7, ncol=3)
axes[-1].set_xlabel("Date")
fig.suptitle("Fire Weather Index Time Series (Daily Means)", y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "fwi_timeseries.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "fwi_timeseries.png")))
print(f"Plotted FWI for: {fwi_stations}")
print("Saved: notebooks/figures/fwi_timeseries.png")
else:
print("FWI analysis skipped: no stations with non-missing FWI data")
Stations with FWI data: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
Plotted FWI for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie'] Saved: notebooks/figures/fwi_timeseries.png
3.1b QF-Aware FWI Time Series¶
FWI values plotted with background shading indicating data quality.
- Yellow band: at least one FWI input was synthetically imputed (
qf == 1) - Red band: at least one FWI input was chain-break gap-filled (
qf == 9) - No shading: all FWI inputs from observed data (
qf == 0)
Stations without per-variable _qf columns (stanhope) are skipped.
# --- QF-Aware FWI Timeseries ---
if fwi_stations:
fig, axes = plt.subplots(len(fwi_stations), 1, figsize=(14, 3 * len(fwi_stations)), sharex=True)
if len(fwi_stations) == 1:
axes = [axes]
fwi_input_qf = ["air_temperature_c_qf", "relative_humidity_pct_qf", "wind_speed_kmh_qf", "rain_mm_qf"]
for idx, station in enumerate(fwi_stations):
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
ax = axes[idx]
# Daily-mean FWI
df_daily = df.set_index("timestamp_utc")[fwi_vars].resample("D").mean()
# Daily quality: worst qf per day across all available input qf cols
available_qf = [c for c in fwi_input_qf if c in df.columns]
if available_qf:
qf_daily = df.set_index("timestamp_utc")[available_qf].resample("D").max()
worst_qf = qf_daily.max(axis=1)
else:
worst_qf = pd.Series(0, index=df_daily.index)
# Background shading
for i in range(len(worst_qf)):
if worst_qf.iloc[i] >= 9:
ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
worst_qf.index[i] + pd.Timedelta(hours=12),
color='red', alpha=0.15)
elif worst_qf.iloc[i] >= 1:
ax.axvspan(worst_qf.index[i] - pd.Timedelta(hours=12),
worst_qf.index[i] + pd.Timedelta(hours=12),
color='gold', alpha=0.20)
# FWI line
ax.plot(df_daily.index, df_daily["fwi"], color="black", linewidth=0.8, label="FWI")
ax.set_ylabel("FWI")
ax.set_title(f"QF-Aware FWI - {station}")
ax.legend(fontsize=7)
axes[-1].set_xlabel("Date")
fig.suptitle("FWI with Quality Flag Shading (yellow=synthetic, red=chain-break)", y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "fwi_qf_aware.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "fwi_qf_aware.png")))
print("Saved: notebooks/figures/fwi_qf_aware.png")
else:
print("QF-Aware FWI skipped: no stations with FWI data.")
Saved: notebooks/figures/fwi_qf_aware.png
3.2 FWI Mode Comparison¶
Overlay FWI values from hourly and compliant modes to show divergence (e.g., during chain breaks).
# Try to load both modes for comparison
hourly_fwi_data = {}
compliant_fwi_data = {}
for station in stations:
h_path = PROCESSED_DIR / station / "station_hourly.csv"
d_path = PROCESSED_DIR / station / "station_daily.csv"
c_path = PROCESSED_DIR / station / f"{station}_daily_compliant.csv"
if h_path.exists():
df = pd.read_csv(h_path, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
if "fwi" in df.columns and df["fwi"].notna().sum() > 0:
hourly_fwi_data[station] = df.set_index("timestamp_utc")["fwi"].resample("D").mean()
if c_path.exists():
df_c = pd.read_csv(c_path, parse_dates=["timestamp_utc"])
df_c["timestamp_utc"] = pd.to_datetime(df_c["timestamp_utc"], utc=True)
df_c = df_c[df_c["timestamp_utc"] >= ANALYSIS_START]
if "fwi" in df_c.columns and df_c["fwi"].notna().sum() > 0:
compliant_fwi_data[station] = df_c.set_index("timestamp_utc")["fwi"]
compare_stations = sorted(set(hourly_fwi_data.keys()) & set(compliant_fwi_data.keys()))
if compare_stations:
fig, axes = plt.subplots(len(compare_stations), 1, figsize=(14, 3 * len(compare_stations)), sharex=True)
if len(compare_stations) == 1:
axes = [axes]
for idx, station in enumerate(compare_stations):
ax = axes[idx]
ax.plot(hourly_fwi_data[station].index, hourly_fwi_data[station].values, label="hourly", alpha=0.7, linewidth=0.9)
ax.plot(compliant_fwi_data[station].index, compliant_fwi_data[station].values, label="compliant", alpha=0.7, linewidth=0.9)
ax.set_ylabel("FWI")
ax.set_title(f"FWI Mode Comparison — {station}")
ax.legend(fontsize=8)
axes[-1].set_xlabel("Date")
fig.suptitle("Hourly vs Compliant FWI", y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "fwi_mode_comparison.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "fwi_mode_comparison.png")))
print(f"Mode comparison for: {compare_stations}")
print("Saved: notebooks/figures/fwi_mode_comparison.png")
else:
print("FWI mode comparison skipped: compliant-mode daily files not found.")
print("Run the pipeline with --fwi-mode compliant to generate them.")
Mode comparison for: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie'] Saved: notebooks/figures/fwi_mode_comparison.png
3.3 FWI Value Statistics¶
Descriptive statistics (min, max, mean, std) for FWI codes per station, sourced from the QA/QC report.
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
qa_df = pd.read_csv(qa_qc_path)
fwi_stat_cols = [c for c in qa_df.columns if c.startswith("fwi_stat_")]
if fwi_stat_cols:
stat_display = qa_df[["station"] + fwi_stat_cols].copy()
print("FWI Value Statistics (from QA/QC report):")
print(stat_display.to_string(index=False))
else:
print("QA/QC report exists but has no fwi_stat_ columns.")
print("Columns available:", list(qa_df.columns))
else:
print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")
QA/QC report exists but has no fwi_stat_ columns. Columns available: ['station', 'hourly_rows', 'daily_rows', 'date_range_start', 'date_range_end', 'completeness', 'fwi_mode', 'pre_imp_missing_pct_air_temperature_c', 'pre_imp_missing_pct_relative_humidity_pct', 'pre_imp_missing_pct_wind_speed_kmh', 'pre_imp_missing_pct_rain_mm', 'post_imp_missing_pct_air_temperature_c', 'post_imp_missing_pct_air_temperature_c_qf', 'post_imp_missing_pct_barometric_pressure_kpa', 'post_imp_missing_pct_bui', 'post_imp_missing_pct_bui_qf', 'post_imp_missing_pct_dc', 'post_imp_missing_pct_dc_qf', 'post_imp_missing_pct_dew_point_c', 'post_imp_missing_pct_dmc', 'post_imp_missing_pct_dmc_qf', 'post_imp_missing_pct_ffmc', 'post_imp_missing_pct_ffmc_qf', 'post_imp_missing_pct_fwi', 'post_imp_missing_pct_fwi_qf', 'post_imp_missing_pct_isi', 'post_imp_missing_pct_isi_qf', 'post_imp_missing_pct_rain_mm', 'post_imp_missing_pct_rain_mm_qf', 'post_imp_missing_pct_relative_humidity_pct', 'post_imp_missing_pct_relative_humidity_pct_qf', 'post_imp_missing_pct_solar_radiation_w_m2', 'post_imp_missing_pct_wind_direction_deg', 'post_imp_missing_pct_wind_gust_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh', 'post_imp_missing_pct_wind_speed_kmh_qf', 'duplicate_count', 'out_of_range_temp_count', 'out_of_range_rh_count', 'out_of_range_wind_count', 'quality_enforced_count', 'quality_flagged_count', 'out_of_range_pre_enforcement', 'out_of_range_post_enforcement', 'fwi_chain_breaks', 'carry_forward_days', 'carry_forward_pct', 'ffmc_min', 'ffmc_max', 'ffmc_mean', 'ffmc_std', 'dmc_min', 'dmc_max', 'dmc_mean', 'dmc_std', 'dc_min', 'dc_max', 'dc_mean', 'dc_std', 'isi_min', 'isi_max', 'isi_mean', 'isi_std', 'bui_min', 'bui_max', 'bui_mean', 'bui_std', 'fwi_min', 'fwi_max', 'fwi_mean', 'fwi_std']
3.4 Chain Break Analysis¶
Analysis of FWI chain breaks by station and cause, sourced from the chain break diagnostic report.
cb_path = PROCESSED_DIR / f"fwi_missingness_report_{FWI_MODE}.csv"
if cb_path.exists():
cb_df = pd.read_csv(cb_path)
print(f"Chain break report: {len(cb_df)} entries")
# Summary table: breaks per station, grouped by cause
if "cause" in cb_df.columns:
print("\nBreaks per station by cause:")
cause_pivot = cb_df.groupby(["station", "cause"]).size().reset_index(name="count")
cause_table = cause_pivot.pivot(index="station", columns="cause", values="count").fillna(0).astype(int)
print(cause_table.to_string())
# Breakdown by missing input
if "missing_input" in cb_df.columns:
print("\nBreakdown by missing input:")
input_counts = cb_df["missing_input"].value_counts()
print(input_counts.to_string())
# Bar chart: chain breaks by station, colored by cause
fig, ax = plt.subplots(figsize=(10, 5))
causes = cb_df["cause"].unique()
station_breaks = cb_df.groupby(["station", "cause"]).size().unstack(fill_value=0)
station_breaks.plot(kind="bar", stacked=True, ax=ax, colormap="Set2")
ax.set_xlabel("Station")
ax.set_ylabel("Chain Break Count")
ax.set_title(f"FWI Chain Breaks by Station and Cause ({FWI_MODE} mode)")
ax.legend(title="Cause")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "chain_breaks.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "chain_breaks.png")))
print("Saved: notebooks/figures/chain_breaks.png")
# Cascade origins table
cascade = cb_df[cb_df["cause"] == "cascade"]
if len(cascade) > 0 and "cascade_origin" in cascade.columns:
print("\nCascade origins:")
print(cascade[["station", "cascade_origin"]].to_string(index=False))
else:
print("Chain break report has no 'cause' column. Columns:", list(cb_df.columns))
else:
print(f"fwi_missingness_report_{FWI_MODE}.csv not found — no chain breaks or pipeline not run.")
Chain break report: 143 entries Breaks per station by cause: cause input_missing startup station cavendish 19 2 greenwich 42 0 north_rustico 10 0 stanhope 44 2 stanley_bridge 11 0 tracadie 11 2 Breakdown by missing input: missing_input relative_humidity_pct 62 air_temperature_c, relative_humidity_pct, wind_speed_kmh 29 air_temperature_c 28 air_temperature_c, relative_humidity_pct 18
Saved: notebooks/figures/chain_breaks.png
4. Data Quality Report¶
Quality metrics from the QA/QC report, quality enforcement actions, and cross-station imputation audit.
4.1 Pre/Post Imputation Missingness¶
Side-by-side heatmap comparing missingness before and after imputation for the four core FWI input variables (temperature, relative humidity, wind speed, precipitation). Both panels share the same color scale so improvements are visually obvious.
Note: Pre-imputation data is only available for the four core variables — those are the ones the pipeline imputes. Extended sensor variables (solar radiation, barometric pressure, etc.) are not imputed by the pipeline.
# --- Pre/Post Imputation Missingness Heatmaps ---
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
qa_df = pd.read_csv(qa_qc_path)
# Core variables have pre-imputation data (these are what the pipeline imputes)
core_vars = ["air_temperature_c", "relative_humidity_pct", "wind_speed_kmh", "rain_mm"]
pre_rows, post_rows = [], []
for _, row in qa_df.iterrows():
station = row["station"]
pre_r = {"station": station}
post_r = {"station": station}
for var in core_vars:
pre_key = f"pre_imp_missing_pct_{var}"
post_key = f"post_imp_missing_pct_{var}"
pre_r[var] = row.get(pre_key, None)
post_r[var] = row.get(post_key, None)
pre_rows.append(pre_r)
post_rows.append(post_r)
pre_df = pd.DataFrame(pre_rows).set_index("station").dropna(axis=1, how="all")
post_df = pd.DataFrame(post_rows).set_index("station").dropna(axis=1, how="all")
common_cols = sorted(set(pre_df.columns) & set(post_df.columns))
pre_df = pre_df[common_cols]
post_df = post_df[common_cols]
fig, (ax_pre, ax_post) = plt.subplots(
1, 2,
figsize=(max(10, len(common_cols) * 2.2), max(3, len(pre_df) * 0.45)),
)
vmin = 0
vmax = max(pre_df.values.max(), post_df.values.max(), 1)
sns.heatmap(
pre_df, annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_pre, vmin=vmin, vmax=vmax,
cbar_kws={"label": "% Missing"},
)
ax_pre.set_title("Before Imputation", fontweight="bold", fontsize=11)
ax_pre.set_xlabel("Variable")
ax_pre.set_ylabel("Station")
sns.heatmap(
post_df, annot=True, fmt=".1f", cmap="YlOrRd",
ax=ax_post, vmin=vmin, vmax=vmax,
cbar_kws={"label": "% Missing"},
)
ax_post.set_title("After Imputation", fontweight="bold", fontsize=11)
ax_post.set_xlabel("Variable")
ax_post.set_ylabel("")
fig.suptitle(
"Pre vs Post Imputation Missingness — Core FWI Variables (% Missing)",
fontsize=12, y=1.02,
)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "missingness_pre_post_imputation.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "missingness_pre_post_imputation.png")))
print("Saved: notebooks/figures/missingness_pre_post_imputation.png")
# Also print the text comparison
print()
print("Pre/Post Imputation Missingness Comparison:")
comparison_rows = []
for station in pre_df.index:
for var in common_cols:
comparison_rows.append({
"station": station,
"variable": var,
"pre_%": pre_df.loc[station, var],
"post_%": post_df.loc[station, var],
"improved_%": round(pre_df.loc[station, var] - post_df.loc[station, var], 2),
})
comp_df = pd.DataFrame(comparison_rows)
print(comp_df.to_string(index=False))
else:
print(f"qa_qc_report_{FWI_MODE}.csv not found — run pipeline first.")
Saved: notebooks/figures/missingness_pre_post_imputation.png
Pre/Post Imputation Missingness Comparison:
station variable pre_% post_% improved_%
greenwich air_temperature_c 38.93 4.06 34.87
greenwich rain_mm 0.13 0.13 0.00
greenwich relative_humidity_pct 16.21 16.19 0.02
greenwich wind_speed_kmh 12.45 3.96 8.49
cavendish air_temperature_c 14.52 14.52 0.00
cavendish rain_mm 0.03 0.03 0.00
cavendish relative_humidity_pct 14.51 14.51 0.00
cavendish wind_speed_kmh 14.91 0.00 14.91
north_rustico air_temperature_c 4.12 4.12 0.00
north_rustico rain_mm 0.04 0.04 0.00
north_rustico relative_humidity_pct 4.12 4.12 0.00
north_rustico wind_speed_kmh 4.55 0.00 4.55
stanley_bridge air_temperature_c 17.19 0.00 17.19
stanley_bridge rain_mm 0.00 0.00 0.00
stanley_bridge relative_humidity_pct 100.00 17.19 82.81
stanley_bridge wind_speed_kmh 32.08 0.00 32.08
tracadie air_temperature_c 12.84 0.00 12.84
tracadie rain_mm 0.00 0.00 0.00
tracadie relative_humidity_pct 100.00 12.84 87.16
tracadie wind_speed_kmh 15.90 0.00 15.90
stanhope air_temperature_c 0.32 0.32 0.00
stanhope rain_mm 0.19 0.19 0.00
stanhope relative_humidity_pct 0.33 0.33 0.00
stanhope wind_speed_kmh 0.28 0.28 0.00
4.2 Quality Enforcement Actions¶
Count of quality enforcement actions by type and station.
qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
qe_df = pd.read_csv(qe_path)
print(f"Quality enforcement report: {len(qe_df)} actions")
if "action" in qe_df.columns and "station" in qe_df.columns:
print("\nActions by type and station:")
action_pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
action_table = action_pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)
print(action_table.to_string())
else:
print("Columns:", list(qe_df.columns))
else:
print("quality_enforcement_report.csv not found.")
Quality enforcement report: 5215 actions Actions by type and station: action flag_only set_nan station cavendish 126 12 greenwich 1161 607 north_rustico 2559 87 stanhope 315 66 stanley_bridge 56 1 tracadie 136 89
4.3 Out-of-Range Summary¶
Out-of-range counts for temperature, RH, and wind per station from the QA/QC report.
qa_qc_path = PROCESSED_DIR / f"qa_qc_report_{FWI_MODE}.csv"
if qa_qc_path.exists():
qa_df = pd.read_csv(qa_qc_path)
oor_cols = [c for c in qa_df.columns if "out_of_range" in c.lower()]
if oor_cols:
print("Out-of-Range Summary:")
print(qa_df[["station"] + oor_cols].to_string(index=False))
else:
print("No out-of-range columns in QA/QC report.")
else:
print(f"qa_qc_report_{FWI_MODE}.csv not found.")
Out-of-Range Summary:
station out_of_range_temp_count out_of_range_rh_count out_of_range_wind_count out_of_range_pre_enforcement out_of_range_post_enforcement
greenwich 0 0 0 0 0
cavendish 0 0 0 0 0
north_rustico 0 0 0 0 0
stanley_bridge 0 0 0 0 0
tracadie 0 0 0 0 0
stanhope 0 0 0 0 0
4.2b Quality Enforcement Summary (Visual)¶
Stacked bar chart of quality enforcement actions by station. Each bar segment represents an action type (set_nan, clip, etc.).
# --- Quality Enforcement Stacked Bar ---
qe_path = PROCESSED_DIR / "quality_enforcement_report.csv"
if qe_path.exists():
qe_df = pd.read_csv(qe_path)
pivot = qe_df.groupby(["station", "action"]).size().reset_index(name="count")
pivot_table = pivot.pivot(index="station", columns="action", values="count").fillna(0).astype(int)
fig, ax = plt.subplots(figsize=(12, 5))
pivot_table.plot(kind="bar", stacked=True, ax=ax, colormap="tab10", edgecolor="white", linewidth=0.5)
ax.set_title("Quality Enforcement Actions by Station")
ax.set_xlabel("Station")
ax.set_ylabel("Action Count")
ax.legend(title="Action", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "quality_enforcement_stacked.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "quality_enforcement_stacked.png")))
print("Saved: notebooks/figures/quality_enforcement_stacked.png")
else:
print("quality_enforcement_report.csv not found.")
Saved: notebooks/figures/quality_enforcement_stacked.png
4.2c Synthetic Data Proportion Heatmap¶
Percentage of rows with qf > 0 (synthetic or chain-break) per station per variable.
Higher values indicate greater reliance on imputed/synthetic data.
Stations without _qf columns for a variable are shown as N/A.
# --- Synthetic Data Proportion Heatmap ---
qf_vars = [
"air_temperature_c_qf", "relative_humidity_pct_qf",
"wind_speed_kmh_qf", "rain_mm_qf",
"ffmc_qf", "dmc_qf", "dc_qf", "isi_qf", "bui_qf", "fwi_qf"
]
prop_matrix = {}
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, low_memory=False)
row = {}
for qv in qf_vars:
if qv in df.columns:
row[qv.replace("_qf", "")] = (df[qv] > 0).mean() * 100
else:
row[qv.replace("_qf", "")] = float("nan")
prop_matrix[station] = row
prop_df = pd.DataFrame(prop_matrix).T
fig, ax = plt.subplots(figsize=(14, 5))
mask = prop_df.isna()
cmap = plt.cm.YlOrRd.copy()
cmap.set_bad(color="#d0d0d0")
im = ax.imshow(prop_df.values, cmap=cmap, aspect="auto", vmin=0, vmax=100)
ax.set_xticks(range(len(prop_df.columns)))
ax.set_xticklabels(prop_df.columns, rotation=45, ha="right", fontsize=8)
ax.set_yticks(range(len(prop_df.index)))
ax.set_yticklabels(prop_df.index, fontsize=8)
# Annotate cells
for i in range(len(prop_df.index)):
for j in range(len(prop_df.columns)):
v = prop_df.iloc[i, j]
if pd.isna(v):
ax.text(j, i, "N/A", ha="center", va="center", fontsize=7, color="gray")
else:
ax.text(j, i, f"{v:.1f}%", ha="center", va="center", fontsize=7,
color="white" if v > 50 else "black")
fig.colorbar(im, ax=ax, label="% rows with qf > 0")
ax.set_title("Synthetic Data Proportion by Station and Variable")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "synthetic_proportion_heatmap.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "synthetic_proportion_heatmap.png")))
print("Saved: notebooks/figures/synthetic_proportion_heatmap.png")
Saved: notebooks/figures/synthetic_proportion_heatmap.png
4.2d Quality Flag Timeline¶
Colored bands showing when quality flags were raised per station.
Each row is a station; colors indicate the type of quality issue detected.
Stations without _quality_flags data are shown as empty.
# --- Quality Flag Timeline ---
import ast
flag_color_map = {
"value_range": "#e6194b", # red
"rate_of_change": "#f58231", # orange
"flatline": "#3cb44b", # green
"cross_variable": "#4363d8", # blue
"stuck_sensor": "#911eb4", # purple
}
flag_data = {}
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"], low_memory=False)
if "_quality_flags" not in df.columns:
continue
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
# Parse flag lists and extract types
flag_types = []
for val in df["_quality_flags"].dropna():
try:
flags = ast.literal_eval(val) if isinstance(val, str) else val
for f in flags:
ftype = f.split(":")[0] if ":" in str(f) else str(f)
flag_types.append(ftype)
except Exception:
pass
if flag_types:
flag_data[station] = pd.Series(flag_types).value_counts()
if flag_data:
flag_df = pd.DataFrame(flag_data).fillna(0).astype(int)
fig, ax = plt.subplots(figsize=(12, 4 + 0.3 * len(flag_df.columns)))
flag_df.plot(kind="barh", stacked=True, ax=ax,
color=[flag_color_map.get(t, "#aaa") for t in flag_df.index])
ax.set_title("Quality Flag Types by Station")
ax.set_xlabel("Flag Count")
ax.set_ylabel("")
ax.legend(title="Flag Type", fontsize=7, bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "quality_flag_timeline.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "quality_flag_timeline.png")))
print("Saved: notebooks/figures/quality_flag_timeline.png")
else:
print("No quality flag data found for any station.")
Saved: notebooks/figures/quality_flag_timeline.png
4.4 Cross-Station Imputation Audit¶
Donor-to-recipient imputation pairs and counts.
cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
cs_df = pd.read_csv(cs_path)
print(f"Cross-station imputation audit: {len(cs_df)} entries")
if "donor" in cs_df.columns and "target" in cs_df.columns:
print("\nDonor -> Recipient pairs:")
pairs = cs_df.groupby(["donor", "target"]).size().reset_index(name="count")
print(pairs.to_string(index=False))
else:
print("Columns:", list(cs_df.columns))
else:
print("cross_station_imputation_audit.csv not found.")
Cross-station imputation audit: 78623 entries Columns: ['station', 'timestamp_utc', 'variable', 'imputed_value', 'quality_flag', 'source', 'method', 'donor_priority']
4.4b Cross-Station Donor Audit (Visual)¶
Dual-panel chart showing:
- Left: Donor contribution bars (how many values each donor provided)
- Right: Imputation method breakdown by station
# --- Cross-Station Donor Audit Visual ---
cs_path = PROCESSED_DIR / "cross_station_imputation_audit.csv"
if cs_path.exists():
cs_df = pd.read_csv(cs_path)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Left: donor contributions
if "source" in cs_df.columns:
donor_counts = cs_df["source"].value_counts()
donor_counts.plot(kind="barh", ax=ax1, color="#4363d8", edgecolor="white")
ax1.set_title("Donor Contribution Count")
ax1.set_xlabel("Number of Imputed Values")
ax1.set_ylabel("")
else:
ax1.text(0.5, 0.5, "No source column", ha="center", va="center", transform=ax1.transAxes)
# Right: method breakdown by station
if "station" in cs_df.columns and "method" in cs_df.columns:
method_pivot = cs_df.groupby(["station", "method"]).size().reset_index(name="count")
method_table = method_pivot.pivot(index="station", columns="method", values="count").fillna(0).astype(int)
method_table.plot(kind="bar", stacked=True, ax=ax2, colormap="Set2", edgecolor="white", linewidth=0.5)
ax2.set_title("Imputation Method by Station")
ax2.set_xlabel("Station")
ax2.set_ylabel("Count")
ax2.legend(title="Method", fontsize=7)
plt.xticks(rotation=45, ha="right")
else:
ax2.text(0.5, 0.5, "No method/station columns", ha="center", va="center", transform=ax2.transAxes)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "cross_station_donor_audit.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "cross_station_donor_audit.png")))
print("Saved: notebooks/figures/cross_station_donor_audit.png")
else:
print("cross_station_imputation_audit.csv not found.")
Saved: notebooks/figures/cross_station_donor_audit.png
5. Principal Component Analysis¶
PCA on hourly temperature to identify station similarity patterns.
temp_frames = []
for station in stations:
fpath = PROCESSED_DIR / station / "station_hourly.csv"
df = pd.read_csv(fpath, parse_dates=["timestamp_utc"])
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], utc=True)
df = df[df["timestamp_utc"] >= ANALYSIS_START]
if "air_temperature_c" in df.columns:
temp_frames.append(df[["station", "timestamp_utc", "air_temperature_c"]])
temp_all = pd.concat(temp_frames, ignore_index=True)
matrix = build_station_matrix(temp_all, value_column="air_temperature_c")
print(f"Station matrix shape: {matrix.shape}")
print(f"Stations: {list(matrix.columns)}")
Station matrix shape: (26113, 6) Stations: ['cavendish', 'greenwich', 'north_rustico', 'stanhope', 'stanley_bridge', 'tracadie']
loadings = pca_station_loadings(matrix)
print("PCA Loadings:")
print(loadings.to_string(index=False))
PCA Loadings:
station component loading explained_variance_ratio
cavendish PC1 0.413668 0.963466
greenwich PC1 0.396266 0.963466
north_rustico PC1 0.413277 0.963466
stanhope PC1 0.406986 0.963466
stanley_bridge PC1 0.405674 0.963466
tracadie PC1 0.413334 0.963466
cavendish PC2 -0.167839 0.018571
greenwich PC2 0.900793 0.018571
north_rustico PC2 -0.171177 0.018571
stanhope PC2 -0.054382 0.018571
stanley_bridge PC2 -0.323366 0.018571
tracadie PC2 -0.153547 0.018571
normalized = matrix.dropna(axis="index", how="any")
normalized = (normalized - normalized.mean()) / normalized.std()
n_comp = min(len(matrix.columns), normalized.shape[0])
pca_full = PCA(n_components=n_comp)
pca_full.fit(normalized)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(range(1, n_comp + 1), pca_full.explained_variance_ratio_, "o-")
ax.set_xlabel("Principal Component")
ax.set_ylabel("Explained Variance Ratio")
ax.set_title("Scree Plot")
ax.set_xticks(range(1, n_comp + 1))
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_scree.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_scree.png")))
print("Saved: notebooks/figures/pca_scree.png")
ratios = pca_full.explained_variance_ratio_.round(4).tolist()
print(f"Explained variance ratios: {ratios}")
Saved: notebooks/figures/pca_scree.png Explained variance ratios: [0.9635, 0.0186, 0.0098, 0.0062, 0.0011, 0.0008]
scores = pca_full.transform(normalized)
# Distinct color per station
station_colors = {
"greenwich": "#e6194b",
"cavendish": "#3cb44b",
"north_rustico": "#4363d8",
"stanhope": "#f58231",
"stanley_bridge": "#911eb4",
"tracadie": "#42d4f4",
}
fig, ax = plt.subplots(figsize=(8, 6))
for i, station in enumerate(matrix.columns):
color = station_colors.get(station, None)
ax.scatter(scores[i, 0], scores[i, 1], s=120, zorder=5, color=color, label=station, edgecolors="white", linewidth=0.5)
ax.annotate(station, (scores[i, 0], scores[i, 1]), textcoords="offset points", xytext=(5, 5), fontsize=9, color=color)
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
ax.set_xlabel(f"PC1 ({pca_full.explained_variance_ratio_[0]:.1%})")
ax.set_ylabel(f"PC2 ({pca_full.explained_variance_ratio_[1]:.1%})")
ax.set_title("PCA Biplot - Station Scores")
ax.axhline(0, color="gray", linewidth=0.5)
ax.axvline(0, color="gray", linewidth=0.5)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pca_biplot.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "pca_biplot.png")))
print("Saved: notebooks/figures/pca_biplot.png")
Saved: notebooks/figures/pca_biplot.png
6. Hierarchical Clustering¶
Stations clustered by temperature correlation to identify redundancy groups.
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
corr = matrix.corr()
distance = 1 - corr
dist_array = squareform(distance.values, checks=False)
link = linkage(dist_array, method="average")
fig, ax = plt.subplots(figsize=(8, 5))
dendrogram(link, labels=list(matrix.columns), ax=ax, leaf_rotation=45)
ax.set_ylabel("Distance (1 - correlation)")
ax.set_title("Hierarchical Clustering Dendrogram")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "clustering_dendrogram.png", bbox_inches="tight")
display(Image(filename=str(FIGURES_DIR / "clustering_dendrogram.png")))
print("Saved: notebooks/figures/clustering_dendrogram.png")
Saved: notebooks/figures/clustering_dendrogram.png
cluster_order = cluster_station_order(matrix)
print("Cluster assignments:")
for entry in cluster_order:
print(f" {entry}")
corr = pairwise_station_correlation(matrix).fillna(0.0)
dist = 1 - corr
print("\nPairwise distance matrix:")
print(dist.round(3).to_string())
Cluster assignments: stanhope tracadie cavendish north_rustico stanley_bridge greenwich Pairwise distance matrix: station cavendish greenwich north_rustico stanhope stanley_bridge tracadie station cavendish 0.000 0.093 0.006 0.033 0.028 0.006 greenwich 0.093 0.000 0.086 0.095 0.102 0.058 north_rustico 0.006 0.086 0.000 0.035 0.026 0.005 stanhope 0.033 0.095 0.035 0.000 0.057 0.031 stanley_bridge 0.028 0.102 0.026 0.057 0.000 0.026 tracadie 0.006 0.058 0.005 0.031 0.026 0.000
7. Redundancy Analysis¶
Combining PCA, clustering, and benchmarking against the Stanhope reference to identify redundant stations.
try:
benchmark = benchmark_to_stanhope(matrix, reference_station="stanhope")
print("Benchmark results:")
print(benchmark.to_string(index=False))
except ValueError as e:
print(f"Stanhope benchmark skipped: {e}")
print("Using inter-station metrics only.")
benchmark = None
Benchmark results:
station reference_station overlap_count mean_abs_diff correlation observations
cavendish stanhope 21858 1.815121 0.966669 [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
greenwich stanhope 24067 1.540169 0.905163 [0.131, 1.272, 1.776, 1.6753333333333327, 1.3593333333333337, 1.0616666666666668, 0.4973333333333335, 0.8983333333333332, 1.579333333333333, 2.796333333333333, 2.403, 1.596, 0.30799999999999983, 2.551, 2.433666666666667, 2.436, 1.4793333333333332, 0.9983333333333333, 0.7026666666666668, 0.5543333333333336, 0.9340000000000002, 2.0716666666666668, 2.3259999999999996, 2.262666666666667, 1.614, 0.8383333333333329, 1.033666666666667, 0.3340000000000001, 0.5190000000000001, 0.6160000000000001, 0.07799999999999985, 0.04800000000000004, 0.9796666666666667, 0.3923333333333332, 0.9243333333333337, 0.301333333333333, 0.07833333333333314, 0.44599999999999973, 0.16566666666666663, 0.3240000000000003, 1.0553333333333335, 1.745333333333333, 2.252333333333333, 2.981333333333333, 2.879333333333333, 2.4543333333333335, 1.6183333333333334, 1.6399999999999997, 2.2116666666666664, 2.530333333333333, 2.5700000000000003, 2.155, 1.5509999999999997, 1.5199999999999996, 1.189, 0.06699999999999928, 1.2399999999999998, 2.4606666666666666, 3.372, 4.061, 4.348333333333334, 4.505333333333333, 4.098333333333334, 4.56, 3.0966666666666667, 1.7093333333333338, 0.17433333333333367, 1.8640000000000003, 1.8049999999999997, 1.5246666666666657, 0.48566666666666647, 1.4106666666666663, 0.5083333333333331, 0.8073333333333332, 1.334, 0.44133333333333336, 0.852, 0.40733333333333344, 0.41466666666666663, 2.330333333333333, 4.1450000000000005, 5.676333333333333, 6.049666666666667, 5.577999999999999, 0.5589999999999993, 1.0030000000000001, 2.7380000000000004, 3.351, 2.309, 3.887333333333334, 4.211666666666666, 4.692333333333334, 5.078666666666667, 4.711333333333333, 4.510333333333333, 3.167666666666667, 0.07866666666666666, 0.3703333333333334, 0.5583333333333333, 2.6606666666666667, ...]
north_rustico stanhope 22994 1.872865 0.964902 [3.2779999999999996, 4.0600000000000005, 3.0125, 3.3594999999999997, 5.177999999999999, 5.859999999999999, 5.2125, 5.2595, 4.9555, 0.5805000000000002, 0.3480000000000001, 0.6470000000000001, 1.1544999999999999, 0.42549999999999977, 0.607, 1.1465, 0.9923333333333335, 1.2554999999999996, 2.831, 5.4375, 5.569500000000001, 0.42600000000000016, 0.7905000000000006, 2.645999999999999, 3.653500000000001, 2.3629999999999995, 1.8560000000000008, 1.9699999999999998, 1.1725000000000003, 1.1084999999999994, 0.3155000000000001, 1.5074999999999998, 0.47550000000000026, 0.44649999999999945, 1.4965000000000002, 1.5819999999999999, 1.9894999999999996, 0.16349999999999998, 0.488, 0.23050000000000015, 0.956, 2.5140000000000002, 4.861000000000001, 3.7095000000000002, 5.258, 7.7545, 3.6179999999999986, 0.7385000000000002, 2.0774999999999997, 0.3949999999999996, 5.0165, 4.219999999999999, 4.367999999999999, 2.4349999999999996, 0.4775000000000009, 0.07399999999999984, 0.6455000000000002, 0.06300000000000061, 1.0415, 0.6745000000000001, 1.8719999999999999, 0.9215, 1.787, 2.569, 2.8975, 2.7225000000000006, 1.4784999999999995, 0.5105, 0.25849999999999973, 0.9654999999999996, 0.9155000000000002, 0.9110000000000005, 0.39700000000000024, 0.39449999999999985, 6.627, 5.5705, 2.4085, 0.44650000000000034, 1.5984999999999996, 0.8835000000000006, 0.7670000000000003, 1.9605000000000015, 1.8565000000000005, 1.9209999999999985, 1.0989999999999993, 0.7234999999999996, 0.7985000000000007, 1.2915, 1.1304999999999996, 0.573500000000001, 0.29800000000000093, 0.22299999999999986, 0.2054999999999998, 0.3130000000000006, 0.702, 0.5025000000000004, 0.49350000000000005, 2.8790000000000013, 4.922, 2.186, ...]
stanley_bridge stanhope 24066 2.099542 0.942855 [0.7825000000000001, 0.4033333333333333, 0.20833333333333326, 0.6875, 0.7641666666666669, 0.31, 0.09166666666666656, 0.27416666666666667, 0.9775, 2.0741666666666667, 1.8708333333333333, 1.355, 0.34833333333333316, 2.1275, 1.9224999999999994, 1.0341666666666667, 0.4358333333333333, 0.3258333333333332, 0.061666666666666536, 0.385, 0.6675, 1.3608333333333331, 1.6858333333333333, 1.4425, 0.8008333333333333, 0.5425, 0.4474999999999998, 0.206666666666667, 0.8541666666666665, 0.5783333333333336, 0.047499999999999876, 0.11750000000000016, 0.5533333333333332, 0.39166666666666705, 1.8816666666666668, 1.03, 1.010833333333333, 0.6891666666666669, 0.4716666666666667, 0.8075000000000001, 1.1208333333333331, 1.3058333333333332, 2.419166666666667, 3.455833333333333, 3.476666666666666, 3.01, 2.100833333333333, 1.7983333333333333, 2.1883333333333335, 2.3058333333333327, 2.064166666666667, 1.4574999999999996, 0.8300000000000001, 0.7508333333333335, 0.41166666666666707, 0.746666666666667, 2.2258333333333336, 3.730833333333333, 4.753333333333334, 5.203333333333334, 5.3625, 5.308333333333334, 4.73, 4.906666666666666, 3.4991666666666665, 1.6374074074074074, 0.3166666666666669, 2.6056666666666666, 1.8436666666666666, 1.6303333333333332, 0.7153333333333327, 1.8246666666666664, 1.4136666666666666, 0.5606666666666669, 2.0686666666666667, 0.865, 0.18466666666666665, 0.7433333333333332, 0.028666666666666618, 1.4756666666666665, 3.2643333333333335, 4.341666666666667, 4.652666666666667, 4.5523333333333325, 0.7736666666666672, 0.6036666666666664, 1.8376666666666672, 3.6559999999999997, 2.684333333333333, 3.2223333333333333, 4.569999999999999, 5.542333333333334, 5.037000000000001, 3.2506666666666666, 3.6006666666666667, 2.5900000000000003, 0.006999999999999895, 0.21300000000000002, 1.5613333333333332, 2.111666666666667, ...]
tracadie stanhope 21940 1.822442 0.968903 [1.1550000000000011, 0.9824999999999982, 0.8283333333333331, 0.6266666666666687, 0.33333333333333215, 0.39416666666667055, 0.7650000000000006, 1.5183333333333309, 0.5924999999999976, 1.1966666666666725, 0.043333333333333, 1.0366666666666688, 1.1408333333333331, 1.1508333333333347, 1.8183333333333387, 1.5458333333333307, 0.769166666666667, 0.086666666666666, 2.557500000000001, 3.0933333333333337, 2.8599999999999994, 2.2641666666666644, 1.975833333333334, 1.7858333333333292, 1.6616666666666688, 1.5225000000000009, 1.6858333333333313, 1.803333333333331, 1.4700000000000024, 0.8125, 0.33500000000000085, 0.16999999999999815, 1.4050000000000011, 2.2225, 2.2025000000000006, 1.3141666666666687, 1.0933333333333337, 0.19583333333333286, 0.7155555555555608, 0.17499999999999716, 0.398333333333337, 0.21083333333333343, 0.8533333333333317, 1.8975000000000009, 1.3825000000000003, 2.1125000000000007, 2.233333333333338, 2.0383333333333304, 1.8616666666666646, 1.1433333333333344, 1.134999999999998, 0.8524999999999991, 0.8716666666666661, 0.9283333333333275, 0.09416666666666984, 0.6916666666666664, 2.231666666666669, 2.1633333333333304, 1.4450000000000003, 2.8283333333333367, 1.7749999999999986, 2.0249999999999986, 1.4258333333333333, 1.2699999999999996, 0.3758333333333326, 0.0591666666666697, 1.2349999999999994, 2.7025000000000006, 3.9775000000000027, 4.57833333333333, 4.006666666666664, 3.7441666666666684, 2.7775, 2.576666666666661, 2.0799999999999983, 0.6533333333333324, 0.4833333333333343, 0.03999999999999915, 0.1700000000000017, 1.5308333333333373, 2.6466666666666683, 2.288333333333334, 1.7424999999999997, 1.1666666666666679, 2.575833333333332, 2.3833333333333364, 1.8758333333333361, 0.9899999999999984, 0.036666666666668846, 1.1208333333333336, 3.6866666666666674, 3.418333333333333, 3.5199999999999996, 2.973333333333329, 1.4858333333333285, 1.4608333333333299, 1.1708333333333272, 1.047500000000003, 1.4683333333333373, 1.6616666666666653, ...]
if benchmark is not None and len(benchmark) > 0:
recommendations = build_station_recommendations(
benchmark,
pca_loadings=loadings,
cluster_order=cluster_order,
)
print("Station Recommendations:")
print(recommendations.to_string(index=False))
else:
print("Cannot build recommendations without benchmark.")
recommendations = None
Station Recommendations:
station reference_station recommendation risk_probability ci_lower ci_upper risk_band evidence assumptions limitations
cavendish stanhope remove 0.121140 0.012887 0.288408 low benchmark correlation=0.967; uncertainty=low (0.01-0.29); pca=[PC1=0.414, PC2=-0.168]; cluster=position 3/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
greenwich stanhope defer 0.028022 0.000000 0.060345 low benchmark correlation=0.905; uncertainty=low (0.00-0.06); pca=[PC1=0.396, PC2=0.901]; cluster=position 6/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
north_rustico stanhope remove 0.110791 0.013123 0.265487 low benchmark correlation=0.965; uncertainty=low (0.01-0.27); pca=[PC1=0.413, PC2=-0.171]; cluster=position 4/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge stanhope defer 0.063608 0.010394 0.144060 low benchmark correlation=0.943; uncertainty=low (0.01-0.14); pca=[PC1=0.406, PC2=-0.323]; cluster=position 5/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
tracadie stanhope remove 0.123004 0.014229 0.279774 low benchmark correlation=0.969; uncertainty=low (0.01-0.28); pca=[PC1=0.413, PC2=-0.154]; cluster=position 2/6 Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
8. Uncertainty Quantification¶
Risk probabilities and confidence intervals for station removal.
from pea_met_network.uncertainty import quantify_station_removal_risk
if benchmark is not None and len(benchmark) > 0:
uncertainty = quantify_station_removal_risk(benchmark)
print("Uncertainty Quantification (KDE-based risk):")
print(uncertainty.to_string(index=False))
fig, ax = plt.subplots(figsize=(8, 4))
stations_plot = uncertainty["station"].tolist()
risks = uncertainty["risk_probability"].tolist()
ci_lo = uncertainty["ci_lower"].tolist()
ci_hi = uncertainty["ci_upper"].tolist()
x = range(len(stations_plot))
ax.bar(x, risks, color="steelblue", alpha=0.7)
yerr_lo = [r - lo for r, lo in zip(risks, ci_lo)]
yerr_hi = [hi - r for r, hi in zip(risks, ci_hi)]
ax.errorbar(x, risks, yerr=[yerr_lo, yerr_hi], fmt="none", c="black", capsize=5)
ax.set_xticks(list(x))
ax.set_xticklabels(stations_plot, rotation=45, ha="right")
ax.set_ylabel("Removal Risk Probability")
ax.set_title("Station Removal Risk with 95% Confidence Intervals")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "uncertainty_risk.png", bbox_inches="tight")
plt.show()
display(Image(filename=str(FIGURES_DIR / "uncertainty_risk.png")))
print("Saved: notebooks/figures/uncertainty_risk.png")
else:
print("Uncertainty analysis requires benchmark data (skipped).")
Uncertainty Quantification (KDE-based risk):
station reference_station risk_probability ci_lower ci_upper risk_band assumptions limitations
cavendish stanhope 0.121140 0.012887 0.288408 low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
greenwich stanhope 0.028022 0.000000 0.060345 low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
north_rustico stanhope 0.110791 0.013123 0.265487 low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
stanley_bridge stanhope 0.063608 0.010394 0.144060 low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
tracadie stanhope 0.123004 0.014229 0.279774 low Distributional uncertainty is estimated with scipy.stats.gaussian_kde over observation-derived station-reference divergence samples; when benchmark fixtures omit raw observations, a documented synthetic distribution is generated from mean difference, correlation, and overlap support. Sample support is adequate for a coarse uncertainty bound.
Saved: notebooks/figures/uncertainty_risk.png
Conclusion¶
TBD — fill in after review.