Zonal Statistics over Cropland Areas

This script calculates zonal statistics (mean values) of continuous raster data (e.g., WaPOR AETI monthly) restricted to cropland areas within administrative polygons (e.g., districts, basins etc). It outputs a table of mean values for each zone and each raster file, but only for pixels that belong to the cropland class in a Land Cover raster.
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
import pandas as pd
import os
import numpy as np
from rasterio.warp import reproject, Resampling

# ---------------- CONFIG ----------------
geojson_boundary = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/Shapefile/Zambia_L2.geojson" 
raster_files_folder = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/AETI_WaPOR_v3_L1_M"

lcc_raster = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/LULC/WaPOR_v2_LULC/WaPOR_v2_LULC_2022_cropland.tif"
cropland_class = 40

output_csv = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/LULC_Stats/WaPOR_Cropland_AETI_M.csv"

# Choose alignment mode: "LULC" (data reprojected to LULC resolution) or 
# "DATA" (LULC reprojected to data raster)

ALIGN_TO = "LULC" # use when coarse lulc or do not take much memory(~100-300m)
# ALIGN_TO = "DATA" # use when lulc raster are too large while loading like (~ ESA LULC 10m)


# -----------------------------------------

# Load shapefile
zones_features = gpd.read_file(geojson_boundary)
zones_attr = zones_features.drop(columns="geometry").reset_index(drop=True)

# Load LULC raster (full resolution, e.g., 10 m)
with rasterio.open(lcc_raster) as lcc_src:
    lcc_data_full = lcc_src.read(1)  # Read the raster band
    lcc_nodata = lcc_src.nodata  # Nodata value
    lcc_affine = lcc_src.transform  # Transform of the LCC raster
    lcc_crs = lcc_src.crs
    lcc_shape = lcc_data_full.shape
    unique_classes = np.unique(lcc_data_full[lcc_data_full != lcc_nodata])  # Get unique land classes
    print("unique_classes",unique_classes)

print("✅ Loaded ESA LULC raster")

# ---------------- FUNCTION ----------------

# Function to reproject and resample a raster
def align_raster_to_lcc(src_raster, lcc_transform, lcc_crs, lcc_shape):
    """Reproject a raster to the LCC grid and return float32 with NaNs for NoData."""
    with rasterio.open(src_raster) as src:
        data = src.read(1).astype("float32")  # <-- ensure float
        data_nodata = src.nodata
        if data_nodata is not None:
            data[data == data_nodata] = np.nan  # safe now (float array)

        # Initialize destination as NaNs (float)
        aligned_data = np.full(lcc_shape, np.nan, dtype="float32")

        # TBP is continuous → bilinear is appropriate. Use nearest for categorical rasters.
        reproject(
            source=data,
            destination=aligned_data,
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=lcc_transform,
            dst_crs=lcc_crs,
            resampling=Resampling.bilinear
        )
        return aligned_data


def align_lcc_to_raster(src_raster, lcc_data_full, lcc_transform, lcc_crs):
    """Reproject LULC raster to match data raster grid"""
    with rasterio.open(src_raster) as src:
        dst_shape = (src.height, src.width)
        dst_transform = src.transform
        dst_crs = src.crs

        # Prepare destination
        lulc_resampled = np.full(dst_shape, 0, dtype=np.int16)

        reproject(
            source=lcc_data_full,
            destination=lulc_resampled,
            src_transform=lcc_transform,
            src_crs=lcc_crs,
            dst_transform=dst_transform,
            dst_crs=dst_crs,
            resampling=Resampling.nearest  # keep categorical
        )

        data = src.read(1).astype("float32")
        if src.nodata is not None:
            data[data == src.nodata] = np.nan

        return data, lulc_resampled, dst_transform
# ------------------------------------------

results = []

# Iterate over AETI rasters
for filename in sorted(os.listdir(raster_files_folder)):
    if filename.endswith(".tif"):
        raster_file = os.path.join(raster_files_folder, filename)
        print(f"Processing raster: {filename}")

        if ALIGN_TO == "LULC":
            aligned_data = align_raster_to_lcc(raster_file, lcc_affine, lcc_crs, lcc_shape)
            current_lcc = lcc_data_full
            current_affine = lcc_affine

        elif ALIGN_TO == "DATA":
            aligned_data, current_lcc, current_affine = align_lcc_to_raster(
                raster_file, lcc_data_full, lcc_affine, lcc_crs
            )

        class_mask = (current_lcc == cropland_class)
        class_data = np.where(class_mask, aligned_data, np.nan)


        # Compute zonal stats
        stats = zonal_stats(
            zones_features.geometry,
            class_data,
            affine=lcc_affine,
            stats=['mean'],
            nodata=np.nan
        )

        # Save results
        for zone_index, stat in enumerate(stats):
            row = zones_attr.iloc[zone_index].to_dict()
            # row["LCC"] = int(cropland_class)
            ##  Keeps None if the zone has no overlap with raster (useful if you want
            # row[filename] = round(stat["mean"], 2) if stat["mean"] is not None else None

            ##  Replaces missing/NaN values with 0. Useful if you want all zones
            row[filename] = round(stat["mean"], 2) if stat["mean"] is not None else 0

            results.append(row)


# Convert to DataFrame
results_df = pd.DataFrame(results)


# attr_cols = list(zones_attr.columns) + ["LCC"]
attr_cols = list(zones_attr.columns)
raster_cols = [c for c in results_df.columns if c not in attr_cols]


pivot_df = results_df.pivot_table(
    index=attr_cols,
    values=raster_cols,
    aggfunc="first"
).reset_index()

# Save to Excel
pivot_df.to_csv(output_csv, index=False)
print(f"✅ Zonal statistics saved to {output_csv}")