Zonal Statistics by Land Cover Class
This script computes zonal statistics (mean values) of a continuous raster dataset (e.g., WaPOR RET annual rasters) over each land cover class defined in a Land Cover Classification (LULC) raster, within polygon boundaries (e.g., districts, basins etc.).
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
import pandas as pd
import os
import numpy as np
from rasterio.warp import reproject, Resampling
geojson_boundary = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/Shapefile/Zambia_L2.geojson"
raster_files_folder = '/Users/amanchaudhary/Desktop/WA_Tool_streamlit/output/Zambia_L0/Data/RET_WaPOR_v3_Annual'
lcc_raster = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/LULC/WaPOR_v2_LULC/WaPOR_v2_LULC_2022_reclass.tif"
output_csv = "/Users/amanchaudhary/Documents/Resources/World_Bank/Zambia/LULC_Stats/RET_annual_LCC_Stats.csv"
# Choose alignment mode: "LULC" (data reprojected to LULC resolution) or
# "DATA" (LULC reprojected to data raster)
ALIGN_TO = "LULC" # use when coarse lulc or do not take much memory(~100-300m)
# ALIGN_TO = "DATA" # use when lulc raster are too large while loading like (~ ESA LULC 10m)
# Load shapefile
zones_features = gpd.read_file(geojson_boundary)
zones_attr = zones_features.drop(columns="geometry").reset_index(drop=True)
# Load the LCC raster
with rasterio.open(lcc_raster) as lcc_src:
lcc_data_full = lcc_src.read(1) # Read the raster band
lcc_nodata = lcc_src.nodata # Nodata value
lcc_affine = lcc_src.transform # Transform of the LCC raster
lcc_crs = lcc_src.crs
lcc_shape = lcc_data_full.shape
unique_classes = np.unique(lcc_data_full[lcc_data_full != lcc_nodata]) # Get unique land classes
print("unique_classes",unique_classes)
# Function to reproject and resample a raster
def align_raster_to_lcc(src_raster, lcc_transform, lcc_crs, lcc_shape):
"""Reproject a raster to the LCC grid and return float32 with NaNs for NoData."""
with rasterio.open(src_raster) as src:
data = src.read(1).astype("float32") # <-- ensure float
data_nodata = src.nodata
if data_nodata is not None:
data[data == data_nodata] = np.nan # safe now (float array)
# Initialize destination as NaNs (float)
aligned_data = np.full(lcc_shape, np.nan, dtype="float32")
# TBP is continuous → bilinear is appropriate. Use nearest for categorical rasters.
reproject(
source=data,
destination=aligned_data,
src_transform=src.transform,
src_crs=src.crs,
dst_transform=lcc_transform,
dst_crs=lcc_crs,
resampling=Resampling.bilinear
)
return aligned_data
def align_lcc_to_raster(src_raster, lcc_data_full, lcc_transform, lcc_crs):
"""Reproject LULC raster to match data raster grid"""
with rasterio.open(src_raster) as src:
dst_shape = (src.height, src.width)
dst_transform = src.transform
dst_crs = src.crs
# Prepare destination
lulc_resampled = np.full(dst_shape, 0, dtype=np.int16)
reproject(
source=lcc_data_full,
destination=lulc_resampled,
src_transform=lcc_transform,
src_crs=lcc_crs,
dst_transform=dst_transform,
dst_crs=dst_crs,
resampling=Resampling.nearest # keep categorical
)
data = src.read(1).astype("float32")
if src.nodata is not None:
data[data == src.nodata] = np.nan
return data, lulc_resampled, dst_transform
# Initialize a list to store results
results = []
# Iterate over raster files in the folder
for filename in os.listdir(raster_files_folder):
if filename.endswith('.tif'):
print(f"Processing raster for: {filename}")
raster_file = os.path.join(raster_files_folder, filename)
if ALIGN_TO == "LULC":
aligned_data = align_raster_to_lcc(raster_file, lcc_affine, lcc_crs, lcc_shape)
current_lcc = lcc_data_full
current_affine = lcc_affine
elif ALIGN_TO == "DATA":
aligned_data, current_lcc, current_affine = align_lcc_to_raster(
raster_file, lcc_data_full, lcc_affine, lcc_crs
)
# Iterate over each land cover class
for lc_class in unique_classes:
# print(f"Processing land cover class: {lc_class}")
# Create a mask for the current land cover class
class_mask = (current_lcc == lc_class)
class_data = np.where(class_mask, aligned_data, np.nan)
# Compute zonal statistics
stats = zonal_stats(
zones_features.geometry,
class_data,
affine=lcc_affine,
stats=['mean'],
nodata=np.nan
)
# Append results to the list
for zone_index, stat in enumerate(stats):
row = zones_attr.iloc[zone_index].to_dict()
row["LCC"] = int(lc_class)
## Keeps None if the zone has no overlap with raster (useful if you want
# row[filename] = round(stat["mean"], 2) if stat["mean"] is not None else None
## Replaces missing/NaN values with 0. Useful if you want all zones
row[filename] = round(stat["mean"], 2) if stat["mean"] is not None else 0
results.append(row)
# Convert results into a DataFrame
results_df = pd.DataFrame(results)
attr_cols = list(zones_attr.columns) + ["LCC"]
raster_cols = [c for c in results_df.columns if c not in attr_cols]
pivot_df = results_df.pivot_table(
index=attr_cols,
values=raster_cols,
aggfunc="first"
).reset_index()
# Export the entie annual result to an csv file
pivot_df.to_csv(output_csv, index=False)
# Drop attr_cols and LCC columns from numeric cols
year_cols = [c for c in pivot_df.columns if c.endswith(".tif")]
# Compute mean across years
pivot_df["mean_val"] = pivot_df[year_cols].mean(axis=1)
# Pivot: rows = attr_cols, columns = LCC
out_df = pivot_df.pivot_table(
index=[c for c in pivot_df.columns if c not in ["LCC", "mean_val"] + year_cols],
columns="LCC",
values="mean_val"
).reset_index()
out_df = out_df.round(2)
out_df.to_csv(output_csv.replace(".csv", "_all_years_mean.csv"), index=False)
print(f"Zonal statistics for all classes saved to {output_csv}")