Part 2 Data Preparation - Intersect Global Forest Watch - Loss Year data with Carbon Offset Projects Geospatial Database

In this tutorial, we will intersect the VRT generated at the end of the previous tutorial with the geodataframe to clip the VRT to each project’s individual raster and save this as a .tif file. Following this, we will compute the area for each loss year within these rasters and integrate this information into our geodataframe.

Import the packages below

#Import packages
import rasterio
from rasterio.mask import mask
from rasterio.warp import Resampling, reproject, calculate_default_transform
from rasterio.plot import show
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.wkt import loads
import os
from pyproj import Proj, transform, CRS
from shapely.geometry import MultiPolygon

Loading the projects geodatabase (accounting area as geometry)

Filtering for AD and IFM Projects

We will conduct this analysis exclusively for Avoided Deforestation and Improved Forest Management projects.

Filtering for Polygons only

This database includes project boundaries in polygon format, while projects with unavailable boundaries are represented by points. To calculate the area of forest loss within project boundaries, we will use only those projects with polygon boundaries and exclude those represented solely by point locations.

all_gdfs = gpd.read_file('path/to/your/file/project_accounting_areas.gpkg')
all_gdfs

#Filtering for AD and IFM projects only
ad_ifm_gdf = all_gdfs[(all_gdfs['Project Type'] == "AD") | (all_gdfs["Project Type"] == "IFM")]

#Filtering for projects whose boudnaries are available
ad_ifm_gdf_poly = ad_ifm_gdf[ad_ifm_gdf.geom_type.isin (['Polygon' ,'MultiPolygon'])]
ad_ifm_gdf_poly

Cliping the VRT with the filtered data frame

After filtering the dataframe, we will intersect it with the VRT to clip the VRT to each individual project raster and save these rasters as .tif files.

The following function iterates through each row of the dataframe, extracts the project geometry, clips the VRT using this geometry, and then saves the clipped raster as a .tif file, naming it according to the project ID and placing it in the specified output directory.

#Clip vrt with the projects database
def clip_vrt_with_geometries(vrt_path, gdf, output_dir):
    # Open the VRT file
    with rasterio.open(vrt_path) as src:
        # Iterate over each geometry in the GeoDataFrame
        for idx, row in gdf.iterrows():
            geometry = [row['geometry']]  # Geometry to clip with
            project_id = row['ProjectID']  # Project ID
            
            # Clip the VRT using the geometry
            out_image, out_transform = mask(src, geometry, crop=True)
            out_meta = src.meta.copy()
            
            # Update metadata
            out_meta.update({
                "driver": "GTiff",
                "height": out_image.shape[1],
                "width": out_image.shape[2],
                "transform": out_transform
            })
            
            # Save the clipped raster
            output_path = os.path.join(output_dir, f"{project_id}.tif")
            with rasterio.open(output_path, "w", **out_meta) as dest:
                dest.write(out_image)
            
            print(f"Saved clipped raster to {output_path}")

Loading the GFW VRT and clipping it to individual project rasters

# Loading the vrt and geospatial projects database
vrt_file = 'path/to/your/file/output.vrt'

# Ensure the GeoDataFrame has a column 'project_id' with unique IDs for each geometry
if 'ProjectID' not in ad_ifm_gdf.columns:
    raise ValueError("GeoDataFrame must have a 'project_id' column.")
    
# Output directory for the clipped rasters
output_dir = 'path/to/your/directory'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# Clip the VRT with the geometries in the GeoDataFrame
clip_vrt_with_geometries(vrt_file, all_gdfs, output_dir)

Calculate the area for each loss year for each project

To accomplish this, we first determine the UTM CRS for each project. Next, we reproject the clipped raster for each project to its respective UTM zone. Finally, we calculate the area for each loss year category within the project’s boundary.

Determine the UTM CRS for each project

The following functions calculate the UTM CRS based on a project’s latitude and longitude and then adds a new column, ‘utm_crs’, to the dataframe.

# Function to determine UTM zone for a given geometry
def get_utm_zone(lon, lat):
    # Calculate the UTM band based on longitude
    # Longitude is adjusted by adding 180, divided by 6, and then mod 60 to get the UTM zone number
    utm_band = str((int((lon + 180) / 6) % 60) + 1)
    
    # Ensure the UTM band is two digits (e.g., '01' instead of '1')
    if len(utm_band) == 1:
        utm_band = '0' + utm_band
        
    # Determine EPSG code based on latitude
    # EPSG code starts with 326 for northern hemisphere and 327 for southern hemisphere
    if lat >= 0:
        epsg_code = f'326{utm_band}' #Northern Hemisphere
    else:
        epsg_code = f'327{utm_band}' #Southern Hemisphere
        
    # Return the CRS object with the appropriate EPSG code
    return CRS(f'EPSG:{epsg_code}')

def get_lat_long_utm(gdf):
    #get centroids
    gdf['centroid'] = gdf.geometry.centroid
    
    #extract latitude longitude
    gdf['latitude'] = gdf['centroid'].y
    gdf['longitude'] = gdf['centroid'].x
    
    #Calculate the UTM CRS for each row
    gdf['utm_crs'] = gdf.apply(lambda row: get_utm_zone(row['longitude'], row['latitude']), axis=1) #
    
    # Drop the centroid, latitude, and longitude columns if you no longer need them
    gdf.drop(columns=['longitude', 'latitude'], inplace=True)
    
    return gdf

#Applying the get_lang_long_utm to get utm for each projects
ad_ifm_gdf_poly = get_lat_long_utm(ad_ifm_gdf_poly)
ad_ifm_gdf_poly

Reproject the clipped raster and compute the area for each loss year category

First, we define a function to reproject the raster for each project using the UTM zone specified in the utm_crs column.

Next, we create a function to process the raster for each project, which calculates the area of each fire loss category within the project’s geometry. This function masks the raster with the project’s geometry, reprojects it to the project’s UTM CRS, and computes the area for each category by counting pixel occurrences. It then organizes the results, including project details and calculated areas, into a dictionary format for each project and returns a list of these dictionaries.

def reproject_to_utm(src, geometry, utm_zone):
    dst_crs = utm_zone.to_string()
    transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds)
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height
    })
    
    destination = np.empty((src.count, height, width), dtype=src.dtypes[0])
    for i in range(1, src.count + 1):
        reproject(
            source=rasterio.band(src, i),
            destination=destination[i - 1],
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=transform,
            dst_crs=dst_crs,
            resampling=Resampling.nearest
        )
    return destination, kwargs

def calculate_area(tif_file, gdf_row):
    results = []

    with rasterio.open(tif_file) as src:
        geometry = [gdf_row['geometry']]
        out_image, out_transform = mask(src, geometry, crop=True, nodata=-1)
        out_image = out_image[0]
        #out_image[out_image == -1] = np.nan

        utm_zone = gdf_row['utm_crs']
        reprojected_image, kwargs = reproject_to_utm(src, geometry, utm_zone)

        unique, counts = np.unique(reprojected_image, return_counts=True)
        total_area = dict(zip(unique, counts))

        pixel_size = kwargs['transform'][0]
        area_per_class = {cls: cnt * pixel_size**2 / 1e6 for cls, cnt in total_area.items()}

        result_row = {
            'ProjectID': gdf_row['ProjectID'],
            'Project Name': gdf_row['Project Name'],
            'Registry Name': gdf_row['Registry Name'],
            'Methodology': gdf_row['Methodology'],
            'Country': gdf_row['Country'],
            'Project Type': gdf_row['Project Type'],
            'Project Start Date': gdf_row['Project Start Date'],
            'Geometry Type': gdf_row['geometry'].geom_type,
            'geometry': gdf_row['geometry'],
            'Raster File': os.path.basename(tif_file)
        }

        for cls in range(24):
            column_name = 'No loss' if cls == 0 else f'{2000 + cls}(km2)'
            result_row[column_name] = area_per_class.get(cls, 0)

        results.append(result_row)

    return results

Applying the calculate_area function to project geometry and associated raster to get area of forest loss for each year between 2001 and 2023

tif_folder = 'path/to/your/file/vrt_clipped'
        
# Initialize an empty list to store all results
all_results = []

# Iterate over each row in the GeoDataFrame
for idx, row in ad_ifm_gdf_poly.iterrows():
    # Find the corresponding TIFF file in the folder
    tif_file = os.path.join(tif_folder, f"{row['ProjectID']}.tif")
        
    # Check if the TIFF file exist
    if os.path.exists(tif_file):
        # Calculate area for the current TIFF file and row
        result_rows = calculate_area(tif_file, row)
        
        # Extend results list with the calculated rows
        all_results.extend(result_rows)
        print(f'Successfully processed {row["ProjectID"]}.tif')
    else:
        print(f'TIFF file {row["ProjectID"]}.tif not found in folder.')
    
# Convert the list of dictionaries to a GeoDataFrame
result_gdf = gpd.GeoDataFrame(all_results)
    
    
result_gdf.head()

# Save the combined results to a new GeoPackage or CSV file
result_gdf.to_file('path/to/your/file/ad_ifm_forest_loss_area.gpkg', driver='GPKG')
result_gdf.drop(columns='geometry').to_csv('path/to/your/file/ad_ifm_forest_loss_area.csv', index=False)