#Import Packages
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import geopandas as gpd
from pyproj import Proj, transform, CRS
import seaborn as sns
import warnings as wr
'ignore')
wr.filterwarnings(import scipy.stats as stats
Part 3
Data Analysis & Visualization
We will import the geopackage that we created at the end of tutorial 2 and run some basic data exploration functions on it to make sure everything looks good
#Importing the geopackage (accounting area, AD & IFM filtered, Polygons filtered)
= gpd.read_file('path/to/your/file/ad_ifm_forest_loss_area.gpkg') data
##To keep this tutorial clean and concise, I won't include separate code chunks for basic data exploration.
##Feel free to run them as part of your analysis.
# shows the first 5 rows in a dataframe
data.head() # returns the number of rows and columns in a dataframe
data.shape #returns the total number of elements in a column (row*column)
data.size # view all the column names in the dataframe
data.columns # returns the type of data and number of non-null values stored in each column,
data.info() # provides a statistical summary of the dataset, returns data mean, percentiles, min and max values. data.describe()
Data Manipulation
Now that we’ve run some basic data exploration, we want to manipulate the data to be able to analyse it
Calculate the area for each project
We want to calculate the percentage of deforestation that occured each year between 2001-2023 for each project
Using this we will calculate
1) the average deforestation rate before the start of the project (10 years), and average deforestation rate after the start or the project
2) based on the average deforestation rate before and after project start, create a new column, deforestation status,
3) The total percentage deforestation inside the project boundary before and after the project start
4) Total deforestation (\(km^{2}\)) inside the project boundary after the project start
To do this we will write functions, this will help us keep the code clean.
Calculate the area for each project
We need to reproject each project geometry to its UTM zone for accurate area calculation.
For this, we are creating two functions: 1) Determines the UTM Zone for each project using its latitude and longitude.
2) Extracts the latitude and longitude from the centroids, apply the get_utm_zone function, calculate the area (default is to calculate the area in \(m^{2}\)), and finally convert the area to \(km^{2}\).
# Function to determine UTM zone for a given geometry
def get_utm_zone(lon, lat):
= str((int((lon + 180) / 6) % 60) + 1)
utm_band if len(utm_band) == 1:
= '0' + utm_band
utm_band if lat >= 0:
= f'326{utm_band}'
epsg_code else:
= f'327{utm_band}'
epsg_code return CRS(f'EPSG:{epsg_code}')
#Convert the geomtery to desired utm and compute area in km2
def convert_to_utm_and_compute_area(row):
= row.geometry
geom = geom.centroid.coords[0]
lon, lat = get_utm_zone(lon, lat)
utm_crs # Create a GeoDataFrame for the single geometry to use the to_crs method
= gpd.GeoDataFrame(index=[0], crs='EPSG:4326', geometry=[geom])
geom_gdf = geom_gdf.to_crs(utm_crs)
geom_utm = geom_utm.area[0]/1e6
area_km2 return area_km2
#Applying the convert_to_utm_and_compute_area function
= 'EPSG:4326' # Ensure the CRS is set to EPSG:4326
data.crs 'area_km2'] = data.apply(convert_to_utm_and_compute_area, axis=1) #Creating a new column, 'area_km2'. data[
Calculate percentage deforestation each year
Before we do this we will rename the year columns in our dataframe for simplicity
data.columns
#Renaming the year loss columns from the format 2001(km2)...2023(km2) to 2001...2023 format
= {col: col.replace('(km2)', '') for col in data.columns}
new_columns =new_columns, inplace=True) data.rename(columns
Write in text the formula for % deforestation. Mention somehwere the year columns hold the area deforested in that year
#Calculating % deforestation relative to the project areafor each year
for year in range(2001, 2024):
= str(year)
year_col = f'{year}_pc'
year_pc_col if year_col in data.columns:
= (data[year_col]/data['area_km2'])*100 data[year_pc_col]
Calculate the average deforestation rate before and after start of the project
Average deforestation rate before (\(AD_{before}\)) start of the project is calculated over a period of 10 years preceding the project’s start year. If a project started in 2010 or earlier, \(AD_{before}\) is calculated from 2001 to a year before the project started.
Average deforestation rate after (\(AD_{after}\)) start of project was calculated from the year following the project start year up to 2023
The project start year is excluded from both calculations.
# Transforming the 'Project Start Date' column to datetime format and creating a column for start year
'Project Start Date'] = pd.to_datetime(data['Project Start Date'], format='%m/%d/%Y')
data['start_year'] = data['Project Start Date'].dt.year data[
#Function to calculate the average deforestation rate before start of project
def average_deforestation_before(row):
= row['start_year']
start_year if start_year <= 2001:
return 0 # If the project started in or before 2001, there's no data for 10 years before
= max(2001, start_year - 10)
start_range = [f"{year}_pc" for year in range(start_range, start_year)]
years_range = row[years_range]
deforestation_rates = deforestation_rates.mean()
average_deforestation return average_deforestation
#Function to calculate the average deforestation rate after start of project
def average_deforestation_after(row):
= row['start_year']
start_year = start_year + 1
start_year_next = [f"{year}_pc" for year in range(start_year_next, 2024)]
years_range = row[years_range]
deforestation_rates = deforestation_rates.mean() if not deforestation_rates.empty else 0
average_deforestation return average_deforestation
#Applying both the functions
#Calculate the average percentage deforestation from the year following the project start year up to 2023.
'avg_pc_def_after_start'] = data.apply(average_deforestation_after, axis=1)
data[
#Calculate average percent deforestation for 10 years before start of the project.
'avg_pc_def_before_start'] = data.apply(average_deforestation_before, axis=1) data[
Determining the deforestation status of the project
Each project is classified into one of the following categories:
Increase: An increase in the average deforestation rate after the project start.
Decrease: A decrease in the average deforestation rate after the project start.
No Deforestation: Projects that had no deforestation both before and after the project start.
No Change: The average deforestation rate before the project start is equal to the average deforestation rate after.
#Classifying deforestation rates as incerased or decreased after project start
def deforestation_status(row):
if row['avg_pc_def_after_start'] > row['avg_pc_def_before_start']:
return 'Increase'
elif row['avg_pc_def_after_start'] < row['avg_pc_def_before_start']:
return 'Decrease'
elif row['avg_pc_def_after_start'] == 0 and row['avg_pc_def_before_start'] == 0:
return 'No deforestation'
else:
return 'No change'
#Applying the deforestation_status function
'def_status'] = data.apply(deforestation_status, axis=1) data[
Calculate total deforestation percentage (relative to the project area) before and after project started
We will calculate the total deforestation before (\(TD_{before}\)) over a period of 10 years preceding the project’s start year. If a project started in 2010 or earlier, \(TD_{before}\) is calculated from 2001 to a year before the project started.
Total deforestation rate after (\(TD_{after}\)) start of project was calculated from the year following the project start year up to 2023
The project start year is excluded from both calculations.
#Calculates the total percentage of deforestation for the 10 years preceding the start of a project.
def total_deforestation_percent_before(row):
= row['start_year']
start_year if start_year <= 2001:
return 0 # If the project started in or before 2001, there's no data for 10 years before
= max(2001, start_year - 10)
start_range = row[[f'{year}_pc' for year in range(start_range, start_year)]].sum()
total_deforestation_percent_before_start return total_deforestation_percent_before_start
#Calculates the total percentage of deforestation for the years following the project's start year until 2023 and returns this total.
def total_deforestation_percent_after(row):
= row['start_year']
start_year = row[['{year}_pc' for year in range(start_year + 1, 2024)]].sum()
total_deforestation_percent_after_start return total_deforestation_percent_after_start
#Applying the total_deforestation_percent_before function
'total_def_percent_before'] = data.apply(total_deforestation_percent_before, axis=1)
data[
#Applying the total_deforestation_percent_after function
'total_def_percent_after'] = data.apply(total_deforestation_percent_after, axis=1) data[
Calculating the total deforested area inside the project boundary following the project start year up to 2023
# Function to calculate total deforestation (in $km^{2}$) after start of project
def total_deforestation_after(row):
= row['start_year']
start_year = row[[f'{year}' for year in range(start_year + 1, 2024)]].sum()
total_deforestation_after_start return total_deforestation_after_start
'total_def_after_start'] = data.apply(total_deforestation_after, axis=1) data[
Data Visualization
We our now going to visualize and analyze our data based on ‘Project Type’. For this analysis we will subset the data frame to avoid confusion in the future
'Project Type'].unique() #returns an array of unique project types present in the 'Project Type' column data[
= data[(data['Project Type'] == "AD")]
data_ad = data[(data['Project Type'] == "IFM")] data_ifm
Percentage of projects that saw an increase, decrease, and no deforestation after project start
We want to make a visualization for what was the percentage of projects in each deforestation status (column: def_status) category
For this we will first create a new dataframe that sums the number of projects for each category
# Count the number of occurrences of each def_status value
= data_ad['def_status'].value_counts().reset_index() #Using the newly subsetted data_ad here
status_counts_ad = ['def_status', 'count'] #Names the columns in the new dataframe status_counts_ad.columns
Next we will create a new variable, ‘total_projects_ad’ that stores the total number of projects in the ‘status_counts_ad’ dataframe by running the .sum() operation on the ‘counts’ column
We will use this ‘total_projects_ad’ variable to find the percentage of projects in each ‘def_status’ category
= status_counts_ad['count'].sum()
total_projects_ad 'percentage'] = (status_counts_ad['count'] / total_projects_ad) * 100
status_counts_ad[
#View the dataframe status_counts_ad
Next, we will prepare bar graphs for this visualization using the data from the status_counts_ad dataframe
# Custom colors for the bars
= ['#550707', '#d8854f', '#606c38']
custom_colors
# Custom x-axis labels
= ['Increase', 'Decrease', 'No deforestation']
custom_labels
# Create a bar plot using seaborn
=(10, 6))
plt.figure(figsize= sns.barplot(x='def_status', y='percentage', data=status_counts_ad, palette=custom_colors, width=0.6)
bar_plot
# Annotate the bars with the percentage of projects
for p in bar_plot.patches:
format(p.get_height(), '.1f') + '%',
bar_plot.annotate(+ p.get_width() / 2., p.get_height()),
(p.get_x() ='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=16)
ha
'Avoided Deforestation', fontsize=21)
plt.title('Deforestation Trend', fontsize=17.5)
plt.xlabel('Percentage of Projects', fontsize=19)
plt.ylabel(=0, fontsize=16)
plt.xticks(rotation=14)
plt.yticks(fontsize
# Set custom x-axis tick labels
bar_plot.set_xticklabels(custom_labels)
# Remove the x-axis line
'bottom'].set_visible(False)
bar_plot.spines[
# Remove the outer square (spines)
sns.despine()
'def_status_ad.png', dpi=1200)
plt.savefig(
plt.show()
We have successfully created a bar graph for Avoided Deforestation Projects.
Similarly, we can create one for Improved Forest Management projects. Go ahead and create one for yourself. Feel free to experiment with the labels, fontsizes, colours, position of annotations on the bars, and other features.
Projects that had high total percentage deforestation after the start of the project
We will identify the 10 projects with the highest total percentage of deforestation after the project started grouped by country and create a bar graph for them.
To do this, we’ll use the ‘total_def_percent_after’ column created earlier.
First we’ll sort the dataframe ‘data_ad’ in descending order based on values in the ‘total_def_percent_after’ column and store it as a new dataframe
= data_ad.sort_values(by=['total_def_percent_after'], ascending=False) #
data_ad_sorted
= data_ad_soretd.head(10) #Subsetting the dataframe to include only the first 10 rows or the 10 projects with highest deofrestation after project started data_ad_sorted
# Custom color palette
= ['#550707', '#7f461b', '#d8854f', '#e1715b', '#a75d5d', '#634539', '#a5a694']
custom_colors
# Plot
=(12, 6))
plt.figure(figsize
= sns.barplot(y='ProjectID', x='total_def_percent_after', hue='Country', data=data_ad_sorted, dodge=False, palette=custom_colors) #Flipped the axes here.
bar_plot
# Annotate the bars
for p in bar_plot.patches:
if p.get_width() > 0: # Only annotate if the width is greater than 0
format(p.get_width(), '.1f') + '%',
bar_plot.annotate(+ p.get_height() / 2.),
(p.get_width(), p.get_y() ='left', va='center', xytext=(0.9, 0), textcoords='offset points', fontsize =16)
ha
'Avoided Deforestation', fontsize=21)
plt.title('Project area deforested after project implementation', fontsize=17.5)
plt.xlabel('')
plt.ylabel(=14)
plt.yticks(fontsize=False, fontsize=14)
plt.legend(frameon
= True, left=True)
sns.despine(bottom='x', bottom=False, labelbottom=False)
bar_plot.tick_params(axis
'total_def_ad.png', dpi=300)
plt.savefig(
plt.how()
We now have a horizontal bar graph that displays the percentage of deforestation inside an AD project after it started.
Similarly, switch the data frame with data_ifm, to visualize the deforestaion trend for IFM projects
To find which projects reduced total percentage deforestation after project start, you can use the following code. Compare the values in the ‘total_def_percent_after’ column to those in the ‘total_def_percent_before’ column
= data_ad.sort_values(by=['total_def_percent_after'], ascending=True) #
data_ad_sorted
data_ad_sorted
Interactive map visualizing deforestation trends for individual projects
In this map, users can click on a project to view its deforestation status, along with a time series graph showing the yearly percentage of deforestation.
For the interactive map, we will create a new column ‘centroids’ which extracts the centroid for each project.
To create the time series plot, we will use year_pc (year: 2001-2024) (Percentage of deforestation for each year),def_status (deforestation status), and start_year (Project Start Year) to mark start year of a project in the graph.
#Import packages
import folium
import plotly.graph_objects as go
from folium import IFrame
from folium.plugins import Search
Creating a function to generate a timeseries graph of yearly deforestation percentages from 2001 to 2023
# Function to create a Plotly graph and return it as an HTML string
def create_time_series_graph(row):
= [str(year) for year in range(2001, 2024)]
years = [round(row[f'{year}_pc'], 4) for year in years]
deforestation_rates = row['start_year']
start_year
= row['def_status']
def_status = '#77bfa3' if def_status == 'Decrease' else '#c9184a' if def_status == 'Increase' else '#9163cb' #Colour of the line graph based on the deforestation status
line_color
= go.Figure()
fig =years, y=deforestation_rates, mode='lines+markers', name='Deforestation %',
fig.add_trace(go.Scatter(x=dict(color=line_color)))
line
= max(deforestation_rates) if max(deforestation_rates) > 0 else 1
max_deforestation_rate =[str(start_year)] * 2, y=[0, max_deforestation_rate], mode='lines', name='Start Year', line=dict(dash='dash', color='black'))) #Vertical line to mark
fig.add_trace(go.Scatter(x#the project start year
=f"Deforestation Time Series for {row['ProjectID']} | Project Type: {row['Project Type']}", #title each time series plot with its respective project ID
fig.update_layout(title# and project type
='Year',
xaxis_title='Percentage Deforestation',
yaxis_title=[0, max_deforestation_rate], #Custom y-axis for each project depending on its percentage deforestation range
yaxis_range='whitesmoke', # Set the background color to off-white
plot_bgcolor='whitesmoke'),
paper_bgcolor=dict(
xaxis=False # Hide x-axis grid lines
showgrid
)
return fig.to_html(full_html=False, include_plotlyjs='cdn')
Creating a function that sets the marker colour for each project depending on its deforestation status
# Function to determine marker color based on deforestation status
def get_marker_color(def_status):
if def_status == 'Increase':
return '#c9184a'
elif def_status == 'Decrease':
return '#77bfa3'
else:
return '#9163cb'
Build an interactive map using Folium
#Displaying the project boundaries for each project would be time-consuming and significantly increase the map's file size.
#Therefore, we will display only the centroid of each project.
'centroids'] = data.geometry.centroid #Extracting project centroids from the geometry column data[
# Create a Folium map
= folium.Map(location=[0,0], zoom_start=2, tiles='cartodbpositron') #Center the map view
m
# Create a FeatureGroup for the markers
= folium.FeatureGroup(name="Projects")
marker_group
# Add circle markers with pop-ups to the FeatureGroup
for index, row in data.iterrows(): #We are mapping all the projects, so we're using the original datafram, data.
= create_time_series_graph(row) # running the create_time_series_graph function on each row of the dataframe
graph_html = IFrame(graph_html, width=800, height=400)
iframe = folium.Popup(iframe, max_width=800)
popup
folium.CircleMarker(=[row['centroids'].y, row['centroids'].x], #Uses the centroids column to detemine the latitude and longitude of the project centroid
location=6,
radius=get_marker_color(row['def_status']),
color=True,
fill=get_marker_color(row['def_status']),
fill_color=0.6,
fill_opacity=popup,
popup=row['ProjectID'],
tooltip# Adding a unique ID for search to work correctly
id=row['ProjectID']
).add_to(marker_group)
# Add the marker group to the map
marker_group.add_to(m)
# Define legend HTML
= '''
legend_html <div style="position: fixed;
top: 50px; right: 50px; width: 350px; height: 80px;
border:2px solid white; z-index:9999; font-size:14px;
background-color:white; opacity: 0.8;
">
<strong>Deforestation after project implementation</strong> <br>
Increase <i class="fa fa-circle fa-1x" style="color:#c9184a"></i><br>
Decrease <i class="fa fa-circle fa-1x" style="color:#77bfa3"></i><br>
No Deforestation <i class="fa fa-circle fa-1x" style="color:#9163cb"></i><br>
</div>
'''
# Add legend to map
m.get_root().html.add_child(folium.Element(legend_html))
# Save the map to an HTML file
'map_with_deforestation_status.html')
m.save(
# To display the map in a Jupyter notebook, uncomment the line below
m
Other Statistics
Number of projects in deforestation status grouped by country
We aim to examine the number of projects in each category by country to identify any spatial trends in deforestation rates. This will help determine whether certain countries are performing better or worse compared to others.
# Group by 'Country' and count occurrences of each 'def_status' value
= data_ad.groupby('Country')['def_status'].value_counts().unstack(fill_value=0)
project_counts_ad
# Add a 'Total' column for sorting
'Total'] = project_counts_ad.sum(axis=1)
project_counts_ad[
# Sort the DataFrame in descending order based on the 'Total' column
= project_counts_ad.sort_values(by='Total', ascending=False)
project_counts_ad
# Drop the 'Total' column after sorting
= project_counts_ad.drop(columns='Total')
project_counts_ad
# Display the result
project_counts_ad
As with the graphs above, this is for AD projects. Switch the dataframe with ‘data_ifm’ to see the trends for IFM projects
Total area (\(km^{2}\)) deforested after project start grouped by country
= data_ad.groupby('Country')['total_def_after_start'].sum().sort_values(ascending=False)
data_ad_total_def
data_ad_total_def
Replace the ‘data_ad’ dataframe with ‘data_ifm’ to find out the total area deforested for IFM projects