Introduction
Today's blog post is about analyzing Gapminder data using Random Forest machine language algorithm. The thesis of interest is to ascertain economic factors that dictates whether a country will be a democracy or a dictatorship.
I will bin the "polityscore" variable into two categorical variables - 0: Democracy, 1: Dictatorship.
I will bin the "polityscore" variable into two categorical variables - 0: Democracy, 1: Dictatorship.
Code
import pandas as pd
import os
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
import sklearn.metrics
# Feature Importance
from sklearn.ensemble import ExtraTreesClassifier
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
# bug fix for display format to avoid run time errors
pd.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
##############################################################################
# DATA MANAGEMENT
##############################################################################
# define method to load data of interest
def load_data(data_dir, csv_file):
if __name__ == "__main__":
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pd.read_csv(DATA_FILE, low_memory=False)
return data
# loading data
gapminder_df = load_data('data', 'gapminder.csv')
features_df = gapminder_df[['incomeperperson', 'alcconsumption', 'armedforcesrate',
'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate',
'internetuserate', 'lifeexpectancy', 'oilperperson', 'polityscore', 'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate']]
features_df['incomeperperson'] = \
pd.to_numeric(features_df['incomeperperson'], errors='coerce')
features_df['alcconsumption'] = \
pd.to_numeric(features_df['alcconsumption'], errors='coerce')
features_df['armedforcesrate'] = \
pd.to_numeric(features_df['armedforcesrate'], errors='coerce')
features_df['breastcancerper100th'] = \
pd.to_numeric(features_df['breastcancerper100th'], errors='coerce')
features_df['co2emissions'] = \
pd.to_numeric(features_df['co2emissions'], errors='coerce')
features_df['femaleemployrate'] = \
pd.to_numeric(features_df['femaleemployrate'], errors='coerce')
features_df['hivrate'] = \
pd.to_numeric(features_df['hivrate'], errors='coerce')
features_df['internetuserate'] = \
pd.to_numeric(features_df['internetuserate'], errors='coerce')
features_df['lifeexpectancy'] = \
pd.to_numeric(features_df['lifeexpectancy'], errors='coerce')
features_df['oilperperson'] = \
pd.to_numeric(features_df['oilperperson'], errors='coerce')
features_df['polityscore'] = \
pd.to_numeric(features_df['polityscore'], errors='coerce')
features_df.dropna()
features_df['relectricperperson'] = \
pd.to_numeric(features_df['relectricperperson'], errors='coerce')
features_df['suicideper100th'] = \
pd.to_numeric(features_df['suicideper100th'], errors='coerce')
features_df['employrate'] = \
pd.to_numeric(features_df['employrate'], errors='coerce')
features_df['urbanrate'] = \
pd.to_numeric(features_df['urbanrate'], errors='coerce')
features_df = features_df.dropna()
print(features_df.head(10))
print(features_df.columns)
features_df.dtypes
def map_target(row):
if row['polityscore'] > 6:
return 0 # Democracy
else:
return 1 # Dictatorship
features_df['political_maturity'] = features_df.apply(lambda row: map_target(row), axis=1)
features_df['polityscore']
pol_mat = features_df['political_maturity'].value_counts(sort=False, dropna=False)
print(pol_mat)
##############################################################################
# END DATA MANAGEMENT
##############################################################################
###############################################################################
# RANDOM FOREST ANALYSIS
###############################################################################
target = features_df['political_maturity']
print(features_df.columns.values)
features = features_df
pred_train, pred_test, tar_train, tar_test = \
train_test_split(features, target, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
report = sklearn.metrics.classification_report(tar_test, predictions)
print(report)
import os
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
import sklearn.metrics
# Feature Importance
from sklearn.ensemble import ExtraTreesClassifier
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
# bug fix for display format to avoid run time errors
pd.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
##############################################################################
# DATA MANAGEMENT
##############################################################################
# define method to load data of interest
def load_data(data_dir, csv_file):
if __name__ == "__main__":
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pd.read_csv(DATA_FILE, low_memory=False)
return data
# loading data
gapminder_df = load_data('data', 'gapminder.csv')
features_df = gapminder_df[['incomeperperson', 'alcconsumption', 'armedforcesrate',
'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate',
'internetuserate', 'lifeexpectancy', 'oilperperson', 'polityscore', 'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate']]
features_df['incomeperperson'] = \
pd.to_numeric(features_df['incomeperperson'], errors='coerce')
features_df['alcconsumption'] = \
pd.to_numeric(features_df['alcconsumption'], errors='coerce')
features_df['armedforcesrate'] = \
pd.to_numeric(features_df['armedforcesrate'], errors='coerce')
features_df['breastcancerper100th'] = \
pd.to_numeric(features_df['breastcancerper100th'], errors='coerce')
features_df['co2emissions'] = \
pd.to_numeric(features_df['co2emissions'], errors='coerce')
features_df['femaleemployrate'] = \
pd.to_numeric(features_df['femaleemployrate'], errors='coerce')
features_df['hivrate'] = \
pd.to_numeric(features_df['hivrate'], errors='coerce')
features_df['internetuserate'] = \
pd.to_numeric(features_df['internetuserate'], errors='coerce')
features_df['lifeexpectancy'] = \
pd.to_numeric(features_df['lifeexpectancy'], errors='coerce')
features_df['oilperperson'] = \
pd.to_numeric(features_df['oilperperson'], errors='coerce')
features_df['polityscore'] = \
pd.to_numeric(features_df['polityscore'], errors='coerce')
features_df.dropna()
features_df['relectricperperson'] = \
pd.to_numeric(features_df['relectricperperson'], errors='coerce')
features_df['suicideper100th'] = \
pd.to_numeric(features_df['suicideper100th'], errors='coerce')
features_df['employrate'] = \
pd.to_numeric(features_df['employrate'], errors='coerce')
features_df['urbanrate'] = \
pd.to_numeric(features_df['urbanrate'], errors='coerce')
features_df = features_df.dropna()
print(features_df.head(10))
print(features_df.columns)
features_df.dtypes
def map_target(row):
if row['polityscore'] > 6:
return 0 # Democracy
else:
return 1 # Dictatorship
features_df['political_maturity'] = features_df.apply(lambda row: map_target(row), axis=1)
features_df['polityscore']
pol_mat = features_df['political_maturity'].value_counts(sort=False, dropna=False)
print(pol_mat)
##############################################################################
# END DATA MANAGEMENT
##############################################################################
###############################################################################
# RANDOM FOREST ANALYSIS
###############################################################################
target = features_df['political_maturity']
print(features_df.columns.values)
features = features_df
pred_train, pred_test, tar_train, tar_test = \
train_test_split(features, target, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
report = sklearn.metrics.classification_report(tar_test, predictions)
print(report)
confusion_matrix = sklearn.metrics.confusion_matrix(tar_test, predictions)
print(confusion_matrix)
accuracy = sklearn.metrics.accuracy_score(tar_test, predictions)
print(accuracy)
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
plt.cla()
plt.plot(trees, accuracy)
print(confusion_matrix)
accuracy = sklearn.metrics.accuracy_score(tar_test, predictions)
print(accuracy)
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
plt.cla()
plt.plot(trees, accuracy)
Summary
Classification Report
precision recall f1-score supportDemocracy 0.83 0.83 0.83 18
Dictatorship 0.40 0.40 0.40 5
avg / total 0.74 0.74 0.74 23
The following explanatory variables - - Income Per Person, Alcohol Consumption , Size of Armed Forces, Breast Cancer Per Hundredth, Carbon Dioxide Emissions, Female Employment Rate , HIV Rate, Internet Use Rate, Life Expectancy, Oil Per Person, Electric Consumption Per Person, Suicide Per Hundredth, Employment Rate, Urbanization Rate were included as possible contributors to the random forest model evaluating a country's democratic maturity, my response variable - 0 : Democracy, 1: Dictatorship.
| Features | Relative Importance Score |
| Income Per Person | 0.072 |
| Alcohol Consumption | 0.196 |
| Size of Armed Forces | 0.055 |
| Breast Cancer Per Hundredth | 0.15 |
| Carbon Dioxide Emission | 0.047 |
| Female Employment Rate | 0.069 |
| HIV Rate | 0.028 |
| Internet Use Rate | 0.018 |
| Life Expectancy | 0.053 |
| Oil Per Person | 0.088 |
| Electric Consumption Per Person | 0.052 |
| Suicide Per Hundredth | 0.081 |
| Employment Rate | 0.064 |
| Urbanization Rate | 0.024 |
The explanatory variables with the highest relative importance scores are alcohol consumption and breast cancer per hundredth. The accuracy of the random forest model is approximately 74%. Subsequently, growing of multiple trees did not contribute much to the overall accuracy of the model, but demonstrated that up to ten threes may match the overall accuracy of this model. This suggests that the decision tree model with one tree may be a better model in this case.

No comments:
Post a Comment