Sunday, February 7, 2016

Decision Tree Analysis with Gapminder Data

Introduction

In this assignment, i will attempt to generate Decision Trees using the Gapminder data set.
I will continue to investigate my research question  that:

There is no association between per person electricity consumption and income per person.

In this assignment, I binned the following variables into two groups:
  • incomeperperson
  • relectricperperson
  • urbanrate
  • polityscore
The remainder of the variables - internetuserate, lifeexpectancy, oilperperson, and employment are quantitative,

My target variable is relectricperperson.

Code

# -*- coding: utf-8 -*-
"""
Created on Thursday January 28 08:27:53 2016

@author: Ernest.Tanson
"""

import pandas
import os

import pydotplus
import sklearn.metrics
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Displaying the decision tree
from sklearn import tree
from io import BytesIO

# bug fix for display format to avoid run time errors
pandas.set_option('display.float_format', lambda x: '%f' % x)

# Set pandas to display all columns and rows in DataFrame
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)

##############################################################################
# DATA MANAGEMENT
##############################################################################

# define method to load data of interest


def load_data(data_dir, csv_file):
    if __name__ == "__main__":
        DATA_PATH = os.path.join(os.getcwd(), data_dir)
        DATA_FILE = os.path.join(DATA_PATH, csv_file)
        data = pandas.read_csv(DATA_FILE, low_memory=False)
        return data

# loading data
data = load_data('data', 'gapminder.csv')
print(data.head(10))


# Extracting data pertinent to variables of interest

data = data[['incomeperperson', 'relectricperperson', 'urbanrate', 'internetuserate',
            'lifeexpectancy', 'oilperperson', 'polityscore', 'employrate']]
# setting variables of interest to numeric format
data['incomeperperson'] = \
    pandas.to_numeric(data['incomeperperson'], errors='coerce')
data['relectricperperson'] = \
    pandas.to_numeric(data['relectricperperson'], errors='coerce')
data['urbanrate'] = \
    pandas.to_numeric(data['urbanrate'], errors='coerce')
data['internetuserate'] = \
    pandas.to_numeric(data['internetuserate'], errors='coerce')
data['lifeexpectancy'] = \
    pandas.to_numeric(data['lifeexpectancy'], errors='coerce')
data['oilperperson'] = \
    pandas.to_numeric(data['oilperperson'], errors='coerce')
data['polityscore'] = \
    pandas.to_numeric(data['polityscore'], errors='coerce')
data['employrate'] = \
    pandas.to_numeric(data['employrate'], errors='coerce')

data[['incomeperperson', 'relectricperperson', 'urbanrate', 'internetuserate',
            'lifeexpectancy', 'oilperperson', 'polityscore', 'employrate']].describe()

#==============================================================================
#
#       incomeperperson  relectricperperson  urbanrate  internetuserate 
# count       190.000000          136.000000 203.000000       192.000000  
# mean       8740.966076         1173.178995  56.769360        35.632716  
# std       14262.809083         1681.440173  23.844933        27.780285  
# min         103.775857            0.000000  10.400000         0.210066  
# 25%         748.245151          203.652109  36.830000         9.999604  
# 50%        2553.496056          597.136436  57.940000        31.810121  
# 75%        9379.891166         1491.145249  74.210000        56.416046  
# max      105147.437700        11154.755030 100.000000        95.638113  
#
#        lifeexpectancy  oilperperson  polityscore  employrate 
# count      191.000000     63.000000   161.000000  178.000000 
# mean        69.753524      1.484085     3.689441   58.635955 
# std          9.708621      1.825090     6.314899   10.519454 
# min         47.794000      0.032281   -10.000000   32.000000 
# 25%         64.447000      0.532541    -2.000000   51.225000 
# 50%         73.131000      1.032470     6.000000   58.699999 
# 75%         76.593000      1.622737     9.000000   64.975000 
# max         83.394000     12.228645    10.000000   83.199997
#==============================================================================

# listwise deletion of missing values
data_clean = data.dropna()

print(data_clean.columns)
data_clean.dtypes



def percapitakWh(row):
    if row['relectricperperson'] > 597.136436 :
        return 0
    else:
        return 1

data_clean['KWH'] = data_clean.apply(lambda row: percapitakWh(row), axis=1)

kWh = data_clean['KWH'].value_counts(sort=False, dropna=False)
print(kWh)


def percapitaincome(row):
    if row['incomeperperson'] > 2553.496056:
        return 0
    else:
        return 1

data_clean['PERCAPINC'] = data_clean.apply(lambda row: percapitaincome(row), axis=1)

income = data_clean['PERCAPINC'].value_counts(sort=False, dropna=False)
print(income)

def urban(row):
    if row['urbanrate'] > 57.940000:
        return 0
    else:
        return 1

data_clean['URBAN'] = data_clean.apply(lambda row: urban(row), axis=1)

urban_rate = data_clean['URBAN'].value_counts(sort=False, dropna=False)
print(urban_rate)

def polityscore(row):
    if row['polityscore'] > 6:
        return 0
    else:
        return 1

data_clean['POLSCORE'] = data_clean.apply(lambda row: polityscore(row), axis=1)

polscore = data_clean['POLSCORE'].value_counts(sort=False, dropna=False)
print(polscore)

data_clean

##############################################################################
# END DATA MANAGEMENT
##############################################################################


###############################################################################
# DECISION TREE ANALYSIS
###############################################################################

predictors =\
    data_clean[['PERCAPINC', 'URBAN', 'internetuserate','POLSCORE', 'lifeexpectancy', 'oilperperson', 'employrate']]
         

targets = data_clean.KWH

pred_train, pred_test, tar_train, tar_test = \
    train_test_split(predictors, targets, test_size=.4)

pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape


classifier = DecisionTreeClassifier()
classifier = classifier.fit(pred_train, tar_train)

predictions = classifier.predict(pred_test)

confusion_matrix = sklearn.metrics.confusion_matrix(tar_test, predictions)
print(confusion_matrix)

accuracy = sklearn.metrics.accuracy_score(tar_test, predictions)
print(accuracy)

out = BytesIO()

#==============================================================================
# with io.open('spam.txt', 'w') as file:
#     file.write(u'Spam and eggs!')
#==============================================================================
   
tree.export_graphviz(classifier, out_file=out)

graph = pydotplus.graph_from_dot_data(out.getvalue())

with open(os.path.join(os.getcwd(), 'img')+'/gapminder_DT_2.png', 'wb') as f:
    f.write(graph.create_png())





Decision Tree Diagram

 

Summary

The accuracy of this model is 0.833 and the confusion matrix is
         | No       Yes
=============
No   |  16          1
Yes  |   3           4

A Decision Tree analysis was performed to test non linear relationship between a series of predictors -  per capita income, urban rate, internet use rate, political score, life expectancy, per person oil consumption, rate of employment, which are made up of quantitative and categorical explanatory variables. The target or response categorical variable is KWH(per person electricity consumption).

For this analysis a series of simple rules  or criteria over and over again to choose variable constellations that best predict the target variable.



No comments:

Post a Comment