Introduction
In this assignment, i will attempt to generate Decision Trees using the Gapminder data set.I will continue to investigate my research question that:
There is no association between per person electricity consumption and income per person.
In this assignment, I binned the following variables into two groups:
- incomeperperson
- relectricperperson
- urbanrate
- polityscore
My target variable is relectricperperson.
Code
# -*- coding: utf-8 -*-"""
Created on Thursday January 28 08:27:53 2016
@author: Ernest.Tanson
"""
import pandas
import os
import pydotplus
import sklearn.metrics
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
# Displaying the decision tree
from sklearn import tree
from io import BytesIO
# bug fix for display format to avoid run time errors
pandas.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)
##############################################################################
# DATA MANAGEMENT
##############################################################################
# define method to load data of interest
def load_data(data_dir, csv_file):
if __name__ == "__main__":
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pandas.read_csv(DATA_FILE, low_memory=False)
return data
# loading data
data = load_data('data', 'gapminder.csv')
print(data.head(10))
# Extracting data pertinent to variables of interest
data = data[['incomeperperson', 'relectricperperson', 'urbanrate', 'internetuserate',
'lifeexpectancy', 'oilperperson', 'polityscore', 'employrate']]
# setting variables of interest to numeric format
data['incomeperperson'] = \
pandas.to_numeric(data['incomeperperson'], errors='coerce')
data['relectricperperson'] = \
pandas.to_numeric(data['relectricperperson'], errors='coerce')
data['urbanrate'] = \
pandas.to_numeric(data['urbanrate'], errors='coerce')
data['internetuserate'] = \
pandas.to_numeric(data['internetuserate'], errors='coerce')
data['lifeexpectancy'] = \
pandas.to_numeric(data['lifeexpectancy'], errors='coerce')
data['oilperperson'] = \
pandas.to_numeric(data['oilperperson'], errors='coerce')
data['polityscore'] = \
pandas.to_numeric(data['polityscore'], errors='coerce')
data['employrate'] = \
pandas.to_numeric(data['employrate'], errors='coerce')
data[['incomeperperson', 'relectricperperson', 'urbanrate', 'internetuserate',
'lifeexpectancy', 'oilperperson', 'polityscore', 'employrate']].describe()
#==============================================================================
#
# incomeperperson relectricperperson urbanrate internetuserate
# count 190.000000 136.000000 203.000000 192.000000
# mean 8740.966076 1173.178995 56.769360 35.632716
# std 14262.809083 1681.440173 23.844933 27.780285
# min 103.775857 0.000000 10.400000 0.210066
# 25% 748.245151 203.652109 36.830000 9.999604
# 50% 2553.496056 597.136436 57.940000 31.810121
# 75% 9379.891166 1491.145249 74.210000 56.416046
# max 105147.437700 11154.755030 100.000000 95.638113
#
# lifeexpectancy oilperperson polityscore employrate
# count 191.000000 63.000000 161.000000 178.000000
# mean 69.753524 1.484085 3.689441 58.635955
# std 9.708621 1.825090 6.314899 10.519454
# min 47.794000 0.032281 -10.000000 32.000000
# 25% 64.447000 0.532541 -2.000000 51.225000
# 50% 73.131000 1.032470 6.000000 58.699999
# 75% 76.593000 1.622737 9.000000 64.975000
# max 83.394000 12.228645 10.000000 83.199997
#==============================================================================
# listwise deletion of missing values
data_clean = data.dropna()
print(data_clean.columns)
data_clean.dtypes
def percapitakWh(row):
if row['relectricperperson'] > 597.136436 :
return 0
else:
return 1
data_clean['KWH'] = data_clean.apply(lambda row: percapitakWh(row), axis=1)
kWh = data_clean['KWH'].value_counts(sort=False, dropna=False)
print(kWh)
def percapitaincome(row):
if row['incomeperperson'] > 2553.496056:
return 0
else:
return 1
data_clean['PERCAPINC'] = data_clean.apply(lambda row: percapitaincome(row), axis=1)
income = data_clean['PERCAPINC'].value_counts(sort=False, dropna=False)
print(income)
def urban(row):
if row['urbanrate'] > 57.940000:
return 0
else:
return 1
data_clean['URBAN'] = data_clean.apply(lambda row: urban(row), axis=1)
urban_rate = data_clean['URBAN'].value_counts(sort=False, dropna=False)
print(urban_rate)
def polityscore(row):
if row['polityscore'] > 6:
return 0
else:
return 1
data_clean['POLSCORE'] = data_clean.apply(lambda row: polityscore(row), axis=1)
polscore = data_clean['POLSCORE'].value_counts(sort=False, dropna=False)
print(polscore)
data_clean
##############################################################################
# END DATA MANAGEMENT
##############################################################################
###############################################################################
# DECISION TREE ANALYSIS
###############################################################################
predictors =\
data_clean[['PERCAPINC', 'URBAN', 'internetuserate','POLSCORE', 'lifeexpectancy', 'oilperperson', 'employrate']]
targets = data_clean.KWH
pred_train, pred_test, tar_train, tar_test = \
train_test_split(predictors, targets, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
classifier = DecisionTreeClassifier()
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)
confusion_matrix = sklearn.metrics.confusion_matrix(tar_test, predictions)
print(confusion_matrix)
accuracy = sklearn.metrics.accuracy_score(tar_test, predictions)
print(accuracy)
out = BytesIO()
#==============================================================================
# with io.open('spam.txt', 'w') as file:
# file.write(u'Spam and eggs!')
#==============================================================================
tree.export_graphviz(classifier, out_file=out)
graph = pydotplus.graph_from_dot_data(out.getvalue())
with open(os.path.join(os.getcwd(), 'img')+'/gapminder_DT_2.png', 'wb') as f:
f.write(graph.create_png())
Decision Tree Diagram
Summary
The accuracy of this model is 0.833 and the confusion matrix is| No Yes
=============
No | 16 1
Yes | 3 4
A Decision Tree analysis was performed to test non linear relationship between a series of predictors - per capita income, urban rate, internet use rate, political score, life expectancy, per person oil consumption, rate of employment, which are made up of quantitative and categorical explanatory variables. The target or response categorical variable is KWH(per person electricity consumption).
For this analysis a series of simple rules or criteria over and over again to choose variable constellations that best predict the target variable.

No comments:
Post a Comment