Introduction
In this blog post, I will continue using gapminder data to perform k-means cluster analysisMy interest is ascertaining the contribution of each variable in the dataset to per person electricity consumption(kWh) globally.
Code
from pandas import DataFrameimport pandas as pd
import os
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
# bug fix for display format to avoid run time errors
pd.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)##############################################################################
# DATA MANAGEMENT
##############################################################################
# define method to load data of interest
def load_data(data_dir, csv_file):
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pd.read_csv(DATA_FILE, low_memory=False)
return data
# loading data
gapminder_df = load_data('data', 'gapminder.csv')
#upper-case all DataFrame column names
gapminder_df.columns = map(str.upper, gapminder_df.columns)
if __name__ == "__main__":
cluster = gapminder_df.iloc[:, 2:]
def convert_to_numeric(mycluster):
for i in mycluster:
mycluster.loc[:,i] = pd.to_numeric(mycluster.loc[:,i], errors='coerce')
return mycluster
numeric_df = convert_to_numeric(cluster)
print(numeric_df)
def map_target(row):
if row['POLITYSCORE'] > 6:
return 0 # Democracy
else:
return 1 # Dictatorship
cluster.loc[:,'DEMO'] = cluster.apply(lambda row: map_target(row), axis=1)
cluster.drop('POLITYSCORE', axis=1, inplace=True)
data_clean = cluster.fillna(0)
# standardize cluster to have mean=0 and sd=1
def standardize_cluster(mycluster):
print(mycluster.columns.values)
clustervar = mycluster.copy()
clustervar.drop('RELECTRICPERPERSON', axis=1, inplace=True)
print(mycluster.columns.values)
for i in clustervar:
clustervar.loc[:,i] = preprocessing.scale(clustervar.loc[:,i].astype('float64'))
return clustervar
clustervar = standardize_cluster(data_clean)
print( clustervar)
##############################################################################
# END DATA MANAGEMENT
##############################################################################
###############################################################################
# K-Means CLUSTER ANALYSIS
###############################################################################
# split data into train and test sets
clus_train, clus_test =\
train_test_split(clustervar, test_size=.4, random_state=123)
# k-means cluster analysis for 1-15 clusters
from scipy.spatial.distance import cdist
clusters = range(1,15)
meandist = []
for k in clusters:
model = KMeans(n_clusters = k)
model.fit(clus_train)
clusassign = model.predict(clus_train)
meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
/ clus_train.shape[0])
"""
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
"""
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
# Interpret 3 cluster solution base on the Elbow method
model3 = KMeans(n_clusters=3)
model3.fit(clus_train)
clusassign = model3.predict(clus_train)
#labels
labels = model3.labels_
# plot clusters
from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 2 Clusters')
plt.show()
"""
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# create a unique identifier variable from the index for the
# cluster training data to merge with the cluster assignment variable
clus_train.reset_index(level=0, inplace=True)
# create a list that has the new index variable
cluslist = list(clus_train['index'])
# create a list of cluster assignments
# combine index variable list with cluster assignment list into a dictionary
newlist = dict(zip(cluslist, labels))
newlist
# convert newlist dictionary to a dataframe
newclus = DataFrame.from_dict(newlist, orient='index')
newclus
# rename the cluster assignment column
newclus.columns = ['cluster']
# now do the same for the cluster assignment variable
# create a unique identifier variable from the index for the
# cluster assignment dataframe
# to merge with cluster training data
newclus.reset_index(level=0, inplace=True)
# merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_train = pd.merge(clus_train, newclus, on='index')
merged_train.head(n=100)
# cluster frequencies
merged_train.cluster.value_counts()
"""
END multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# FINALLY calculate clustering variable means by cluster
clustergrp = merged_train.groupby('cluster').mean()
print ("Clustering variable means by cluster")
print(clustergrp)
# validate clusters in training data by examining cluster differences in RELECTRICPERPERSON using ANOVA
# first have to merge RELELCTRICPERPERSON with clustering variables and cluster assignment data
relectricperperson_data = data_clean['RELECTRICPERPERSON']
# split RELECTRICPERPERSON data into train and test sets
relectricperperson_train, relectricperperson_test =\
train_test_split(relectricperperson_data, test_size=.4, random_state=123)
relectricperperson_train1 = pd.DataFrame(relectricperperson_train)
relectricperperson_train1.reset_index(level=0, inplace=True)
merged_train_all = pd.merge(relectricperperson_train1, merged_train, on='index')
kWh = merged_train_all[['RELECTRICPERPERSON', 'cluster']].dropna()
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
kWhmod = smf.ols(formula='RELECTRICPERPERSON ~ C(cluster)', data=kWh).fit()
print(kWhmod.summary())
print ('means for RELECTRICPERPERSON by cluster')
kWhMean = kWh.groupby('cluster').mean()
print (kWhMean)
print ('standard deviations for RELECTRICPERPERSON by cluster')
kWhSD= kWh.groupby('cluster').std()
print (kWhSD)
mc1 = multi.MultiComparison(kWh['RELECTRICPERPERSON'], kWh['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
"""
VALIDATION
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster in test data set
"""
# create a variable out of the index for the cluster training dataframe to merge on
clus_test.reset_index(level=0, inplace=True)
# create a list that has the new index variable
cluslistval = list(clus_test['index'])
# create a list of cluster assignments
labelsval=list(labels)
# combine index variable list with labels list into a dictionary
newlistval=dict(zip(cluslistval, labelsval))
newlistval
# convert newlist dictionary to a dataframe
newclusval=DataFrame.from_dict(newlistval, orient='index')
newclusval
# rename the cluster assignment column
newclusval.columns = ['cluster']
# create a variable out of the index for the cluster assignment dataframe to merge on
newclusval.reset_index(level=0, inplace=True)
# merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_test = pd.merge(clus_test, newclusval, on='index')
# cluster frequencies
merged_test.cluster.value_counts()
"""
END multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# calculate test data clustering variable means by cluster
clustergrpval = merged_test.groupby('cluster').mean()
print ("Test data clustering variable means by cluster")
print(clustergrpval)
# validate clusters in testing data by examining cluster differences in RELECTRICPERPERSON using ANOVA
# first have to merge RELELCTRICPERPERSON with clustering variables and cluster assignment data
relectricperperson_test1 = pd.DataFrame(relectricperperson_test)
relectricperperson_test1.reset_index(level=0, inplace=True)
merged_test_all = pd.merge(relectricperperson_test1, merged_test, on='index')
kWh_test = merged_test_all[['RELECTRICPERPERSON', 'cluster']].dropna()
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
kWh_testmod = smf.ols(formula='RELECTRICPERPERSON ~ cluster', data=kWh_test).fit()
print(kWh_testmod.summary())
print ('Test Data means for RELECTRICPERPERSON by cluster')
kWh_testMean = kWh_test.groupby('cluster').mean()
print (kWh_testMean)
print ('Test Data standard deviations for RELECTRICPERPERSON by cluster')
kWh_testSD= kWh_test.groupby('cluster').std()
print (kWh_testSD)
mc1_test = multi.MultiComparison(kWh_test['RELECTRICPERPERSON'], kWh_test['cluster'])
res1_test = mc1_test.tukeyhsd()
print(res1_test.summary())
Summary
I conducted a k-means cluster analysis to identify subgroups of countries that we can use per person electricity consumption as a proxy for economic attainment. This is based on 15 economic variables deemed to influence a country's economic growth.Clustering variables includes categorical variable of a country's political maturity - Democracy or Dictatorship and quantitative variables of per capita income, alcohol consumption, rate of growth of a country's armed forces, breast cancer infection per 100 people, carbon dioxide emission, employment rate, female employment rate, HIV infection rate, internet use rate, life expectancy, oil use per person, suicide per 100 people, employment rate, urbanization rate and per person electricity consumption. I standardized/normalized all clustering variables to have a mean of 0 and standard deviation of 1
I randomly split data into training set consisting of 60% of observations (N=127) and a testing set made up 40% of observations (N=86).
Using Euclidean distance measurement, I conducted a series of k-means cluster analyses on the training dataset, specifying clusters between 1 through 15. For guidance on the number of potential clusters to interpret, I plotted the variance for each of the 15 cluster solutions in an elbow curve accounted for by the clusters (r-square). See Figure 1 below.
Figure 1: Selecting k with the Elbow Method

The Elbow Method diagram suggested that 3-clusters solutions can be interpreted.
Figure 2 below shows the result of an interpretation of a 3-cluster solution.
Figure 2: Scatterplot of Canonical Variables for 3 Clusters
Leveraging Canonical Discriminant Analysis, I reduced the variables from 15 to a few that has outsize contribution to the variance of the clustering variables. Figure 2 above is the scatter plot of the first two canonical variables.
The means of the clustering variables revealed that compared to other clusters, countries in cluster 0(green) are relatively developed economies as evidenced by higher levels of per capita income, alcohol consumption, rate of growth of a country's armed forces, breast cancer infection per 100 people, carbon dioxide emission, employment rate, female employment rate, internet use rate, life expectancy, oil use per person, suicide per 100 people, employment rate and urbanization rate. However, they have low HIV infection rate and are not a democracies.
Cluster 1(red) has both high HIV rate of infection and suicide rate per 100 person but low life expectancy and urbanization rate. Also, they have high employment in general but a particularly high female employment rate. This is a fledgling democracy.
Cluster 2(blue) is a fully fledged democracy but a developing economy nonetheless. It is least developed of the three clusters. The relative contribution by the economic variables of interest are non existent to a degree. They have relative low income per person, low employment rate, low urbanization rate and low life expectancy rate.
To externally validate clusters, I performed an Analysis of Variance(ANOVA) to test for significance differences between the three clusters on per person electricity consumption(kWh). Using Tukey test, I performed a post-hoc comparison between the three clusters. My test results indicated a statistically significance difference between the clusters on RELECTRICPERPERSON (F(2,125) = 23.31, p < 0.001). See Figure 3 below
Figure 3: OLS Regression Analysis(Training Dataset)
OLS Regression Results==============================================================================
Dep. Variable: RELECTRICPERPERSON R-squared: 0.273
Model: OLS Adj. R-squared: 0.262
Method: Least Squares F-statistic: 23.31
Date: Sat, 27 Feb 2016 Prob (F-statistic): 2.54e-09
Time: 16:24:12 Log-Likelihood: -1071.6
No. Observations: 127 AIC: 2149.
Df Residuals: 124 BIC: 2158.
Df Model: 2
Covariance Type: nonrobust
===================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept 1605.9341 164.969 9.735 0.000 1279.415 1932.453
C(cluster)[T.1] -1364.0482 221.964 -6.145 0.000 -1803.377 -924.719
C(cluster)[T.2] -1542.8442 292.156 -5.281 0.000 -2121.103 -964.586
==============================================================================
Omnibus: 118.652 Durbin-Watson: 2.178
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1454.624
Skew: 3.287 Prob(JB): 0.00
Kurtosis: 18.221 Cond. No. 3.83
==============================================================================
The Tukey post-hoc comparison revealed a significant difference between clusters on RELECTRICPERPERSON with the exception that there no significance differences between clusters 1 and 2. See Figure 4 below.
Figure 4: Tukey Multiple Comparison of Means (Training Dataset)
Multiple Comparison of Means - Tukey HSD,FWER=0.05============================================
group1 group2 meandiff lower upper reject
---------------------------------------------------------------------------
0 1 -1364.0482 -1890.6291 -837.4672 True
0 2 -1542.8442 -2235.9471 -849.7413 True
1 2 -178.796 -850.6163 493.0242 False
-------------------------------------------------------------------------
Countries in cluster 0 have the largest per person electricity consumption globally ( mean = 1605.934133, s.d = 1780.616595) while countries in cluster 2 have the lowest per person electricity consumption globally (mean = 63.089933 , s.d = 295.918015).


