Introduction
In this week's assignment, the ask is to use provided datasets and generate and interpret correlation coefficient using two quantitative variables. I will use the gapminder dataset and calculate correlation coefficient between "incomeperperson" and "relectricperperson".
Code
import pandas
import numpy
import seaborn
import scipy
import os
import matplotlib.pyplot as plt
# define method to load data of interest
def load_data(data_dir, csv_file):
if __name__ == "__main__":
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pandas.read_csv(DATA_FILE, low_memory=False)
return data
# bug fix for display format to avoid run time errors
pandas.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)
# loading data
data = load_data('data', 'gapminder.csv')
print(data)
reg_data = data.copy()
# Extracting data pertinent to variables of interest
reg_data = \
reg_data[['incomeperperson', 'relectricperperson']]
print(reg_data)
# setting variables of interest to numeric
reg_data['incomeperperson'] = \
pandas.to_numeric(reg_data['incomeperperson'], errors='coerce')
reg_data['relectricperperson'] = \
pandas.to_numeric(reg_data['relectricperperson'], errors='coerce')
reg_data['incomeperperson'] =\
reg_data['incomeperperson'].replace(' ', numpy.nan)
reg_data['relectricperperson'] =\
reg_data['relectricperperson'].replace(' ', numpy.nan)
scat1 = seaborn.regplot(x='incomeperperson', y='relectricperperson',
fit_reg=True, data=reg_data)
plt.xlabel('Income Per Person (constant 2008 USD)')
plt.ylabel('Per Person Electric Consumption (kWh')
plt.title('Scatterplot for the Association Between income and\
electricity consumption globally')
# Remove all NAs from data
data_clean = reg_data.dropna()
print('association between income and electricity consumption')
(r, p) =\
scipy.stats.pearsonr(data_clean['incomeperperson'],
data_clean['relectricperperson'])
print('(r = %f, p-value = %s)' % (r, p))
print('r2 = %f' % r**2)
import numpy
import seaborn
import scipy
import os
import matplotlib.pyplot as plt
# define method to load data of interest
def load_data(data_dir, csv_file):
if __name__ == "__main__":
DATA_PATH = os.path.join(os.getcwd(), data_dir)
DATA_FILE = os.path.join(DATA_PATH, csv_file)
data = pandas.read_csv(DATA_FILE, low_memory=False)
return data
# bug fix for display format to avoid run time errors
pandas.set_option('display.float_format', lambda x: '%f' % x)
# Set pandas to display all columns and rows in DataFrame
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)
# loading data
data = load_data('data', 'gapminder.csv')
print(data)
reg_data = data.copy()
# Extracting data pertinent to variables of interest
reg_data = \
reg_data[['incomeperperson', 'relectricperperson']]
print(reg_data)
# setting variables of interest to numeric
reg_data['incomeperperson'] = \
pandas.to_numeric(reg_data['incomeperperson'], errors='coerce')
reg_data['relectricperperson'] = \
pandas.to_numeric(reg_data['relectricperperson'], errors='coerce')
reg_data['incomeperperson'] =\
reg_data['incomeperperson'].replace(' ', numpy.nan)
reg_data['relectricperperson'] =\
reg_data['relectricperperson'].replace(' ', numpy.nan)
scat1 = seaborn.regplot(x='incomeperperson', y='relectricperperson',
fit_reg=True, data=reg_data)
plt.xlabel('Income Per Person (constant 2008 USD)')
plt.ylabel('Per Person Electric Consumption (kWh')
plt.title('Scatterplot for the Association Between income and\
electricity consumption globally')
Scatter Plot
data_clean = reg_data.dropna()
print('association between income and electricity consumption')
(r, p) =\
scipy.stats.pearsonr(data_clean['incomeperperson'],
data_clean['relectricperperson'])
print('(r = %f, p-value = %s)' % (r, p))
print('r2 = %f' % r**2)
Correlation Coefficient
(r = 0.651637, p-value = 4.63071717359e-17)
r2 = 0.424631
Conclusion
The generated correlation coefficient is positively correlated and modestly strong with a value of 0.65 and is also statistically significant(p-value < 0.05). This goes to show that, it is highly unlikely that the relationship of this magnitude between income and electricity use globally is due to chance alone.
With an r-squared = 0.42, this suggests that, if we know the income per person, we can predict 42% of the variability we observe in per person electricity consumption globally.


No comments:
Post a Comment