from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
# Predefined functions; they should look familiar to functions you've coded in assignments!
def standard_units(arr):
return (arr - np.average(arr))/np.std(arr)
def correlation(t, x, y):
x_standard = standard_units(t.column(x))
y_standard = standard_units(t.column(y))
return np.average(x_standard * y_standard)
def slope(t, x, y):
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def fitted_values(t, x, y):
"""Return an array of the regression estimates at all the x values"""
a = slope(t, x, y)
b = intercept(t, x, y)
return a*t.column(x) + b
def residuals(t, x, y):
"""Return an array of all the residuals"""
predictions = fitted_values(t, x, y)
return t.column(y) - predictions
# Ignore this code; it produces plots for demonstrating the regression model
def draw_and_compare(true_slope, true_int, sample_size):
x = np.random.normal(50, 5, sample_size)
xlims = np.array([np.min(x), np.max(x)])
errors = np.random.normal(0, 6, sample_size)
y = (true_slope * x + true_int) + errors
sample = Table().with_columns('x', x, 'y', y)
sample.scatter('x', 'y')
plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
plots.title('True Line, and Points Created')
sample.scatter('x', 'y')
plots.title('What We Get to See')
sample.scatter('x', 'y', fit_line=True)
plots.title('Regression Line: Estimate of True Line')
sample.scatter('x', 'y', fit_line=True)
plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
plots.title("Regression Line and True Line")
draw_and_compare(2, -5, 10)
draw_and_compare(2, -5, 10)
draw_and_compare(2, -5, 100)