from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library. We won't test you on this but it's good to know.
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In this lecture, we will explore the use of optimization to find the "best" model and derive least-squares regression.
In the last lecture we developed the equations of slope and intercept using the correlation coefficient $r$.
def standard_units(x):
"""Converts an array x to standard units"""
return (x - np.mean(x)) / np.std(x)
def correlation(t, x, y):
"""Computes the correlation between columns x and y"""
x_su = standard_units(t.column(x))
y_su = standard_units(t.column(y))
return np.mean(x_su * y_su)
def slope(t, x, y):
"""Computes the slope of the regression line"""
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
"""Computes the intercept of the regression line"""
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def predict_linear(t, x, y):
"""Return an array of the regressions estimates at all the x values"""
pred_y = slope(t, x, y) * t.column(x) + intercept(t, x, y)
return pred_y
We can now compute predictions, but how good are they? How do we know that we have a good linear fit? To study this we will consider a new dataset.
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)
| State | District | Median Income | Percent voting for Clinton | College% |
|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 |
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 |
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 |
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 |
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 |
... (430 rows omitted)
px.scatter(demographics.to_df(),
x="College%",
y="Median Income",
color="State")
correlation(demographics, 'College%', 'Median Income')
0.81846485171413352
Discussion Question: Any concerns about the correlation computation being done here?
Here we will try to predict the income for each district as a function of the percent of college educated people.
demo_slope = slope(demographics, 'College%', 'Median Income')
demo_intercept = intercept(demographics, 'College%', 'Median Income')
print("Slope:", demo_slope)
print("Intercept:", demo_intercept)
Slope: 1270.70168946 Intercept: 20802.5777667
Make the actual predictions.
demographics = demographics.with_column(
"Linear Prediction", predict_linear(demographics, 'College%', 'Median Income')
)
demographics
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction |
|---|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 |
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 |
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 |
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 |
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 |
| Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 |
| Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 |
| Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 |
| Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 |
| Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 |
... (425 rows omitted)
Visualizing the predictions:
demographics.iscatter('College%', ["Median Income", "Linear Prediction"])
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest,
y=demo_slope * xtest + demo_intercept,
name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig
Is this a good fit? Is it the best fit?
The error is the difference between the actual and predicted value:
$$ \text{error} = y - y_\text{predicted} $$In a future lecture, we will refer to this error as the residual.
y = demographics.column('Median Income')
predicted = predict_linear(demographics, 'College%', 'Median Income')
errors = y - predicted
demographics = demographics.with_column('Error', errors)
demographics
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error |
|---|---|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 | -4216.42 |
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 | -6468.87 |
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 | -3230.58 |
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 | -1294.51 |
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 | -7614.84 |
| Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 | -6024.33 |
| Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 | -10790.2 |
| Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 | 18024.7 |
| Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 | -1397.77 |
| Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 | -14934.4 |
... (425 rows omitted)
What are the districts with the largest error values?
(demographics
.with_column("Abs Error", np.abs(demographics.column("Error")))
.sort("Abs Error", descending=True)
)
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error | Abs Error |
|---|---|---|---|---|---|---|---|
| New York | Congressional District 2 (115th Congress), New York | 93362 | 43.9 | 31.2 | 60448.5 | 32913.5 | 32913.5 |
| Maryland | Congressional District 5 (115th Congress), Maryland | 95442 | 64.1 | 33.7 | 63625.2 | 31816.8 | 31816.8 |
| Virginia | Congressional District 10 (115th Congress), Virginia | 120384 | 52.2 | 54.3 | 89801.7 | 30582.3 | 30582.3 |
| California | Congressional District 15 (115th Congress), California | 106291 | 69.9 | 45 | 77984.2 | 28306.8 | 28306.8 |
| California | Congressional District 17 (115th Congress), California | 121150 | 73.9 | 57 | 93232.6 | 27917.4 | 27917.4 |
| California | Congressional District 19 (115th Congress), California | 93139 | 72.9 | 35.5 | 65912.5 | 27226.5 | 27226.5 |
| California | Congressional District 18 (115th Congress), California | 125790 | 73.4 | 61.5 | 98950.7 | 26839.3 | 26839.3 |
| New York | Congressional District 1 (115th Congress), New York | 90435 | 42.2 | 34.1 | 64133.5 | 26301.5 | 26301.5 |
| Pennsylvania | Congressional District 2 (115th Congress), Pennsylvania | 41320 | 72.9 | 36.7 | 67437.3 | -26117.3 | 26117.3 |
| Georgia | Congressional District 5 (115th Congress), Georgia | 48269 | 85 | 42 | 74172 | -25903 | 25903 |
... (425 rows omitted)
What would a large error suggest?
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", hover_name="District")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest,
y=demo_slope * xtest + demo_intercept,
name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3),
y=np.ravel(np.vstack([y, predicted, np.nan * predicted]).T),
marker_color="gray", line_width=0.75, name="Errors")
fig
demographics.ihist('Error', bins=50, rug=True)
What is the average error?
np.mean(errors)
6.3560089503211536e-13
Mean Absolute Error
np.mean(np.abs(errors))
7235.9076108913441
Mean Squared Error (MSE)
np.mean(errors ** 2)
88332095.268617392
Root Mean Squared Error (RMSE)
np.sqrt(np.mean(errors ** 2))
9398.5155885712811
Assuming $y$ is income in dollars. What are the units of:
def demographics_rmse(slope, intercept):
predicted = slope * demographics.column("College%") + intercept
actual = demographics.column("Median Income")
errors = predicted - actual
rmse = np.sqrt(np.mean(errors ** 2))
return rmse
The value of our error function for the slope and intercept we derived in last lecture is:
demographics_rmse(demo_slope, demo_intercept)
9398.5155885712811
What if we used a different slope and intercept value:
def visualize_demographics_rmse(slope, intercept):
rmse = demographics_rmse(slope, intercept)
predicted = slope * demographics.column("College%") + intercept
actual = demographics.column("Median Income")
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, y=slope * xtest + intercept,
name = f"{np.round(slope, 2)} x + {np.round(intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3),
y=np.ravel(np.vstack([actual, predicted, np.nan * predicted]).T),
marker_color="gray", line_width=0.75, name="Errors")
fig.update_layout(title=f"RMSE = {np.round(rmse, 2)}")
return fig
visualize_demographics_rmse(demo_slope, demo_intercept)
visualize_demographics_rmse(demo_slope+1000, demo_intercept - 50000)
Varying the Slope and Intercept and Plotting the RMSE
alt_slopes = demo_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
rmses = np.append(rmses, demographics_rmse(new_slope, demo_intercept))
variations = Table().with_columns("Slope", alt_slopes, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Slope", y="RMSE")
fig.add_scatter(x=[demo_slope], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10,
name="Best Slope")
What if we tried to change the intercept value while using the best slope so far?
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
rmses = np.append(rmses, demographics_rmse(demo_slope, new_intercept))
variations = Table().with_columns("Intercept", alt_intercepts, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Intercept", y="RMSE")
fig.add_scatter(x=[demo_intercept], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10,
name="Best Intercept")
What if we tried changing both the slope and the intercept at the same time?
alt_slopes = demo_slope + np.arange(-100, 100, 1)
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 10)
variations = Table(["Slope", "Intercept", "RMSE"])
for new_slope in alt_slopes:
for new_intercept in alt_intercepts:
rmse = demographics_rmse(new_slope, new_intercept)
variations.append([new_slope, new_intercept, rmse])
variations
go.Figure(data=[
go.Contour(x=variations.column("Slope"), y=variations.column("Intercept"), z=variations.column("RMSE")),
go.Scatter(x=[demo_slope], y=[demo_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))