from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library. We won't test you on this but it's good to know.
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In this lecture, we will explore the use of optimization to find the "best" model and derive least-squares regression.
In the last lecture we developed the equations of slope and intercept using the correlation coefficient $r$.
def standard_units(x):
"""Converts an array x to standard units"""
return (x - np.mean(x)) / np.std(x)
def correlation(t, x, y):
"""Computes the correlation between columns x and y"""
x_su = standard_units(t.column(x))
y_su = standard_units(t.column(y))
return np.mean(x_su * y_su)
def slope(t, x, y):
"""Computes the slope of the regression line"""
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
"""Computes the intercept of the regression line"""
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def predict_linear(t, x, y):
"""Return an array of the regressions estimates at all the x values"""
pred_y = slope(t, x, y) * t.column(x) + intercept(t, x, y)
return pred_y
We can now compute predictions, but how good are they? How do we know that we have a good linear fit? To study this we will consider a new dataset.
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)
State | District | Median Income | Percent voting for Clinton | College% |
---|---|---|---|---|
Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 |
Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 |
Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 |
Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 |
Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 |
... (430 rows omitted)
px.scatter(demographics.to_df(),
x="College%",
y="Median Income",
color="State")
correlation(demographics, 'College%', 'Median Income')
0.81846485171413352
Discussion Question: Any concerns about the correlation computation being done here?
Here we will try to predict the income for each district as a function of the percent of college educated people.
demo_slope = slope(demographics, 'College%', 'Median Income')
demo_intercept = intercept(demographics, 'College%', 'Median Income')
print("Slope:", demo_slope)
print("Intercept:", demo_intercept)
Slope: 1270.70168946 Intercept: 20802.5777667
Make the actual predictions.
demographics = demographics.with_column(
"Linear Prediction", predict_linear(demographics, 'College%', 'Median Income')
)
demographics
State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction |
---|---|---|---|---|---|
Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 |
Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 |
Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 |
Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 |
Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 |
Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 |
Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 |
Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 |
Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 |
Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 |
... (425 rows omitted)
Visualizing the predictions:
demographics.iscatter('College%', ["Median Income", "Linear Prediction"])
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest,
y=demo_slope * xtest + demo_intercept,
name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig
Is this a good fit? Is it the best fit?
The error is the difference between the actual and predicted value:
$$ \text{error} = y - y_\text{predicted} $$In a future lecture, we will refer to this error as the residual.
y = demographics.column('Median Income')
predicted = predict_linear(demographics, 'College%', 'Median Income')
errors = y - predicted
demographics = demographics.with_column('Error', errors)
demographics
State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error |
---|---|---|---|---|---|---|
Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 | -4216.42 |
Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 | -6468.87 |
Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 | -3230.58 |
Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 | -1294.51 |
Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 | -7614.84 |
Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 | -6024.33 |
Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 | -10790.2 |
Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 | 18024.7 |
Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 | -1397.77 |
Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 | -14934.4 |
... (425 rows omitted)
What are the districts with the largest error values?
(demographics
.with_column("Abs Error", np.abs(demographics.column("Error")))
.sort("Abs Error", descending=True)
)
State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error | Abs Error |
---|---|---|---|---|---|---|---|
New York | Congressional District 2 (115th Congress), New York | 93362 | 43.9 | 31.2 | 60448.5 | 32913.5 | 32913.5 |
Maryland | Congressional District 5 (115th Congress), Maryland | 95442 | 64.1 | 33.7 | 63625.2 | 31816.8 | 31816.8 |
Virginia | Congressional District 10 (115th Congress), Virginia | 120384 | 52.2 | 54.3 | 89801.7 | 30582.3 | 30582.3 |
California | Congressional District 15 (115th Congress), California | 106291 | 69.9 | 45 | 77984.2 | 28306.8 | 28306.8 |
California | Congressional District 17 (115th Congress), California | 121150 | 73.9 | 57 | 93232.6 | 27917.4 | 27917.4 |
California | Congressional District 19 (115th Congress), California | 93139 | 72.9 | 35.5 | 65912.5 | 27226.5 | 27226.5 |
California | Congressional District 18 (115th Congress), California | 125790 | 73.4 | 61.5 | 98950.7 | 26839.3 | 26839.3 |
New York | Congressional District 1 (115th Congress), New York | 90435 | 42.2 | 34.1 | 64133.5 | 26301.5 | 26301.5 |
Pennsylvania | Congressional District 2 (115th Congress), Pennsylvania | 41320 | 72.9 | 36.7 | 67437.3 | -26117.3 | 26117.3 |
Georgia | Congressional District 5 (115th Congress), Georgia | 48269 | 85 | 42 | 74172 | -25903 | 25903 |
... (425 rows omitted)
What would a large error suggest?
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", hover_name="District")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest,
y=demo_slope * xtest + demo_intercept,
name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3),
y=np.ravel(np.vstack([y, predicted, np.nan * predicted]).T),
marker_color="gray", line_width=0.75, name="Errors")
fig
demographics.ihist('Error', bins=50, rug=True)
What is the average error?
np.mean(errors)
6.3560089503211536e-13
Mean Absolute Error
np.mean(np.abs(errors))
7235.9076108913441
Mean Squared Error (MSE)
np.mean(errors ** 2)
88332095.268617392
Root Mean Squared Error (RMSE)
np.sqrt(np.mean(errors ** 2))
9398.5155885712811
Assuming $y$ is income in dollars. What are the units of:
def demographics_rmse(slope, intercept):
predicted = slope * demographics.column("College%") + intercept
actual = demographics.column("Median Income")
errors = predicted - actual
rmse = np.sqrt(np.mean(errors ** 2))
return rmse
The value of our error function for the slope and intercept we derived in last lecture is:
demographics_rmse(demo_slope, demo_intercept)
9398.5155885712811
What if we used a different slope and intercept value:
def visualize_demographics_rmse(slope, intercept):
rmse = demographics_rmse(slope, intercept)
predicted = slope * demographics.column("College%") + intercept
actual = demographics.column("Median Income")
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, y=slope * xtest + intercept,
name = f"{np.round(slope, 2)} x + {np.round(intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3),
y=np.ravel(np.vstack([actual, predicted, np.nan * predicted]).T),
marker_color="gray", line_width=0.75, name="Errors")
fig.update_layout(title=f"RMSE = {np.round(rmse, 2)}")
return fig
visualize_demographics_rmse(demo_slope, demo_intercept)
visualize_demographics_rmse(demo_slope+1000, demo_intercept - 50000)
Varying the Slope and Intercept and Plotting the RMSE
alt_slopes = demo_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
rmses = np.append(rmses, demographics_rmse(new_slope, demo_intercept))
variations = Table().with_columns("Slope", alt_slopes, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Slope", y="RMSE")
fig.add_scatter(x=[demo_slope], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10,
name="Best Slope")
What if we tried to change the intercept value while using the best slope so far?
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
rmses = np.append(rmses, demographics_rmse(demo_slope, new_intercept))
variations = Table().with_columns("Intercept", alt_intercepts, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Intercept", y="RMSE")
fig.add_scatter(x=[demo_intercept], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10,
name="Best Intercept")
What if we tried changing both the slope and the intercept at the same time?
alt_slopes = demo_slope + np.arange(-100, 100, 1)
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 10)
variations = Table(["Slope", "Intercept", "RMSE"])
for new_slope in alt_slopes:
for new_intercept in alt_intercepts:
rmse = demographics_rmse(new_slope, new_intercept)
variations.append([new_slope, new_intercept, rmse])
variations
go.Figure(data=[
go.Contour(x=variations.column("Slope"), y=variations.column("Intercept"), z=variations.column("RMSE")),
go.Scatter(x=[demo_slope], y=[demo_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))
In three dimensions:
go.Figure(data=[
go.Surface(x = alt_slopes, y = alt_intercepts,
z=variations.column("RMSE").reshape(len(alt_slopes), len(alt_intercepts)).T),
go.Scatter3d(x=[demo_slope], y=[demo_intercept], z=[demographics_rmse(demo_slope, demo_intercept)])],
layout=dict(height=1000,
scene_xaxis_title="Slope", scene_yaxis_title="Intercept",
scene_zaxis_title="RMSE"))
If our goal is just to find the parameters of our line that minimize some kind of error, we can use numerical optimization tools. Suppose we wanted to minimize the function:
$$ f(x) = \left(x - 2\right)^2 + 3 $$def f(x):
return ((x-2)**2) + 3
x = np.arange(1, 3, 0.1)
y = f(x)
px.line(x=x, y=y)
We could use the minimize function in the datascience
package. The function returns the arguments to the functions that result in the minimum value:
minimize(f)
1.9999999946252267
print("x_min =", minimize(f))
print("f(x_min) =", f(minimize(f)))
x_min = 1.9999999946252267 f(x_min) = 3.0
fig = px.line(x=x, y=y)
fig.add_scatter(x=[minimize(f)], y=[f(minimize(f))],
name="Minimum", marker_color="red", marker_size=10)
Minimize works for even more complex functions.
$$ f(x) = 2 * \sin(\pi x) + x^3 + x^4 + \sin(10x) $$def complicated_function(x):
return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 + np.sin(x * 10)
x = np.arange(-1.5, 1.5, 0.01)
y2 = complicated_function(x)
px.line(x=x, y=y2)
We can still use minimize to find the minimum:
x_min = minimize(complicated_function)
print("x_min =", x_min)
print("f(x_min) =", complicated_function(x_min))
x_min = -0.740999069950806 f(x_min) = -2.46205656866
fig = px.line(x=x, y=y2)
fig.add_scatter(x=[x_min],
y=[complicated_function(x_min)],
name="Minimum", marker_color="red", marker_size=10)
We can even minimize multidimensional functions:
$$ \texttt{surface_function(a,b)} = -\frac{\cos\left(\pi \sqrt{(a+0.5)^2 + b^2}\right)}{\sqrt{(a+0.5)^2 + b^2} + 1} $$def surface_function(a, b):
d = np.sqrt( (a+0.5)**2 + b**2 )
return -np.cos(np.pi* d) / (d**2 + 1)
a_min, b_min = minimize(surface_function)
[a_min, b_min]
[-0.49999999726226257, 2.7824593980664285e-11]
Visualizing the surface and the minimum:
xs = np.arange(-1.5, 1.5, 0.01)
ys = np.arange(-1.5, 1.5, 0.01)
x, y = np.meshgrid(xs, ys)
zs = surface_function(x.flatten(), y.flatten())
go.Figure(data=[
go.Surface(x = xs, y = ys,
z=zs.reshape(len(xs), len(ys))),
go.Scatter3d(x=[a_min], y=[b_min], z=[surface_function(a_min, b_min)])
],
layout=dict(height=1000,
scene_xaxis_title="a", scene_yaxis_title="b",
scene_zaxis_title="surface"))
We can use minimize to find the slope and intercept that minimize root mean squared error in our predictions:
minimize(demographics_rmse)
array([ 1270.70168805, 20802.57933807])
How does this compare to the slope and intercept we derived earlier?
[demo_slope, demo_intercept]
[1270.70168946388, 20802.577766677925]
What happens if we minimize the mean squared error instead of the root mean squared error?
def demographics_mse(slope, intercept):
x = demographics.column('College%')
y = demographics.column('Median Income')
estimate = slope*x + intercept
return np.mean(((y - estimate) ** 2))
minimize(demographics_mse)
array([ 1270.70168947, 20802.5777658 ])
What if we wanted to use the absolute error instead?
def demographics_mae(any_slope, any_intercept):
x = demographics.column('College%')
y = demographics.column('Median Income')
estimate = any_slope*x + any_intercept
return np.mean(np.abs(y - estimate))
minimize(demographics_mae)
array([ 1264.71948402, 20001.85902309])
That is different!
mae_slope, mae_intercept = minimize(demographics_mae)
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", color="State")
xtest = np.arange(0, 75, 0.1)
fig.add_scatter(x=xtest,
y=demo_slope * xtest + demo_intercept,
name = f"Least Squares: {np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=xtest,
y=mae_slope * xtest + mae_intercept,
name = f"MAE: {np.round(mae_slope, 2)} x + {np.round(mae_intercept)}")
fig
hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)
vehicle | year | msrp | acceleration | mpg | class |
---|---|---|---|---|---|
Prius (1st Gen) | 1997 | 24509.7 | 7.46 | 41.26 | Compact |
Tino | 2000 | 35355 | 8.2 | 54.1 | Compact |
Prius (2nd Gen) | 2000 | 26832.2 | 7.97 | 45.23 | Compact |
Insight | 2000 | 18936.4 | 9.52 | 53 | Two Seater |
Civic (1st Gen) | 2001 | 25833.4 | 7.04 | 47.04 | Compact |
... (148 rows omitted)
px.scatter_3d(
hybrid.to_df(),
x="mpg", y="acceleration", z="msrp",
hover_name="vehicle",
color="class",
height=800
)
Suppose we use the model:
$$ y = a * \text{acc} + b * \text{mpg} + c $$def hybrid_rmse(a, b, c):
actual = hybrid.column("msrp")
acc = hybrid.column("acceleration")
mpg = hybrid.column("mpg")
predicted = a*acc + b*mpg + c
mse = np.sqrt(np.mean((actual - predicted)**2))
return mse
a, b, c = minimize(hybrid_rmse)
[a, b, c]
[4119.6597287965033, -513.26014324389826, 7636.7196373778297]
print(f"Error: {hybrid_rmse(a, b, c):,}")
Error: 14,687.539382720333
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
for acc in acceleration_range:
pred = a * acc + b * mpg + c
predictions.append([mpg, acc, pred])
fig = px.scatter_3d(
hybrid.to_df(),
x="mpg", y="acceleration", z="msrp",
hover_name="vehicle",
color="class",
height=800
)
fig.add_surface(
x = mpg_range, y = acceleration_range,
z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
showscale=False
)
We could try to improve our predictions by defining a more complex equation:
$$ \text{y} = a * \text{acc} + b *\text{mpg} + c * \text{acc}^2 + d * \text{mpg}^2 + e $$def hybrid_better_rmse(a, b, c, d, e):
actual = hybrid.column("msrp")
acc = hybrid.column("acceleration")
mpg = hybrid.column("mpg")
predicted = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
mse = np.sqrt(np.mean((actual - predicted)**2))
return mse
a, b, c, d, e = minimize(hybrid_better_rmse)
[a, b, c, d, e]
[-8278.7149344607205, -2634.1314044379506, 467.28859068643095, 27.309648987149739, 122783.24092048552]
print(f"Error: {hybrid_better_rmse(a,b,c,d,e):,}")
Error: 13,035.853740271548
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
for acc in acceleration_range:
pred = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
predictions.append([mpg, acc, pred])
fig = px.scatter_3d(
hybrid.to_df(),
x="mpg", y="acceleration", z="msrp",
hover_name="vehicle",
color="class",
height=800
)
fig.add_surface(
x = mpg_range, y = acceleration_range,
z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
showscale=False
)