from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library. We won't test you on this but it's good to know.
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In this lecture, we will explore the use of optimization to find the "best" model and derive least-squares regression.
In the last lecture we developed the equations of slope and intercept using the correlation coefficient $r$.
def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)
def correlation(t, x, y):
    """Computes the correlation between columns x and y"""
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)
def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd
def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean
def predict_linear(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    pred_y = slope(t, x, y) * t.column(x) + intercept(t, x, y)
    return pred_y
We can now compute predictions, but how good are they? How do we know that we have a good linear fit? To study this we will consider a new dataset.
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)
| State | District | Median Income | Percent voting for Clinton | College% | 
|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 
... (430 rows omitted)
px.scatter(demographics.to_df(), 
           x="College%", 
           y="Median Income",
           color="State")
correlation(demographics, 'College%', 'Median Income')
0.81846485171413352
Discussion Question: Any concerns about the correlation computation being done here?
Here we will try to predict the income for each district as a function of the percent of college educated people.
demo_slope = slope(demographics, 'College%', 'Median Income')
demo_intercept = intercept(demographics, 'College%', 'Median Income')
print("Slope:", demo_slope)
print("Intercept:", demo_intercept)
Slope: 1270.70168946 Intercept: 20802.5777667
Make the actual predictions.
demographics = demographics.with_column(
    "Linear Prediction", predict_linear(demographics, 'College%', 'Median Income')
)
demographics
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | 
|---|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 | 
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 | 
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 | 
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 | 
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 | 
| Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 | 
| Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 | 
| Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 | 
| Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 | 
| Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 | 
... (425 rows omitted)
Visualizing the predictions:
demographics.iscatter('College%', ["Median Income", "Linear Prediction"])
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig
Is this a good fit? Is it the best fit?
The error is the difference between the actual and predicted value:
$$ \text{error} = y - y_\text{predicted} $$In a future lecture, we will refer to this error as the residual.
y = demographics.column('Median Income')
predicted = predict_linear(demographics, 'College%', 'Median Income')
errors = y - predicted
demographics = demographics.with_column('Error', errors)
demographics
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error | 
|---|---|---|---|---|---|---|
| Alabama | Congressional District 1 (115th Congress), Alabama | 47083 | 34.1 | 24 | 51299.4 | -4216.42 | 
| Alabama | Congressional District 2 (115th Congress), Alabama | 42035 | 33 | 21.8 | 48503.9 | -6468.87 | 
| Alabama | Congressional District 3 (115th Congress), Alabama | 46544 | 32.3 | 22.8 | 49774.6 | -3230.58 | 
| Alabama | Congressional District 4 (115th Congress), Alabama | 41110 | 17.4 | 17 | 42404.5 | -1294.51 | 
| Alabama | Congressional District 5 (115th Congress), Alabama | 51690 | 31.3 | 30.3 | 59304.8 | -7614.84 | 
| Alabama | Congressional District 6 (115th Congress), Alabama | 61413 | 26.1 | 36.7 | 67437.3 | -6024.33 | 
| Alabama | Congressional District 7 (115th Congress), Alabama | 34664 | 69.8 | 19.4 | 45454.2 | -10790.2 | 
| Alaska | Congressional District (at Large) (115th Congress), Alaska | 76440 | 37.6 | 29.6 | 58415.3 | 18024.7 | 
| Arizona | Congressional District 1 (115th Congress), Arizona | 50537 | 46.6 | 24.5 | 51934.8 | -1397.77 | 
| Arizona | Congressional District 2 (115th Congress), Arizona | 49072 | 49.6 | 34 | 64006.4 | -14934.4 | 
... (425 rows omitted)
What are the districts with the largest error values?
(demographics
     .with_column("Abs Error", np.abs(demographics.column("Error")))
     .sort("Abs Error", descending=True)
)
| State | District | Median Income | Percent voting for Clinton | College% | Linear Prediction | Error | Abs Error | 
|---|---|---|---|---|---|---|---|
| New York | Congressional District 2 (115th Congress), New York | 93362 | 43.9 | 31.2 | 60448.5 | 32913.5 | 32913.5 | 
| Maryland | Congressional District 5 (115th Congress), Maryland | 95442 | 64.1 | 33.7 | 63625.2 | 31816.8 | 31816.8 | 
| Virginia | Congressional District 10 (115th Congress), Virginia | 120384 | 52.2 | 54.3 | 89801.7 | 30582.3 | 30582.3 | 
| California | Congressional District 15 (115th Congress), California | 106291 | 69.9 | 45 | 77984.2 | 28306.8 | 28306.8 | 
| California | Congressional District 17 (115th Congress), California | 121150 | 73.9 | 57 | 93232.6 | 27917.4 | 27917.4 | 
| California | Congressional District 19 (115th Congress), California | 93139 | 72.9 | 35.5 | 65912.5 | 27226.5 | 27226.5 | 
| California | Congressional District 18 (115th Congress), California | 125790 | 73.4 | 61.5 | 98950.7 | 26839.3 | 26839.3 | 
| New York | Congressional District 1 (115th Congress), New York | 90435 | 42.2 | 34.1 | 64133.5 | 26301.5 | 26301.5 | 
| Pennsylvania | Congressional District 2 (115th Congress), Pennsylvania | 41320 | 72.9 | 36.7 | 67437.3 | -26117.3 | 26117.3 | 
| Georgia | Congressional District 5 (115th Congress), Georgia | 48269 | 85 | 42 | 74172 | -25903 | 25903 | 
... (425 rows omitted)
What would a large error suggest?
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", hover_name="District")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3), 
                y=np.ravel(np.vstack([y, predicted, np.nan * predicted]).T),
                marker_color="gray", line_width=0.75, name="Errors")
fig
demographics.ihist('Error', bins=50, rug=True)
What is the average error?
np.mean(errors)
6.3560089503211536e-13
Mean Absolute Error
np.mean(np.abs(errors))
7235.9076108913441
Mean Squared Error (MSE)
np.mean(errors ** 2)
88332095.268617392
Root Mean Squared Error (RMSE)
np.sqrt(np.mean(errors ** 2))
9398.5155885712811
Assuming $y$ is income in dollars. What are the units of:
def demographics_rmse(slope, intercept):
    predicted = slope * demographics.column("College%") + intercept 
    actual = demographics.column("Median Income")
    errors = predicted - actual
    rmse = np.sqrt(np.mean(errors ** 2))
    return rmse
The value of our error function for the slope and intercept we derived in last lecture is:
demographics_rmse(demo_slope, demo_intercept)
9398.5155885712811
What if we used a different slope and intercept value:
def visualize_demographics_rmse(slope, intercept):
    rmse = demographics_rmse(slope, intercept)
    predicted = slope * demographics.column("College%") + intercept 
    actual = demographics.column("Median Income")
    fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
    xtest = np.arange(0, 75, 1)
    fig.add_scatter(x=xtest, y=slope * xtest + intercept,
                    name = f"{np.round(slope, 2)} x + {np.round(intercept)}")
    fig.add_scatter(x=demographics.column("College%").repeat(3), 
                    y=np.ravel(np.vstack([actual, predicted, np.nan * predicted]).T),
                    marker_color="gray", line_width=0.75, name="Errors")
    fig.update_layout(title=f"RMSE = {np.round(rmse, 2)}")
    return fig
visualize_demographics_rmse(demo_slope, demo_intercept)
visualize_demographics_rmse(demo_slope+1000, demo_intercept - 50000)
Varying the Slope and Intercept and Plotting the RMSE
alt_slopes = demo_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
    rmses = np.append(rmses, demographics_rmse(new_slope, demo_intercept))
variations = Table().with_columns("Slope", alt_slopes, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Slope", y="RMSE")
fig.add_scatter(x=[demo_slope], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10, 
                name="Best Slope")
What if we tried to change the intercept value while using the best slope so far?
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
    rmses = np.append(rmses, demographics_rmse(demo_slope, new_intercept))
variations = Table().with_columns("Intercept", alt_intercepts, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Intercept", y="RMSE")
fig.add_scatter(x=[demo_intercept], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10, 
                name="Best Intercept")
What if we tried changing both the slope and the intercept at the same time?
alt_slopes = demo_slope + np.arange(-100, 100, 1)
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 10)
variations = Table(["Slope", "Intercept", "RMSE"])
for new_slope in alt_slopes:
    for new_intercept in alt_intercepts:
        rmse = demographics_rmse(new_slope, new_intercept)
        variations.append([new_slope, new_intercept, rmse])
    
variations
go.Figure(data=[
    go.Contour(x=variations.column("Slope"), y=variations.column("Intercept"), z=variations.column("RMSE")), 
    go.Scatter(x=[demo_slope], y=[demo_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))
In three dimensions:
go.Figure(data=[
    go.Surface(x = alt_slopes, y = alt_intercepts,
               z=variations.column("RMSE").reshape(len(alt_slopes), len(alt_intercepts)).T),
    go.Scatter3d(x=[demo_slope], y=[demo_intercept], z=[demographics_rmse(demo_slope, demo_intercept)])], 
          layout=dict(height=1000, 
                      scene_xaxis_title="Slope", scene_yaxis_title="Intercept", 
                      scene_zaxis_title="RMSE"))
If our goal is just to find the parameters of our line that minimize some kind of error, we can use numerical optimization tools. Suppose we wanted to minimize the function:
$$ f(x) = \left(x - 2\right)^2 + 3 $$def f(x):
    return ((x-2)**2) + 3
x = np.arange(1, 3, 0.1)
y = f(x)
px.line(x=x, y=y)
We could use the minimize function in the datascience package. The function returns the arguments to the functions that result in the minimum value:
minimize(f)
1.9999999946252267
print("x_min =", minimize(f))
print("f(x_min) =", f(minimize(f)))
x_min = 1.9999999946252267 f(x_min) = 3.0
fig = px.line(x=x, y=y)
fig.add_scatter(x=[minimize(f)], y=[f(minimize(f))],
                name="Minimum", marker_color="red", marker_size=10)
Minimize works for even more complex functions.
$$ f(x) = 2 * \sin(\pi x) + x^3 + x^4 + \sin(10x) $$def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 + np.sin(x * 10)
x = np.arange(-1.5, 1.5, 0.01)
y2 = complicated_function(x)
px.line(x=x, y=y2)
We can still use minimize to find the minimum:
x_min = minimize(complicated_function)
print("x_min =", x_min)
print("f(x_min) =", complicated_function(x_min))
x_min = -0.740999069950806 f(x_min) = -2.46205656866
fig = px.line(x=x, y=y2)
fig.add_scatter(x=[x_min],
                y=[complicated_function(x_min)],
                name="Minimum", marker_color="red", marker_size=10)
We can even minimize multidimensional functions:
$$ \texttt{surface_function(a,b)} = -\frac{\cos\left(\pi \sqrt{(a+0.5)^2 + b^2}\right)}{\sqrt{(a+0.5)^2 + b^2} + 1} $$def surface_function(a, b):
    d = np.sqrt( (a+0.5)**2 + b**2 )
    return -np.cos(np.pi* d) / (d**2 + 1)
a_min, b_min = minimize(surface_function)
[a_min, b_min]
[-0.49999999726226257, 2.7824593980664285e-11]
Visualizing the surface and the minimum:
xs = np.arange(-1.5, 1.5, 0.01)
ys = np.arange(-1.5, 1.5, 0.01)
x, y = np.meshgrid(xs, ys)
zs = surface_function(x.flatten(), y.flatten())
go.Figure(data=[
    go.Surface(x = xs, y = ys,
               z=zs.reshape(len(xs), len(ys))),
    go.Scatter3d(x=[a_min], y=[b_min], z=[surface_function(a_min, b_min)])
    ], 
    layout=dict(height=1000, 
                scene_xaxis_title="a", scene_yaxis_title="b", 
                scene_zaxis_title="surface"))
We can use minimize to find the slope and intercept that minimize root mean squared error in our predictions:
minimize(demographics_rmse)
array([ 1270.70168805, 20802.57933807])
How does this compare to the slope and intercept we derived earlier?
[demo_slope, demo_intercept]
[1270.70168946388, 20802.577766677925]
What happens if we minimize the mean squared error instead of the root mean squared error?
def demographics_mse(slope, intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = slope*x + intercept
    return np.mean(((y - estimate) ** 2))
minimize(demographics_mse)
array([ 1270.70168947, 20802.5777658 ])
What if we wanted to use the absolute error instead?
def demographics_mae(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return np.mean(np.abs(y - estimate))
minimize(demographics_mae)
array([ 1264.71948402, 20001.85902309])
That is different!
mae_slope, mae_intercept = minimize(demographics_mae)
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", color="State")
xtest = np.arange(0, 75, 0.1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"Least Squares: {np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=xtest, 
                y=mae_slope * xtest + mae_intercept,
                name = f"MAE: {np.round(mae_slope, 2)} x + {np.round(mae_intercept)}")
fig
hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)
| vehicle | year | msrp | acceleration | mpg | class | 
|---|---|---|---|---|---|
| Prius (1st Gen) | 1997 | 24509.7 | 7.46 | 41.26 | Compact | 
| Tino | 2000 | 35355 | 8.2 | 54.1 | Compact | 
| Prius (2nd Gen) | 2000 | 26832.2 | 7.97 | 45.23 | Compact | 
| Insight | 2000 | 18936.4 | 9.52 | 53 | Two Seater | 
| Civic (1st Gen) | 2001 | 25833.4 | 7.04 | 47.04 | Compact | 
... (148 rows omitted)
px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
Suppose we use the model:
$$ y = a * \text{acc} + b * \text{mpg} + c $$def hybrid_rmse(a, b, c):
    actual = hybrid.column("msrp")
    acc = hybrid.column("acceleration")
    mpg = hybrid.column("mpg")
    predicted = a*acc + b*mpg + c
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse
a, b, c = minimize(hybrid_rmse)
[a, b, c]
[4119.6597287965033, -513.26014324389826, 7636.7196373778297]
print(f"Error: {hybrid_rmse(a, b, c):,}")
Error: 14,687.539382720333
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a * acc + b * mpg + c
        predictions.append([mpg, acc, pred])
fig = px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)
We could try to improve our predictions by defining a more complex equation:
$$ \text{y} = a * \text{acc} + b *\text{mpg} + c * \text{acc}^2 + d * \text{mpg}^2 + e $$def hybrid_better_rmse(a, b, c, d, e):
    actual = hybrid.column("msrp")
    acc = hybrid.column("acceleration")
    mpg = hybrid.column("mpg")
    predicted = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse
a, b, c, d, e = minimize(hybrid_better_rmse)
[a, b, c, d, e]
[-8278.7149344607205, -2634.1314044379506, 467.28859068643095, 27.309648987149739, 122783.24092048552]
print(f"Error: {hybrid_better_rmse(a,b,c,d,e):,}")
Error: 13,035.853740271548
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
        predictions.append([mpg, acc, pred])
        
fig = px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)