from datascience import *
import numpy as np


import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)


def correlation(t, x, y):
    """Computes the correlation between columns x and y"""
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)


def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd


def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean


def predict_linear(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    pred_y = slope(t, x, y) * t.column(x) + intercept(t, x, y)
    return pred_y


demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)


px.scatter(demographics.to_df(), 
           x="College%", 
           y="Median Income",
           color="State")


correlation(demographics, 'College%', 'Median Income')

0.81846485171413352


demo_slope = slope(demographics, 'College%', 'Median Income')
demo_intercept = intercept(demographics, 'College%', 'Median Income')
print("Slope:", demo_slope)
print("Intercept:", demo_intercept)

Slope: 1270.70168946
Intercept: 20802.5777667


demographics = demographics.with_column(
    "Linear Prediction", predict_linear(demographics, 'College%', 'Median Income')
)
demographics


demographics.iscatter('College%', ["Median Income", "Linear Prediction"])


fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig


y = demographics.column('Median Income')
predicted = predict_linear(demographics, 'College%', 'Median Income')

errors = y - predicted


demographics = demographics.with_column('Error', errors)
demographics


(demographics
     .with_column("Abs Error", np.abs(demographics.column("Error")))
     .sort("Abs Error", descending=True)
)


fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", hover_name="District")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=demographics.column("College%").repeat(3), 
                y=np.ravel(np.vstack([y, predicted, np.nan * predicted]).T),
                marker_color="gray", line_width=0.75, name="Errors")
fig


demographics.ihist('Error', bins=50, rug=True)


np.mean(errors)

6.3560089503211536e-13


np.mean(np.abs(errors))

7235.9076108913441


np.mean(errors ** 2)

88332095.268617392


np.sqrt(np.mean(errors ** 2))

9398.5155885712811


def demographics_rmse(slope, intercept):
    predicted = slope * demographics.column("College%") + intercept 
    actual = demographics.column("Median Income")
    errors = predicted - actual
    rmse = np.sqrt(np.mean(errors ** 2))
    return rmse


demographics_rmse(demo_slope, demo_intercept)

9398.5155885712811


def visualize_demographics_rmse(slope, intercept):
    rmse = demographics_rmse(slope, intercept)
    predicted = slope * demographics.column("College%") + intercept 
    actual = demographics.column("Median Income")
    fig = px.scatter(demographics.to_df(), x="College%", y="Median Income")
    xtest = np.arange(0, 75, 1)
    fig.add_scatter(x=xtest, y=slope * xtest + intercept,
                    name = f"{np.round(slope, 2)} x + {np.round(intercept)}")
    fig.add_scatter(x=demographics.column("College%").repeat(3), 
                    y=np.ravel(np.vstack([actual, predicted, np.nan * predicted]).T),
                    marker_color="gray", line_width=0.75, name="Errors")
    fig.update_layout(title=f"RMSE = {np.round(rmse, 2)}")
    return fig


visualize_demographics_rmse(demo_slope, demo_intercept)


visualize_demographics_rmse(demo_slope+1000, demo_intercept - 50000)


alt_slopes = demo_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
    rmses = np.append(rmses, demographics_rmse(new_slope, demo_intercept))

variations = Table().with_columns("Slope", alt_slopes, "RMSE", rmses)


fig = px.scatter(variations.to_df(), x="Slope", y="RMSE")
fig.add_scatter(x=[demo_slope], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10, 
                name="Best Slope")


alt_intercepts = demo_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
    rmses = np.append(rmses, demographics_rmse(demo_slope, new_intercept))

variations = Table().with_columns("Intercept", alt_intercepts, "RMSE", rmses)
fig = px.scatter(variations.to_df(), x="Intercept", y="RMSE")
fig.add_scatter(x=[demo_intercept], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10, 
                name="Best Intercept")


alt_slopes = demo_slope + np.arange(-100, 100, 1)
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 10)
variations = Table(["Slope", "Intercept", "RMSE"])
for new_slope in alt_slopes:
    for new_intercept in alt_intercepts:
        rmse = demographics_rmse(new_slope, new_intercept)
        variations.append([new_slope, new_intercept, rmse])
    
variations
go.Figure(data=[
    go.Contour(x=variations.column("Slope"), y=variations.column("Intercept"), z=variations.column("RMSE")), 
    go.Scatter(x=[demo_slope], y=[demo_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))


go.Figure(data=[
    go.Surface(x = alt_slopes, y = alt_intercepts,
               z=variations.column("RMSE").reshape(len(alt_slopes), len(alt_intercepts)).T),
    go.Scatter3d(x=[demo_slope], y=[demo_intercept], z=[demographics_rmse(demo_slope, demo_intercept)])], 
          layout=dict(height=1000, 
                      scene_xaxis_title="Slope", scene_yaxis_title="Intercept", 
                      scene_zaxis_title="RMSE"))


def f(x):
    return ((x-2)**2) + 3


x = np.arange(1, 3, 0.1)
y = f(x)
px.line(x=x, y=y)


minimize(f)

1.9999999946252267


print("x_min =", minimize(f))
print("f(x_min) =", f(minimize(f)))

x_min = 1.9999999946252267
f(x_min) = 3.0


fig = px.line(x=x, y=y)
fig.add_scatter(x=[minimize(f)], y=[f(minimize(f))],
                name="Minimum", marker_color="red", marker_size=10)


def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 + np.sin(x * 10)


x = np.arange(-1.5, 1.5, 0.01)
y2 = complicated_function(x)
px.line(x=x, y=y2)


x_min = minimize(complicated_function)
print("x_min =", x_min)
print("f(x_min) =", complicated_function(x_min))

x_min = -0.740999069950806
f(x_min) = -2.46205656866


fig = px.line(x=x, y=y2)
fig.add_scatter(x=[x_min],
                y=[complicated_function(x_min)],
                name="Minimum", marker_color="red", marker_size=10)


def surface_function(a, b):
    d = np.sqrt( (a+0.5)**2 + b**2 )
    return -np.cos(np.pi* d) / (d**2 + 1)


a_min, b_min = minimize(surface_function)
[a_min, b_min]

[-0.49999999726226257, 2.7824593980664285e-11]


xs = np.arange(-1.5, 1.5, 0.01)
ys = np.arange(-1.5, 1.5, 0.01)
x, y = np.meshgrid(xs, ys)
zs = surface_function(x.flatten(), y.flatten())
go.Figure(data=[
    go.Surface(x = xs, y = ys,
               z=zs.reshape(len(xs), len(ys))),
    go.Scatter3d(x=[a_min], y=[b_min], z=[surface_function(a_min, b_min)])
    ], 
    layout=dict(height=1000, 
                scene_xaxis_title="a", scene_yaxis_title="b", 
                scene_zaxis_title="surface"))


minimize(demographics_rmse)

array([  1270.70168805,  20802.57933807])


[demo_slope, demo_intercept]

[1270.70168946388, 20802.577766677925]


def demographics_mse(slope, intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = slope*x + intercept
    return np.mean(((y - estimate) ** 2))


minimize(demographics_mse)

array([  1270.70168947,  20802.5777658 ])


def demographics_mae(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return np.mean(np.abs(y - estimate))


minimize(demographics_mae)

array([  1264.71948402,  20001.85902309])


mae_slope, mae_intercept = minimize(demographics_mae)
fig = px.scatter(demographics.to_df(), x="College%", y="Median Income", color="State")
xtest = np.arange(0, 75, 0.1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"Least Squares: {np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=xtest, 
                y=mae_slope * xtest + mae_intercept,
                name = f"MAE: {np.round(mae_slope, 2)} x + {np.round(mae_intercept)}")
fig


hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)


px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)


def hybrid_rmse(a, b, c):
    actual = hybrid.column("msrp")
    acc = hybrid.column("acceleration")
    mpg = hybrid.column("mpg")
    predicted = a*acc + b*mpg + c
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse


a, b, c = minimize(hybrid_rmse)
[a, b, c]

[4119.6597287965033, -513.26014324389826, 7636.7196373778297]


print(f"Error: {hybrid_rmse(a, b, c):,}")

Error: 14,687.539382720333


mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a * acc + b * mpg + c
        predictions.append([mpg, acc, pred])


fig = px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)


def hybrid_better_rmse(a, b, c, d, e):
    actual = hybrid.column("msrp")
    acc = hybrid.column("acceleration")
    mpg = hybrid.column("mpg")
    predicted = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse


a, b, c, d, e = minimize(hybrid_better_rmse)
[a, b, c, d, e]

[-8278.7149344607205,
 -2634.1314044379506,
 467.28859068643095,
 27.309648987149739,
 122783.24092048552]


print(f"Error: {hybrid_better_rmse(a,b,c,d,e):,}")

Error: 13,035.853740271548


mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = Table(["mpg", "acc", "pred"])
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
        predictions.append([mpg, acc, pred])
        
fig = px.scatter_3d(
    hybrid.to_df(), 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions.column("pred").reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)

State	District	Median Income	Percent voting for Clinton	College%
Alabama	Congressional District 1 (115th Congress), Alabama	47083	34.1	24
Alabama	Congressional District 2 (115th Congress), Alabama	42035	33	21.8
Alabama	Congressional District 3 (115th Congress), Alabama	46544	32.3	22.8
Alabama	Congressional District 4 (115th Congress), Alabama	41110	17.4	17
Alabama	Congressional District 5 (115th Congress), Alabama	51690	31.3	30.3

State	District	Median Income	Percent voting for Clinton	College%	Linear Prediction
Alabama	Congressional District 1 (115th Congress), Alabama	47083	34.1	24	51299.4
Alabama	Congressional District 2 (115th Congress), Alabama	42035	33	21.8	48503.9
Alabama	Congressional District 3 (115th Congress), Alabama	46544	32.3	22.8	49774.6
Alabama	Congressional District 4 (115th Congress), Alabama	41110	17.4	17	42404.5
Alabama	Congressional District 5 (115th Congress), Alabama	51690	31.3	30.3	59304.8
Alabama	Congressional District 6 (115th Congress), Alabama	61413	26.1	36.7	67437.3
Alabama	Congressional District 7 (115th Congress), Alabama	34664	69.8	19.4	45454.2
Alaska	Congressional District (at Large) (115th Congress), Alaska	76440	37.6	29.6	58415.3
Arizona	Congressional District 1 (115th Congress), Arizona	50537	46.6	24.5	51934.8
Arizona	Congressional District 2 (115th Congress), Arizona	49072	49.6	34	64006.4

State	District	Median Income	Percent voting for Clinton	College%	Linear Prediction	Error
Alabama	Congressional District 1 (115th Congress), Alabama	47083	34.1	24	51299.4	-4216.42
Alabama	Congressional District 2 (115th Congress), Alabama	42035	33	21.8	48503.9	-6468.87
Alabama	Congressional District 3 (115th Congress), Alabama	46544	32.3	22.8	49774.6	-3230.58
Alabama	Congressional District 4 (115th Congress), Alabama	41110	17.4	17	42404.5	-1294.51
Alabama	Congressional District 5 (115th Congress), Alabama	51690	31.3	30.3	59304.8	-7614.84
Alabama	Congressional District 6 (115th Congress), Alabama	61413	26.1	36.7	67437.3	-6024.33
Alabama	Congressional District 7 (115th Congress), Alabama	34664	69.8	19.4	45454.2	-10790.2
Alaska	Congressional District (at Large) (115th Congress), Alaska	76440	37.6	29.6	58415.3	18024.7
Arizona	Congressional District 1 (115th Congress), Arizona	50537	46.6	24.5	51934.8	-1397.77
Arizona	Congressional District 2 (115th Congress), Arizona	49072	49.6	34	64006.4	-14934.4

State	District	Median Income	Percent voting for Clinton	College%	Linear Prediction	Error	Abs Error
New York	Congressional District 2 (115th Congress), New York	93362	43.9	31.2	60448.5	32913.5	32913.5
Maryland	Congressional District 5 (115th Congress), Maryland	95442	64.1	33.7	63625.2	31816.8	31816.8
Virginia	Congressional District 10 (115th Congress), Virginia	120384	52.2	54.3	89801.7	30582.3	30582.3
California	Congressional District 15 (115th Congress), California	106291	69.9	45	77984.2	28306.8	28306.8
California	Congressional District 17 (115th Congress), California	121150	73.9	57	93232.6	27917.4	27917.4
California	Congressional District 19 (115th Congress), California	93139	72.9	35.5	65912.5	27226.5	27226.5
California	Congressional District 18 (115th Congress), California	125790	73.4	61.5	98950.7	26839.3	26839.3
New York	Congressional District 1 (115th Congress), New York	90435	42.2	34.1	64133.5	26301.5	26301.5
Pennsylvania	Congressional District 2 (115th Congress), Pennsylvania	41320	72.9	36.7	67437.3	-26117.3	26117.3
Georgia	Congressional District 5 (115th Congress), Georgia	48269	85	42	74172	-25903	25903

vehicle	year	msrp	acceleration	mpg	class
Prius (1st Gen)	1997	24509.7	7.46	41.26	Compact
Tino	2000	35355	8.2	54.1	Compact
Prius (2nd Gen)	2000	26832.2	7.97	45.23	Compact
Insight	2000	18936.4	9.52	53	Two Seater
Civic (1st Gen)	2001	25833.4	7.04	47.04	Compact

Lecture 31¶

Review From Last Lecture¶

Standard Units¶

Correlation¶

Slope and Intercept¶

Linear Prediction¶

Making Predictions with Linear Regression¶

Making Predictions¶

Computing the Error¶

Visualizing the Errors¶

Summarizing the Overall Error¶

Discussion Question¶

Error as Function of our Model (Line)¶

Numerical Optimization¶

Minimizing RMSE¶

Multiple Linear Regression¶

Fitting Non-linear Data¶