from datascience import *
import numpy as np


import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# Note: Child heights are the **adult** heights of children in a family
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
)
heights


heights.iscatter('Parent Average', 'Child')


def nearest_neighbor_predictor(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights
            .where("Parent Average", are.between(lower_bound, upper_bound))
            .column("Child")
    )
    return np.mean(similar_child_heights)


my_height = 5*12 + 11 # 5 ft 11 inches
spouse_height = 5*12 + 7 # 5 ft 7 inches
our_average = (my_height + spouse_height) / 2.0
our_average

69.0


nearest_neighbor_predictor(our_average)

67.988059701492531


fig = px.scatter(x=heights.column('Parent Average'), y=heights.column('Child'))
fig.add_vline(our_average - 0.5)
fig.add_vline(our_average + 0.5)
fig.add_scatter(x=[our_average], y=[nearest_neighbor_predictor(our_average)], 
                name="Prediction", marker_size=10)


heights_with_predictions = heights.with_columns(
    'Prediction', heights.apply(nearest_neighbor_predictor, 'Parent Average'))


heights_with_predictions.iscatter('Parent Average')


heights_with_predictions.iscatter('Parent Average')


hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)


hybrid.sort('msrp', descending=True)


px.scatter(hybrid.to_df(), 
           x="msrp", 
           y="mpg", 
           hover_name="vehicle", 
           color="class")


px.scatter(hybrid.to_df(), 
           x="msrp", 
           y="acceleration", 
           hover_name="vehicle", 
           color="class")


px.scatter(hybrid.to_df(), 
           x="msrp", 
           y="acceleration", 
           size="mpg",
           hover_name="vehicle", 
           color="class")


px.scatter(hybrid.to_df(), 
           x="mpg", 
           y="acceleration", 
           size="msrp",
           hover_name="vehicle", 
           color="class")


def make_correlated_data(r, n=500):
    "Generate a a table with columns x and y with a correlation of approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    y = r*x + (np.sqrt(1-r**2))*z
    return Table().with_columns("x", x, "y", y)

def r_scatter(r, n=500, subplot=False):
    "Generate a scatter plot with a correlation approximately r"
    data = make_correlated_data(r, n)
    plt = go.Scatter(x=data.column("x"), y= data.column("y"), 
                     name=str(r), mode="markers")
    if subplot:
        return plt
    else: 
        return go.Figure().add_trace(plt)


r_scatter(0.8)


figs = make_subplots(2,3)
n=200
figs.add_trace(r_scatter(0.2,  n, subplot=True), 1, 1)
figs.add_trace(r_scatter(0.5,  n, subplot=True), 1, 2)
figs.add_trace(r_scatter(0.8,  n, subplot=True), 1, 3)
figs.add_trace(r_scatter(-0.2, n, subplot=True), 2, 1)
figs.add_trace(r_scatter(-0.5, n, subplot=True), 2, 2)
figs.add_trace(r_scatter(-0.8, n, subplot=True), 2, 3)


def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)


hybrid = hybrid.with_columns(
    "mpg (SU)", standard_units(hybrid.column('mpg')),
    "msrp (SU)", standard_units(hybrid.column('msrp')),
    "acceleration (SU)", standard_units(hybrid.column('acceleration')),
)
hybrid.show(5)


px.scatter(hybrid.to_df(), 
           x="msrp", 
           y="acceleration", 
           hover_name="vehicle", 
           color="class")


px.scatter(hybrid.to_df(), 
           x="msrp (SU)", 
           y="acceleration (SU)", 
           hover_name="vehicle", 
           color="class")


np.mean(hybrid.column("acceleration (SU)") * hybrid.column("msrp (SU)"))

0.69557789969139783


Table().with_column(
    "Product", hybrid.column("acceleration (SU)") * hybrid.column("msrp (SU)")
).ihist("Product", bins=20)


def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_su = standard_units(t.column(x))
    y_in_su = standard_units(t.column(y))
    return np.mean(x_in_su * y_in_su)

def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_su = standard_units(t.column(x))
    y_in_su = standard_units(t.column(y))
    return np.mean(x_in_su * y_in_su)


fig = make_subplots(1,3)
fig.add_scatter(x=hybrid.column("msrp"), y=hybrid.column("mpg"), 
                mode="markers", row=1, col=1)
fig.add_scatter(x=hybrid.column("msrp"), y=hybrid.column("acceleration"), 
                mode="markers", row=1, col=2)
fig.add_scatter(x=hybrid.column("mpg"), y=hybrid.column("acceleration"), 
                mode="markers", row=1, col=3)
fig.update_xaxes(title_text="msrp", row=1, col=1)
fig.update_yaxes(title_text="mpg", row=1, col=1)
fig.update_xaxes(title_text="msrp", row=1, col=2)
fig.update_yaxes(title_text="acceleration", row=1, col=2)
fig.update_xaxes(title_text="mpg", row=1, col=3)
fig.update_yaxes(title_text="acceleration", row=1, col=3)
fig.update_layout(showlegend=False)


correlation(hybrid, "msrp", "mpg")

-0.53182636336837863


correlation(hybrid, "msrp", "acceleration")

0.69557789969139783


correlation(hybrid, "mpg", "acceleration")

-0.5060703843771186


px.scatter(hybrid.to_df(), 
           x="msrp", 
           y="acceleration", 
           hover_name="vehicle", 
           color="class")


px.scatter(hybrid.to_df(), 
           x="acceleration", 
           y="msrp", 
           hover_name="vehicle", 
           color="class")


correlation(hybrid, "msrp", "acceleration")

0.69557789969139783


correlation(hybrid, "acceleration", "msrp")

0.69557789969139783


new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.iscatter('x', 'y')


correlation(nonlinear, 'x', 'y')

0.0


def nn_predictor(x):
    return np.mean(nonlinear.where("x", are.between(x-0.51, x+0.51)).column("y"))


(
    nonlinear.with_column("Prediction", nonlinear.apply(nn_predictor, "x"))
            .iscatter("x")
)


line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.iscatter('x', 'y')


correlation(line, 'x', 'y')

1.0


outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.iscatter('x', 'y')


correlation(outlier, 'x', 'y')

0.0


sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014


sat2014.iscatter('Critical Reading', 'Math')


correlation(sat2014, 'Critical Reading', 'Math')

0.98475584110674341


px.scatter(sat2014.to_df(), 
           x = "Critical Reading",
           y = "Math",
           hover_name = "State",
           size = "Participation Rate")

Parent Average	Child
72.75	73.2
72.75	69.2
72.75	69
72.75	69
71	73.5
71	72.5
71	65.5
71	65.5
69.5	71
69.5	68

vehicle	year	msrp	acceleration	mpg	class
Prius (1st Gen)	1997	24509.7	7.46	41.26	Compact
Tino	2000	35355	8.2	54.1	Compact
Prius (2nd Gen)	2000	26832.2	7.97	45.23	Compact
Insight	2000	18936.4	9.52	53	Two Seater
Civic (1st Gen)	2001	25833.4	7.04	47.04	Compact

vehicle	year	msrp	acceleration	mpg	class
Lexus LS600h/hL	2007	118544	17.54	21	Midsize
ActiveHybrid 7	2010	104300	20.41	22.11	Large
ActiveHybrid 7i	2011	102606	18.18	20	Midsize
ActiveHybrid X6	2009	97237.9	17.96	18.82	SUV
S400 Long	2009	96208.9	13.89	26.34	Large
Panamera S	2013	96150	18.52	25	Large
Panamera S	2012	95283.9	17.54	25	Large
S400	2013	92350	13.89	21	Large
S400	2010	88212.8	12.99	21	Large
ActiveHybrid 7L	2013	84300	18.18	25	Large

vehicle	year	msrp	acceleration	mpg	class	mpg (SU)	msrp (SU)	acceleration (SU)
Prius (1st Gen)	1997	24509.7	7.46	41.26	Compact	0.59091	-0.69363	-1.53501
Tino	2000	35355	8.2	54.1	Compact	1.76495	-0.18568	-1.2825
Prius (2nd Gen)	2000	26832.2	7.97	45.23	Compact	0.953911	-0.584852	-1.36098
Insight	2000	18936.4	9.52	53	Two Seater	1.66437	-0.954663	-0.832081
Civic (1st Gen)	2001	25833.4	7.04	47.04	Compact	1.11941	-0.631636	-1.67832

State	Participation Rate	Critical Reading	Math	Writing	Combined
Alabama	6.7	547	538	532	1617
Alaska	54.2	507	503	475	1485
Arizona	36.4	522	525	500	1547
Arkansas	4.2	573	571	554	1698
California	60.3	498	510	496	1504
Colorado	14.3	582	586	567	1735
Connecticut	88.4	507	510	508	1525
Delaware	100	456	459	444	1359
District of Columbia	100	440	438	431	1309
Florida	72.2	491	485	472	1448

Lecture 29¶

Predicting Child Heights¶

The Nearest Neighbor Predictions¶

Association¶

Correlation¶

Computing the Correlation¶

Defining the Correlation Function¶

Switching Axes¶

Care when Interpreting Correlation¶

Correlation does Not Imply Causation¶

Nonlinearity¶

Outliers¶

Ecological Correlations¶

Bonus: Understanding the SAT data¶