from datascience import *
import numpy as np


import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)


def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_su = standard_units(t.column(x))
    y_in_su = standard_units(t.column(y))
    return np.mean(x_in_su * y_in_su)


def make_correlated_data(r, n=500):
    "Generate a a table with columns x and y with a correlation of approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    # This is "magic" to sample from a multivariate Gaussian
    y = r*x + (np.sqrt(1-r**2))*z 
    return Table().with_columns("x", x, "y", y)

def r_scatter(r, n=500, subplot=False):
    "Generate a scatter plot with a correlation approximately r"
    data = make_correlated_data(r, n)
    plt = go.Scatter(x=data.column("x"), y= data.column("y"), 
                     name=str(r), mode="markers", marker_opacity=0.5)
    if subplot:
        return plt
    else: 
        return go.Figure().add_trace(plt)


figs = make_subplots(2,3)
n=500
figs.add_trace(r_scatter(0.2,  n, subplot=True), 1, 1)
figs.add_trace(r_scatter(0.5,  n, subplot=True), 1, 2)
figs.add_trace(r_scatter(0.8,  n, subplot=True), 1, 3)
figs.add_trace(r_scatter(-0.2, n, subplot=True), 2, 1)
figs.add_trace(r_scatter(-0.5, n, subplot=True), 2, 2)
figs.add_trace(r_scatter(-0.8, n, subplot=True), 2, 3)


new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.iscatter('x', 'y')


correlation(nonlinear, 'x', 'y')

0.0


line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.iscatter('x', 'y')


correlation(line, 'x', 'y')

1.0


outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.iscatter('x', 'y')


correlation(outlier, 'x', 'y')

0.0


sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014


sat2014.iscatter('Critical Reading', 'Math')


correlation(sat2014, 'Critical Reading', 'Math')

0.98475584110674341


px.scatter(sat2014.to_df(), 
           x = "Critical Reading",
           y = "Math",
           hover_name = "State",
           size = "Participation Rate")


## Load the family height data
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
).sort("Parent Average")


def nn_heights(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights
            .where("Parent Average", are.between(lower_bound, upper_bound))
            .column("Child")
    )
    if len(similar_child_heights) == 0: #handle the case when there is no data
        return np.nan # nan = not a number , a special floating point "number"
    else:
        return np.mean(similar_child_heights)


test_heights = Table().with_column("Parent Average", np.arange(61,74,0.2))
test_heights = test_heights.with_column(
    "NN Prediction", test_heights.apply(nn_heights, "Parent Average"))


## Plot it all
fig = px.scatter(heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights.column("Parent Average"), 
                y=test_heights.column("NN Prediction"), name="NN Prediction")


## Transform the heights data into standard units
su_heights = Table().with_columns(
    "Parent Average", standard_units(heights.column("Parent Average")),
    "Child", standard_units(heights.column("Child")))

## Transform the nearest neighbor predictions to standard units
su_test_heights = Table().with_columns(
    "Parent Average", 
    (test_heights.column("Parent Average") - heights.column("Parent Average").mean()) 
                        / heights.column("Parent Average").std(),
    "NN Prediction", 
    (test_heights.column("NN Prediction") - heights.column("Child").mean()) 
                        / heights.column("Child").std()) 

## Plot it all
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"), 
                y=su_test_heights.column("NN Prediction"), name="NN Prediction")


correlation(heights, "Parent Average", "Child")

0.32244267720033076


r = correlation(su_heights, "Parent Average", "Child")
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"), 
                y=su_test_heights.column("NN Prediction"), 
                name="NN Prediction")
fig.add_scatter(x=np.arange(-3,4,0.1), y= r * np.arange(-3,4,0.1), 
                name=f"Line(y={np.round(r,4)} x)")


# You don't need to understand all the parts of this function.
def make_correlation_and_line_plot(r):
    """ 
    Generates a plot of synthetic data with a correlation coefficient r
    along with the nearest neighbor predictions and 
    a line with the slope r and intercept 0
    """
    # Make synthetic data
    example = make_correlated_data(r).sort("x")
    
    # Compute nearest neighbor predictions
    def nn_prediction_example(x_val):
        """ Predicts y-value for x based on the example table """
        neighbors = example.where('x', are.between(x_val - .25, x_val + .25)).column("y")
        if len(neighbors) == 0:
            return np.nan
        else: 
            return np.mean(neighbors)   
    example = example.with_columns(
        'NN Prediction', example.apply(nn_prediction_example, 'x'))
    
    # Generate Plots.
    x,y = example.column("x"), example.column("y")
    fig = px.scatter(example.to_df(), x="x", y="y", height=600)
    fig.add_scatter(x=example.column("x"), y=example.column("NN Prediction"), 
                    name="NN Prediction")
    fig.add_scatter(x=x, y= r * x, name=f"Line(y={r} x)")
    fig.add_scatter(x=x, y=x, line_color="gray", line_dash="dot", name="Line(y=x)")
    return fig


make_correlation_and_line_plot(r=0.90)


make_correlation_and_line_plot(r=0.60)


make_correlation_and_line_plot(r=0.20)


make_correlation_and_line_plot(r=0.0)


make_correlation_and_line_plot(r=-0.6)


def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd


def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean


example = make_correlated_data(0.5)
slope(example, 'x', 'y')

0.46462086938305275


heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]

[0.66449526235258838, 22.461839955758812]

heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]


heights = heights.with_column(
    'Regression Prediction', 
    heights_slope * heights.column('Parent Average') + heights_intercept
)
heights

heights = heights.with_column(
    'Regression Prediction', 
    predicted_heights_slope*heights.column('Parent Average') + predicted_heights_intercept
)
heights


fig = px.scatter(heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights.column("Parent Average"), 
                y=test_heights.column("NN Prediction"), name="NN Prediction")
line_name = f"y = {np.round(heights_slope,2)} x + {np.round(heights_intercept,2)}"
fig.add_scatter(x=heights.column("Parent Average"), 
                y=heights.column("Regression Prediction"),
                name=line_name)


# X-axis: midterm scores
midterm_mean = 70
midterm_sd = 10

# Y-axis: final scores
final_mean = 50
final_sd = 12

# Correlation (relates X to Y values)
corr = 0.75

# X value
midterm_student = 90


midterm_student_su = (midterm_student - midterm_mean) / midterm_sd
midterm_student_su

2.0


final_student_su = midterm_student_su * corr
final_student_su

1.5


final_student = final_student_su * final_sd + final_mean
final_student

68.0

State	Participation Rate	Critical Reading	Math	Writing	Combined
Alabama	6.7	547	538	532	1617
Alaska	54.2	507	503	475	1485
Arizona	36.4	522	525	500	1547
Arkansas	4.2	573	571	554	1698
California	60.3	498	510	496	1504
Colorado	14.3	582	586	567	1735
Connecticut	88.4	507	510	508	1525
Delaware	100	456	459	444	1359
District of Columbia	100	440	438	431	1309
Florida	72.2	491	485	472	1448

Parent Average	Child	Regression Prediction
62	66	63.6605
62	60	63.6605
62.5	68	63.9928
62.5	67	63.9928
62.5	66.5	63.9928
62.5	66	63.9928
62.5	65.7	63.9928
62.5	65.5	63.9928
62.5	65	63.9928
62.5	65	63.9928

Lecture 30¶

Recap From Last Lecture¶

Care when Interpreting the Correlation¶

Correlation does Not Imply Causation¶

Nonlinearity¶

Outliers¶

Ecological Correlations¶

Bonus: Understanding the SAT data¶

Prediction Lines¶

Using the Heights Example¶

The Relationship Between Correlation and NN Predictions¶

Correlation of 0.90¶

Correlation of 0.60¶

Correlation of 0.20¶

Correlation of 0¶

Correlation of -0.6¶

Defining the linear regression line¶

Implementing Linear Regression¶

Discussion Question: Exam Score Prediction¶