from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library. We won't test you on this but it's good to know.
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In this lecture, we derive the equation for linear regression using the correlation coefficient $r$.
In the previous lecture, we introduced the correlation coefficient:
\begin{align} r & = \text{Mean}\left(\text{StandardUnits}(x) * \text{StandardUnits}(y)\right)\\ & = \frac{1}{n} \sum_{i=1}^n \text{StandardUnits}(x_i) * \text{StandardUnits}(y_i)\\ & = \frac{1}{n}\sum_{i=1}^n \left( \frac{x_i - \text{Mean}(x)}{\text{Stdev}(x)} \right) * \left( \frac{y_i - \text{Mean}(y)}{\text{Stdev}(y)} \right) \\ \end{align}We implemented the correlation coefficient:
def standard_units(x):
"Convert any array of numbers to standard units."
return (x - np.average(x)) / np.std(x)
def correlation(t, x, y):
"""t is a table; x and y are column labels"""
x_in_su = standard_units(t.column(x))
y_in_su = standard_units(t.column(y))
return np.mean(x_in_su * y_in_su)
We built an intuition about the correlation coefficient using the following code which you don't need to understand:
def make_correlated_data(r, n=500):
"Generate a a table with columns x and y with a correlation of approximately r"
x = np.random.normal(0, 1, n)
z = np.random.normal(0, 1, n)
# This is "magic" to sample from a multivariate Gaussian
y = r*x + (np.sqrt(1-r**2))*z
return Table().with_columns("x", x, "y", y)
def r_scatter(r, n=500, subplot=False):
"Generate a scatter plot with a correlation approximately r"
data = make_correlated_data(r, n)
plt = go.Scatter(x=data.column("x"), y= data.column("y"),
name=str(r), mode="markers", marker_opacity=0.5)
if subplot:
return plt
else:
return go.Figure().add_trace(plt)
figs = make_subplots(2,3)
n=500
figs.add_trace(r_scatter(0.2, n, subplot=True), 1, 1)
figs.add_trace(r_scatter(0.5, n, subplot=True), 1, 2)
figs.add_trace(r_scatter(0.8, n, subplot=True), 1, 3)
figs.add_trace(r_scatter(-0.2, n, subplot=True), 2, 1)
figs.add_trace(r_scatter(-0.5, n, subplot=True), 2, 2)
figs.add_trace(r_scatter(-0.8, n, subplot=True), 2, 3)
When computing correlation it is important to always visualize your data first and then consider each of the following issues.
Low correlation does not imply absence of a relationship. Correlation measures linear relationships. Data with strong non-linear relationship may have very low correlation.
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
'x', new_x,
'y', new_x**2
)
nonlinear.iscatter('x', 'y')
There is clearly a relationship to this data. Given the value of $x$ you can easily predict the value of $y$. What is the correlation?
correlation(nonlinear, 'x', 'y')
0.0
line = Table().with_columns(
'x', make_array(1, 2, 3, 4),
'y', make_array(1, 2, 3, 4)
)
line.iscatter('x', 'y')
correlation(line, 'x', 'y')
1.0
outlier = Table().with_columns(
'x', make_array(1, 2, 3, 4, 5),
'y', make_array(1, 2, 3, 4, 0)
)
outlier.iscatter('x', 'y')
correlation(outlier, 'x', 'y')
0.0
The correlation between aggregated variables (e.g., after grouping) may be much higher than the correlation between the underlying variables.
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014
State | Participation Rate | Critical Reading | Math | Writing | Combined |
---|---|---|---|---|---|
Alabama | 6.7 | 547 | 538 | 532 | 1617 |
Alaska | 54.2 | 507 | 503 | 475 | 1485 |
Arizona | 36.4 | 522 | 525 | 500 | 1547 |
Arkansas | 4.2 | 573 | 571 | 554 | 1698 |
California | 60.3 | 498 | 510 | 496 | 1504 |
Colorado | 14.3 | 582 | 586 | 567 | 1735 |
Connecticut | 88.4 | 507 | 510 | 508 | 1525 |
Delaware | 100 | 456 | 459 | 444 | 1359 |
District of Columbia | 100 | 440 | 438 | 431 | 1309 |
Florida | 72.2 | 491 | 485 | 472 | 1448 |
... (41 rows omitted)
sat2014.iscatter('Critical Reading', 'Math')
correlation(sat2014, 'Critical Reading', 'Math')
0.98475584110674341
That is a very strong correlation. However, each data point corresponds to a large cloud of data points where each person might have had greater variability in their scores.
While we have the data loaded. Does anyone have a guess which dots correspond to which state?
px.scatter(sat2014.to_df(),
x = "Critical Reading",
y = "Math",
hover_name = "State",
size = "Participation Rate")
Here we build an intuition about the relationship between the slope of the nearest neighbor line and the correlation coefficient.
## Load the family height data
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
'Parent Average', parent_avgs,
'Child', families.column('child'),
).sort("Parent Average")
Here is a slightly more robust Nearest Neighbor predictor
def nn_heights(parent_average, window=0.5):
lower_bound = parent_average - window
upper_bound = parent_average + window
similar_child_heights = (
heights
.where("Parent Average", are.between(lower_bound, upper_bound))
.column("Child")
)
if len(similar_child_heights) == 0: #handle the case when there is no data
return np.nan # nan = not a number , a special floating point "number"
else:
return np.mean(similar_child_heights)
Make predictions at many different parent heights not just the heights in the dataset.
test_heights = Table().with_column("Parent Average", np.arange(61,74,0.2))
test_heights = test_heights.with_column(
"NN Prediction", test_heights.apply(nn_heights, "Parent Average"))
## Plot it all
fig = px.scatter(heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights.column("Parent Average"),
y=test_heights.column("NN Prediction"), name="NN Prediction")
However, it will be easier to start in standard units.
## Transform the heights data into standard units
su_heights = Table().with_columns(
"Parent Average", standard_units(heights.column("Parent Average")),
"Child", standard_units(heights.column("Child")))
## Transform the nearest neighbor predictions to standard units
su_test_heights = Table().with_columns(
"Parent Average",
(test_heights.column("Parent Average") - heights.column("Parent Average").mean())
/ heights.column("Parent Average").std(),
"NN Prediction",
(test_heights.column("NN Prediction") - heights.column("Child").mean())
/ heights.column("Child").std())
## Plot it all
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"),
y=su_test_heights.column("NN Prediction"), name="NN Prediction")
Computing the correlation we get:
correlation(heights, "Parent Average", "Child")
0.32244267720033076
What happens if we draw a line with that slope:
r = correlation(su_heights, "Parent Average", "Child")
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"),
y=su_test_heights.column("NN Prediction"),
name="NN Prediction")
fig.add_scatter(x=np.arange(-3,4,0.1), y= r * np.arange(-3,4,0.1),
name=f"Line(y={np.round(r,4)} x)")
Here we examine the relationship between the nearest neighbor prediction "line" and the correlation for several synthetic datasets.
# You don't need to understand all the parts of this function.
def make_correlation_and_line_plot(r):
"""
Generates a plot of synthetic data with a correlation coefficient r
along with the nearest neighbor predictions and
a line with the slope r and intercept 0
"""
# Make synthetic data
example = make_correlated_data(r).sort("x")
# Compute nearest neighbor predictions
def nn_prediction_example(x_val):
""" Predicts y-value for x based on the example table """
neighbors = example.where('x', are.between(x_val - .25, x_val + .25)).column("y")
if len(neighbors) == 0:
return np.nan
else:
return np.mean(neighbors)
example = example.with_columns(
'NN Prediction', example.apply(nn_prediction_example, 'x'))
# Generate Plots.
x,y = example.column("x"), example.column("y")
fig = px.scatter(example.to_df(), x="x", y="y", height=600)
fig.add_scatter(x=example.column("x"), y=example.column("NN Prediction"),
name="NN Prediction")
fig.add_scatter(x=x, y= r * x, name=f"Line(y={r} x)")
fig.add_scatter(x=x, y=x, line_color="gray", line_dash="dot", name="Line(y=x)")
return fig
make_correlation_and_line_plot(r=0.90)
make_correlation_and_line_plot(r=0.60)
make_correlation_and_line_plot(r=0.20)
make_correlation_and_line_plot(r=0.0)
make_correlation_and_line_plot(r=-0.6)
In standard units we developed a simple equation for the regression line:
\begin{align} \text{SU}(y_\text{predicted}) = r * \text{SU}(x_\text{new}) \end{align}where $r$ is the correlation coefficient and $\text{SU}$ is the standard units:
\begin{align} \text{SU}(y_\text{predicted}) & = \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} \\ \text{SU}(x_\text{new}) &= \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)} \end{align}Here we use $x_\text{new}$ to indicate a new $x$ value for which we want to make a prediction $y_\text{predicted}$.
We would like to express this line in the original units of the data. We can do that by substituting the definition of standard units:
\begin{align} \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} = r * \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)} \end{align}While this equation does desribe a line it would look a little nicer in the form:
\begin{align} y_\text{predicted} = \text{slope} * x_\text{new} + \text{intercept} \end{align}Let's do some algebra to get that equation: $$ \require{color} \definecolor{comment}{RGB}{200,100,50} \begin{align} \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r * \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}\\ \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r * \frac{1}{\text{Stdev}(x)} x_\text{new} - r * \frac{1}{\text{Stdev}(x)}\text{Mean}(x) & \color{comment} \text{Expanding the right side}\\ y_\text{predicted} - \text{Mean}(y) &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) & \color{comment} \text{Multiplying by $\text{Stdev}(y)$}\\ y_\text{predicted} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} + \text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) & \color{comment} \text{Adding $\text{Mean}(y)$}\\ y_\text{predicted} &= \left(r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\right) x_\text{new} + \left(\text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x)\right) & \color{comment} \text{Rearranging Terms} \end{align} $$
This means we can define the slope and intercept as: \begin{align} \text{slope} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\\ \text{intercept} & = \text{Mean}(y) - \text{slope} * \text{Mean}(x) \end{align}
Using the above equations implement the slope and intercept functions:
def slope(t, x, y):
"""Computes the slope of the regression line"""
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def slope(t, x, y):
"""Computes the slope of the regression line"""
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
</details>
def intercept(t, x, y):
"""Computes the intercept of the regression line"""
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def intercept(t, x, y):
"""Computes the intercept of the regression line"""
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
</details>
Testing it out
example = make_correlated_data(0.5)
slope(example, 'x', 'y')
0.46462086938305275
Computing the slope and intercept for the heights dataset:
heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]
[0.66449526235258838, 22.461839955758812]
heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]
</details>
Adding the regression predictions:
heights = heights.with_column(
'Regression Prediction',
heights_slope * heights.column('Parent Average') + heights_intercept
)
heights
Parent Average | Child | Regression Prediction |
---|---|---|
62 | 66 | 63.6605 |
62 | 60 | 63.6605 |
62.5 | 68 | 63.9928 |
62.5 | 67 | 63.9928 |
62.5 | 66.5 | 63.9928 |
62.5 | 66 | 63.9928 |
62.5 | 65.7 | 63.9928 |
62.5 | 65.5 | 63.9928 |
62.5 | 65 | 63.9928 |
62.5 | 65 | 63.9928 |
... (924 rows omitted)
heights = heights.with_column(
'Regression Prediction',
predicted_heights_slope*heights.column('Parent Average') + predicted_heights_intercept
)
heights
</details>
fig = px.scatter(heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights.column("Parent Average"),
y=test_heights.column("NN Prediction"), name="NN Prediction")
line_name = f"y = {np.round(heights_slope,2)} x + {np.round(heights_intercept,2)}"
fig.add_scatter(x=heights.column("Parent Average"),
y=heights.column("Regression Prediction"),
name=line_name)
# X-axis: midterm scores
midterm_mean = 70
midterm_sd = 10
# Y-axis: final scores
final_mean = 50
final_sd = 12
# Correlation (relates X to Y values)
corr = 0.75
# X value
midterm_student = 90
midterm_student_su = (midterm_student - midterm_mean) / midterm_sd
midterm_student_su
2.0
final_student_su = midterm_student_su * corr
final_student_su
1.5
final_student = final_student_su * final_sd + final_mean
final_student
68.0