import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')


values = make_array(2, 3, 3, 9)
values

array([2, 3, 3, 9])


sum(values)/len(values)

4.25


np.average(values)

4.25


np.mean(values)

4.25


(2 + 3 + 3 + 9)/4

4.25


2*(1/4) + 3*(2/4) + 9*(1/4)

4.25


2*0.25 + 3*0.5 + 9*0.25

4.25


values_table = Table().with_columns('value', values)
values_table


bins_for_display = np.arange(0.5, 10.6, 1)
values_table.ihist('value', bins = bins_for_display)


## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)


Table().with_column('value', new_vals).ihist(bins = bins_for_display)


np.average(new_vals)

4.25


np.mean(new_vals)

4.25


np.average(make_array(2,3,9), weights = make_array(10,20,10))

4.25


Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Mass');


sd_table = Table().with_columns('Value', values)
sd_table


average_value = np.mean(values)
average_value

4.25


deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table


np.average(deviations)

0.0


np.abs(deviations)

array([ 2.25,  1.25,  1.25,  4.75])


np.average(np.abs(deviations))

2.375


sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table


import plotly.express as px
px.line(x = np.arange(-5, 5, 0.1), y = np.arange(-5, 5, 0.1)**2, height = 800)


# Variance of the data:
# mean squared deviation from average

variance = np.mean(deviations ** 2)
variance

7.6875


np.var(values)

7.6875


# Standard Deviation (SD): 
# root mean squared deviation from average
# = square root of the variance

sd = variance ** 0.5
sd

2.7726341266023544


np.std(values)

2.7726341266023544


def chebyshev(num_SDs):
    """
    Returns a LOWER BOUND on the fraction of the data 
    within num_SDs of the mean.
    
    Put another way AT LEAST chebyshev(z) data is within mean +/- z * SDs.
    """
    return 1 - 1 / (num_SDs**2)


chebyshev(2)

0.75


chebyshev(3.5)

0.9183673469387755


chebyshev(1.1)

0.17355371900826455


chebyshev(0.5)

-3.0


births = Table.read_table('baby.csv').drop('Maternal Smoker')


births.labels

('Birth Weight',
 'Gestational Days',
 'Maternal Age',
 'Maternal Height',
 'Maternal Pregnancy Weight')


births.ihist(overlay = False)


mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

(128.4787052810903, 20.725449704280411)


within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

within_3_SDs.num_rows / births.num_rows # Proportion within 3 SDs of the mean

0.9863713798977853


# Chebyshev's bound: 
# This proportion should be at least
print("Chebyshev says that at least", round(chebyshev(3),2), 
      "of the rows will be within 3 standard deviations of the mean")

Chebyshev says that at least 0.89 of the rows will be within 3 standard deviations of the mean


# See if Chebyshev's bounds work for distributions with various shapes


for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%',
              " which is > ", np.round(chebyshev(z) *100, 2), "(Chebyshev)")

Birth Weight
Average plus or minus 2 SDs: 94.89 %  which is >  75.0 (Chebyshev)
Average plus or minus 3 SDs: 99.57 %  which is >  88.89 (Chebyshev)
Average plus or minus 4 SDs: 100.0 %  which is >  93.75 (Chebyshev)
Average plus or minus 5 SDs: 100.0 %  which is >  96.0 (Chebyshev)

Gestational Days
Average plus or minus 2 SDs: 93.78 %  which is >  75.0 (Chebyshev)
Average plus or minus 3 SDs: 98.64 %  which is >  88.89 (Chebyshev)
Average plus or minus 4 SDs: 99.57 %  which is >  93.75 (Chebyshev)
Average plus or minus 5 SDs: 99.83 %  which is >  96.0 (Chebyshev)

Maternal Age
Average plus or minus 2 SDs: 94.89 %  which is >  75.0 (Chebyshev)
Average plus or minus 3 SDs: 99.91 %  which is >  88.89 (Chebyshev)
Average plus or minus 4 SDs: 100.0 %  which is >  93.75 (Chebyshev)
Average plus or minus 5 SDs: 100.0 %  which is >  96.0 (Chebyshev)

Maternal Height
Average plus or minus 2 SDs: 97.19 %  which is >  75.0 (Chebyshev)
Average plus or minus 3 SDs: 99.66 %  which is >  88.89 (Chebyshev)
Average plus or minus 4 SDs: 99.91 %  which is >  93.75 (Chebyshev)
Average plus or minus 5 SDs: 100.0 %  which is >  96.0 (Chebyshev)

Maternal Pregnancy Weight
Average plus or minus 2 SDs: 95.06 %  which is >  75.0 (Chebyshev)
Average plus or minus 3 SDs: 98.64 %  which is >  88.89 (Chebyshev)
Average plus or minus 4 SDs: 99.49 %  which is >  93.75 (Chebyshev)
Average plus or minus 5 SDs: 99.91 %  which is >  96.0 (Chebyshev)


def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)


ages = births.column('Maternal Age')
ages

array([27, 33, 28, ..., 30, 21, 38])


ages_standard_units = standard_units(ages)
ages_standard_units

array([-0.03925456,  0.99249577,  0.13270383, ...,  0.47662061,
       -1.07100488,  1.85228771])


np.mean(ages_standard_units), np.std(ages_standard_units)

(-7.868020072300939e-17, 1.0)


both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both


np.mean(ages), np.std(ages)

(27.228279386712096, 5.8153604041908968)


both.ihist('Age in Years', bins = np.arange(15, 46, 2))


both.ihist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))


births.ihist('Maternal Height', bins = np.arange(56.5, 72.6, 1), height=800)
# plots.xticks(np.arange(57, 72, 2));


heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

(64.049403747870528, 2.5250254409674375)

Value	Deviation	Squared Deviation
2	-2.25	5.0625
3	-1.25	1.5625
3	-1.25	1.5625
9	4.75	22.5625

Lecture 25¶

Average (Mean)¶

Average as Center of Mass¶

Standard Deviation¶

Proving that the Average Deviation is Zero:¶

Variance¶

Units¶

Standard Deviation¶

Chebyshev's Bounds: "Tail" Bounds¶

Testing the Bound on Real Data¶

Examining Maternal Pregnancy Weight¶

Standard Units¶

Discussion Question¶

The SD and Bell Shaped Curves¶

Age in Years	Age in Standard Units
27	-0.0392546
33	0.992496
28	0.132704
23	-0.727088
25	-0.383171
33	0.992496
23	-0.727088
25	-0.383171
30	0.476621
27	-0.0392546

value
2
3
3
9