from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


sf = Table.read_table('san_francisco_2019.csv')
min_salary = 15 * 20 * 50
sf = sf.where('Salary', are.above(min_salary))


sf.num_rows

37103


sf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)

/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/datascience/tables.py:5865: UserWarning: FixedFormatter should only be used together with FixedLocator
  axis.set_xticklabels(ticks, rotation='vertical')


# Parameter: Median total compensation in the population
def median_comp(t):
    return percentile(50, t.column('Total Compensation'))


pop_median = median_comp(sf)
print("Parameter Value:", pop_median)

Parameter Value: 135747.0

sf.sample(400, with_replacement=False).to_csv("sf_sample.csv")


sample_sf = Table.read_table("sf_sample.csv")
sample_sf.show(4)


sample_sf.num_rows

400


def bootstrapper(sample, statistic, num_repetitions):
    """
    Returns the statistic computed on a num_repetitions  
    bootstrap samples from sample.
    """
    bstrap_stats = make_array()
    for i in np.arange(num_repetitions):
        # Step 1: Sample the Sample
        bootstrap_sample = sample.sample()
        # Step 2: compute statistics on the sample of the sample
        bootstrap_stat = statistic(bootstrap_sample)
        # Accumulate the statistics
        bstrap_stats = np.append(bstrap_stats, bootstrap_stat)

    return bstrap_stats


bootstrap_medians = bootstrapper(sample_sf, median_comp, 10000)


Table().with_column("bootstraps", bootstrap_medians).hist()
## Extra Viz code

parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2)
plots.title('Bootstrap Medians and the Parameter (Green Dot)');

/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/datascience/tables.py:5865: UserWarning: FixedFormatter should only be used together with FixedLocator
  axis.set_xticklabels(ticks, rotation='vertical')


def compute_ci(bs_samples, confidence_level):
    """
    Returns the confidence interval for the provided bootstrap samples
    and desired confidence level.
    """
    tail_size = (100 - confidence_level)/2
    lower = percentile(tail_size, bs_samples)
    upper = percentile(100 - tail_size, bs_samples)
    return [lower, upper]


ci = compute_ci(bootstrap_medians, 95)
ci

[130580.0, 142054.0]


Table().with_column("bootstraps", bootstrap_medians).hist()
# cool python trick to deconstruct the array!
[left, right] = ci
# Plotting parameters; you can ignore this code
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);


intervals = Table(['lower', 'upper', 'median', 'good', 'sample_size'])
sample_sizes = [2,4,8,16]
for sample_size in sample_sizes:
    for trial in np.arange(20): # Run 20 trials of each configuration
        # Pay for one new random sample from the population
        og_sample = sf.sample(sample_size, with_replacement=False)
        # Compute the statistic on the sample
        sample_median = median_comp(og_sample)
        # Generate the medians of 1000 bootstrap samples
        bootstrap_medians = bootstrapper(og_sample, median_comp, 1000)
        # Construct the confidence interval
        [ci_lower, ci_upper] = compute_ci(bootstrap_medians, 95)
        # Determine if the confidence interval is good
        is_good = ci_lower <= pop_median <= ci_upper
        # Add a row to the table
        intervals = intervals.with_row(
            [ci_lower, ci_upper, sample_median, is_good, str(sample_size)])

# Add an experiment number
intervals = intervals.with_column("Experiment", np.arange(intervals.num_rows))


intervals


import plotly.express as px

# Plotly will draw error bars which are the distance in each direction
# from the median
intervals = intervals.with_columns(
        "error_left", intervals.column("median") - intervals.column("lower"),
        "error_right", intervals.column("upper") - intervals.column("median"))

# Making the plot
fig = px.scatter(
    intervals.to_df(), # Plotly requires a pandas dafaframe
    x="median", # X location of my interval center dot
    y="Experiment", # Y location of my interval center dot 
    color="sample_size", # The color to use.
    symbol="good", # The column to use for the symbol
    symbol_map={True: "circle", False: "circle-open"},
    error_x="error_right", # upper error bar size
    error_x_minus="error_left", # lower error bar size
    height=800)
fig.add_vline(pop_median)


# Random sample of mother-newborn pairs
births = Table.read_table('baby.csv')
births


# Average age of mothers in the sample
np.average(births.column('Maternal Age'))

27.228279386712096


births.hist('Maternal Age')


percentile(50, births.column('Maternal Age'))

26


[percentile(2.5, births.column('Maternal Age')),
 percentile(97.5, births.column('Maternal Age'))]

[19, 40]


def avg_maternal_age(sample):
    return np.average(sample.column("Maternal Age"))


sample_statistic = avg_maternal_age(births)
sample_statistic

27.228279386712096


bootstrap_means = bootstrapper(births, avg_maternal_age, 3000)
avg_maternal_age_ci = compute_ci(bootstrap_means, confidence_level=95)
avg_maternal_age_ci

[26.890971039182283, 27.551107325383306]


resampled_means = Table().with_columns(
    'Bootstrap Sample Mean', bootstrap_means
)
resampled_means.hist(bins=15)
[left, right] = avg_maternal_age_ci
plots.plot([left, right], [0, 0], color='yellow', lw=8);


births.hist('Maternal Age')
plots.plot([left, right], [0, 0], color='yellow', lw=8);


bootstrap_means = bootstrapper(births, avg_maternal_age, 5000)
avg_maternal_age_ci = compute_ci(bootstrap_means, confidence_level=95)
avg_maternal_age_ci

[26.896933560477002, 27.545144804088586]


bootstrap_means = bootstrapper(births, avg_maternal_age, 5000)
avg_maternal_age_ci = compute_ci(bootstrap_means, confidence_level=99.9)
avg_maternal_age_ci

[26.680579216354346, 27.755536626916523]


values = make_array(2, 3, 3, 9)
values

array([2, 3, 3, 9])


sum(values)/len(values)

4.25


np.average(values)

4.25


np.mean(values)

4.25


(2 + 3 + 3 + 9)/4

4.25


2*(1/4) + 3*(2/4) + 9*(1/4)

4.25


2*0.25 + 3*0.5 + 9*0.25

4.25


values_table = Table().with_columns('value', values)
values_table


bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)


## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)


Table().with_column('value', new_vals).hist(bins = bins_for_display)


np.average(new_vals)

4.25


Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Gravity');

Organization Group	Department	Job Family	Job	Salary	Overtime	Benefits	Total Compensation
Community Health	Public Health	Human Services	Hospital Eligibility Worker	75854	2661	35780	114296
Community Health	Public Health	Nursing	Nurse Practitioner	236867	0	77694	314562
Public Protection	Police	Police Services	Police Officer	124906	12346	41706	178957
Public Works, Transportation & Commerce	Department Of Public Works	Skilled Labor	Hodcarrier	88812	589	40995	130396

lower	upper	median	good	sample_size	Experiment
148779	188554	148779	False	2	0
210940	234990	210940	False	2	1
107907	180652	107907	True	2	2
121077	137292	121077	True	2	3
25252	59568	25252	False	2	4
105799	113954	105799	False	2	5
20421	26208	20421	False	2	6
105664	123580	105664	False	2	7
137439	147847	137439	False	2	8
79425	138245	79425	True	2	9

Lecture 24¶

Reviewing Setup from Last Lecture¶

Defining the Statistic¶

The Populatin Parameter¶

The Sample¶

Bootstrap Sampling¶

Examining the Distribution of the Statistic¶

Computing the Confidence Interval¶

Simulating the Simulation!¶

Confidence Interval for Unknown Population Mean¶

Compute the Sample Statistic¶

Use the Bootstrap to Estimate the CI¶

Using the Confidence Interval for Testing Hypotheses¶

Average (Mean)¶

Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
120	284	27	62	100	False
113	282	33	64	135	False
128	279	28	64	115	True
108	282	23	67	125	True
136	286	25	62	93	False
138	244	33	62	178	False
132	245	23	65	140	False
120	289	25	62	125	False
143	299	30	66	136	True
140	351	27	68	120	False

value
2
3
3
9