from os import getcwd
from yaml import safe_load
import numpy as np
import scipy.stats as st
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

thisTutorial = 'bayes_law'
if getcwd() == '/content':
    # assume we are in Colab, and the user's data directory is linked to their drive/Physics267_data
    from google.colab import drive
    drive.mount('/content/drive')
    datapath = '/content/drive/MyDrive/Physics267_data/' + thisTutorial + '/' 
else:
    # assume we are running locally somewhere and have the data under ./data/
    datapath = 'data/'

data = safe_load(open(datapath+'data.yaml', 'r').read())
data

{'n': 341, 's': 158}

# prior_params = {'alpha':..., 'beta':...}

def prior(f, alpha, beta):
    # return the prior density evaluated at x, p(f)

fgrid = np.linspace(0.0, 1.0, 500)

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, prior(fgrid, **prior_params), 'k-');
plt.xlabel('f');
plt.ylabel('p(f)');

# here f would be our model parameter, and s and n would be from our data dictionary
def likelihood(f, s, n):

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, prior(fgrid, **prior_params), 'k-', label='p(f)');
plt.plot(fgrid, likelihood(fgrid, **data), 'b-', label='P(s|f)');
plt.xlabel('f');
plt.legend();

print('This does NOT need to be 1.0:', np.trapezoid(likelihood(fgrid, **data), x=fgrid))

This does NOT need to be 1.0: 0.002923976608187133

sgrid = np.arange(0.0, data['n']+1) # all possible values of `s`: 0, 1, ..., n
test_f = np.pi/10.0 # an arbitrary choice of `f`
test_integral = likelihood(test_f, sgrid, data['n']).sum() # sum of P(s|f) over all s
print('This had better be exactly 1.0 (to within numerical error):', test_integral)

This had better be exactly 1.0 (to within numerical error): 1.0000000000000004

# test the above claim
assert np.isclose(test_integral, 1.0)
print('Yay!')

Yay!

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(sgrid, likelihood(0.01, sgrid, data['n']), 'b.', label='f=0.01');
plt.plot(sgrid, likelihood(0.1, sgrid, data['n']), 'r.', label='f=0.1');
plt.plot(sgrid, likelihood(0.7, sgrid, data['n']), 'k.', label='f=0.7');
plt.xlabel('s');
plt.ylabel('P(s|f)');
plt.legend();

posterior = prior # you can have this one

def get_post_params(alpha, beta, n, s):
    # Note: the first arguments are the prior's parameters.
    # Return the posterior parameters as a dictionary.
    # Since the posterior distribution (function) is the same as the prior distribution 
    # (function), this dictionary's keys should be the same as "prior_params" has.

post_params = get_post_params(**prior_params, **data)

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, prior(fgrid, **prior_params), 'k-', label='p(f)');
plt.plot(fgrid, likelihood(fgrid, **data), 'b-', label='p(s|f)');
plt.plot(fgrid, posterior(fgrid, **post_params), 'r-', label='p(f|s)');
plt.xlabel('f');
plt.legend();

# hidden test of your posterior parameter values

# post_fgrid = ...posterior evaluated at f = "fgrid"

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, posterior(fgrid, **post_params), 'r-', label='analytic');
plt.plot(fgrid, post_fgrid, 'b.', label='grid');
plt.xlabel('f');
plt.ylabel('p(f|s)');
plt.legend();

# test whether these agree
assert np.allclose(posterior(fgrid, **post_params), post_fgrid)
print('Yay!')

Yay!

def summarize_posterior(post_params):
    # Find the 50th, 15.85th and 84.15th percentiles of the posterior distribution.
    # Return these as a numpy array with shape (3,), in that order.

print('My best fit and interval:', summarize_posterior(post_params))
print('**Checkpoint:**          ', np.array(safe_load(open(datapath+'interval_checkpoint.yaml', 'r').read())))

My best fit and interval: [0.46348594 0.43661446 0.49049967]
**Checkpoint:**           [0.46348594 0.43661446 0.49049967]

data2 = safe_load(open(datapath+'data2.yaml', 'r').read())
data2

{'n': 249, 's': 117}

# post_params_2 = posterior from just the second data set
# post_params_both = posterior from both data sets

print('posterior parameters from just the second data set', post_params_2)
print('posterior parameters from both data sets', post_params_both)

posterior parameters from just the second data set {'alpha': 118.0, 'beta': 133.0}
posterior parameters from both data sets {'alpha': 276.0, 'beta': 316.0}

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, prior(fgrid, **prior_params), 'k-', label='p(f)');
plt.plot(fgrid, posterior(fgrid, **post_params), 'r-', label='p(f|s_1)');
plt.plot(fgrid, posterior(fgrid, **post_params_2), 'b-', label='p(f|s_2)');
plt.xlabel('f');
plt.legend();

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.plot(fgrid, prior(fgrid, **prior_params), 'k-', label=r'$p(f)$');
plt.plot(fgrid, posterior(fgrid, **post_params), 'r-', label=r'$p(f|s_1)$');
plt.plot(fgrid, posterior(fgrid, **post_params_both), 'b-', label=r'$p(f|s_1,s_2)$');
plt.xlabel('f');
plt.legend();

print('Just data1:', summarize_posterior(post_params))
print('Just data2:', summarize_posterior(post_params_2))
print('Both data1 and data2', summarize_posterior(post_params_both))

Just data1: [0.46348594 0.43661446 0.49049967]
Just data2: [0.47004003 0.43859668 0.50164294]
Both data1 and data2 [0.46617815 0.44570036 0.48673225]

data_both = {'n':data['n']+data2['n'], 's':data['s']+data2['s']}

Npredict = 1000000

# f_from_posterior = ...
# s_post_predicted = ...

plt.hist(s_post_predicted, density=True, label='your samples', bins=20);
plt.axvline(x=data_both['s'], label='measured s', color='C1')
plt.xlabel(r'$s\prime$', fontsize='x-large');
plt.ylabel(r'$P(s\prime|s)$', fontsize='x-large');
plt.legend(fontsize='x-large');

assert np.isclose(s_post_predicted.mean(), data_both['n']*post_params_both['alpha']/(post_params_both['alpha']+post_params_both['beta']), atol=0.1)
print('Yay!')

Yay!

Tutorial: Bayes' Law¶

Background¶

Solution and implementation¶

Prior¶

Likelihood¶

Posterior¶

Comparison with brute force¶

Summarizing the constraint on $f$¶

Updating with new data¶

Check the goodness of fit¶