from os import getcwd
from os.path import exists as file_exists
from yaml import safe_load
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline

import emcee
import incredible as cr
from pygtc import plotGTC

thisTutorial = 'missing_data'
if getcwd() == '/content':
    # assume we are in Colab, and the user's data directory is linked to their drive/Physics267_data
    from google.colab import drive
    drive.mount('/content/drive')
    datapath = '/content/drive/MyDrive/Physics267_data/' + thisTutorial + '/' 
else:
    # assume we are running locally somewhere and have the data under ./data/
    datapath = 'data/'

failure_temps = np.array([53., 56., 57., 63., 70., 70., 75.])
success_temps = np.array([66., 67., 67., 67., 68., 69., 70., 70., 72., 73., 75., 76., 76., 78., 79., 80., 81.])

data = safe_load(open(datapath+'data.yaml', 'r').read())
failure_temps = np.array(data['failure'])
success_temps = np.array(data['success'])
print("Failure temperatures:", failure_temps)
print("Success temperatures:", success_temps)

Failure temperatures: [53. 56. 57. 63. 70. 70. 75.]
Success temperatures: [66. 67. 67. 67. 68. 69. 70. 70. 72. 73. 75. 76. 76. 78. 79. 80. 81.]

plt.rcParams['figure.figsize'] = (6, 4)
plt.plot(success_temps, st.norm.rvs(success_temps*0.0, 0.05), 'o', label='Successes');
plt.plot(failure_temps, st.norm.rvs(failure_temps*0.0+1.0, 0.05), 'o', label='Failures');
plt.xlim(45,85); plt.ylim(-1,2); plt.xlabel('Temperature (F)'); plt.yticks([]); plt.legend();

def P_success(T, T0, beta, Pcold, Phot):
    """
    Evaluate Psuccess as given above, as a function of T, for parameters T0, beta, Pcold, Phot.
    """

plt.rcParams['figure.figsize'] = (6, 4)
T_axis = np.arange(32., 100.)
plt.plot(T_axis, P_success(T_axis, 70.0, 0.3, 0.0, 1.0));
plt.plot(T_axis, P_success(T_axis, 65.0, 0.1, 0.4, 0.9));
plt.plot(T_axis, P_success(T_axis, 45.0, 1.0, 0.1, 0.5));
plt.plot(T_axis, P_success(T_axis, 80.0, 0.5, 0.9, 0.2));
plt.xlabel('temperature (F)');
plt.ylabel('probability of a clean launch');

def ln_prior(T0, beta, Pcold, Phot):
    """
    Return the log-prior density for parameters T0, beta, Pcold, Phot
    """

assert ln_prior(-70.0, 0.3, 0.0, 1.0) == -np.inf
assert ln_prior(70.0, -0.3, 0.0, 1.0) == -np.inf
assert ln_prior(70.0, 0.3, -0.1, 1.0) == -np.inf
assert ln_prior(70.0, 0.3, 1.1, 1.0) == -np.inf
assert ln_prior(70.0, 0.3, 0.0, -1.0) == -np.inf
assert ln_prior(70.0, 0.3, 0.0, 2.0) == -np.inf

def prior_sample():
    """ Return a dictionary with keys T0, beta, Pcold, Phot containing a random draw from the prior """

plt.rcParams['figure.figsize'] = (5,3)
prior_samples = [1.0 - P_success(T=36., **prior_sample()) for i in range(5000)]
plt.hist(prior_samples, density=True); plt.xlabel('Prior probability of failure at 36F');

I_have_thought_about_this_and_it_makes_sense = False # change to True when true

assert I_have_thought_about_this_and_it_makes_sense

class Model:
    def __init__(self, log_prior, log_likelihood):
        self.log_prior = log_prior
        self.log_likelihood = log_likelihood
        self.param_names = ['T0', 'beta', 'Pcold', 'Phot']
        self.param_labels = [r'$T_0$', r'$\beta$', r'$P_\mathrm{cold}$', r'$P_\mathrm{hot}$']
        self.sampler = None
        self.samples = None
    def log_posterior(self, pvec=None, **params):
        '''
        Our usual log-posterior function, able to take a vector argument to satisfy emcee
        '''
        if pvec is not None:
            pdict = {k:pvec[i] for i,k in enumerate(self.param_names)}
            return self.log_posterior(**pdict)
        lnp = self.log_prior(**params)
        if lnp != -np.inf:
            lnp += self.log_likelihood(**params)
        return lnp
    def sample_posterior(self, nwalkers=8, nsteps=10000, guess=[65.0, 0.1, 0.25, 0.75], threads=1):
        # use emcee to sample the posterior
        npars = len(self.param_names)
        self.sampler = emcee.EnsembleSampler(nwalkers, npars, self.log_posterior, threads=threads)
        start = np.array([np.array(guess)*(1.0 + 0.01*np.random.randn(npars)) for j in range(nwalkers)])
        self.sampler.run_mcmc(start, nsteps)
        plt.rcParams['figure.figsize'] = (16.0, 3.0*npars)
        fig, ax = plt.subplots(npars, 1);
        cr.plot_traces(self.sampler.chain[:min(8,nwalkers),:,:], ax, labels=self.param_labels);
    def check_chains(self, burn=500, maxlag=500):
        '''
        Ignoring `burn` samples from the front of each chain, compute convergence criteria and
        effective number of samples.
        '''
        nwalk, nsteps, npars = self.sampler.chain.shape
        if burn < 1 or burn >= nsteps:
            return
        tmp_samples = [self.sampler.chain[i,burn:,:] for i in range(nwalk)]
        R = cr.GelmanRubinR(tmp_samples)
        print('R =', R)
        neff = cr.effective_samples(tmp_samples, maxlag=maxlag, throw=True)
        print('neff =', neff)
        print('NB: Since walkers are not independent, these will be optimistic!')
        return R,neff
    def remove_burnin(self, burn=500):
        '''
        Remove `burn` samples from the front of each chain, and concatenate.
        Store the result in self.samples.
        '''
        nwalk, nsteps, npars = self.sampler.chain.shape
        if burn < 1 or burn >= nsteps:
            return
        self.samples = self.sampler.chain[:,burn:,:].reshape(nwalk*(nsteps-burn), npars)
    def posterior_prediction_Pfailure(self, temperatures=np.arange(30., 85.), probs=[0.5, 0.16, 0.84]):
        '''
        For the given temperatures, compute and store quantiles of the posterior predictive distribution for O-ring failure.
        By default, return the median and a 68% credible interval (defined via quantiles).
        '''
        Pfail = np.array([1.0-P_success(T, self.samples[:,0], self.samples[:,1], self.samples[:,2], self.samples[:,3]) for T in temperatures])
        res = {'T':temperatures, 'p':[str(p) for p in probs]}
        for p in probs:
            res[str(p)] = np.quantile(Pfail, p, axis=1)
        self.post_failure = res
    def plot_Pfailure(self, ax, color, label):
        '''
        Plot summaries of the posterior predictive distribution for O-ring failure.
        Show the center as a solid line and credible interval(s) bounded by dashed lines.
        '''
        ax.plot(self.post_failure['T'], self.post_failure[self.post_failure['p'][0]], color+'-', label=label)
        n = len(self.post_failure['p'])
        if n > 1:
            for j in range(1,n):
                ax.plot(self.post_failure['T'], self.post_failure[self.post_failure['p'][j]], color+'--')
        ax.set_xlabel('T (F)');
        ax.set_ylabel(r'$P_\mathrm{failure}(T)$');
        ax.legend();
    def post1d_Pfailure(self, T):
        self.whist = cr.whist(1.0-P_success(T, *self.samples.T))

def ln_like_complete(T0, beta, Pcold, Phot):
    """
    Return the log-likelihood corresponding to a complete data set
    (Go ahead and access `failure_temps' and `success_temps' from global scope, even though this is not good programming practice)
    """

guess = [65.0, 0.1, 0.25, 0.75] # starting position
burn = 1000 # burn-in
maxlag = 1500 # max lag for Neffective estimate
nsteps = 15000

complete_model = Model(ln_prior, ln_like_complete)

%%time
complete_model.sample_posterior(nwalkers=8, nsteps=nsteps, guess=guess)

CPU times: user 11.1 s, sys: 54.6 ms, total: 11.2 s
Wall time: 11.2 s

R,neff = complete_model.check_chains(burn=burn, maxlag=maxlag)
assert np.all(R < 1.05)
assert np.all(neff > 400)

R = [1.00425898 1.00280167 1.0091491  1.00267955]

neff = [ 726.51284986 1218.01819689  723.05536844 1183.78155128]
NB: Since walkers are not independent, these will be optimistic!

complete_model.remove_burnin(burn=burn)
plotGTC(complete_model.samples, paramNames=complete_model.param_labels,
       figureSize=6, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});

complete_model.posterior_prediction_Pfailure()
complete_model.post1d_Pfailure(36.)

plt.rcParams['figure.figsize'] = (12., 4.)
fig, ax = plt.subplots(1, 2);
complete_model.plot_Pfailure(ax[0], 'C0', 'complete')
ax[1].plot(complete_model.whist['x'], complete_model.whist['density']);
ax[1].set_xlabel(r'$P_\mathrm{failure}(T=36$ F$)$'); ax[1].set_ylabel('posterior density');

I_have_thought_about_these_questions = False # change to True when true

assert I_have_thought_about_these_questions

success_Tmin = np.min(success_temps)
success_Tmax = np.max(success_temps)
Nsuccess = len(success_temps)

del success_temps

def ln_like_censored(T0, beta, Pcold, Phot):
    """
    Return the log-likelihood for the case of censored success temperatures.
    This prototype assumes the success temperatures will be marginalized over within this function; otherwise
    they would need to be additional parameters to be sampled.
    
    (Go ahead and access `failure_temps', `Nsuccess', `success_Tmin' and `success_Tmax' from global scope, even though this is poor programming practice)
    """

guess = [65.0, 0.1, 0.25, 0.75] # starting position
burn = 1000 # burn-in
maxlag = 1500 # max lag for Neffective estimate
nsteps = 15000

censored_model = Model(ln_prior, ln_like_censored)

%%time
censored_model.sample_posterior(nwalkers=8, nsteps=nsteps, guess=guess)

CPU times: user 10.8 s, sys: 59.4 ms, total: 10.9 s
Wall time: 11.1 s

R,neff = censored_model.check_chains(burn=burn, maxlag=maxlag)
assert np.all(R < 1.05)
assert np.all(neff > 400)

R = [1.00191858 1.00457386 1.00214692 1.00012395]

neff = [1185.11198257 1046.53614751 1135.71056118 1171.0699351 ]
NB: Since walkers are not independent, these will be optimistic!

censored_model.remove_burnin(burn=burn)
plotGTC([complete_model.samples, censored_model.samples], paramNames=complete_model.param_labels,
        chainLabels=['complete', 'censored'],
        figureSize=6, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});

censored_model.posterior_prediction_Pfailure()
censored_model.post1d_Pfailure(36.)

plt.rcParams['figure.figsize'] = (12., 4.)
fig, ax = plt.subplots(1, 2);
complete_model.plot_Pfailure(ax[0], 'C0', 'complete')
censored_model.plot_Pfailure(ax[0], 'C1', 'censored')
ax[1].plot(complete_model.whist['x'], complete_model.whist['density']);
ax[1].plot(censored_model.whist['x'], censored_model.whist['density']);
ax[1].set_xlabel(r'$P_\mathrm{failure}(T=36$ F$)$'); ax[1].set_ylabel('posterior density');

I_have_pondered_them = False # change to True when true

assert I_have_pondered_them

success_Tmin = 45.0
success_Tmax = 80.0

guess = [65.0, 0.1, 0.25, 0.75] # starting position
burn = 1000 # burn-in
maxlag = 1500 # max lag for Neffective estimate
nsteps = 15000

verycensored_model = Model(ln_prior, ln_like_censored)

%%time
verycensored_model.sample_posterior(nwalkers=8, nsteps=nsteps, guess=guess)

CPU times: user 10.1 s, sys: 36.7 ms, total: 10.1 s
Wall time: 10.2 s

R,neff = verycensored_model.check_chains(burn=burn, maxlag=maxlag)
assert np.all(R < 1.05)
assert np.all(neff > 400)

R = [1.00177103 1.00534173 1.00994032 1.00358337]

neff = [562.09379088 874.29490823 639.40908906 865.3949379 ]
NB: Since walkers are not independent, these will be optimistic!

verycensored_model.remove_burnin(burn=burn)
plotGTC([complete_model.samples, censored_model.samples, verycensored_model.samples], paramNames=complete_model.param_labels,
        chainLabels=['complete', 'censored', 'very censored'],
        figureSize=6, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});

verycensored_model.posterior_prediction_Pfailure()
verycensored_model.post1d_Pfailure(36.)

plt.rcParams['figure.figsize'] = (12., 4.)
fig, ax = plt.subplots(1, 2);
complete_model.plot_Pfailure(ax[0], 'C0', 'complete')
censored_model.plot_Pfailure(ax[0], 'C1', 'censored')
verycensored_model.plot_Pfailure(ax[0], 'C2', 'very censored')
ax[1].plot(complete_model.whist['x'], complete_model.whist['density']);
ax[1].plot(censored_model.whist['x'], censored_model.whist['density']);
ax[1].plot(verycensored_model.whist['x'], verycensored_model.whist['density']);
ax[1].set_xlabel(r'$P_\mathrm{failure}(T=36$ F$)$'); ax[1].set_ylabel('posterior density');

I_have_ruminated_sufficiently = False # change to True when true

assert I_have_ruminated_sufficiently

Tutorial: O-ring Failure Rates Prior to the Challenger Shuttle Loss¶

Coping with missing information¶

Background¶

Temperature Effects¶

1. Defining a model¶

1a. Implement this function and have a look¶

1b. PGM and priors¶

1c. Model fitting code¶

2. Solution for complete data¶

3. Censored (but somewhat informed) success temperatures¶

3a. Censored model definition¶

3b. Censored model fit¶

4. Censored (less informed) success temperatures¶

Parting thoughts¶