TutorialName = 'cepheid1'
exec(open('tbc.py').read()) # define TBC and TBC_above
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as st
import emcee
import incredible as cr
from pygtc import plotGTC


data_path = 'data/' 
cepheids = np.loadtxt(data_path + 'R11ceph.dat')
galaxies = np.loadtxt(data_path + 'R11redsh.dat')
ngc_numbers = [int(g) for g in galaxies[:,0]]
data = {int(x[0]):{'z':x[1]} for x in galaxies}
for g in ngc_numbers:
    j = np.where(cepheids[:,1] == g)[0]
    data[g]['id'] = np.array([int(i) for i in cepheids[j,0]])
    data[g]['Ngal'] = len(data[g]['id'])
    data[g]['m'] = cepheids[j,2]
    data[g]['merr'] = cepheids[j,3]
    data[g]['P'] = cepheids[j,4]
    data[g]['logO_H'] = cepheids[j,5]
    data[g]['bias'] = cepheids[j,6]
    data[g]['logP'] = np.log10(data[g]['P'])
c = 3.0e5 # km/s
H0 = 70.0 # km/s/Mpc

TBC()
# for g in ngc_numbers:
#     data[g]['dL'] = ...
#     data[g]['M'] = ...


g = ngc_numbers[0]
g


data[g]['Ngal']


TBC() # answer in Markdown


TBC() # answer in Markdown


global_pivot = np.mean([data[i]['logP'].mean() for i in ngc_numbers])
for i in ngc_numbers:
    data[i]['pivot'] = global_pivot


def meanfunc(x, xpivot, a, b):
    '''
    x is log10(period/days)
    returns an absolute magnitude
    '''
    return a + b*(x - xpivot)


param_names = ['a', 'b', 'sigma']
param_labels = [r'$a$', r'$b$', r'$\sigma$']


TBC() # guess = {'a': ..., 'b': ..., 'c': ...}

guessvec = [guess[p] for p in param_names] # it will be useful to have `guess` as a vector also

plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.errorbar(data[g]['logP'], data[g]['M'], yerr=data[g]['merr'], fmt='none');
plt.xlabel('log10 period/days', fontsize=14);
plt.ylabel('absolute magnitude', fontsize=14);
xx = np.linspace(0.5, 2.25, 100)
plt.plot(xx, meanfunc(xx, data[g]['pivot'], guess['a'], guess['b']))
plt.gca().invert_yaxis();


# prior, likelihood, posterior functions for a SINGLE galaxy

# generic prior for use in all parts of the notebook
def log_prior(a, b, sigma, Mtrue=None):
    '''
    `a`, `b`, and `sigma` are scalars; `Mtrue` is an array (or None)
    '''
    TBC()
    
# likelihood specifically for part A
def log_likelihood_A(gal, a, b, sigma, Mtrue):
    '''
    `gal` is an entry in the `data` dictionary; `a`, `b`, and `sigma` are scalars; `Mtrue` is an array
    '''
    TBC()
    
# generic posterior, again for all parts of the problem
def log_posterior(gal, loglike, **params):
    lnp = log_prior(**params)
    if lnp != -np.inf:
        lnp += loglike(gal, **params)
    return lnp

# posterior for part A, taking a parameter array argument for compatibility with emcee
def logpost_vecarg_A(pvec):
    params = {name:pvec[i] for i,name in enumerate(param_names)}
    params['Mtrue'] = pvec[len(param_names):] # everything after the named parameters is an M_{ij}
    return log_posterior(data[g], log_likelihood_A, **params)

TBC_above()


guess_A = np.concatenate((guessvec, data[g]['M']))
logpost_vecarg_A(guess_A)


%%time

nsteps = 1000 # or whatever

npars = len(guess_A)
nwalkers = 2*npars
sampler = emcee.EnsembleSampler(nwalkers, npars, logpost_vecarg_A)
start = np.array([np.array(guess_A)*(1.0 + 0.01*np.random.randn(npars)) for j in range(nwalkers)])
sampler.run_mcmc(start, nsteps)
print('Yay!')


npars = len(guess)+1
plt.rcParams['figure.figsize'] = (16.0, 3.0*npars)
fig, ax = plt.subplots(npars, 1);
cr.plot_traces(sampler.chain[:min(8,nwalkers),:,:npars], ax, labels=param_labels+[r'$M_1$']);
npars = len(guess_A)


TBC()
# burn = ...
# maxlag = ...

tmp_samples = [sampler.chain[i,burn:,:4] for i in range(nwalkers)]
print('R =', cr.GelmanRubinR(tmp_samples))
print('neff =', cr.effective_samples(tmp_samples, maxlag=maxlag))
print('NB: Since walkers are not independent, these will be optimistic!')
print("Plus, there's a good chance that the results in this section are garbage...")


samples_A = sampler.chain[:,burn:,:].reshape(nwalkers*(nsteps-burn), npars)

plotGTC([samples_A[:,:4]], paramNames=param_labels+[r'$M_1$'], chainLabels=['emcee/brute'],
        figureSize=8, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});


plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.errorbar(data[g]['logP'], data[g]['M'], yerr=data[g]['merr'], fmt='none');
plt.xlabel('log10 period/days', fontsize=14);
plt.ylabel('absolute magnitude', fontsize=14);
xx = np.linspace(0.5, 2.25, 100)
plt.plot(xx, meanfunc(xx, data[g]['pivot'], guess['a'], guess['b']), label='starting point')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_A[:,0].mean(), samples_A[:,1].mean()), label='emcee/brute')
plt.gca().invert_yaxis();
plt.legend();


TBC() # answer in markdown


def log_likelihood_B(gal, a, b, sigma):
    '''
    `gal` is an entry in the `data` dictionary; `a`, `b`, and `sigma` are scalars
    '''
    TBC()

def logpost_vecarg_B(pvec):
    params = {name:pvec[i] for i,name in enumerate(param_names)}
    return log_posterior(data[g], log_likelihood_B, **params)

TBC_above()


logpost_vecarg_B(guessvec)


%%time

nsteps = 10000

npars = len(param_names)
nwalkers = 2*npars
sampler = emcee.EnsembleSampler(nwalkers, npars, logpost_vecarg_B)
start = np.array([np.array(guessvec)*(1.0 + 0.01*np.random.randn(npars)) for j in range(nwalkers)])
sampler.run_mcmc(start, nsteps)
print('Yay!')


plt.rcParams['figure.figsize'] = (16.0, 3.0*npars)
fig, ax = plt.subplots(npars, 1);
cr.plot_traces(sampler.chain[:min(8,nwalkers),:,:], ax, labels=param_labels);


TBC()
# burn = ...
# maxlab = ...

tmp_samples = [sampler.chain[i,burn:,:] for i in range(nwalkers)]
print('R =', cr.GelmanRubinR(tmp_samples))
print('neff =', cr.effective_samples(tmp_samples, maxlag=maxlag))
print('NB: Since walkers are not independent, these will be optimistic!')


samples_B = sampler.chain[:,burn:,:].reshape(nwalkers*(nsteps-burn), npars)

plotGTC([samples_A[:,:3], samples_B], paramNames=param_labels, chainLabels=['emcee/brute', 'emcee/analytic'],
        figureSize=8, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});


sol = np.loadtxt('solutions/ceph1.dat.gz')
plotGTC([sol, samples_B], paramNames=param_labels, chainLabels=['solution', 'my emcee/analytic'],
        figureSize=8, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});


plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.errorbar(data[g]['logP'], data[g]['M'], yerr=data[g]['merr'], fmt='none');
plt.xlabel('log10 period/days', fontsize=14);
plt.ylabel('absolute magnitude', fontsize=14);
xx = np.linspace(0.5, 2.25, 100)
plt.plot(xx, meanfunc(xx, data[g]['pivot'], guess['a'], guess['b']), label='starting point')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_A[:,0].mean(), samples_A[:,1].mean()), label='emcee/brute')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_B[:,0].mean(), samples_B[:,1].mean()), label='emcee/analytic')
plt.gca().invert_yaxis();
plt.legend();


TBC() # answer in Markdown


import lrgs


x = np.asmatrix(data[g]['logP'] - data[g]['pivot']).T
y = np.asmatrix(data[g]['M']).T
M = [np.matrix([[1e-6, 0], [0, err**2]]) for err in data[g]['merr']]


import multiprocessing


nsteps = 2000 # some arbitrary number of steps to run

def do_gibbs(i):
    # every parallel process will have the same random seed if we don't reset them to different values here
    if i > 0:
        np.random.seed(i*42)
    # lrgs.Parameters set up a sampler that assumes the x's are known precisely.
    # Other classes would correspond to different possible priors on x.
    par = lrgs.Parameters(x, y, M)
    chain = lrgs.Chain(par, nsteps)
    chain.run(fix='x') # fix='x' isn't necessary here, but it shows how one would fix other parameters if we wanted to
    # Extracts the chain as a dictionary. Note that we have the option of hanging onto the samples of the magnitude
    #  parameters in addition to the intercept, slope and scatter, though this is not the default.
    dchain = chain.to_dict(["B", "Sigma"])
    # since $sigma^2$ is sampled rather than $\sigma$, take the square root here
    return np.array([dchain['B_0_0'], dchain['B_1_0'], np.sqrt(dchain['Sigma_0_0'])]).T


%%time
#with multiprocessing.Pool() as pool:
#    gibbs_samples = pool.map(do_gibbs, range(2)) # 2 parallel processes - change if you want

# latest Python doesn't appreciate my use of multiprocessing, so I guess we'll just wait twice as long

gibbs_samples = [do_gibbs(i) for i in range(2)]


plt.rcParams['figure.figsize'] = (16.0, 3.0*npars)
fig, ax = plt.subplots(npars, 1);
cr.plot_traces(gibbs_samples, ax, labels=param_labels);


burn = 50
maxlag = 1000

tmp_samples = [x[burn:,:] for x in gibbs_samples]
print('R =', cr.GelmanRubinR(tmp_samples))
print('neff =', cr.effective_samples(tmp_samples, maxlag=maxlag))


samples_C = np.concatenate(tmp_samples, axis=0)

plotGTC([samples_A[:,:3], samples_B, samples_C], paramNames=param_labels,
        chainLabels=['emcee/brute', 'emcee/analytic', 'LRGS/Gibbs'],
        figureSize=8, customLabelFont={'size':12}, customTickFont={'size':12}, customLegendFont={'size':16});


plt.rcParams['figure.figsize'] = (7.0, 5.0)
plt.errorbar(data[g]['logP'], data[g]['M'], yerr=data[g]['merr'], fmt='none');
plt.xlabel('log10 period/days', fontsize=14);
plt.ylabel('absolute magnitude', fontsize=14);
xx = np.linspace(0.5, 2.25, 100)
plt.plot(xx, meanfunc(xx, data[g]['pivot'], guess['a'], guess['b']), label='starting point')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_A[:,0].mean(), samples_A[:,1].mean()), label='emcee/brute')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_B[:,0].mean(), samples_B[:,1].mean()), label='emcee/analytic')
plt.plot(xx, meanfunc(xx, data[g]['pivot'], samples_C[:,0].mean(), samples_C[:,1].mean()), '--', label='LRGS/Gibbs', color='k')
plt.gca().invert_yaxis();
plt.legend();


TBC() # answer in Markdown

Tutorial: The Cepheid Period-Luminosity Relation for a Single Galaxy¶

Fitting a line... hierarchically!¶

1. Data¶

2. Model specification¶

3. Strategy¶

Sampling:¶

Direct integration:¶

4. Obtain the posterior¶

4a. Brute force sampling of all parameters¶

IMPORTANT¶

4b. Sampling with analytic marginalization¶

4c. Conjugate Gibbs sampling¶

Parting thoughts¶