# Copyright (c) 2015-2023 Patricio Cubillos and contributors.
# mc3 is open-source software under the MIT license (see LICENSE).
__all__ = [
'sample',
]
import os
import sys
import importlib
import multiprocessing as mpr
from datetime import date
import numpy as np
import matplotlib as mpl
if os.environ.get('DISPLAY', '') == '':
mpl.use('Agg')
import matplotlib.pyplot as plt
from .fit_driver import fit
from .mcmc_driver import mcmc
from .ns_driver import nested_sampling
from . import utils as mu
from . import stats as ms
from . import plots as mp
from .version import __version__
[docs]def sample(
data=None, uncert=None, func=None, params=None,
indparams=[], indparams_dict={},
pmin=None, pmax=None, pstep=None,
prior=None, priorlow=None, priorup=None,
sampler=None, ncpu=None, leastsq=None, chisqscale=False,
nchains=7, nsamples=None, burnin=0, thinning=1,
grtest=True, grbreak=0.0, grnmin=0.5, wlike=False,
fgamma=1.0, fepsilon=0.0, hsize=10, kickoff='normal',
plots=False, theme='blue', statistics='med_central',
ioff=False, showbp=True,
savefile=None, resume=False,
rms=False, log=None, pnames=None, texnames=None,
**kwargs):
"""
This beautiful piece of code executes an MCMC or NS posterior sampling.
Parameters
----------
data: 1D float ndarray or string
Data to be fit by func. If string, path to file containing data.
uncert: 1D float ndarray
Uncertainties of data.
func: Callable or string-iterable
The callable function that models data as:
model = func(params, *indparams, **indparams_dict)
Or an iterable of 3 strings (funcname, modulename, path)
that specifies the function name, function module, and module path.
If the module is already in the python-path scope, path can be omitted.
params: 1D float ndarray or string
Set of initial fitting parameters for func.
If string, path to file containing data.
indparams: tuple or string
Additional arguments required by func. If string, path to file
containing indparams.
indparams_dict: dict
Additional keyword arguments required by func (if needed).
pmin: 1D ndarray
Lower boundaries for the posterior exploration.
pmax: 1D ndarray
Upper boundaries for the posterior exploration.
pstep: 1D ndarray
Parameter stepping behavior.
- Free parameters have pstep>0.
- Fixed parameters have pstep=0.
- Negative values indicate a shared parameter, with pstep set to
the negative index of the sharing parameter (starting the count
from 1), e.g.: to share second parameter and first one, do:
pstep[1] = -1.
For MCMC, the pstep value of free parameters set the scale of the
initial jump proposal.
prior: 1D ndarray
Parameter priors. The type of prior is determined by priorlow
and priorup:
if both priorlow>0 and priorup>0 Gaussian
else Uniform between [pmin,pmax]
priorlow: 1D ndarray
Lower prior uncertainty values.
priorup: 1D ndarray
Upper prior uncertainty values.
sampler: String
Sampling algorithm:
- 'mrw': Metropolis random walk.
- 'demc': Differential Evolution Markov chain.
- 'snooker': DEMC-z with snooker update.
- 'dynesty': DynamicNestedSampler() sampler from dynesty.
ncpu: Integer
Number of processors for the MCMC chains (mc3 defaults to
one CPU for each chain plus a CPU for the central hub).
leastsq: String
If not None, perform a least-square optimization before the MCMC run.
Select from:
'lm': Levenberg-Marquardt (most efficient, but doesn't obey bounds)
'trf': Trust Region Reflective
chisqscale: Boolean
Scale the data uncertainties such that the reduced chi-square = 1.
nchains: Scalar
Number of simultaneous chains to run.
nsamples: Scalar
Total number of samples.
burnin: Integer
Number of burned-in (discarded) number of iterations at the beginning
of the chains.
thinning: Integer
Thinning factor of the chains (use every thinning-th iteration) used
in the GR test and plots.
wlike: Bool
If True, calculate the likelihood in a wavelet-base. This requires
three additional parameters (TBD: this needs documentation).
grtest: Boolean
If True, run Gelman & Rubin test.
grbreak: Float
Gelman-Rubin convergence threshold to stop the MCMC (I'd suggest
grbreak ~ 1.01). Do not break if grbreak=0.0 (default).
grnmin: Integer or float
Minimum number of samples required for grbreak to stop the MCMC.
If grnmin > 1: grnmin sets the minimum required number of samples.
If 0 < grnmin < 1: grnmin sets the minimum required nsamples fraction.
fgamma: Float
Proposals jump scale factor for DEMC's gamma.
The code computes: gamma = fgamma * 2.38 / sqrt(2*Nfree)
fepsilon: Float
Jump scale factor for DEMC's support distribution.
The code computes: e = fepsilon * Normal(0, pstep)
hsize: Integer
Number of initial samples per chain.
kickoff: String
Flag to indicate how to start the chains:
'normal' for normal distribution around initial guess, or
'uniform' for uniform distribution withing the given boundaries.
plots: Bool
If True plot parameter traces, pairwise-posteriors, and posterior
histograms.
theme:
The color theme for plots. Can have any format recognized as a
matplotlib color.
statistics: String
Statistics to adopt for the plots. Select from:
- 'med_central' Median and central quantile
- 'max_like' Max marginal likelihood (mode) and HPD
- 'global_max_like' Max a posteriori (best-fit) and HPD
ioff: Bool
If True, set plt.ioff(), i.e., do not display figures on screen.
showbp: Bool
If True, show best-fitting values in histogram and pairwise plots.
savefile: String
If not None, filename to store allparams and other MCMC results.
resume: Boolean
If True resume a previous run (identified by the .npz file name).
rms: Boolean
If True, calculate the RMS of the residuals: data - best_model.
log: String or mc3.utils.Log instance
Filename (as string) or log handler (as Log instance) handle logging.
pnames: 1D string iterable
List of parameter names (including fixed and shared parameters)
to display on output screen and figures. See also texnames.
Screen output trims up to the 11th character.
If not defined, default to texnames.
texnames: 1D string iterable
Parameter names for figures, which may use latex syntax.
If not defined, default to pnames.
kwargs: Dict
Additional keyword arguments passed to the sampler.
Returns
-------
mc3_output: Dict
A Dictionary containing the MCMC posterior distribution and related
stats, including:
- posterior: thinned posterior distribution of shape [nsamples, nfree],
including the burn-in phase.
- zchain: chain indices for the posterior samples.
- zmask: posterior mask to remove the burn-in.
- chisq: chi^2 values for the posterior samples.
- log_post: log(posterior) for the posterior samples (see Notes).
- burnin: number of burned-in samples per chain.
- ifree: Indices of the free parameters.
- pnames: Parameter names.
- texnames: Parameter names in Latex format.
- meanp: mean of the marginal posteriors.
- stdp: standard deviation of the marginal posteriors.
- CRlo: lower boundary of the marginal 68%-highest posterior
density (the credible region).
- CRhi: upper boundary of the marginal 68%-HPD.
- bestp: model parameters for the optimal log(posterior) in the sample.
- best_log_post: optimal log(posterior) in the sample (see Notes).
- best_model: model evaluated at bestp.
- best_chisq: chi^2 for the optimal log(posterior) in the sample.
- red_chisq: reduced chi-square: chi^2/(ndata-nfree) for the
best-fitting sample.
- BIC: Bayesian Information Criterion: chi^2 - nfree*log(ndata)
for the best-fitting sample.
- chisq_factor: Uncertainties scale factor to enforce chi^2_red = 1.
- stddev_residuals: standard deviation of the residuals.
- acceptance_rate: sample's acceptance rate.
Notes
-----
The log_post variable is defined here as:
log_post = log(posterior)
= log(likelihood) + log(prior)
= -0.5*chi-square + log_prior
= sum_i -0.5*((data[i] - model[i])/uncert[i])**2 + log_prior
with log_prior defined as:
log_prior = sum_j -0.5*((params[j] - prior[j])/prior_uncert[j])**2
For each parameter with a Gaussian prior.
Note that constant terms have been neglected.
Examples
--------
>>> import numpy as np
>>> import mc3
>>> def quad(p, x):
>>> return p[0] + p[1]*x + p[2]*x**2.0
>>> # Preamble, create a noisy synthetic dataset:
>>> np.random.seed(3)
>>> x = np.linspace(0, 10, 100)
>>> p_true = [3, -2.4, 0.5]
>>> y = quad(p_true, x)
>>> uncert = np.sqrt(np.abs(y))
>>> data = y + np.random.normal(0, uncert)
>>> # Initial guess for fitting parameters:
>>> params = np.array([ 3.0, -2.0, 0.1])
>>> pstep = np.array([ 1.0, 1.0, 1.0])
>>> pmin = np.array([ 0.0, -5.0, -1.0])
>>> pmax = np.array([10.0, 5.0, 1.0])
>>> # Gaussian prior on first parameter, uniform on second and third:
>>> prior = np.array([3.5, 0.0, 0.0])
>>> priorlow = np.array([0.1, 0.0, 0.0])
>>> priorup = np.array([0.1, 0.0, 0.0])
>>> indparams = [x]
>>> func = quad
>>> ncpu = 7
>>> # MCMC sampling:
>>> mcmc_output = mc3.sample(
>>> data, uncert, func, params, indparams=indparams,
>>> sampler='snooker', pstep=pstep, ncpu=ncpu, pmin=pmin, pmax=pmax,
>>> prior=prior, priorlow=priorlow, priorup=priorup,
>>> leastsq='lm', nsamples=1e5, burnin=1000, plots=True)
>>> # Nested sampling:
>>> ns_output = mc3.sample(
>>> data, uncert, func, params, indparams=indparams,
>>> sampler='dynesty', pstep=pstep, ncpu=ncpu, pmin=pmin, pmax=pmax,
>>> prior=prior, priorlow=priorlow, priorup=priorup,
>>> leastsq='lm', plots=True)
>>> # See more examples and details at:
>>> # https://mc3.readthedocs.io/en/latest/mcmc_tutorial.html
>>> # https://mc3.readthedocs.io/en/latest/ns_tutorial.html
"""
# Logging object:
if isinstance(log, str):
log = mu.Log(log, append=resume)
closelog = True
else:
closelog = False
if log is None:
log = mu.Log()
log.msg(
f"\n{log.sep}\n"
" Multi-core Markov-chain Monte Carlo (mc3).\n"
f" Version {__version__}.\n"
f" Copyright (c) 2015-{date.today().year} Patricio Cubillos "
"and collaborators.\n"
" mc3 is open-source software under the MIT license (see LICENSE).\n"
f"{log.sep}\n\n")
if sampler is None:
log.error("'sampler' is a required argument")
if nsamples is None and sampler in ['MRW', 'DEMC', 'snooker']:
log.error("'nsamples' is a required argument for MCMC runs")
if leastsq not in [None, 'lm', 'trf']:
log.error(
f"Invalid 'leastsq' input ({leastsq}). Must select from "
"['lm', 'trf']")
# Read the model parameters:
params = mu.isfile(params, 'params', log, 'ascii', False, not_none=True)
# Unpack if necessary:
if np.ndim(params) > 1:
ninfo, ndata = np.shape(params)
if ninfo == 7: # The priors
prior = params[4]
priorlow = params[5]
priorup = params[6]
if ninfo >= 4: # The stepsize
pstep = params[3]
if ninfo >= 3: # The boundaries
pmin = params[1]
pmax = params[2]
else:
log.error('Invalid format/shape for params input file')
params = params[0] # The initial guess
params = np.array(params)
# Process data and uncertainties:
data = mu.isfile(data, 'data', log, 'bin', False, not_none=True)
if np.ndim(data) > 1:
data, uncert = data
# Make local 'uncert' a copy, to avoid overwriting:
if uncert is None:
log.error("'uncert' is a required argument")
uncert = np.copy(uncert)
# Process the independent parameters:
if indparams != []:
indparams = mu.isfile(indparams, 'indparams', log, 'bin', unpack=False)
if ioff:
plt.ioff()
resume = resume and (savefile is not None)
if resume:
log.msg(f"\n\n{log.sep}\n{log.sep} Resuming previous MCMC run.\n\n")
# Import the model function:
if isinstance(func, (list, tuple, np.ndarray)):
if len(func) == 3:
sys.path.append(func[2])
else:
sys.path.append(os.getcwd())
fmodule = importlib.import_module(func[1])
func = getattr(fmodule, func[0])
elif not callable(func):
log.error(
"'func' must be either a callable or an iterable of strings "
"with the model function, file, and path names")
if ncpu is None and sampler in ['snooker', 'demc', 'mrw']:
ncpu = nchains
elif ncpu is None and sampler == 'dynesty':
ncpu = 1
# Cap the number of processors:
if ncpu >= mpr.cpu_count():
log.warning(
f"The number of requested CPUs ({ncpu}) is >= than the number "
f"of available CPUs ({mpr.cpu_count()}). "
f"Enforced ncpu to {mpr.cpu_count()-1}.")
ncpu = mpr.cpu_count() - 1
nparams = len(params)
ndata = len(data)
# Setup array of parameter names:
if pnames is None and texnames is not None:
pnames = texnames
elif pnames is not None and texnames is None:
texnames = pnames
elif pnames is None and texnames is None:
pnames = texnames = mu.default_parnames(nparams)
pnames = np.asarray(pnames)
texnames = np.asarray(texnames)
if pmin is None:
pmin = np.tile(-np.inf, nparams)
if pmax is None:
pmax = np.tile( np.inf, nparams)
pmin = np.asarray(pmin)
pmax = np.asarray(pmax)
if (np.any(np.isinf(pmin)) or np.any(np.isinf(pmax))) \
and sampler=='dynesty':
log.error('Parameter space must be constrained by pmin and pmax')
if pstep is None:
pstep = 0.1 * np.abs(params)
pstep = np.asarray(pstep)
# Set prior parameter indices:
if prior is None or priorup is None or priorlow is None:
prior = priorup = priorlow = np.zeros(nparams)
# Override priors for non-free parameters:
priorlow[pstep<=0] = 0.0
priorup [pstep<=0] = 0.0
# Check that initial values lie within the boundaries:
if np.any(params < pmin) or np.any(params > pmax):
pout = ""
for pname, par, minp, maxp in zip(pnames, params, pmin, pmax):
if par < minp:
pout += f"\n{pname[:11]:11s} {minp: 12.5e} < {par: 12.5e}"
if par > maxp:
pout += f"\n{pname[:11]:26s} {par: 12.5e} > {maxp: 12.5e}"
log.error(
"Some initial-guess values are out of bounds:\n"
"Param name pmin value pmax\n"
"----------- ------------ ------------ ------------"
f"{pout}"
)
nfree = int(np.sum(pstep > 0))
ifree = np.where(pstep > 0)[0] # Free parameter indices
ishare = np.where(pstep < 0)[0] # Shared parameter indices
# Check output dimension:
model0 = func(params, *indparams, **indparams_dict)
if np.shape(model0) != np.shape(data):
log.error(
f"The size of the data array ({np.size(data)}) does not "
f"match the size of the func() output ({np.size(model0)})"
)
# Check that output path exists:
if savefile is not None:
fpath, fname = os.path.split(os.path.realpath(savefile))
if not os.path.exists(fpath):
log.warning(
f"Output folder path: '{fpath}' does not exist. "
"Creating new folder."
)
os.makedirs(fpath)
# At the moment, skip optimization when these dynesty inputs exist:
if sampler == 'dynesty' \
and ('loglikelihood' in kwargs or 'prior_transform' in kwargs):
leastsq = None
# Least-squares minimization:
chisq_factor = 1.0
if leastsq is not None:
fit_output = fit(
data, uncert, func, np.copy(params),
indparams, indparams_dict,
pstep, pmin, pmax, prior, priorlow, priorup, leastsq)
fit_bestp = fit_output['bestp']
log.msg(
f"Least-squares best-fitting parameters:\n {fit_bestp}\n\n",
si=2)
# Scale data-uncertainties such that reduced chisq = 1:
if chisqscale:
chisq_factor = np.sqrt(fit_output['best_chisq']/(ndata-nfree))
uncert *= chisq_factor
# Re-calculate best-fitting parameters with new uncertainties:
fit_output = fit(
data, uncert, func, np.copy(params),
indparams, indparams_dict,
pstep, pmin, pmax, prior, priorlow, priorup, leastsq)
log.msg(
"Least-squares best-fitting parameters (rescaled chisq):"
f"\n {fit_output['bestp']}\n\n",
si=2)
params = np.copy(fit_output['bestp'])
else:
fit_output = None
if resume:
with np.load(savefile) as oldrun:
uncert *= float(oldrun['chisq_factor'])/chisq_factor
chisq_factor = float(oldrun['chisq_factor'])
# Here's where the magic happens:
if sampler in ['mrw', 'demc', 'snooker']:
output = mcmc(
data, uncert, func,
params, indparams, indparams_dict,
pmin, pmax, pstep,
prior, priorlow, priorup, nchains, ncpu, nsamples, sampler,
wlike, fit_output, grtest, grbreak, grnmin, burnin, thinning,
fgamma, fepsilon, hsize, kickoff, savefile, resume, log,
pnames, texnames,
)
elif sampler == 'dynesty':
output = nested_sampling(
data, uncert, func, params, indparams, indparams_dict,
pmin, pmax, pstep, prior, priorlow, priorup, ncpu,
thinning, resume, log, **kwargs,
)
# Get some stats:
output['chisq_factor'] = chisq_factor
if leastsq is not None:
delta_log_post = output['best_log_post'] - fit_output['best_log_post']
delta_pars = output['bestp'] - fit_output['bestp']
if delta_log_post > 5.0e-8 and np.any(delta_pars != 0.0):
log.warning(
"MCMC found a better fit than the minimizer:\n"
"MCMC best-fitting parameters: (chisq={:.8g})\n{}\n"
"Minimizer best-fitting parameters: (chisq={:.8g})\n{}".
format(
-2*output['best_log_post'], output['bestp'],
-2*fit_output['best_log_post'], fit_output['bestp']))
# Stats without burn-in samples:
posterior, zchain, zmask = mu.burn(
Z=output['posterior'], zchain=output['zchain'], burnin=output['burnin'])
bestp = output['bestp']
post = mp.Posterior(
posterior, pnames=texnames[ifree], theme=theme,
bestp=bestp[ifree], statistics=statistics,
)
print(post.statistics)
# Let Posterior to turn the theme into a Theme() object:
theme = post.theme
# Parameter statistics:
sample_stats = ms.calc_sample_statistics(
post.posterior, bestp, pstep, calc_hpd=True,
)
median = output['medianp'] = sample_stats[0]
mean = output['meanp'] = sample_stats[1]
stdp = output['stdp'] = sample_stats[2]
med_low_bounds = output['median_low_bounds'] = sample_stats[3]
med_high_bounds = output['median_high_bounds'] = sample_stats[4]
mode = output['mode'] = sample_stats[5]
hpd_low_bounds = output['hpd_low_bounds'] = sample_stats[6]
hpd_high_bounds = output['hpd_high_bounds'] = sample_stats[7]
# Legacy (this will be deprecated at some point)
output['CRlo'] = hpd_low_bounds - bestp
output['CRhi'] = hpd_high_bounds - bestp
output['CRlo'][pstep==0] = output['CRhi'][pstep==0] = 0.0
log.msg(
"\nParameter name best fit median 1sigma_low 1sigma_hi S/N"
"\n--------------- ----------- ----------------------------------- ---------",
width=80)
for i in range(nparams):
pname = f'{pnames[i][0:15]:<15}'
lo = med_low_bounds[i] - median[i]
hi = med_high_bounds[i] - median[i]
if i in ifree:
snr = f"{np.abs(bestp[i])/stdp[i]:.1f}"
elif i in ishare:
idx = -int(pstep[i])
snr = f"[share{idx:02d}]"
else:
snr = "[fixed]"
lo = hi = 0.0
log.msg(
f"{pname} {bestp[i]:11.4e} {median[i]:11.4e} "
f"{lo:11.4e} {hi:11.4e} {snr:>9s}",
width=160,
)
# Fit statistics:
best_chisq = output['best_chisq']
log_post = -2.0*output['best_log_post']
bic = output['BIC']
red_chisq = output['red_chisq']
std_dev = output['stddev_residuals']
chisqscale_txt = f"sqrt(reduced chi-squared) factor: {chisq_factor:.4f}\n"
if not chisqscale:
chisqscale_txt = ''
fmt = len(f"{bic:.4f}") # Length of string formatting
log.msg(
f"\n{chisqscale_txt}"
f"Best-parameter's chi-squared: {best_chisq:{fmt}.4f}\n"
f"Best-parameter's -2*log(posterior): {log_post:{fmt}.4f}\n"
f"Bayesian Information Criterion: {bic:{fmt}.4f}\n"
f"Reduced chi-squared: {red_chisq:{fmt}.4f}\n"
f"Standard deviation of residuals: {std_dev:.6g}\n",
indent=2,
)
# Extract filename from savefile or default one:
if savefile is not None:
savefile_root = os.path.splitext(savefile)[0]
else:
savefile_root = 'mc3'
stats_file = f'{savefile_root}_statistics.txt'
ms.summary_stats(post, output, filename=stats_file)
log.msg(
'\nFor a detailed summary with all parameter posterior statistics '
f'see {stats_file}',
)
log.msg("\nOutput sampler files:")
log.msg(stats_file, indent=2)
# Save results (pop unpickables before saving, then put back):
if savefile is not None:
unpickables = ['dynesty_sampler']
unpickables = np.intersect1d(unpickables, list(output.keys()))
tmp_outputs = {key: output.pop(key) for key in unpickables}
np.savez(savefile, **output)
output.update(tmp_outputs)
log.msg(savefile, indent=2)
if plots:
# Trace plot:
savefile = f'{savefile_root}_trace.png'
mp.trace(
output['posterior'], zchain=output['zchain'],
burnin=output['burnin'], pnames=texnames[ifree],
savefile=savefile, color=theme.color,
)
log.msg(savefile, indent=2)
# Pairwise posteriors:
savefile = f'{savefile_root}_pairwise_posterior.png'
post.plot(savefile=savefile, show_estimates=showbp)
log.msg(savefile, indent=2)
# Histograms:
savefile = f'{savefile_root}_marginal_posterior.png'
post.plot_histogram(savefile=savefile, show_estimates=showbp)
log.msg(savefile, indent=2)
# RMS vs bin size:
if rms:
savefile = f'{savefile_root}_RMS.png'
residuals = output['best_model'] - data
data_rms, rms_lo, rms_hi, stderr, binsize = ms.time_avg(residuals)
mp.rms(
binsize, data_rms, stderr, rms_lo, rms_hi,
binstep=len(binsize)//500+1,
savefile=savefile,
)
log.msg(savefile, indent=2)
# Close the log file if necessary:
if closelog:
log.msg(log.logname, indent=2)
log.close()
return output