Source code for tools.statistics

#!/usr/bin/env python3

"""
.. module:: statistics
   :synopsis: a module meant to collect various statistical algorithms. It contains
   the procedure that computes an approximate Gaussian likelihood from an
   expected an observer upper limit. See https://arxiv.org/abs/1202.3415.

.. moduleauthor:: Wolfgang Waltenberger <wolfgang.waltenberger@gmail.com>

"""

from scipy import stats, optimize
from smodels.tools.smodelsLogging import logger
from scipy.special import erf
import numpy as np
from smodels.experiment.exceptions import SModelSExperimentError as SModelSError


[docs]def likelihoodFromLimits(upperLimit, expectedUpperLimit, nsig, nll=False,
                         allowNegativeMuhat=True, corr=0.6):
    """ computes the likelihood from an expected and an observed upper limit.
    :param upperLimit: observed upper limit, as a yield (i.e. unitless)
    :param expectedUpperLimit: expected upper limit, also as a yield
    :param nsig: number of signal events, if None then nsig = mumax
    :param nll: if True, return negative log likelihood
    :param allowNegativeMuhat: if True, then allow muhat to become negative,
           else demand that muhat >= 0. In the presence of underfluctuations
           in the data, setting this to True results in more realistic
           approximate likelihoods.
    :param corr: correction factor:
                 ULexp_mod = ULexp / (1. - corr*((ULobs-ULexp)/(ULobs+ULexp)))
                 When comparing with likelihoods constructed from efficiency maps,
                 a factor of corr = 0.6 has been found to result in the best approximations.

    :returns: likelihood (float), and muhat
    """
    assert (upperLimit > 0.)

    if corr > 0. and upperLimit > expectedUpperLimit:
        expectedUpperLimit = expectedUpperLimit / (1. - corr*((upperLimit-expectedUpperLimit)/(upperLimit+expectedUpperLimit)))

    def getSigma(ul, muhat=0.):
        """ get the standard deviation sigma, given
            an upper limit and a central value. assumes a truncated Gaussian likelihood """
        # the expected scale, eq 3.24 in arXiv:1202.3415
        return (ul - muhat) / 1.96

    sigma_exp = getSigma(expectedUpperLimit)  # the expected scale, eq 3.24 in arXiv:1202.3415
    denominator = np.sqrt(2.) * sigma_exp

    def root_func(x):  # we want the root of this one
        return (erf((upperLimit-x)/denominator)+erf(x/denominator)) / (1. + erf(x/denominator)) - .95

    def find_neg_mumax(upperLimit, expectedUpperLimit, xa, xb):
        c = 0
        while root_func(xa)*root_func(xb) > 0:
            xa = 2*xa
            c += 1
            if c > 10:
                logger.error(f"cannot find bracket for brent bracketing ul={upperLimit}, eul={expectedUpperLimit},xa={xa}, xb={xb}")

        mumax = optimize.brentq(root_func, xa, xb, rtol=1e-03, xtol=1e-06)
        return mumax

    def llhd(nsig, mumax, sigma_exp, nll):
        if nsig is None:
            nsig = mumax
        # need to account for the truncation!
        # first compute how many sigmas left of center is 0.
        Zprime = mumax / sigma_exp
        # now compute the area of the truncated gaussian
        A = stats.norm.cdf(Zprime)
        if nll:
            return np.log(A) - stats.norm.logpdf(nsig, mumax, sigma_exp)
        return float(stats.norm.pdf(nsig, mumax, sigma_exp) / A)

    if upperLimit < expectedUpperLimit:
        ## underfluctuation. mumax = 0.
        if allowNegativeMuhat:
            xa = - expectedUpperLimit
            xb = 1
            mumax = find_neg_mumax(upperLimit, expectedUpperLimit, xa, xb)
            return llhd(nsig, mumax, sigma_exp, nll), mumax
        else:
            return llhd(nsig, 0., sigma_exp, nll), 0.

    fA = root_func(0.)
    fB = root_func(max(upperLimit, expectedUpperLimit))
    if np.sign(fA*fB) > 0.:
        ## the have the same sign
        logger.error("when computing likelihood: fA and fB have same sign")
        return None
    mumax = optimize.brentq(root_func, 0., max(upperLimit, expectedUpperLimit),
                            rtol=1e-03, xtol=1e-06)
    llhdexp = llhd(nsig, mumax, sigma_exp, nll)
    return llhdexp, mumax

[docs]def rootFromNLLs(nllA, nll0A, nll, nll0):
    """ compute the CLs - alpha from the NLLs """
    qmu = 2*(nll - nll0)
    if qmu < 0.:
        qmu = 0.
    sqmu = np.sqrt(qmu)
    qA = 2*(nllA - nll0A)
    if qA < 0.:
        qA = 0.
    sqA = np.sqrt(qA)
    CLsb = 1. - stats.multivariate_normal.cdf(sqmu)
    CLb = 0.
    if qA >= qmu:
        CLb = stats.multivariate_normal.cdf(sqA - sqmu)
    else:
        if qA == 0.:
            CLsb = 1.
            CLb = 1.
        else:
            CLsb = 1. - stats.multivariate_normal.cdf((qmu + qA)/(2*sqA))
            CLb = 1. - stats.multivariate_normal.cdf((qmu - qA)/(2*sqA))
    CLs = 0.
    if CLb > 0.:
        CLs = CLsb / CLb
    cl = .95
    root = CLs - 1. + cl
    return root


[docs]def determineBrentBracket(mu_hat, sigma_mu, rootfinder):
    """ find a, b for brent bracketing
    :param mu_hat: mu that maximizes likelihood
    :param sigm_mu: error on mu_hat (not too reliable)
    :param rootfinder: function that finds the root (usually root_func)
    """
    sigma_mu = max(sigma_mu, .5)  # there is a minimum on sigma_mu
    # the root should be roughly at mu_hat + 2*sigma_mu
    a = mu_hat + 1.5 * sigma_mu
    ntrials = 20
    i = 0
    foundExtra = False
    while rootfinder(a) < 0.:
        # if this is negative, we move it to the left
        i += 1
        a -= (i**2.)*sigma_mu
        if i > ntrials or a < -10000.:
            for a in [0., 1., -1., 3., -3., 10., -10., .1, -.1, .01, -.01 ]:
                if rootfinder(a) > 0.:
                    foundExtra = True
                    break
            if not foundExtra:
                logger.error(f"cannot find an a that is left of the root. last attempt, a={a:.2f}, root = {rootfinder(a):.2f}.")
                logger.error(f"mu_hat={mu_hat:.2f}, sigma_mu={sigma_mu:.2f}")
                raise SModelSError(f"cannot find an a that is left of the root. last attempt, a={a:.2f}, root = {rootfinder(a):.2f}.")
    i = 0
    foundExtra = False
    b = mu_hat + 2.5 * sigma_mu
    while rootfinder(b) > 0.:
        # if this is positive, we move it to the right
        i += 1
        b += (i**2.)*sigma_mu
        if i > ntrials:
            for b in [1., 0., 3., -1., 10., -3., .1, -.1, -10., 100. ]:
                if rootfinder(b) < 0.:
                    foundExtra = True
                    break
            if not foundExtra:
                logger.error(f"cannot find an b that is right of the root. last attempt, b={b:.2f}, root = {rootfinder(b):.2f}.")
                logger.error(f"mu_hat was at {mu_hat:.2f} sigma_mu at {sigma_mu:.2f}")
                raise SModelSError(f"cannot find an b that is right of the root. last attempt, b={b:.2f}, root = {rootfinder(b):.2f}.")
    return a, b


[docs]def rvsFromLimits(upperLimit, expectedUpperLimit, n=1, corr=0.):
    """
    Generates a sample of random variates, given expected and observed likelihoods.
    The likelihood is modelled as a truncated Gaussian.

    :param upperLimit: observed upper limit, as a yield (i.e. unitless)
    :param expectedUpperLimit: expected upper limit, also as a yield
    :param n: sample size
    :param corr: correction term

    :returns: sample of random variates
    """

    if corr > 0. and upperLimit > expectedUpperLimit:
        expectedUpperLimit = expectedUpperLimit / (1. - corr*((upperLimit-expectedUpperLimit)/(upperLimit+expectedUpperLimit)))

    sigma_exp = expectedUpperLimit / 1.96  # the expected scale
    denominator = np.sqrt(2.) * sigma_exp

    def root_func(x):  # we want the root of this one
        return (erf((upperLimit-x)/denominator)+erf(x/denominator)) / (1. + erf(x/denominator)) - .95

    fA, fB = root_func(0.), root_func(max(upperLimit, expectedUpperLimit))
    if np.sign(fA*fB) > 0.:
        # the have the same sign
        logger.error("when computing likelihood: fA and fB have same sign")
        return None
    mumax = optimize.brentq(root_func, 0., max(upperLimit, expectedUpperLimit), rtol=1e-03, xtol=1e-06)
    ret = []
    while len(ret) < n:
        tmp = stats.norm.rvs(mumax, sigma_exp)
        if tmp > 0.:
            ret.append(tmp)
    return ret


[docs]def deltaChi2FromLlhd(likelihood):
    """ compute the delta chi2 value from a likelihood (convenience function) """
    if likelihood == 0.:
        return 1e10  # a very high number is good
    elif likelihood is None:
        return None

    return -2. * np.log(likelihood)

[docs]def chi2FromLmax ( llhd, lmax ):
    """ compute the chi2 from likelihood and lmax """
    chi2 = 0.
    if llhd > 1e-200:
        from math import log
        chi2 = 2 * log ( lmax / llhd )
    if chi2 < 0. and llhd < 1e-100:
        # numerical inaccuracy
        chi2 = 0.
    return chi2

[docs]def chi2FromLimits(likelihood, upperLimit, expectedUpperLimit, corr=0.):
    """ compute the chi2 value from a likelihood (convenience function).
    """
    if corr > 0. and upperLimit > expectedUpperLimit:
        expectedUpperLimit = expectedUpperLimit / (1. - corr*((upperLimit-expectedUpperLimit)/(upperLimit+expectedUpperLimit)))
    sigma_exp = expectedUpperLimit / 1.96  # the expected scale
    l0 = 2. * stats.norm.logpdf(0., 0., sigma_exp)
    l = deltaChi2FromLlhd(likelihood)
    if l is None:
        return None

    return l + l0