Source code for statistics.basicStats

#!/usr/bin/env python3

"""
.. module:: basicStats
   :synopsis: a module thats collects various statistical codes, that \
   are shared among other submodules.

.. moduleauthor:: Wolfgang Waltenberger <wolfgang.waltenberger@gmail.com>
"""

from scipy import stats
from smodels.base.smodelsLogging import logger
import numpy as np
from smodels.statistics.exceptions import SModelSStatisticsError as SModelSError
from typing import Text, Union, Tuple, Optional
from collections.abc import Callable

__all__ = [ "NllEvalType", "CLsfromNLL", "determineBrentBracket",
            "exponentiateNLL" ]

from enum import Enum

[docs]def exponentiateNLL( nll : Optional[float], doIt : bool = True ) -> float: """ if doIt, then compute likelihood from nll, else return nll. Should become obsolete longterm """ if nll == None: return None #if doIt: # return 0.0 #return 9000.0 if doIt: return np.exp(-nll ) return nll
[docs]class NllEvalType(Enum): """ an enum to account for the different types of likelihood values: observed, a priori evaluationType, a posteriori evaluationType """ observed = 0 aposteriori = 1 apriori = 2
[docs] @classmethod def init ( cls, evaluationType : Union[str,bool] ): """ get evaluationtype either from a string (e.g. 'posteriori') or a bool (true is priori, false is observed) """ evaluationType = str(evaluationType).lower().replace("_","") if evaluationType in [ "posteriori", "aposteriori", "posterior" ]: return cls.aposteriori if evaluationType in [ "apriori", "prior", "priori", "true" ]: return cls.apriori if evaluationType in [ "false", "observed", "obs" ]: return cls.observed raise SModelSError ( f"NllEvalType {evaluationType} unknown" )
def __hash__(self): return hash(self.value) def __eq__ ( self, other ): if type ( other ) == NllEvalType: return super().__eq__ ( other ) if type ( other ) in [ bool, str ]: return super().__eq__ ( NllEvalType.init ( other ) ) raise SModelSError ( f"comparing a NllEvalType with {other}({type(other)})" )
## convenience observed, aposteriori, apriori = NllEvalType.observed, NllEvalType.aposteriori, \ NllEvalType.apriori def clsType ( CLs : float, return_type : str, cl : float = 0.95 ) -> float: assert return_type in [ "1-CLs", "CLs", "CLs-alpha", "alpha-CLs" ], \ f"return_type {return_type} unknown" if return_type == "1-CLs": return 1.0 - CLs if return_type == "CLs": return CLs if return_type == "CLs-alpha": return CLs - 1 + cl return 1 - cl - CLs
[docs]def CLsfromNLL( nllA: float, nll_minA: float, nll: float, nll_min: float, big_muhat : bool, return_type: Text = "CLs-alpha", nSigma : int = 0, report_all : bool = False ) -> Union[float,dict]: """ compute CLs (or similar) from the NLLs :param nllA: negative log likelihood for Asimov data :param nll_minA: negative log likelihood at muhat for Asimov data :param nll: negative log likelihood :param nll_min: negative log likelihood at muhat :param big_muhat: true if muhat>mu :param return_type: (Text) one of "CLs-alpha", "alpha-CLs", "1-CLs", or "CLs": ========== ======================= CLs-alpha: returns CLs - 0.05 alpha-CLs: returns 0.05 - CLs 1-CLs: returns 1-CLs value CLs: returns "raw" CLs value ========== ======================= :param nSigma: compute CLs not for central value but for this number of sigmas (positive or negative), see Equations 86 - 89 in the CCGV paper. :param report_all: if true, report CLs, CLsb :returns: Cls-type value, see above. if report_all, report dict with several entries """ assert return_type in ["CLs-alpha", "alpha-CLs", "1-CLs", "CLs"], \ f"Unknown return type: {return_type}." qmu = 0.0 if ( nll < nll_min or big_muhat ) else 2 * (nll - nll_min) sqmu = np.sqrt(qmu) qA = 2 * (nllA - nll_minA) if qA < 0.0: qA = 0.0 sqA = np.sqrt(qA) if qA >= qmu: CLsb = 1.0 - stats.norm.cdf(sqmu + nSigma ) CLb = stats.norm.cdf(sqA - sqmu + nSigma ) else: CLsb = 1. CLb = 1. if qA != 0.: teststat = (qmu - qA) / (2 * sqA) ## for pyhf comparison CLsb = 1. - stats.norm.cdf((qmu + qA) / (2 * sqA) + nSigma ) CLb = 1. - stats.norm.cdf( teststat + nSigma ) CLs = CLsb / CLb if CLb > 0 else 0.0 ret = float ( clsType ( CLs, return_type, 0.95 ) ) if report_all: r = { "CLs": float(CLs), "CLsb": float(CLsb), "CLb": float(CLb), "returns": ret } r["sqmu"]=float(sqmu) r["sqA"]=float(sqA) return r return ret
def CLsWithErrorsfromNLL( nllA: float, nll_minA: float, nll: float, nll_min: float, s_nllA: float, s_nll_minA : float, s_nll: float, s_nll_min : float, big_muhat : bool, return_type: Text = "CLs-alpha", nSigma : int = 0 ) -> Tuple[float,float]: """ compute CLs (or similar) from the NLLs :param nllA: negative log likelihood for Asimov data :param nll_minA: negative log likelihood at muhat for Asimov data :param nll: negative log likelihood :param nll_min: negative log likelihood at muhat :param s_nll*: the nlls' errors :param big_muhat: true if muhat>mu :param return_type: (Text) one of "CLs-alpha", "alpha-CLs", "1-CLs", or "CLs": ========== ======================= CLs-alpha: returns CLs - 0.05 alpha-CLs: returns 0.05 - CLs 1-CLs: returns 1-CLs value CLs: returns "raw" CLs value ========== ======================= :param nSigma: compute CLs not for central value but for this number of sigmas (positive or negative), see Equations 86 - 89 in the CCGV paper. :returns: Cls-type value, see above """ assert return_type in ["CLs-alpha", "alpha-CLs", "1-CLs", "CLs"], \ f"Unknown return type: {return_type}." qmu = 0.0 if ( nll < nll_min or big_muhat ) else 2 * (nll - nll_min) var_qmu = 0.0 if nll >= nll_min and not big_muhat: var_qmu = 4 * ( s_nll**2 + s_nll_min**2 ) sqrt_qmu = np.sqrt(qmu) var_sqrt_qmu = 0. if sqrt_qmu > 0.: var_sqrt_qmu = var_qmu / ( 4 * qmu ) qA = 2 * (nllA - nll_minA) var_qA = 4 * ( s_nllA**2 + s_nll_minA**2 ) if qA < 0.0: qA = 0.0 var_qA = 0. sqrt_qA = np.sqrt(qA) var_sqrt_qA = 0. if sqrt_qA > 0.: var_sqrt_qA = var_qA / ( 4 * qA ) if qA >= qmu: CLsb = 1.0 - stats.norm.cdf(sqrt_qmu + nSigma ) var_CLsb = stats.norm.pdf(sqrt_qmu + nSigma )**2 * ( var_sqrt_qmu ) x = sqrt_qA - sqrt_qmu + nSigma var_x = var_sqrt_qA + var_sqrt_qmu CLb = stats.norm.cdf( x ) var_CLb = stats.norm.pdf( x )**2 * var_x else: s_CLsb, s_CLb = 0., 0. CLsb, CLb = 1., 1. var_CLsb, var_CLb = 0., 0. # FIXME is that a good choice? if qA != 0.: x = (qmu + qA) / (2 * sqrt_qA) + nSigma var_x = var_qmu / ( 4* qA ) + var_sqrt_qA / 4 CLsb = 1. - stats.norm.cdf( x ) var_CLsb = stats.norm.pdf( x )**2 * var_x x2 = ( qmu - qA) / (2 * sqrt_qA ) + nSigma # s_x2 = s_sqrt_qmu / ( 2*sqrt_qA ) + s_sqrt_qA / 2. # (var_x2) is the same as var_x CLb = 1. - stats.norm.cdf( x2 ) var_CLb = stats.norm.pdf ( x2 )**2 * var_x CLs = 0. s_CLs = 0. if CLb > 0: CLs = CLsb / CLb if abs(CLs)>1e-8: ## CLs is close to zero, so this wont work var_CLs = var_CLsb / CLb**2 + var_CLb * CLsb**2 / (CLb**4) s_CLs = float ( np.sqrt ( var_CLs )) # s_CLs is the same for them all ret = float ( clsType ( CLs, return_type, 0.95 ) ) return ( ret, s_CLs ) def findRoot ( func : Callable, lower_bound : float, upper_bound : float, args : tuple = (), rtol : float = 8.881784197001252e-16, xtol : float = 2e-12 ) -> float: """ find the root of the function "func", within [lower_bound,upper_bound]. We first try the faster toms748. If that doesnt converge, we fall back to brentq. :param func: the callable function to find the root for :param lower_bound: lower end of bracket :param upper_bound: upper end of bracket :param rtol: rtol for both toms748 and brentq :param xtol: xtol for both toms748 and brentq :returns: place where function evaluates to zero """ logger.debug("Starting root finding") from scipy import optimize root = optimize.toms748( func, lower_bound, upper_bound, args=args, rtol=rtol, xtol=xtol, full_output=True ) if root[1].converged: root = root[0] else: root = optimize.brentq( func, lower_bound, upper_bound, args=args, rtol=rtol, xtol=xtol ) return root
[docs]def determineBrentBracket( mu_hat : float, sigma_mu : float , rootfinder : Callable, allowNegative : bool = True, args : dict = {}, verbose : bool = True ) -> Tuple: """find a, b for brent bracketing :param mu_hat: mu that maximizes likelihood :param sigm_mu: error on mu_hat (not too reliable) :param rootfinder: function that finds the root (usually root_func) :param allowNegative: if False, then do not allow a or b to become negative :returns: the interval a,b """ msg = logger.error if verbose else logger.debug sigma_mu = max(sigma_mu, 0.5) # there is a minimum on sigma_mu sigma_mu = min(sigma_mu, 100.) # there is a maximum on sigma_mu # the root should be roughly at mu_hat + 2*sigma_mu a = mu_hat + 1.5 * sigma_mu ra = rootfinder(a,**args) ntrials = 20 i = 0 foundExtra = False avalues = [ 0.0, 1.0, -1.0, 3.0, -3.0, 10.0, -10.0, 0.1, -0.1, 0.01, -0.01, .001, -.001, 100., -100., .0001, -.0001 ] if not allowNegative: avalues = [0.0, 1.0, 3.0, 10.0, 0.1, 0.01, .001, 100., .0001 ] while ra < 0.0: # if this is negative, we move it to the left i += 1 a -= (i**2.0) * sigma_mu if not allowNegative and a < 0: a = 0 ra = rootfinder(a,**args) if i > ntrials or a < -10000.0 or ra is None: for a in avalues: ra = rootfinder(a,**args) if ra is None: # if cls computation failed, try with next a value continue if ra > 0.0: foundExtra = True break if not foundExtra: msg( f"cannot find an a that is left of the root. last attempt, a={a:.2f}, root = {ra:.2f}." ) msg(f"mu_hat={mu_hat:.2f}, sigma_mu={sigma_mu:.2f}") raise SModelSError( f"cannot find an a that is left of the root. last attempt, a={a:.2f}, root = {ra:.2f}." ) i = 0 foundExtra = False b = mu_hat + 2.5 * sigma_mu rb = rootfinder(b,**args) bvalues = [1.0, 0.0, 3.0, -1.0, 10.0, -3.0, 0.1, -0.1, -10.0, 100.0, -100.0, 1000.0, .01, -.01, .001, -.001, 10000.0, 100000.0, 1000000.0 ] if not allowNegative: bvalues = [ 1.0, 0.0, 3.0, 10.0, 0.1, 100.0, 1000.0, .01, .001, 10000.0, 100000.0, 1000000.0 ] while rb > 0.0: # if this is positive, we move it to the right i += 1 b += (i**2.0) * sigma_mu rb = rootfinder(b,**args) if rb is None: # if cls computation failed, try with a bigger b value continue closestr, closest = float("inf"), None memoize = {} if i > ntrials or ( b < 0 and not allowNegative ): for b in bvalues: rb = rootfinder(b,**args) memoize[b]=float(rb) if rb is None: # if cls computation failed, try with next b value continue if rb < 0.0: foundExtra = True break if rb < closestr: closestr = rb closest = b if not foundExtra: msg(f"cannot find a b that is right of the root (i.e. rootfinder(b) < 0).") msg(f"memoize {memoize}" ) msg(f"closest to zero rootfinder({closest})={closestr}") msg(f"mu_hat was at {mu_hat:.2f} sigma_mu at {sigma_mu:.2f}") raise SModelSError( f"cannot find b right of the root" ) return a, b
def deltaChi2FromLlhd(likelihood): """compute the delta chi2 value from a likelihood (convenience function)""" if likelihood == 0.0: return 1e10 # a very high number is good elif likelihood is None: return None return -2.0 * np.log(likelihood)