Source code for experiment.databaseObj

#!/usr/bin/env python3

"""
.. module:: databaseObj
   :synopsis: Contains Database class that represents the database of experimental results.

.. moduleauthor:: Veronika Magerl <v.magerl@gmx.at>
.. moduleauthor:: Andre Lessa <lessa.a.p@gmail.com>
.. moduleauthor:: Wolfgang Waltenberger <wolfgang.waltenberger@gmail.com>
.. moduleauthor:: Matthias Wolf <matthias.wolf@wot.at>
"""

from __future__ import print_function
import os
import hashlib
import pathlib
import sys
import time
import copy
import io
from smodels.experiment import datasetObj
from smodels.installation import cacheDirectory
from smodels.experiment.metaObj import Meta
from smodels.experiment.expResultObj import ExpResult
from smodels.experiment.expSMSDict import ExpSMSDict
from smodels.experiment.exceptions import DatabaseNotFoundException
from smodels.base.physicsUnits import TeV
from smodels.experiment.expAuxiliaryFuncs import cleanWalk
from smodels.experiment.exceptions import SModelSExperimentError as SModelSError
from smodels.base.smodelsLogging import logger
import logging
os.environ["OMP_NUM_THREADS"] = "2"

scipyver = ""
try:
    from importlib.metadata import version
    scipyver = version("scipy")
except Exception as e:
    try:
        from scipy import __version__ as scipyver
    except Exception as e:
        pass
if scipyver not in ["1.8.", "1.9.", "1.10.", "2.0.", "2.1."]:
    # fix for pickling different scipy versions (1.7.x vs 1.8.x)
    # so that databases pickled with scipy 1.8.x still work with scipy 1.7.x
    import scipy.spatial
    if not hasattr(scipy.spatial, "_qhull") and hasattr(scipy.spatial, "qhull"):
        sys.modules["scipy.spatial._qhull"] = scipy.spatial.qhull

try:
    import cPickle as serializer
except ImportError as e:
    import pickle as serializer


def _getSHA1(filename):
    return hashlib.sha1(pathlib.Path(filename).read_bytes()).hexdigest()

# some mechanism to remove lock files if the download got interrupted
import atexit
lockfiles = set() 

[docs]def removeLockFiles( lockfiles ): """ remove cruft lockfiles """ for l in lockfiles: if os.path.exists ( l ): try: os.unlink ( l ) except FileNotFoundError as e: pass lockfiles = set()
atexit.register ( removeLockFiles, lockfiles )
[docs]class Database(object): """ Database object. Holds a list of SubDatabases and the ExpSMS map. Delegates all calls to SubDatabases. """ def __init__(self, base=None, force_load=None, progressbar=False, subpickle=True, combinationsmatrix=None): """ :param base: path to the database, or pickle file (string), or http address. If None, "official", or "official_fastlim", use the official database for your code version (including fastlim results, if specified). If "latest", or "latest_fastlim", check for the latest database. Multiple databases may be specified using '+' as a delimiter. :param force_load: force loading the text database ("txt"), or binary database ("pcl"), dont force anything if None :param progressbar: show a progressbar when building pickle file (needs the python-progressbar module) :param subpickle: produce small pickle files per exp result. Should only be used when working on the database. :param combinationsmatrix: an optional dictionary that contains info about combinable analyses, e.g. { "anaid1": ( "anaid2", "anaid3" ) } optionally specifying signal regions, e.g. { "anaid1:SR1": ( "anaid2:SR2", "anaid3" ) } """ self.subs = [] if "_fastlim" in base: # for backwards compatibility base = base.replace("_fastlim", "+fastlim") sstrings = base.split("+") for ss in sstrings: self.subs.append(SubDatabase(ss, force_load, progressbar, subpickle, combinationsmatrix)) # Compute SMS dict with all results self._allExpSMSDict = ExpSMSDict(self.expResultList) # Filter results using the default options # (i.e. remove non-validated results,...) self.selectExpResults() @property def expResultList(self): """ The combined list of results, compiled from the the active results in each subdatabase. """ if len(self.subs) == 0: return [] else: lists = [x.expResultList for x in self.subs] return self.mergeLists(lists)
[docs] def mergeLists(self, lists): """ small function, merges lists of ERs """ D = {} for tmp in lists: for t in tmp: if len(t.datasets) == 0: # skip empty entries logger.warning(f"Analysis {t.globalInfo.id} has no datasets. Will remove it.") continue anaid = t.globalInfo.id + t.datasets[0].getType() if anaid not in D: D[anaid] = t else: D[anaid] = self.mergeERs(D[anaid], t) return list(D.values())
[docs] def mergeERs(self, o1, r2): """ merge the content of exp res r1 and r2 """ r1 = copy.deepcopy(o1) r1.globalInfo = r2.globalInfo dids = [x.getID() for x in o1.datasets] for ds in r2.datasets: if not ds.getID() in dids: # completely new dataset r1.datasets.append(ds) else: # just overwrite the old txnames idx = dids.index(ds.getID()) # ds index r2txs = ds.txnameList r1txnames = [x.txName for x in r1.datasets[idx].txnameList] for txn in r2txs: if txn.txName in r1txnames: tidx = r1txnames.index(txn.txName) # overwrite r1.datasets[idx].txnameList[tidx] = txn else: # a new txname r1.datasets[idx].txnameList.append(txn) return r1
[docs] def createBinaryFile(self, filename=None): """ create a pcl file from all the subs """ ## make sure we have a model to pickle with the database! logger.debug(" * create %s" % filename) if filename == None: filename = self.pcl_meta.pathname with open(filename, "wb") as f: logger.debug(" * load text database") logger.debug(" * write %s db version %s" % (filename, self.databaseVersion)) ptcl = min(4, serializer.HIGHEST_PROTOCOL) ## 4 is default protocol in python3.8, and highest protocol in 3.7 serializer.dump(self.txt_meta, f, protocol=ptcl) serializer.dump(self.expResultList, f, protocol=ptcl) serializer.dump(self.databaseParticles, f, protocol=ptcl) logger.info("%s created." % (filename))
def __str__(self): # r = [ str(x) for x in self.subs ] # return "+".join(r) idList = "Database version: " + self.databaseVersion idList += "\n" idList += "-" * len(idList) + "\n" if self.expResultList == None: idList += "no experimental results available! " return idList idList += "%d experimental results: " % \ len(self.expResultList) atlas, cms = [], [] datasets = 0 txnames = 0 s = {8: 0, 13: 0} for expRes in self.expResultList: Id = expRes.globalInfo.getInfo('id') sqrts = expRes.globalInfo.getInfo('sqrts').asNumber(TeV) if not sqrts in s.keys(): s[sqrts] = 0 s[sqrts] += 1 datasets += len(expRes.datasets) for ds in expRes.datasets: txnames += len(ds.txnameList) if "ATLAS" in Id: atlas.append(expRes) if "CMS" in Id: cms.append(expRes) idList += "%d CMS, %d ATLAS, " % (len(cms), len(atlas)) for sqrts in s.keys(): idList += "%d @ %d TeV, " % (s[sqrts], sqrts) # idList += expRes.globalInfo.getInfo('id') + ', ' idList = idList[:-2] + '\n' idList += "%d datasets, %d txnames.\n" % (datasets, txnames) return idList def __eq__(self, other): if type(other) != type(self): return False for x, y in zip(self.subs, other.subs): if x != y: return False return True
[docs] def getExpResults(self, analysisIDs=['all'], datasetIDs=['all'], txnames=['all'], dataTypes=['all'], useNonValidated=False, onlyWithExpected=False): """ Select (filter) the results within the database satisfying the restrictions set by the arguments and returns the corresponding results. """ self.selectExpResults(analysisIDs=analysisIDs, datasetIDs=datasetIDs, txnames=txnames, dataTypes=dataTypes, useNonValidated=useNonValidated, onlyWithExpected=onlyWithExpected) return self.expResultList[:]
[docs] def selectExpResults(self, analysisIDs=['all'], datasetIDs=['all'], txnames=['all'], dataTypes=['all'], useNonValidated=False, onlyWithExpected=False): """ Selects (filter) the results within the database satisfying the restrictions set by the arguments and updates the centralized SMS dictionary. :param analysisIDs: list of analysis ids ([CMS-SUS-13-006,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] Furthermore, the centre-of-mass energy can be chosen as suffix, e.g. ":13*TeV". Note that the asterisk in the suffix is not a wildcard. :param datasetIDs: list of dataset ids ([ANA-CUT0,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param txnames: list of txnames ([TChiWZ,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param dataTypes: dataType of the analysis (all, efficiencyMap or upperLimit) Can be wildcarded with usual shell wildcards: * ? [<letters>] :param useNonValidated: If False, the results with validated = False will not be included :param onlyWithExpected: Return only those results that have expected values also. Note that this is trivially fulfilled for all efficiency maps. """ for sub in self.subs: sub.setActiveExpResults(analysisIDs, datasetIDs, txnames, dataTypes, useNonValidated, onlyWithExpected) # Update SMS map expDict = self._allExpSMSDict self.expSMSDict = expDict.filter(self.expResultList)
[docs] def getExpSMS(self): """ Returns all the SMS present in the selected experimental results """ return list(self.expSMSDict._smsDict.keys())
@property def databaseParticles(self): """ Database particles, a list, one entry per sub """ r = [x.databaseParticles for x in self.subs] return r[0] # FIXME do sth smarter? @property def databaseVersion(self): """ The version of the database, concatenation of the individual versions """ r = [x.databaseVersion for x in self.subs] for i, ri in enumerate(r): # avoid repetitions if ri == None: r[i] = "unknown" for j, rj in enumerate(r[i+1:]): if ri in rj: r[i+j+1] = rj.replace(ri, "") return "+".join(r) @property def txt_meta(self): """ The meta info of the text version, a merger of the original ones """ r = [x.txt_meta for x in self.subs] ret = r[0] return ret @property def pcl_meta(self): """ The meta info of the text version, a merger of the original ones """ ret = None r = [] for x in self.subs: if hasattr(x, "pcl_meta"): r.append(x.pcl_meta) ret = r[0] return ret
[docs] def createLinksToCombinationsMatrix(self): """ in all globalInfo objects, create a shallow link to the combinations matrix """ for x in self.subs: if not hasattr(x, "combinationsmatrix") or x.combinationsmatrix == None: x.combinationsmatrix = self.combinationsmatrix x.createLinksToCombinationsMatrix()
[docs] def clearLinksToCombinationsMatrix(self): """ clear all shallow links to the combinations matrix """ self.combinationsmatrix = None for x in self.subs: x.combinationsmatrix = None x.clearLinksToCombinationsMatrix()
[docs]class SubDatabase(object): """ SubDatabase object. Holds a list of ExpResult objects. """ def __init__(self, base=None, force_load=None, progressbar=False, subpickle=True, combinationsmatrix=None): """ :param base: path to the database, or pickle file (string), or http address. If None, "official", or "official_fastlim", use the official database for your code version (including fastlim results, if specified). If "latest", or "latest_fastlim", check for the latest database. Multiple databases may be named, use "+" as delimiter. Order matters: Results with same name will overwritten according to sequence :param force_load: force loading the text database ("txt"), or binary database ("pcl"), dont force anything if None :param progressbar: show a progressbar when building pickle file (needs the python-progressbar module) :param subpickle: produce small pickle files per exp result. Should only be used when working on the database. :param combinationsmatrix: an optional dictionary that contains info about combinable analyses, e.g. { "anaid1": ( "anaid2", "anaid3" ) } optionally specifying signal regions, e.g. { "anaid1:SR1": ( "anaid2:SR2", "anaid3" ) } """ self.url = base self.combinationsmatrix = combinationsmatrix self.source = "" if force_load == None and base.endswith(".pcl"): force_load = "pcl" self.force_load = force_load self.subpickle = subpickle obase = base # keep old name for more checks for 'latest' from smodels.installation import __dblabels__ if base in __dblabels__: from smodels.installation import databasePath base = databasePath(base) base, pclfile = self.checkPathName(base) self.pcl_meta = Meta(pclfile) self._allExpResults = [] self._activeResults = [] self.txt_meta = self.pcl_meta if not self.force_load == "pcl": self.txt_meta = Meta(base) self.progressbar = None if progressbar: try: import progressbar as P self.progressbar = P.ProgressBar(widgets=["Building Database ", P.Percentage(), P.Bar(marker=P.RotatingMarker()), P.ETA()]) except ImportError as e: logger.warning("progressbar requested, but python-progressbar is not installed.") if self.force_load == "txt": self._setParticles() self.loadTextDatabase() self.txt_meta.printFastlimBanner() return if self.force_load == "pcl": self.loadBinaryFile() self._setParticles() self.pcl_meta.printFastlimBanner() if "latest" in obase: from smodels import installation codeVersion = installation.version() pclVersion = self.pcl_meta.databaseVersion if codeVersion[0] != pclVersion[0]: logger.error("major versions of code and database differ! code=%s, database=%s" % (codeVersion[0], pclVersion[0])) return if self.force_load in [None, "none", "None"]: self.loadDatabase() self._setParticles() self.txt_meta.printFastlimBanner() return logger.error("when initialising database: force_load=%s is not " "recognized. Valid values are: pcl, txt, None." % force_load) raise SModelSError() def __eq__(self, other): """ compare two databases """ if type(self) != type(other): return False if not self.txt_meta.sameAs(other.txt_meta): return False if len(self.expResultList) != len(other.expResultList): return False for (myres, otherres) in zip(self.expResultList, other.expResultList): if myres != otherres: return False return True @property def expResultList(self): """ The list of active results. """ return self._activeResults[:] @expResultList.setter def expResultList(self,value): """ If a results list is defined for the database, store it in _allExpResults and reset the active list. """ self._allExpResults = value self._activeResults = self._allExpResults[:]
[docs] def loadDatabase(self): """ if no binary file is available, then load the database and create the binary file. if binary file is available, then check if it needs update, create new binary file, in case it does need an update. """ if not os.path.exists(self.pcl_meta.pathname): logger.info("Creating binary database ") logger.info("(this may take a few minutes, but it's done only once!)") self.loadTextDatabase() self.createBinaryFile() else: if self.needsUpdate(): self.createBinaryFile() else: self.loadBinaryFile(lastm_only=False)
[docs] def loadTextDatabase(self): """ simply loads the textdabase """ if self.txt_meta.databaseVersion and len(self.expResultList) > 0: logger.debug("Asked to load database, but has already been loaded. Ignore.") return logger.info("Parsing text database at %s" % self.txt_meta.pathname) self.expResultList = self._loadExpResults() self.createLinksToModel() self.createLinksToCombinationsMatrix()
[docs] def createLinksToModel(self): """ in all globalInfo objects, create links to self.databaseParticles """ if not hasattr(self, "databaseParticles"): return if type(self.databaseParticles) == type(None): return for ctr, er in enumerate(self.expResultList): if not hasattr(er.globalInfo, "_databaseParticles"): er.globalInfo._databaseParticles = self.databaseParticles elif type(er.globalInfo._databaseParticles) == type(None): er.globalInfo._databaseParticles = self.databaseParticles
[docs] def createLinksToCombinationsMatrix(self): """ in all globalInfo objects, create links to self.combinationsmatrix """ if not hasattr(self, "combinationsmatrix"): return if self.combinationsmatrix is None: return for er in self.expResultList: if not hasattr(er.globalInfo, "_combinationsmatrix"): er.globalInfo._combinationsmatrix = self.combinationsmatrix elif er.globalInfo._combinationsmatrix is None: er.globalInfo._combinationsmatrix = self.combinationsmatrix
[docs] def clearLinksToCombinationsMatrix(self): for er in self.expResultList: if hasattr(er.globalInfo, "_combinationsmatrix"): del er.globalInfo._combinationsmatrix
[docs] def removeLinksToModel(self): """ remove the links of globalInfo._databaseParticles to the model. Currently not used. """ for er in self.expResultList: if hasattr(er.globalInfo, "_databaseParticles"): del er.globalInfo._databaseParticles
[docs] def loadBinaryFile(self, lastm_only=False): """ Load a binary database, returning last modified, file count, database. :param lastm_only: if true, the database itself is not read. :returns: database object, or None, if lastm_only == True. """ if lastm_only and self.pcl_meta.mtime: ## doesnt need to load database, and mtime is already ## loaded return None if not os.path.exists(self.pcl_meta.pathname): return None try: with open(self.pcl_meta.pathname, "rb") as f: t0 = time.time() pclfilename = self.pcl_meta.pathname self.pcl_meta = serializer.load(f) self.pcl_meta.pathname = pclfilename if self.force_load == "pcl": self.txt_meta = self.pcl_meta if not lastm_only: if not self.force_load == "pcl" and self.pcl_meta.needsUpdate(self.txt_meta): logger.warning("Something changed in the environment." "Regenerating.") self.createBinaryFile() return self logger.info("loading binary db file %s format version %s" % (self.pcl_meta.pathname, self.pcl_meta.format_version)) if sys.version[0] == "2": self.expResultList = serializer.load(f) else: self.expResultList = serializer.load(f, encoding="latin1") t1 = time.time()-t0 logger.info("Loaded database from %s in %.1f secs." % (self.pcl_meta.pathname, t1)) self.databaseParticles = None try: self.databaseParticles = serializer.load(f) except EOFError as e: pass # a model does not *have* to be defined self.createLinksToModel() self.createLinksToCombinationsMatrix() except(EOFError, ValueError) as e: os.unlink(self.pcl_meta.pathname) if lastm_only: self.pcl_meta.format_version = -1 self.pcl_meta.mtime = 0 return self logger.error("%s is not readable (%s)." % (self.pcl_meta.pathname, str(e))) if self.source in ["http", "ftp", "pcl"]: logger.error("source cannot be rebuilt. supply a different path to the database in your ini file.") raise SModelSError() self.createBinaryFile() # self.txt_meta = self.pcl_meta return self
[docs] def checkBinaryFile(self): nu = self.needsUpdate() logger.debug("Checking binary db file.") logger.debug("Binary file dates to %s(%d)" % (time.ctime(self.pcl_meta.mtime), self.pcl_meta.filecount)) logger.debug("Database dates to %s(%d)" % (time.ctime(self.txt_meta.mtime), self.txt_meta.filecount)) if nu: logger.info("Binary db file needs an update.") else: logger.info("Binary db file does not need an update.") return nu
[docs] def needsUpdate(self): """ does the binary db file need an update? """ try: self.loadBinaryFile(lastm_only=True) # logger.error( "needs update?" ) return(self.pcl_meta.needsUpdate(self.txt_meta)) except(IOError, DatabaseNotFoundException, TypeError, ValueError): # if we encounter a problem, we rebuild the database. return True
[docs] def createBinaryFile(self, filename=None): """ create a pcl file from the text database, potentially overwriting an old pcl file. """ ## make sure we have a model to pickle with the database! if self.txt_meta == None: logger.error("Trying to create database pickle, but no txt_meta defined.") raise SModelSError() logger.debug("database timestamp: %s, filecount: %s" % (time.ctime(self.txt_meta.mtime), self.txt_meta.filecount)) binfile = filename if binfile == None: binfile = self.pcl_meta.pathname if not hasattr(self, 'databaseParticles') or \ type(self.databaseParticles) == type(None): self._setParticles(self._getParticles()) logger.debug(" * create %s" % binfile) with open(binfile, "wb") as f: logger.debug(" * load text database") self.loadTextDatabase() logger.debug(" * write %s db version %s, format version %s, %s" % (binfile, self.txt_meta.databaseVersion, self.txt_meta.format_version, self.txt_meta.cTime())) # ptcl = serializer.HIGHEST_PROTOCOL ptcl = min(4, serializer.HIGHEST_PROTOCOL) # 4 is default protocol in python3.8, and highest protocol in 3.7 serializer.dump(self.txt_meta, f, protocol=ptcl) serializer.dump(self.expResultList, f, protocol=ptcl) serializer.dump(self.databaseParticles, f, protocol=ptcl) logger.info("%s created." % (binfile))
@property def databaseVersion(self): """ The version of the database, read from the 'version' file. """ return self.txt_meta.databaseVersion @databaseVersion.setter def databaseVersion(self, x): self.txt_meta.databaseVersion = x self.pcl_meta.databaseVersion = x
[docs] def inNotebook(self): """ Are we running within a notebook? Has an effect on the progressbar we wish to use. """ try: cfg = get_ipython().config if 'IPKernelApp' in cfg.keys(): return True else: return False except NameError: return False
@property def base(self): """ This is the path to the base directory. """ return self.txt_meta.pathname
[docs] def lockFile ( self, filename : os.PathLike ): """ lock the file <filename> """ lockfile = os.path.join ( os.path.dirname ( filename ), ".lock_"+ os.path.basename ( filename ) ) ctr = 0 while ( ctr < 5 ): ctr+=1 if not os.path.exists ( lockfile ): f=open ( lockfile, "wt" ) f.write ( f"# this is a temporary lockfile created {time.asctime()}\n" ) f.write ( f"# meant to prevent multiple, parallel downloads of\n" ) f.write ( f"# {filename}\n" ) f.close() lockfiles.add ( lockfile ) return True # we have a lockfile. lets see how old s = os.stat ( lockfile ) t = ( time.time() - s.st_mtime ) / 60. # time in minutes # if older than 3 hours, then disregard lockfile if t > 180: lockfiles.add ( lockfile ) return True time.sleep ( ctr ) logger.error ( f"File {filename} is locked, probably because another process is already downloading. Remove {lockfile} if you feel it is safe to retry." ) sys.exit() """ import fcntl # does not work on all file systems fcntl.lockf( handle, fcntl.LOCK_EX) """
[docs] def unlockFile ( self, filename : os.PathLike ): """ unlock the file <filename> """ lockfile = os.path.join ( os.path.dirname ( filename ), ".lock_"+ os.path.basename ( filename ) ) if lockfile in lockfiles: lockfiles.remove( lockfile ) if os.path.exists ( lockfile ): try: os.unlink ( lockfile ) except FileNotFoundError as e: pass
#import fcntl # does not work on all filesystems #fcntl.lockf(handle, fcntl.LOCK_UN)
[docs] def fetchFromScratch(self, path, store): """ fetch database from scratch, together with description. :param store: filename to store json file. """ def sizeof_fmt(num, suffix='B'): for unit in ['', 'K', 'M', 'G', 'T', 'P']: if abs(num) < 1024.: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix) import requests try: r = requests.get(path, timeout=5) except requests.exceptions.RequestException as e: logger.error("Exception when trying to fetch database: %s" % e) logger.error("Consider supplying a different database path in the ini file (possibly a local one)") raise SModelSError() if r.status_code != 200: line = "Error %d: could not fetch '%s' from server: '%s'" % \ (r.status_code, path, r.reason) logger.error(line) raise SModelSError(line) ## its new so store the description with open(store, "w") as f: f.write(r.text) if not "url" in r.json().keys(): logger.error("cannot parse json file %s." % path) raise SModelSError() size = r.json()["size"] cDir, defused = cacheDirectory(create=True, reportIfDefault=True) t0 = time.time() filename = os.path.join(cDir, r.json()["url"].split("/")[-1]) if os.path.exists(filename): # if file exists and checksums match, we dont download if "sha1" in r.json(): sha = _getSHA1(filename) if sha == r.json()["sha1"]: ## seems it hasnt changed self.force_load = "pcl" return ("./", "%s" % filename) r2 = requests.get(r.json()["url"], stream=True, timeout=(250, 2000)) # filename= os.path.join ( cDir, r2.url.split("/")[-1] ) msg = "downloading the database from %s and caching in %s." % (path, cDir) if defused: msg += " If you want the pickled database file to be cached in a different location, set the environment variable SMODELS_CACHEDIR, e.g. to '/tmp'." logger.warning(msg) logger.info("need to fetch %s and store in %s. size is %s." % (r.json()["url"], filename, sizeof_fmt(size))) self.lockFile ( filename ) with open(filename, "wb") as dump: if not self.inNotebook(): # \r doesnt work in notebook print(" " + " "*51 + "<", end="\r") print("loading >", end="") for x in r2.iter_content(chunk_size=int(size / 50)): dump.write(x) dump.flush() print(".", end="") sys.stdout.flush() if self.inNotebook(): print("done.") else: print("") dump.close() self.unlockFile ( filename ) sha = _getSHA1(filename) testsha = r.json()["sha1"] if sha != testsha: logger.error(f"error: downloaded file has different checksum {sha}!={testsha}. This should not happen. Contact the smodels-developers <smodels-developers@lists.oeaw.ac.at>") # sys.exit() logger.info("fetched %s in %d secs." % (r2.url, time.time()-t0)) logger.debug("store as %s" % filename) self.force_load = "pcl" return ("./", "%s" % filename)
[docs] def fetchFromServer(self, path): import requests import time import json self.source = "http" if "ftp://" in path: self.source = "ftp" cDir = cacheDirectory(create=True) store = os.path.join(cDir, path.replace(":", "_").replace("/", "_").replace(".", "_")) logger.debug("need to fetch from server: %s and store to %s" % (path, store)) if not os.path.isfile(store): ## completely new! fetch the description and the db! return self.fetchFromScratch(path, store) with open(store, "r") as f: jsn = json.load(f) filename = os.path.join(cDir, jsn["url"].split("/")[-1]) class _: # pseudo class for pseudo requests def __init__(self): self.status_code = -1 r = _() try: r = requests.get(path, timeout=2) except requests.exceptions.RequestException as e: pass if r.status_code != 200: logger.warning("Error %d: could not fetch %s from server." % (r.status_code, path)) if not os.path.isfile(filename): logger.error("Cant find a local copy of the pickle file. Exit.") sys.exit() logger.warning("I do however have a local copy of the file at %s. I work with that." % filename) self.force_load = "pcl" return (cDir, filename) #return ( cDir, os.path.basename ( filename ) ) if not os.path.exists(filename): return self.fetchFromScratch(path, store) stats = os.stat(filename) if abs(stats.st_size - jsn["size"]) > 4096: ## size doesnt match (4096 is to allow for slightly different file ## sizes reported by the OS). redownload! return self.fetchFromScratch(path, store) """ # dont do this b/c its slowish if "sha1" in r.json(): t0 = time.time() sha = _getSHA1 ( filename ) print ( "it took", time.time()-t0 ) if sha != r.json()["sha1"]: return self.fetchFromScratch ( path, store ) """ if r.json()["lastchanged"] > jsn["lastchanged"]: ## has changed! redownload everything! return self.fetchFromScratch(path, store) if not os.path.isfile(filename): return self.fetchFromScratch(path, store) self.force_load = "pcl" return ("./", filename)
[docs] def checkPathName(self, path): """ checks the path name, returns the base directory and the pickle file name. If path starts with http or ftp, fetch the description file and the database. returns the base directory and the pickle file name """ logger.debug('Try to set the path for the database to: %s', path) if path.startswith(("http://", "https://", "ftp://")): return self.fetchFromServer(path) if path.startswith(("file://")): path = path[7:] tmp = os.path.realpath(path) if os.path.isfile(tmp): base = os.path.dirname(tmp) return (base, tmp) if tmp[-4:] == ".pcl": self.source = "pcl" if not os.path.exists(tmp): if self.force_load == "pcl": logger.error("File not found: %s" % tmp) raise SModelSError() logger.info("File not found: %s. Will generate." % tmp) base = os.path.dirname(tmp) return (base, tmp) logger.error("Supplied a pcl filename, but %s is not a file." % tmp) raise SModelSError() path = tmp + '/' if not os.path.exists(path): logger.error('%s is no valid path!' % path) raise DatabaseNotFoundException("Database not found") m = Meta(path) self.source = "txt" return (path, path + m.getPickleFileName())
def __str__(self): idList = "Database version: " + self.databaseVersion idList += "\n" idList += "-" * len(idList) + "\n" if self.expResultList == None: idList += "no experimental results available! " return idList idList += "%d experimental results: " % \ len(self.expResultList) atlas, cms = [], [] datasets = 0 txnames = 0 s = {8: 0, 13: 0} for expRes in self.expResultList: Id = expRes.globalInfo.getInfo('id') sqrts = expRes.globalInfo.getInfo('sqrts').asNumber(TeV) if not sqrts in s.keys(): s[sqrts] = 0 s[sqrts] += 1 datasets += len(expRes.datasets) for ds in expRes.datasets: txnames += len(ds.txnameList) if "ATLAS" in Id: atlas.append(expRes) if "CMS" in Id: cms.append(expRes) idList += "%d CMS, %d ATLAS, " % (len(cms), len(atlas)) for sqrts in s.keys(): idList += "%d @ %d TeV, " % (s[sqrts], sqrts) # idList += expRes.globalInfo.getInfo('id') + ', ' idList = idList[:-2] + '\n' idList += "%d datasets, %d txnames.\n" % (datasets, txnames) return idList def _setParticles(self, databaseParticles=None): """ Set the databaseParticles attribute. If databaseParticles is None and the self.databaseParticles is None, try to use the particles stored in the first ExpResult in the database (ExptResult.globalInfo._databaseParticles). If not found, fallback to the final states defined in defaultFinalStates.py. :param databaseParticles: Model object containing the final state particles used in the database. """ #If not yet defined, set the attribute to None: if not hasattr(self, 'databaseParticles'): self.databaseParticles = None #If input is given, use it to set the databaseParticles attribute: if databaseParticles: logger.debug("Setting database particles from %s" % str(databaseParticles)) self.databaseParticles = databaseParticles #If still None, fallback to default: if self.databaseParticles is None: logging.debug("databaseParticles not found. Using default state.") from smodels.experiment.defaultFinalStates import finalStates self.databaseParticles = finalStates def _getParticles(self, particlesFile='databaseParticles.py'): """ Load the particle objects used in the database. The particles are searched for in the database folder. If not found, the default particles will be loaded. """ fulldir = os.path.join(self.txt_meta.pathname, particlesFile) if os.path.isfile(fulldir): from importlib import import_module sys.path.append(self.txt_meta.pathname) pFile = os.path.splitext(particlesFile)[0] logger.debug("Loading database particles from: %s" % fulldir) modelFile = import_module(pFile, package='smodels') if not hasattr(modelFile, 'finalStates'): logger.error("Model definition (finalStates) not found in" % fulldir) else: #set model name to file location: modelFile.finalStates.label = os.path.basename(fulldir) return modelFile.finalStates return None def _loadExpResults(self): """ Checks the database folder and generates a list of ExpResult objects for each (globalInfo.txt,sms.py) pair. :returns: list of ExpResult objects """ #Try to load particles from databaseParticles.py self._setParticles(self._getParticles()) folders = [] #for root, _, files in os.walk(self.txt_meta.pathname): # for root, _, files in cleanWalk(self._base): for root, _, files in cleanWalk(self.txt_meta.pathname): folders.append((root, files)) folders.sort() roots = [] for root, files in folders: if "/.git/" in root: continue if root[-11:] == "/validation": continue if root[-5:] == "/orig": continue if not 'globalInfo.txt' in files: continue else: roots.append(root) if self.progressbar: self.progressbar.maxval = len(roots) self.progressbar.start() resultsList = [] for ctr, root in enumerate(roots): if self.progressbar: self.progressbar.update(ctr) expres = self.createExpResult(root) if expres: resultsList.append(expres) if not resultsList: logger.warning("Zero results loaded.") if self.progressbar: self.progressbar.finish() return resultsList
[docs] def createExpResult(self, root): """ create, from pickle file or text files """ txtmeta = Meta(root, hasFastLim=None, databaseVersion=self.databaseVersion) pclfile = "%s/.%s" % (root, txtmeta.getPickleFileName()) logger.debug("Creating %s, pcl=%s" % (root, pclfile)) expres = None try: # logger.info( "%s exists? %d" % ( pclfile,os.path.exists( pclfile ) ) ) if not self.force_load == "txt" and os.path.exists(pclfile): # logger.info( "%s exists" % ( pclfile ) ) with open(pclfile, "rb") as f: logger.debug("Loading: %s" % pclfile) ## read meta from pickle pclmeta = serializer.load(f) if not pclmeta.needsUpdate(txtmeta): logger.debug("we can use expres from pickle file %s" % pclfile) expres = serializer.load(f) else: logger.debug("we cannot use expres from pickle file %s" % pclfile) logger.debug("txt meta %s" % txtmeta) logger.debug("pcl meta %s" % pclmeta) logger.debug("pcl meta needs update %s" % pclmeta.needsUpdate(txtmeta)) except IOError as e: logger.error("exception %s" % e) if not expres: # create from text file expres = ExpResult(root, databaseParticles=self.databaseParticles) if self.subpickle and expres: expres.writePickle(self.databaseVersion) if expres: contact = expres.globalInfo.getInfo("contact") if contact and "fastlim" in contact.lower(): self.txt_meta.hasFastLim = True return expres
[docs] def setActiveExpResults(self, analysisIDs=['all'], datasetIDs=['all'], txnames=['all'], dataTypes=['all'], useNonValidated=False, onlyWithExpected=False): """ Filter the experimental results and store them in activeResults. :param analysisIDs: list of analysis ids ([CMS-SUS-13-006,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] Furthermore, the centre-of-mass energy can be chosen as suffix, e.g. ":13*TeV". Note that the asterisk in the suffix is not a wildcard. :param datasetIDs: list of dataset ids ([ANA-CUT0,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param txnames: list of txnames ([TChiWZ,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param dataTypes: dataType of the analysis (all, efficiencyMap or upperLimit) Can be wildcarded with usual shell wildcards: * ? [<letters>] :param useNonValidated: If False, the results with validated = False will not be included :param onlyWithExpected: Return only those results that have expected values also. Note that this is trivially fulfilled for all efficiency maps. :returns: list of ExpResult objects or the ExpResult object if the list contains only one result """ self._activeResults = self.getExpResults(analysisIDs, datasetIDs, txnames, dataTypes, useNonValidated, onlyWithExpected)
[docs] def getExpResults(self, analysisIDs=['all'], datasetIDs=['all'], txnames=['all'], dataTypes=['all'], useNonValidated=False, onlyWithExpected=False): """ Returns a list of ExpResult objects. Each object refers to an analysisID containing one (for UL) or more (for Efficiency maps) dataset (signal region) and each dataset containing one or more TxNames. If analysisIDs is defined, returns only the results matching one of the IDs in the list. If dataTypes is defined, returns only the results matching a dataType in the list. If datasetIDs is defined, returns only the results matching one of the IDs in the list. If txname is defined, returns only the results matching one of the Tx names in the list. :param analysisIDs: list of analysis ids ([CMS-SUS-13-006,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] Furthermore, the centre-of-mass energy can be chosen as suffix, e.g. ":13*TeV". Note that the asterisk in the suffix is not a wildcard. :param datasetIDs: list of dataset ids ([ANA-CUT0,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param txnames: list of txnames ([TChiWZ,...]). Can be wildcarded with usual shell wildcards: * ? [<letters>] :param dataTypes: dataType of the analysis (all, efficiencyMap or upperLimit) Can be wildcarded with usual shell wildcards: * ? [<letters>] :param useNonValidated: If False, the results with validated = False will not be included :param onlyWithExpected: Return only those results that have expected values also. Note that this is trivially fulfilled for all efficiency maps. :returns: list of ExpResult objects or the ExpResult object if the list contains only one result """ if type(analysisIDs) == str: analysisIDs = [analysisIDs] if type(datasetIDs) == str: datasetIDs = [datasetIDs] if type(txnames) == str: txnames = [txnames] if type(dataTypes) == str: dataTypes = [dataTypes] import fnmatch expResultList = [] for expResult in self._allExpResults: analysisID = expResult.globalInfo.getInfo('id') sqrts = expResult.globalInfo.getInfo('sqrts') # Skip analysis not containing any of the required ids: if analysisIDs != ['all']: hits = False for patternString in analysisIDs: # Extract centre-of-mass energy # Assuming 0 or 1 colons. pattern = patternString.split(':') hits = fnmatch.filter([analysisID], pattern[0]) if len(pattern) > 1: # Parse suffix # Accepted Strings: ":13", ":13*TeV", ":13TeV", ":13 TeV" # Everything else will yield an error at the unum-conversion (eval()) if pattern[1].endswith('TeV'): pattern[1] = pattern[1][:-3] if pattern[1][-1] in [" ", "*"]: pattern[1] = pattern[1][:-1] pattern[1] += "*TeV" if sqrts != eval(pattern[1]): hits = False if hits: break # continue if not hits: continue newExpResult = ExpResult() newExpResult.path = expResult.path newExpResult.globalInfo = expResult.globalInfo newExpResult.datasets = [] newExpResult.origdatasets = expResult.datasets for dataset in expResult.datasets: if dataTypes != ['all']: hits = False for pattern in dataTypes: hits = fnmatch.filter([dataset.dataInfo.dataType], pattern) if hits: break #continue if not hits: continue if hasattr(dataset.dataInfo, 'dataId') and datasetIDs != ['all']: hits = False if datasetIDs == None: datasetIDs = [None] for pattern in datasetIDs: hits = fnmatch.filter([str(dataset.dataInfo.dataId)], str(pattern)) if hits: break # continue if not hits: continue newDataSet = datasetObj.DataSet(dataset.path, dataset.globalInfo, False) newDataSet.dataInfo = dataset.dataInfo newDataSet.txnameList = [] for txname in dataset.txnameList: validated = txname.validated if type(validated) == str: validated = validated.lower() if (validated not in [True, False, "true", "false", "n/a", "tbd", None, "none"]): logger.error("value of validated field '%s' in %s unknown." % (validated, expResult)) if validated in [None, "none"]: # FIXME after 1.1.1 this becomes a warning msg? logger.debug("validated is None in %s/%s/%s. Please set to True, False, N/A, or tbd." % (expResult.globalInfo.id, dataset.dataInfo.dataId, txname)) if validated not in [None, True, "true", "n/a", "tbd"] and (not useNonValidated): continue if txnames != ['all']: #Replaced by wildcard-evaluation below (2018-04-06 mat) hits = False for pattern in txnames: hits = fnmatch.filter([txname.txName], pattern) if hits: # one match is enough break if not hits: continue if onlyWithExpected and dataset.dataInfo.dataType == \ "upperLimit" and not txname.txnameDataExp: continue newDataSet.txnameList.append(txname) # Skip data set not containing any of the required txnames: if not newDataSet.txnameList or newDataSet.txnameList == []: continue newExpResult.datasets.append(newDataSet) # Skip analysis not containing any of the required txnames: if not newExpResult.getTxNames(): continue expResultList.append(newExpResult) return expResultList
[docs] def updateBinaryFile(self): """ write a binar db file, but only if necessary. """ if self.needsUpdate(): logger.debug("Binary db file needs an update.") self.createBinaryFile() else: logger.debug("Binary db file does not need an update.")
if __name__ == "__main__": import argparse from smodels.base.smodelsLogging import setLogLevel """ Run as a script, this checks and/or writes dbX.pcl files """ argparser = argparse.ArgumentParser(description='simple script to check \ and/or write dbX.pcl files') argparser.add_argument('-c', '--check', help='check binary db file', action='store_true') argparser.add_argument('-t', '--time', help='time reading db', action='store_true') argparser.add_argument('-r', '--read', help='read binary db file', action='store_true') argparser.add_argument('-w', '--write', help='force writing binary db file', action='store_true') argparser.add_argument('-u', '--update', help='update binary db file, if necessary', action='store_true') argparser.add_argument('-d', '--debug', help='debug mode', action='store_true') argparser.add_argument('-D', '--database', help='directory name of database', default="../../../smodels-database/") args = argparser.parse_args() logger.setLevel(level=logging.INFO) if args.debug: setLogLevel(level=logging.DEBUG) if args.write: db = Database(args.database, force_load="txt") db.createBinaryFile() sys.exit() db = Database(args.database) if args.update: db.updateBinaryFile() if args.check: db.checkBinaryFile() if args.time: t0 = time.time() expResult = db.loadBinaryFile(lastm_only=False) t1 = time.time() print("Time it took reading binary db file: %.1f s." % (t1-t0)) txtdb = db.loadTextDatabase() t2 = time.time() print("Time it took reading text file: %.1f s." % (t2-t1)) if args.read: db = db.loadBinaryFile(lastm_only=False) listOfExpRes = db.expResultList for expResult in listOfExpRes: print(expResult) for expResult in listOfExpRes: print(expResult)