Source code for matching.clusterTools

"""
.. module:: clusterTools
   :synopsis: Module holding the SMSCluster class and cluster methods used to combine similar SMS according
      to the analysis.

.. moduleauthor:: Andre Lessa <lessa.a.p@gmail.com>

"""

from smodels.base.particleNode import ParticleNode
from smodels.base.particle import MultiParticle
from smodels.base.physicsUnits import fb
from smodels.decomposition.theorySMS import TheorySMS
from smodels.experiment.datasetObj import DataSet, CombinedDataSet
from smodels.matching.exceptions import SModelSMatcherError as SModelSError
from smodels.matching.matcherAuxiliaryFuncs import average
from smodels.base.smodelsLogging import logger
import numpy as np



[docs]class AverageSMS(TheorySMS):
    """
    Represents an SMS or list of SMS containing only
    the basic attributes required for clustering or computing efficiencies/upper limits.
    Its properties are given by the average properties of the SMS
    it represents and its weight is given by the total weight of
    all SMS.
    """

    def __init__(self, smsList=[]):
        if any(not isinstance(sms, TheorySMS) for sms in smsList):
            raise SModelSError("An AverageSMS must be created from a list of TheorySMS objects.")

        TheorySMS.__init__(self)

        # Get the relevant properties needed by the txnames
        # (in addition to mass, totalwidth and isSM)
        attrList = ['mass', 'totalwidth', 'isSM']
        for sms in smsList:
            dataMap = sms.txname.dataMap
            attrList += [attr for node, attr, unit in dataMap.values()]
        attrList = list(set(attrList))

        # Define base SMS to copy (common) global properties from
        smsBase = smsList[0]

        #  Define relevant properties to be stored and averaged over:
        self.properties = attrList
        self.smsList = smsList[:]
        self.txname = smsBase.txname

        #  Consistency checks:
        if any(sms.canonName != smsBase.canonName for sms in self.smsList):
            logger.error("Can not build average SMS from SMS with distinct topologies.")
            raise SModelSError()
        if any(sms.txname != self.txname for sms in self.smsList):
            logger.error("Can not build average SMS from SMS for distinct txnames.")
            raise SModelSError()


        # Create new node objects holding particles
        # with the average attributes
        avgNodesDict = {}
        if len(smsList) == 1:
            avgNodesDict = {n : node for n,node in zip(smsBase.nodeIndices,smsBase.nodes)}
        else:
            for nodeIndex in smsBase.nodeIndices:
                if nodeIndex == smsBase.rootIndex:
                    # For the root node make a dummy copy:
                    avgNode = smsBase.indexToNode(nodeIndex).copy()
                else:
                    # For all the other nodes compute the average
                    allParticles = [sms.indexToNode(nodeIndex).particle for sms in smsList]
                    attrDict = {}
                    for attr in self.properties:
                        values = [getattr(ptc, attr) for ptc in allParticles]
                        avgAttr = self.getAverage(values)
                        attrDict[attr] = avgAttr
                    mp = allParticles[0]
                    for ptc in allParticles[1:]:
                        mp = mp + ptc
                    if isinstance(mp,MultiParticle):
                        avgParticle = MultiParticle(particles=mp.particles,
                                                attributesDict=attrDict)
                    else:
                        avgParticle = MultiParticle(particles=[mp],
                                                attributesDict=attrDict)
                    avgNode = ParticleNode(particle=avgParticle)
                avgNodesDict[nodeIndex] = avgNode

        # Copy the tree structure from smsBase,
        # but replacing the node objects by the average nodes
        self.copyTreeFrom(smsBase,nodesObjDict=avgNodesDict)
        # Compute the weight
        self.weight = smsBase.weight
        for sms in self.smsList[1:]:
            self.weight = self.weight + sms.weight

    def __cmp__(self, other):
        """
        Compares the SMS with other. Only the properties
        defined in self.properties are used for comparison.
        :param other:  SMS to be compared (SMS object)
        :return: -1 if self < other, 0 if self == other, +1, if self > other.
        """

        if not isinstance(other, TheorySMS):
            return -1

        for nodeIndex in self.nodeIndices:
            nodeA = self.indexToNode(nodeIndex)
            nodeB = other.indexToNode(nodeIndex)
            if nodeA.isSM and nodeB.isSM:
                continue
            for attr in self.properties:
                attrA = getattr(nodeA,attr)
                attrB = getattr(nodeB,attr)
                if attrA != attrB:
                    if attrA > attrB:
                        return 1
                    else:
                        return -1
        return 0

    def __lt__(self, other):
        return self.__cmp__(other) == -1

    def __gt__(self, other):
        return self.__cmp__(other) == 1

    def __eq__(self, other):
        return self.__cmp__(other) == 0

    def __ne__(self, other):
        return self.__cmp__(other) != 0

[docs]    def getAverage(self, values, weighted=True, nround=5):
        """
        Compute the average value for the attribute using
        the SMS list in self.smsList.
        If weighted = True, compute the weighted average
        using the SMS weights.

        :param values: List of values to be averaged over
        :param weighted: If True, uses the SMS weights to compute a weighted average
        :param nround: If greater than zero and the returning attibute is numeric, will round it
                      to this number of significant digits.

        :return: Average value of attribute.
        """

        if weighted:
            weights = [sms.weight.asNumber(fb) for sms in self.smsList]
        else:
            weights = [1.]*len(self.smsList)

        return average(values, weights, nround)

[docs]    def contains(self, sms):
        """
        Check if the average SMS contains the SMS

        :param sms: TheorySMS object

        :return: True/False
        """

        if any(sms is selfSMS for selfSMS in self.smsList):
            return True
        return False


[docs]class SMSCluster(object):
    """
    An instance of this class represents a cluster of SMS.
    This class is used to store the relevant information about a cluster of
    SMS and to manipulate this information.
    """

    def __init__(self, smsList=[]):

        self.smsList = smsList

    def __eq__(self, other):

        if type(self) != type(other):
            return False
        elif set(self.indices()) != set(other.indices()):
            return False
        else:
            return True

    def __iter__(self):
        return iter(self.smsList)

    def __getitem__(self, isms):
        return self.smsList[isms]

    def __len__(self):
        return len(self.smsList)

    def __str__(self):
        return str(self.smsList)

    def __repr__(self):
        return str(self.smsList)

[docs]    def indices(self):
        """
        Return a list of SMS indices appearing in cluster
        """

        indices = [sms._index for sms in self]

        return indices

[docs]    def getTotalXSec(self):
        """
        Return the sum over the cross sections of all SMS belonging to
        the cluster.

        :returns: sum of weights of all the SMS in the cluster (XSectionList object)
        """
        totxsec = 0.0*fb
        for sms in self.smsList:
            totxsec += sms.weight
        return totxsec

    @property
    def averageSMS(self):
        """
        Computes the average SMS for the cluster.
        The average SMS has generic ParticleNodes
        with the attributes set to the average values of self.smsList.
        It can only be defined if all SMS share the same canonical name
        and the same txname. Otherwise, returns None

        :return: AverageSMS object or None (if it can not be defined)
        """

        # Check if an average SMS can be defined:
        cName = self.smsList[0].canonName
        tx = self.smsList[0].txname
        if any(sms.canonName != cName for sms in self.smsList):
            return None
        if any(sms.txname != tx for sms in self.smsList):
            return None
        # Define the average SMS with the required properties averaged over:
        avgSMS = AverageSMS(self.smsList[:])

        avgSMS._index = None

        return avgSMS

[docs]def relativeDistance(sms1, sms2, dataset):
    """
    Defines the relative distance between two SMS according to their
    upper limit values.
    The distance is defined as d = 2*|ul1-ul2|/(ul1+ul2).

    :parameter sms1: SMS object
    :parameter sms2: SMS object

    :returns: relative distance
    """

    if not hasattr(sms1, '_upperLimit'):
        sms1._upperLimit = dataset.getUpperLimitFor(sms1,
                                                   txnames=sms1.txname)
    if not hasattr(sms2, '_upperLimit'):
        sms2._upperLimit = dataset.getUpperLimitFor(sms2,
                                                   txnames=sms2.txname)

    ul1 = sms1._upperLimit
    ul2 = sms2._upperLimit

    if ul1 is None or ul2 is None:
        return None
    if (ul1+ul2).asNumber(fb) == 0.:
        return 0.
    ulDistance = 2.*abs(ul1 - ul2)/(ul1 + ul2)

    return ulDistance

[docs]def clusterSMS(smsList, maxDist, dataset):
    """
    Cluster the original SMS according to their distance in upper limit space.

    :parameter smsList: list of sms (TheorySMS objects)
    :parameter dataset: Dataset object to be used when computing distances in upper limit space
    :parameter maxDist: maximum distance for clustering two SMS

    :returns: list of clusters (SMSCluster objects)
    """
    if len(smsList) == 0:
        return []

    if any(not isinstance(sms, TheorySMS) for sms in smsList):
        raise SModelSError("Asked to cluster non TheorySMS objects")
    if not isinstance(dataset, (DataSet, CombinedDataSet)):
        raise SModelSError("A dataset object must be defined for clustering")

    # Make sure only unique SMS are clustered together (avoids double counting weights)
    # Sort SMS, so the ones with highest contribution (weight) come first:
    smsList = sorted(smsList, key=lambda sms: sms.weight, reverse=True)

    # Remove duplicated SMS:
    smsUnique = []
    for sms in smsList:
        # Skip the SMS if it is related to any another SMS in the list
        if any(sms.isRelatedTo(smsB) for smsB in smsUnique):
            continue
        smsUnique.append(sms)

    # Get txname list only with the txnames from unique SMS used for clustering
    txnames = list(set([sms.txname for sms in smsUnique]))
    if dataset.getType() == 'upperLimit' and len(txnames) != 1:
        logger.error("Clustering SMS with different Txnames for an UL result.")
        raise SModelSError()

    if dataset.getType() == 'upperLimit':  # Group according to upper limit values
        clusters = doCluster(smsUnique, dataset, maxDist)
    elif dataset.getType() == 'efficiencyMap':  # Group all SMS together
        cluster = SMSCluster()
        for isms, sms in enumerate(smsUnique):
            sms._index = isms
        cluster.smsList = smsUnique
        clusters = [cluster]

    for cluster in clusters:
        cluster.txnames = txnames
    return clusters

[docs]def groupSMS(smsList, dataset):
    """
    Group SMS into groups where the average SMS
    identical to all the SMS in group.
    The groups contain all SMS which share the same mass,width and upper limit
    and can be replaced by their average SMS when building clusters.

    :parameter smsList: list of all SMS to be grouped
    :parameter dataset: Dataset object to be used when computing distances in upper limit space

    :returns: a list of AverageSMS objects
              which represents a group of SMS with same mass, width and upper limit.
    """


    #  First make sure all SMS contain their upper limits
    for sms in smsList:
        if not hasattr(sms, '._upperLimit'):
            sms._upperLimit = dataset.getUpperLimitFor(sms, txnames=sms.txname)
        if sms._upperLimit is None:
            raise SModelSError("Trying to cluster SMS (id = %i, txname = %s) outside the grid for dataset %s." 
                                %(sms.smsID,sms.txname,dataset.longStr()))

    # Group SMS if they have the same UL
    # and give the same average SMS (same BSM attributes)
    avgSMSList = []
    for iA, smsA in enumerate(smsList):
        avgSMS = AverageSMS([smsA])
        avgSMS._upperLimit = smsA._upperLimit
        for iB, smsB in enumerate(smsList):
            if iB <= iA:
                continue
            if smsA._upperLimit != smsB._upperLimit:
                continue
            if avgSMS != smsB:
                continue
            avgSMS.smsList.append(smsB)
            avgSMS.weight += smsB.weight
        if avgSMS not in avgSMSList:
            avgSMSList.append(avgSMS)

    # Make sure each SMS belongs to a average SMS:
    for sms in smsList:
        nclusters = sum([avgSMS.contains(sms) for avgSMS in avgSMSList])
        if nclusters != 1:
            raise SModelSError("Error computing average SMS. SMS %s belongs to %i average SMS."
                               % (str(sms), nclusters))
    return avgSMSList

[docs]def clusterTo(centroids,smsList,dataset,maxDist):
    """
    Assign a SMS from smsList to one of the centroids
    """
    
    dMatrix = np.full((len(smsList),len(centroids)),fill_value=maxDist)
    for isms,sms in enumerate(smsList):
        for ic,c in enumerate(centroids):
            dMatrix[isms,ic] = relativeDistance(c,sms,dataset)

    clusters = [[] for ic in range(len(centroids))]
    notClustered = []
    for isms,sms in enumerate(smsList):
        ic = np.argmin(dMatrix[isms])
        d = dMatrix[isms,ic]
        if d < maxDist:
            clusters[ic].append(isms)
        else:
            notClustered.append(isms)
    
    clusterObjs = [SMSCluster((np.array(smsList)[indexList]).tolist())
                    for indexList in clusters]
    clusterObjs = sorted(clusterObjs, key = lambda c: sorted([sms.smsID for sms in c.smsList]))

    # Sort not clustered by largest distance first
    notClustered = sorted(notClustered, key = lambda isms: min(dMatrix[isms,:]),reverse=True)
    notClustered = np.array(smsList)[notClustered].tolist()

    return clusterObjs,notClustered   

[docs]def kmeansCluster(initialCentroids,sortedSMSList,dataset,maxDist):

    # First cluster around the initial centroids
    clusters,notClustered = clusterTo(initialCentroids,sortedSMSList,dataset,maxDist)
    
    stop = False
    # Now iterate until the clusters converge
    # (at this stage do not impose a distance limit for clustering)
    while (not stop):
        # Compute new centroids:
        centroids = [cluster.averageSMS for cluster in clusters]
        # Compute new clusters
        newClusters,notClustered = clusterTo(centroids,sortedSMSList,dataset,maxDist=float('inf'))
        # Stop when clusters no longer change
        if all(clusters[ic] == c for ic,c in enumerate(newClusters)):
            stop = True
        else:
            clusters = newClusters[:]

    # Using the converged centroids remove the SMS with d(centroid,sms) > maxDist
    newClusters,notClustered = clusterTo(centroids,sortedSMSList,dataset,maxDist=maxDist)
    return  newClusters,notClustered

[docs]def doCluster(smsList, dataset, maxDist,nmax=100):
    """
    Cluster algorithm to cluster SMS using a modified K-Means method.

    :parameter smsList: list of all SMS to be clustered
    :parameter dataset: Dataset object to be used when computing distances in upper limit space
    :parameter maxDist: maximum distance for clustering two SMS
    :parameter nmax: maximum number of iterations

    :returns: a list of SMSCluster objects containing the SMS
              belonging to the cluster
    """

    # Get average SMS:
    averageSMSList = groupSMS(smsList, dataset)

    # Index average SMS:
    sortedSMSList = sorted(averageSMSList, key=lambda sms: sms._upperLimit)
    for isms, sms in enumerate(sortedSMSList):
        sms._index = isms


    # Choose initial centroids as the first SMS in a chain of
    # SMS all with dist < maxDist:
    centroids = [sortedSMSList[0]]
    for sms in sortedSMSList:
        if relativeDistance(sms,centroids[-1],dataset) > maxDist:
            centroids.append(sms)

    clusters,notClustered = kmeansCluster(centroids,sortedSMSList,dataset,maxDist)

    niter = 1
    while (len(notClustered) !=0) and (niter < nmax):
        centroids = [c.averageSMS for c in clusters] + [notClustered[0]]
        clusters,notClustered = kmeansCluster(centroids,sortedSMSList,dataset,maxDist)
        niter += 1

    if (niter == nmax) and len(notClustered) != 0:
        logger.warning("SMSCluster failed, using unclustered topologies")
        clusters = [SMSCluster([sms]) for sms in sortedSMSList]

    # Replace average SMS by the original SMS:
    for cluster in clusters:
        originalSMS = []
        for avgSMS in cluster.smsList[:]:
            originalSMS += avgSMS.smsList[:]
        cluster.smsList = originalSMS[:]        
    
    # Finally sort clusters by total xsection and length
    clusters = sorted(clusters, key = lambda c: (c.getTotalXSec(),len(c)),reverse=True)

    return clusters