Source code for fastkde.convergence_tests

import fastkde.fastKDE as fastKDE
import time
import scipy.optimize as opt
import scipy.stats as stat
import pylab as PP
from multiprocessing import Pool
import numpy as npy

berkeleyLabText = """                                                           
.++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++';   :+++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++;+++++   ++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++':.. ..:;++++++++++++++++++++++++++++++,+++++++++++
++++++++++++++++;     `....       ;+++++++++++++++++++++++++.;++++++++++
+++++++++++++;  :+++++++++++++++:    :+++++++++++++++++++++++ ++++++++++
+++++++++++`,++++++++++++++++++++++;    +++++++++++++++++++ +,++++++++++
+++++++++`++++++++++++++++++++++++++++    ++++++++++++++++++++ +++++++++
+++++++;++++++++++++++++++++++++++++++++   ,++++++++++++++:+++ +++++++++
++++++++++++++++++++++++++++++++++++++++++   ++++++++++++:+++++:++++++++
+++++++++++++++++++++++++++++++++++++++++++`  ;++++++++++ ++++++`+++++++
+++++++++++++++++++++++++++++++++++++++++++++  `+++++++++ '+:+:; +++++++
++++++++++++++++++++++++++++++++++++++++++++++   ++++++++ .; + ; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++   ...++++ .; + ; +++++++
++++++++++++++++++++++++++++++++++++++++++++++++     ++++ .; + ; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ .; + ; +++++++
+++++++.  ,++:   +++. `+++,  `+++   +++   ,++'  `++; ++++ .' + ; +++++++
++++++  ;+++.  ++++  ++++  .++++  ++++  ;+++:  ++++; ++++ +++++; +++++++
++++++  ++++  `++++  ++++  ;++++  ++++  ++++:  ++++; ++++ +++++; +++++++
++++++  ++++  .++++  ++++  ;++++  ++++  ++++.  ++++; ++++ +++++; +++++++
++++++ .++++ .'++++ .++++ .+++++ :++++ :++++  :++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++ +++++; +++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; ++++:+++++':+++++++
+++++++++++++++++++++++++++++++++++++++++++++++++++; +++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++    +    +    +  +  +    + +++;    ; ++ `+++ ;++++  .++    +++++++
++++++ ++ . :+++ ++ :  ` ++ ++++ +++; ++++ .' ++++ ;++++ : ++ ++ ;++++++
++++++ .  + `..+ :. :   ;++ ..,+ +++; ..+++  :++++ ;++++ + ++ .  +++++++
++++++ :, '    +    +    ++   .+ +++;   +++  +++++ ;+++` + ;+ :` +++++++
++++++ ++ : :+++ +  +  + ,+ ++++ +++; +++++. +++++ ;+++     + ++ .++++++
++++++    ;    ; ++ :  +. +    +    ;    ++. +++++    + ++; +    +++++++
++++++:::++::::':++::::++:;::::+::::'::::++;:+++++::::':+++:+:::++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
,++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++,
"""  # Adapted from http://picascii.com/



[docs]
def underlyingFunction(x, x0=305, y0=200, yrange=4):
    """A sigmoid with a transition centered on x0"""
    return (yrange / 2) * npy.tanh(x - x0) + y0




[docs]
def stochasticSample(
    x0=305, y0=200, yrange=4, numSamples=1e3, xmid=305, xp1=5, xp2=2, yp1=0, yp2=None
):
    if yp2 is None:
        # By default, have the range of the noise be larger than the range in y
        yp2 = 3 * yrange
    # Generate random samples of X from the gamma distribution
    # Note: I flip the gamma distribution around here so that the upper range has a short tail
    x = -(npy.random.gamma(xp1, xp2, int(numSamples)) - xp1 * xp2) + xmid
    # Generate random samples of y from x and add normally distributed noise
    y = underlyingFunction(x, x0, y0, yrange) + npy.random.normal(
        loc=yp1, scale=yp2, size=numSamples
    )

    # Concatenate the paired samples together
    xy = npy.concatenate((x[npy.newaxis, :], y[npy.newaxis, :]), axis=0)
    return xy




[docs]
def conditionalPDF(
    y, x, x0=305, y0=200, yrange=4, xmid=305, xp1=5, xp2=2, yp1=0, yp2=None
):
    if yp2 is None:
        # By default, have the range of the noise be larger than the range in y
        yp2 = 3 * yrange

    mu = underlyingFunction(x, x0, y0, yrange)

    return stat.norm.pdf(y, loc=mu, scale=yp2)




[docs]
def marginalX(y, x, x0=305, y0=200, yrange=4, xmid=305, xp1=5, xp2=2, yp1=0, yp2=None):
    xbar = xp1 * xp2
    return stat.gamnpy.ma.pdf(x0 - x + xbar, a=xp1, scale=xp2)




[docs]
def jointXY(y, x, x0=305, y0=200, yrange=4, xmid=305, xp1=5, xp2=2, yp1=0, yp2=None):
    # Return the product of the conditional and the marginal
    return marginalX(y, x, x0, y0, yrange, xmid, xp1, xp2, yp1, yp2) * conditionalPDF(
        y, x, x0, y0, yrange, xmid, xp1, xp2, yp1, yp2
    )




[docs]
def getModeCurve(conditional, axes):
    """Extract the mode curve from a 2D conditional (assumes conditioning on axis 0)"""
    modeCurve = npy.array(
        [axes[1][conditional[:, i].argmax()] for i in range(conditional.shape[1])]
    )
    # Remove points whre all of the PDF is missing
    allMising = npy.prod((conditional.mask), axis=0)
    modeCurve = npy.ma.masked_where(allMising == 1, modeCurve)
    return modeCurve




[docs]
def asciiToPoints(text, convertCharacter=" "):
    """Given ascii art text, convert the whitespace into a list of point"""

    # Get the lines from the text
    textLines = text.split("\n")[1:-1]

    # Convert the text into a point array
    pointArray = npy.array(
        [[1 if char == convertCharacter else 0 for char in line] for line in textLines]
    ).T[:, ::-1]

    return npy.where(pointArray == 1)




[docs]
def powLaw(x, a, c):
    return c * x**a



# A simple timer for comparing ECF calculation methods

[docs]
class Timer:
    """A simple timer class

    Example::

        ```python
        myTimer = Timer()
        with myTimer:
            doSomething()

        print myTimer.duration
        ```
    """

    def __enter__(self):
        self.start = time.time()

    def __exit__(self, *args):
        self.duration = time.time() - self.start




[docs]
def genCovMat(varianceList, correlationDict):
    """Generates a covariance matrix, given a list of variances for all variables and a dict of correlation
    coefficients for all pairs of variables.
    
    The dict's keys must be tuples that describe the pair of points for the correlation coefficient.
    
    For example, the following would be valid input for for a 3 variable distribution::
    
        ```python
        varianceList = [0.1, 10.0, 100.0]
        correlationDict = { (0,1) :  0.9, \
                            (0,2) :  0.0, \
                            (1,2) : -0.4 }

        covMat = genCovMat(varianceList,correlationDict)
        ```
    """

    # Generate a list of all possible unique variable pairs
    numVars = len(varianceList)
    pairList = correlationDict.keys()
    for i, j in pairList:
        # Check that the indices are different from one another for this pair
        assert i != j, "The (i,j) tuple keys for correlationDict must all be different"
        assert i >= 0 and j >= 0, "i and j must be >= 0"
        assert i < numVars and j < numVars, "i and j must be < {}".format(numVars)
        # Check that the correlation coefficients are valid
        assert (
            abs(correlationDict[(i, j)]) <= 1
        ), "All correlation coefficients must have a magnitude less than or equal to 1"

    # Make sure we have enough correlation pairs
    numPairsNeeded = (numVars**2 - numVars) / 2
    assert (
        len(pairList) == numPairsNeeded
    ), "{} correlation pairs were provided; {} are needed".format(
        len(pairList), numPairsNeeded
    )

    # Initialize the covariance matrix
    covMat = npy.zeros([numVars, numVars])

    # Insert the variances
    for i in range(numVars):
        covMat[i, i] = varianceList[i]

    # Insert the covariances
    for i, j in pairList:
        covMat[i, j] = covMat[j, i] = (
            correlationDict[(i, j)]
            * npy.sqrt(varianceList[i])
            * npy.sqrt(varianceList[j])
        )

    return covMat




[docs]
def estimatePDFWrapper(arg, **kwargs):
    """A timed wrapper for doing a PDF estimate and evaluating its error"""

    j, numSamplesMax, sampleList, scKWArgs, conditionVar, pdfTestObj = arg

    # Seed the random number generator
    npy.random.seed(j)

    # Pull samples from the distribution
    mySamples = pdfTestObj.sampleFromDistribution(numSamplesMax)

    ISEVals = []
    timingVals = []
    for numSamples in sampleList:
        # Extract the current sample set
        mySample = mySamples[..., :numSamples]

        # Initialize the timer
        myTimer = Timer()

        # Do the KDE estimate and time it
        with myTimer:
            _pdf = fastKDE.fastKDE(mySample, **scKWArgs)
            if conditionVar is not None:
                cpdf = _pdf.estimateConditionals(conditionVar, mySample, peakFrac=0.01)

        # Calculate the integrated squared error
        deltaXs = [npy.diff(ax) for ax in _pdf.axes]
        deltaXs = [npy.concatenate((dx, [dx[-1]])) for dx in deltaXs]
        deltaXProducts = npy.prod(npy.meshgrid(*tuple(deltaXs)), axis=0)

        if conditionVar is None:
            ISE = npy.ma.sum(
                deltaXProducts * (pdfTestObj.pdfStandard(_pdf.axes) - _pdf.pdf) ** 2
            )
        else:
            ISE = npy.ma.sum(
                deltaXProducts * (pdfTestObj.pdfStandard(_pdf.axes) - cpdf) ** 2
            ) / len(cpdf.compressed())

        ISEVals.append(ISE)
        timingVals.append(myTimer.duration)

    return timingVals, ISEVals, mySamples




[docs]
class testDistribution(object):
    """A generic distribution--meant to be overridden--for testing the fast KDE method"""

    def __init__(self, **kwargs):
        # Save all incoming keyword arguments
        for key, val in kwargs.items():
            self.__setitem__(key, val)

        self.pdfName = "None"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        pass



[docs]
    def pdfStandard(self, axes):
        pass



[docs]
    def doTesting(
        self,
        numSamplesMax=2097152,
        numRepetitions=30,
        scKWArgs={
            "frac_contiguous_hyper_volumes": 1,
            "num_points": 257,
            "positive_shift": False,
        },
        numProcs=1,
    ):
        self.numProcs = int(numProcs)

        try:
            conditionVar = self.conditionVar
        except AttributeError:
            conditionVar = None

        # Set the number of test samples to be a power of 2
        powmax = int(npy.floor(npy.log2(numSamplesMax)))
        numSamplesMax = 2**powmax

        # Set the list of numbers-of-samples
        sampleList = 2 ** (npy.arange(4, powmax + 1)) + 1

        # Save the sampling list
        self.sampleList = sampleList

        if self.numProcs == 1:
            returnList = [
                estimatePDFWrapper(
                    (j, numSamplesMax, sampleList, scKWArgs, conditionVar, self)
                )
                for j in range(numRepetitions)
            ]

        else:
            # Do the PDF estimation using multiple processors
            estimationPool = Pool(self.numProcs)
            returnList = estimationPool.map(
                estimatePDFWrapper,
                zip(
                    range(numRepetitions),
                    [numSamplesMax] * numRepetitions,
                    [sampleList] * numRepetitions,
                    [scKWArgs] * numRepetitions,
                    [conditionVar] * numRepetitions,
                    [self] * numRepetitions,
                ),
            )

        # Save the longest sample of each repetition
        # self.sampleObjs = [ ra[2] for ra in returnList]
        # Save the timing values
        self.masterTiming = npy.array([ra[0] for ra in returnList])
        # Save the ISE values
        self.masterISE = npy.array([ra[1] for ra in returnList])

        self.meanISE = npy.average(self.masterISE, axis=0)
        self.meanTiming = npy.average(self.masterTiming, axis=0)
        self.stdISE = npy.std(self.masterISE, axis=0)
        self.stdTiming = npy.std(self.masterTiming, axis=0)

        popt, pcov = opt.curve_fit(
            powLaw, self.sampleList, self.meanISE, p0=(1, -1), sigma=self.stdISE
        )

        self._popt = popt
        self._pcov = pcov
        self.ISESlope = popt[0]
        self.ISENorm = popt[1]
        self.ISEFit = popt[1] * npy.array(self.sampleList) ** popt[0]



[docs]
    def generatePlots(self, saveType=None, show=True):
        """Generate plots of the error rate and the timing"""

        fig = PP.figure(figsize=(8, 8))
        ax = fig.add_subplot(111, xscale="log", yscale="log")

        ax.errorbar(self.sampleList, self.meanISE, yerr=self.stdISE)
        ax.plot(self.sampleList, self.ISEFit, linewidth=3, alpha=0.5, color="gray")

        ax.set_xlabel("# of Samples")
        ax.set_ylabel("I.S.E.")

        ax.legend(
            [
                "{} ISE".format(self.pdfName),
                r"N$^{" + "{:0.02f}".format(self.ISESlope) + "}$",
            ]
        )

        if show:
            PP.show()





[docs]
class testNormal1D(testDistribution):
    def __init__(self, **kwargs):
        # Call the class constructor
        super(testNormal1D, self).__init__(**kwargs)

        try:
            self.mu
        except AttributeError:
            self.mu = 0.0

        try:
            self.sig
        except AttributeError:
            self.sig = 1.0

        self.pdfName = "Normal"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        """Samples from a random normal distribution"""
        sig = self.sig
        mu = self.mu
        randsample = sig * npy.random.normal(size=numSamples) + mu
        return randsample[npy.newaxis, :]



[docs]
    def pdfStandard(self, axes):
        """Returns the value of the normal distribution at location `axes`"""
        x = axes[0]
        sig = self.sig
        mu = self.mu
        pdfVal = (1.0 / (sig * npy.sqrt(2 * npy.pi))) * npy.exp(
            -((x - mu) ** 2) / (2.0 * sig**2)
        )

        return pdfVal





[docs]
class testNormal2D(testDistribution):
    def __init__(self, **kwargs):
        # Call the class constructor
        super(testNormal2D, self).__init__(**kwargs)

        try:
            self.mu
        except AttributeError:
            self.mu = npy.array([0.0, 0.0])

        try:
            self.sig
        except AttributeError:
            self.sig = npy.array([1.0, 3.0])

        try:
            self.rho
        except AttributeError:
            self.rho = 0.7

        self.pdfName = "Bivariate Normal"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        """Samples from a bivariate normal distribution"""
        sx, sy = self.sig
        mu = self.mu
        r = self.rho
        covMat = [[sx**2, r * sx * sy], [r * sx * sy, sy**2]]

        randsample = npy.random.multivariate_normal(mu, covMat, (numSamples,)).T
        return randsample



[docs]
    def pdfStandard(self, axes):
        """Returns the value of the bivariate normal distribution at location `axes`"""
        sx, sy = self.sig
        mu = self.mu
        r = self.rho

        C = [[sx**2, r * sx * sy], [r * sx * sy, sy**2]]

        xarrays = npy.array(npy.meshgrid(axes[0], axes[1])).T

        return stat.multivariate_normal.pdf(xarrays, mu, C).T





[docs]
class testNormalND(testDistribution):
    def __init__(self, **kwargs):
        # Call the class constructor
        super(testNormalND, self).__init__(**kwargs)

        try:
            self.mu
        except AttributeError:
            self.mu = [0.0, 50.0, -50.0]

        try:
            self.covMat
        except AttributeError:
            varianceList = [0.1, 10.0, 100.0]
            correlationDict = {(0, 1): 0.9, (0, 2): 0.0, (1, 2): -0.4}
            self.covMat = genCovMat(varianceList, correlationDict)

        self.rank = len(self.mu)

        self.pdfName = "Multivariate Normal"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        """Samples from a multivariate normal distribution"""
        mu = self.mu
        covMat = self.covMat

        # randsample = npy.random.multivariate_normal(mu,covMat,(numSamples,)).T
        randsample = stat.multivariate_normal.rvs(mu, covMat, (numSamples,)).T
        return randsample



[docs]
    def pdfStandard(self, axes):
        """Returns the value of the multivariate normal distribution at location `axes`"""

        mu = self.mu
        C = self.covMat
        k = self.rank

        lastFirst = npy.zeros(k + 1, dtype=int)
        lastFirst[-1] = 0
        # lastFirst[:-1] = roll(npy.arange(k) + 1,1)
        lastFirst[:-1] = npy.arange(k) + 1
        xarrays = npy.transpose(
            npy.array(npy.meshgrid(*axes, indexing="ij")), axes=lastFirst
        )

        return stat.multivariate_normal.pdf(xarrays, mu, C).T





[docs]
class testMixtureModel(testDistribution):
    def __init__(self, **kwargs):
        # Call the class constructor
        super(testMixtureModel, self).__init__(**kwargs)

        try:
            self.muList
        except AttributeError:
            xp, yp = asciiToPoints(berkeleyLabText)
            xbar = 0.9 * npy.median(xp)
            ybar = npy.median(yp)

            # Transform to center
            xp = xp - xbar
            yp = yp - ybar

            # Rotate the image
            theta = 37.0 * npy.pi / 180.0
            xpd = xp * npy.cos(theta) - yp * npy.sin(theta)
            ypd = xp * npy.sin(theta) + yp * npy.cos(theta)
            xp = xpd
            yp = ypd

            # Calculate the variance
            varx = npy.var(xp)
            vary = npy.var(yp)

            self.muList = list(zip(xp, yp))
            self.covList = list(0.3 * npy.ones(len(self.muList)))

            for i in range(int(len(xp))):
                self.muList.append((0.0, 0.0))
                self.covList.append(genCovMat([varx, vary], {(0, 1): 0.0}))

        self.pdfName = "Mixture of Standard Normals"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        """Samples from a multivariate normal distribution"""
        numNormals = len(self.muList)

        # Generate a sequence of random draws, representing randomly (uniformly) choosing
        # from among the given normal distributions
        iDraws = npy.random.randint(0, numNormals, size=numSamples)

        # From these draws, construct the lengths of samples for each multivariate normal
        numDraws = [len(npy.nonzero(iDraws == i)[0]) for i in range(numNormals)]

        # randsample = npy.random.multivariate_normal(mu,covMat,(numSamples,)).T
        sampleList = []
        for mu, cov, N in zip(self.muList, self.covList, numDraws):
            if N > 0:
                sample = stat.multivariate_normal.rvs(mu, cov, N)
                if len(npy.shape(sample)) == 1:
                    sample = sample[npy.newaxis, :]
                sampleList.append(sample)

        # Concatenate the samples
        randsample = npy.concatenate(sampleList, axis=0)

        # Shuffle the samples in-place
        npy.random.shuffle(randsample)

        # Return the transpose [var,sample]
        return randsample.T



[docs]
    def pdfStandard(self, axes):
        """Returns the value of the multivariate normal distribution at location `axes`"""

        k = len(axes)

        # Create the x grid to feed to multivariate_normal
        lastFirst = npy.zeros(k + 1, dtype=int)
        lastFirst[-1] = 0
        lastFirst[:-1] = npy.arange(k) + 1
        xarrays = npy.transpose(
            npy.array(npy.meshgrid(*axes, indexing="ij")), axes=lastFirst
        )

        pdfStandard = npy.zeros(npy.shape(xarrays[..., 0]))

        # Go through the distribution centers
        numNormals = float(len(self.muList))
        for mu, cov in zip(self.muList, self.covList):
            pdfStandard += stat.multivariate_normal.pdf(xarrays, mu, cov) / numNormals

        return pdfStandard.T





[docs]
class transitionPDF(testDistribution):
    """A test for transition PDF convergence"""

    def __init__(self, **kwargs):
        # Save all incoming keyword arguments
        for key, val in kwargs.items():
            self.__setitem__(key, val)

        self.pdfName = "Transition"


[docs]
    def sampleFromDistribution(self, numSamples=2097152):
        return stochasticSample(numSamples=numSamples)



[docs]
    def pdfStandard(self, axes):
        xx, yy = npy.meshgrid(*axes)
        return jointXY(yy, xx)





[docs]
class testConditional(transitionPDF):
    """A test for transition PDF convergence of the conditional"""

    def __init__(self, **kwargs):
        # Save all incoming keyword arguments
        for key, val in kwargs.items():
            self.__setitem__(key, val)

        self.pdfName = "Conditional"
        self.conditionVar = 0


[docs]
    def pdfStandard(self, axes):
        xx, yy = npy.meshgrid(*axes)
        return conditionalPDF(yy, xx)