Source code for fastkde.fastKDE

# !/usr/bin/env python

import numpy as npy
import scipy.optimize
import fastkde.empiricalCharacteristicFunction as ecf
import copy
import time
import warnings
from fastkde.nufft import calc_t_from_x, dft_points

import fastkde.floodFillSearch as flood


# A simple timer for comparing ECF calculation methods
[docs] class Timer: def __init__(self, n=None): self.n = n def __enter__(self): self.start = time.time() def __exit__(self, *args): print("N = {}, t = {} seconds".format(self.n, time.time() - self.start))
[docs] def next_highest_power_of_two(number): """Returns the nearest power of two that is greater than or equal to number""" return int(2 ** (npy.ceil(npy.log2(number))))
[docs] class fastKDE: """ Estimates the density function of a given dataset using the self-consistent method of Bernacchia and Pigolotti (2011, J. R. Statistic Soc. B.). Prior to estimating the PDF, the data are standardized to have a mean of 0 and a variance of 1. Standardization is done so that PDFs of varying widths can be calculated on a unified grid; the original PDF can be re-obtained by scaling, offsetting, and renormalizing the calculated PDF. Assuming the PDF is reasonably narrow, then most of the information in the PDF should be contained in the returned domain. The width of the domain is set in terms of multiples of unit standard deviations of the data; the default is 20-sigma. input: ------ data (array_like) : the data from which to estimate the PDF. Should be 1- or 2-dimensional. If 2-dimensional, this flags calculation of an N-dimensional PDF. The first index should refer to each variable and the second index the observations of the variables. axes : the axis-values of the estimated PDF. They must be evenly spaced and they should have a length that is a power of two plus one (e.g., 33). log_axes : Flags whether axes should be log spaced (i.e., the PDF is calculated based on log(data) and then transformed back to sample space). Should be a logical value (True or False) or a list of logical values with an item for each variable (i.e, len(log_axes) == shape(data)[0]) specifying which axes should use log spacing. If only True or False is given, that value is used for all variables. num_points_per_sigma : the number of points on the data grid per standard deviation; this influences the total size of the axes that are automatically calculated if no other aspects of the grid are specified. num_points : the number of points to use for the pdf grid. If provided as a scalar, each axis will have the same number of points. Otherwise, it should be an iterable with a value for each axis length. Axis lengths should be a power of two plus one (e.g., 33) deltaX : if given, this specifies the spacing between domain values. do_approximate_ecf : flags whether to approximate the ECF using a (much faster) FFT. In tests, this is accurate to ~1e-14 over low frequencies, but is inaccurate to ~1e-2 for the highest ~5% of frequencies. ecf_precision : sets the precision of the approximate ECF. If set to 2, it uses double precision accuracy; 1 otherwise do_fft : flags whether to calculate phiSC and its FFT to obtain pdf do_save_marginals : flags whether to calculate and save the marginal distributions frac_contiguous_hyper_volumes : the fraction of contiguous hypervolumes of the ECF, that are above the ECF threshold, to use in the density estimate num_contiguous_hyper_volumes : like frac_contiguous_hyper_volumes, but specify an integer number to use. frac_contiguous_hyper_volumes will be ignored if this is provided as an argument. positive_shift : translate the PDF vertically such that the estimate is positive or 0 everywhere axis_expansion_factor : sets the amount by which the KDE grid will be expanded relative to the original min-max spread for each variable: 1.0 means a 100% (2x) expansion in the range. Such an expansion is necessary to avoid kernel power from one end of the grid leaking into the opposite end due to the perioidicity of the Fourier transform. """ def __init__( self, data=None, axes=None, log_axes=False, num_points_per_sigma=10, num_points=None, do_approximate_ecf=True, ecf_precision=1, do_save_transformed_kernel=False, do_fft=True, do_save_marginals=True, be_verbose=False, frac_contiguous_hyper_volumes=1, num_contiguous_hyper_volumes=None, positive_shift=True, axis_expansion_factor=1.0, ): def vprint(msg): """Only print if be_verbose is True""" if be_verbose: print(msg) addOne = True # Force x grids to be (2**n) + 1 if data is not None: # Save the original data for the marginal calculation original_data = npy.array(data) data = npy.array(data) # First check the rank of the data data_rank = len(npy.shape(data)) # If the data are a vector, promote the data to a rank-1 array with only 1 column if data_rank == 1: data = npy.array(original_data[npy.newaxis, :], dtype=npy.float64) else: data = npy.array(original_data, dtype=npy.float64) if data_rank > 2: raise ValueError( "data must be a rank-2 array of shape [num_variables,num_data_points]" ) # Set the rank of the data self.data_rank = data_rank # Set the number of variables self.num_variables = npy.shape(data)[0] # Set the number of data points self.num_data_points = npy.shape(data)[1] # Check if we need log axes for any variables try: # Check if an iterable was provided for log_axes log_axes[0] except TypeError: # Otherwise set the array to be a list filled with the same value log_axes = self.num_variables * [log_axes] # Save the log_axes variable self.log_axes = log_axes # Loop over variables and take the logarithm of any with log axes for v in range(self.num_variables): # Take the logarithm of the given variable if log_axes[v]: # Check wheter all the data are positive if npy.amin(data[v, :]) <= 0: raise ValueError( ( "logarithmic axes were specified for variable {}, but" + "that variable contains values less than 0: min = {}" ).format(v, npy.amin(data[v, :])) ) data[v, :] = npy.log(data[v, :]) self.frac_contiguous_hyper_volumes = frac_contiguous_hyper_volumes if num_contiguous_hyper_volumes is not None: self.frac_contiguous_hyper_volumes = num_contiguous_hyper_volumes vprint( ( "Operating on data with num_variables = {}, " + "num_data_points = {}" ).format(self.num_variables, self.num_data_points) ) else: self.num_data_points = 0 # Create a variable to hold the original PDF and axes self.originalPDF = None self.originalAxes = None # Store the do_fft flag self.do_fft = do_fft # Save the marginals flag self.do_save_marginals = do_save_marginals if self.num_variables == 1: self.do_save_marginals = False # Set whether to approximate the ECF using the FFT method self.do_approximate_ecf = do_approximate_ecf # Set the approximate ECF precision self.ecf_precision = ecf_precision # Preinitialize the ecf threshold self.ecfThreshold = None # Flag whether to save the transformed kernel self.do_save_transformed_kernel = do_save_transformed_kernel # initialize the kernel and its transform self.kappaSC = None self.kSC = None self.positive_shift = positive_shift # *********************** # Calculate the x grids # *********************** if axes is None: # Get the range of the data self.xMin = npy.amin(data, 1) self.xMax = npy.amax(data, 1) vprint("Data stats:") vprint("\tminima: {}".format(self.xMin)) vprint("\tmaxima: {}".format(self.xMax)) # Get the grid mid-points midPoint = 0.5 * (self.xMax + self.xMin) # inflate the range by axis_expansion_factor to ensure that KDE power # doesn't leak through the periodic axis boundary self.xMin += axis_expansion_factor * (self.xMin - midPoint) self.xMax += axis_expansion_factor * (self.xMax - midPoint) if num_points is None: # Calculate the number of standard deviations there # are in the data range dataRange = self.xMax - self.xMin numSigma = dataRange / npy.std(data, axis=1) # Set the number of points for each dimensions self.numXPoints = npy.array( [ next_highest_power_of_two(ns * num_points_per_sigma) + int(addOne) for ns in numSigma ] ) else: # If we can iterate through try: lenNum = len(num_points) isIterable = True except TypeError: isIterable = False lenNum = 1 if isIterable: if lenNum == self.num_variables: self.numXPoints = num_points else: raise ValueError( ( "len(num_points) = {}, but it should match" + "num_variables = {}" ).format(lenNum, self.num_variables) ) else: self.numXPoints = npy.array(self.num_variables * (num_points,)) # Set the grids for each dimension self.axes = [ npy.linspace(xmin, xmax, np) for xmin, xmax, np in zip(self.xMin, self.xMax, self.numXPoints) ] vprint( ("Grids created with xmin: {}, xmax: {}, " + "npoints: {}").format( self.xMin, self.xMax, self.numXPoints ) ) else: # Set the xgrid from the function argument self.axes = axes self.xMin = npy.array([npy.amin(xg) for xg in axes]) self.xMax = npy.array([npy.amax(xg) for xg in axes]) self.numXPoints = npy.array([len(xg) for xg in axes]) # Get the grid mid-points self.midPoint = 0.5 * (self.xMax + self.xMin) # Set the midpoint of the incoming grid self.dataMid = 0.5 * (self.xMax + self.xMin) # Set the range to be +/- pi self.dataNorm = (self.xMax - self.xMin) / (2.0 * npy.pi) # Get the grid spacings self.deltaX = npy.array([xg[1] - xg[0] for xg in self.axes]) # Save xgrids as axes for backward compatibility self.xgrids = self.axes # Check that the axes are regular and proper powers of two for v in range(self.num_variables): xg = self.axes[v] dx = (xg[1:] - self.dataMid[v]) / self.dataNorm[v] - ( xg[:-1] - self.dataMid[v] ) / self.dataNorm[v] dxdiff = dx - self.deltaX[v] / self.dataNorm[v] fTolerance = self.deltaX[v] / (1e4 * self.dataNorm[v]) # Check that these differences are less than 1/1e6 if not all(npy.abs(dxdiff) < fTolerance): raise ValueError("All grids in axes must be regularly spaced") log2size = npy.log2(len(xg) - addOne) if log2size != npy.floor(log2size): if addOne: extraStr = " + 1" else: extraStr = "" raise ValueError( "All grids in axes must be powers of 2" + extraStr + ", but got {}".format(len(xg)) ) # Calculate the frequency point grids (for 0-centered data) self.tgrids = [ calc_t_from_x((xg - av) / sd) for xg, av, sd in zip(self.axes, self.dataMid, self.dataNorm) ] self.numTPoints = npy.array([len(tg) for tg in self.tgrids]) self.deltaT = npy.array([tg[2] - tg[1] for tg in self.tgrids]) self.phiSC = (0.0 + 0.0j) * npy.zeros(self.numTPoints) self.ECF = (0.0 + 0.0j) * npy.zeros(self.numTPoints) # Initialize the good distribution index self.goodDistributionInds = [] # Set the verbosity flag self.be_verbose = be_verbose self.convolvedData = None # Initialize the marginals self.marginalObjects = None if data is not None: # ************************************************* # Calculate the Empirical Characteristic Function # ************************************************* # Note that this routine also standardizes the data on-the-fly vprint("Calculating the ECF") # Transfrom the data to 0-centered coordinates for v in range(self.num_variables): data[v, :] = (data[v, :] - self.dataMid[v]) / self.dataNorm[v] # Calculate the ECF (see empiricalCharacteristicFunction.py) ecfObj = ecf.ECF( input_data=data, tgrids=self.tgrids, use_fft_approximation=self.do_approximate_ecf, precision=self.ecf_precision, be_verbose=self.be_verbose, ) # Extract the ECF from the ECF object self.ECF = ecfObj.ECF if self.do_fft: # ************************************************* # Apply the filter # ************************************************* # Apply the Bernacchia and Pigolotti (2011) filter to the ECF to obtain # the fourier representation of the self-consistent density vprint("Applying the filter") self.applyBernacchiaFilter() # ************************************************* # Transform to real space # ************************************************* # Transform the optimal distribution to real space vprint("Transforming to real space") self.__transformphiSC__() # Calculate and save the marginal distribution objects if self.do_save_marginals: self.marginalObjects = [] for i in range(self.num_variables): self.marginalObjects.append( fastKDE( original_data[i, :], axes=[self.originalAxes[i]], positive_shift=self.positive_shift, frac_contiguous_hyper_volumes=self.frac_contiguous_hyper_volumes, log_axes=self.log_axes[i], do_save_marginals=False, ) ) return # ***************************************************************************** # ** fastKDE: *********************************************** # ******************* applyBernacchiaFilter() ********************************* # ***************************************************************************** # *****************************************************************************
[docs] def applyBernacchiaFilter(self, doFlushArrays=True): """Given an ECF, calculate the self-consistent density in fourier-space by applying the BP11 filter.""" # Make an easy-to-read and float version of self.num_data_points N = float(self.num_data_points) # Calculate the stability threshold for the ECF ecfThresh = 4.0 * (N - 1.0) / (N * N) self.ecfThreshold = ecfThresh # Calculate the squared magnitude of the ECF ecfSq = abs(self.ECF) ** 2 # Find all hypervolumes where ecfSq is greater than the stability threshold contiguousInds = flood.flood_fill_search( ecfSq, search_threshold=self.ecfThreshold ) if contiguousInds == []: raise RuntimeError( ( "No ECF values found above the ECF threshold. " + "max(ecfSq) = {}, ecfThresh = {}" ).format(npy.amax(ecfSq), ecfThresh) ) # Sort them by distance from the center sortedInds = flood.sort_by_distance_from_center( contiguousInds, npy.shape(ecfSq) ) # Convert the fraction of hypervolumes to a number if needbe numVolumes = len(sortedInds) if self.frac_contiguous_hyper_volumes >= 1: numVolumesToUse = int(self.frac_contiguous_hyper_volumes) else: numVolumesToUse = int(self.frac_contiguous_hyper_volumes * numVolumes) if numVolumesToUse < 1: numVolumesToUse = 1 # Check that we don't exceed the number of provided volumes if numVolumesToUse > numVolumes: numVolumesToUse = numVolumes # Initialize the filtered value list iCalcPhi = self.num_variables * [npy.array([], dtype="int")] # Pull out frac_contiguous_hyper_volumes of contiguous hyper volumes, in # order of distance from the origin for i in range(numVolumesToUse): for n in range(self.num_variables): iCalcPhi[n] = npy.concatenate((iCalcPhi[n], sortedInds[i][n])) # Convert iCalcPhi to a list of tuples, such that it is compatible with the # output of where() if self.num_variables != 1: iCalcPhi = [tuple(ii) for ii in iCalcPhi] # convert to a tuple, to avoid a numpy warning iCalcPhi = tuple(iCalcPhi) # Save the filter self.iCalcPhi = iCalcPhi # If flagged, clear the phiSC array. This is needed if the same fastKDE object # is reused for multiple data. if doFlushArrays: self.phiSC[:] = 0.0 + 0.0j # Calculate the transform of the self-consistent Kernel (and only calculate it at # points where ecfSq is above ecfThresh) kappaSC = (1.0 + 0.0j) * npy.zeros(npy.shape(self.ECF)) kappaSC[iCalcPhi] = (N / (2 * (N - 1))) * ( 1 + npy.sqrt(1 - ecfThresh / ecfSq[iCalcPhi]) ) # Store the fourier kernel if we are going to save the transformed kernel if self.do_save_transformed_kernel: self.kappaSC = kappaSC midPointAccessor = tuple([(tp - 1) // 2 for tp in self.numTPoints]) # Calculate the transform of the self-consistent density estimate self.phiSC[iCalcPhi] = self.ECF[iCalcPhi] * kappaSC[iCalcPhi] if self.be_verbose: print( ("Normalization of kappaSC, ECF, and phiSC: " + "{}, {}, {}").format( kappaSC[midPointAccessor], self.ECF[midPointAccessor], self.phiSC[midPointAccessor], ) )
# ***************************************************************************** # ** fastKDE: *********************************************** # ******************* findGoodDistributionInds() ****************************** # ***************************************************************************** # *****************************************************************************
[docs] def findGoodDistributionInds(self): """Find indices of the optimal distribution that are above 0.0""" return npy.where(self.pdf >= 0.0)
# ***************************************************************************** # ** fastKDE: *********************************************** # ******************* findBadDistributionInds() ******************************* # ***************************************************************************** # *****************************************************************************
[docs] def findBadDistributionInds(self): """Find indices of the optimal distribution that are below 0.0""" return npy.where(self.pdf < 0.0)
# ***************************************************************************** # ** fastKDE: *********************************************** # ******************* __transformphiSC__() ************************************ # ***************************************************************************** # ***************************************************************************** def __transformphiSC__(self): """Transform the self-consistent estimate of the distribution from frequency space to real space""" # Transform the PDF estimate to real space pdf = ( npy.fft.fftshift(npy.real(npy.fft.fftn(npy.fft.ifftshift(self.phiSC)))) * npy.prod(self.deltaT) * (1.0 / (2 * npy.pi)) ** self.num_variables ) # Unnormalize it pdf /= npy.prod(self.dataNorm) # transpose the self-consistent density estimate self.pdf = pdf.transpose() # Shift the PDF such that the negative areas can be set to 0, while the # positive area is still normalized to 1 if self.positive_shift: if len(npy.where(self.pdf < 0)[0]) != 0: # Define a function f(delta), such that f(delta) is how far off # self.pdf-delta is from being normalized; hence, we want to find # the zero of this function def normFunc(delta): """Calculate how far off from normal is the shifted PDF""" ipos = npy.where((self.pdf - delta) >= 0.0) return 1 - sum((self.pdf[ipos] - delta) * npy.prod(self.deltaX)) # Set the initial guess for the newton-raphson search # a = -normFunc(0) a = 0.0 # Find the zero of the above function; i.e., find delta, such that # the shifted PDF is normalized try: delta = scipy.optimize.newton(normFunc, a, maxiter=10000) except RuntimeError: delta = 0.0 # Check if the positive shift method failed if not npy.isfinite(delta) or delta < 0 or delta >= npy.amax(self.pdf): if self.be_verbose: print( "positive_shift algorithm failure: defaulting to no shift" ) delta = 0.0 # If a shift is provided, do the shift if delta != 0.0: # Shift the PDF self.pdf -= delta # And set the negative values to 0 self.pdf[npy.where(self.pdf < 0)] = 0.0 if self.be_verbose: normConst = sum(pdf * npy.prod(self.deltaX)) midPointAccessor = tuple([(tp - 1) // 2 for tp in self.numTPoints]) print( ("Normalization of pdf = {}. " + "phiSC[0] = {}").format( normConst, self.phiSC[midPointAccessor] ) ) # Save the original PDF and axes self.originalPDF = npy.array(self.pdf) self.originalAxes = list(self.axes) # Check if any variables need to be transformed due to use # of logarithmic axes for v in range(self.num_variables): if self.log_axes[v]: # Transform the axis back to data space self.axes[v] = npy.exp(self.axes[v]) # Generate a slice to help the axis conform in shape to the PDF conformSlice = self.num_variables * [npy.newaxis] conformSlice[v] = slice(None, None, None) conformSlice = tuple(conformSlice) # Transform the PDF self.pdf /= self.axes[v][conformSlice[::-1]] # Set self.fSC for backward compatibility self.fSC = self.pdf # Take the transform of the self-consistent kernel if flagged if self.do_save_transformed_kernel: kSC = ( npy.fft.fftshift( npy.real(npy.fft.fftn(npy.fft.ifftshift(self.kappaSC))) ) * npy.prod(self.deltaT) * (1.0 / (2 * npy.pi)) ** self.num_variables ) kSC /= npy.prod(self.dataNorm) self.kSC = kSC.transpose() # ***************************************************************************** # ** fastKDE: *********************************************** # ******************* __transformphiSC_points__() ************************************ # ***************************************************************************** # ***************************************************************************** def __transformphiSC_points__(self, list_of_points): """Transform the self-consistent estimate of the distribution from frequency space to a set of points in real space""" # Transfrom the point to 0-centered coordinates list_of_points = npy.array(list_of_points, copy=True) for v in range(self.num_variables): list_of_points[v, :] = ( list_of_points[v, :] - self.dataMid[v] ) / self.dataNorm[v] # Set the fill value for the frequency grids fill_value = -1e20 # Get the maximum frequency grid length tgrids = self.tgrids ntmax = npy.amax([len(tgrid) for tgrid in tgrids]) # Create the frequency grids array frequencyGrids = fill_value * npy.ones([self.num_variables, ntmax]) # Fill the frequency grids array for v in range(self.num_variables): frequencyGrids[v, : len(tgrids[v])] = tgrids[v] # do the inverse direct Fourier transform pdf = ( dft_points( frequencyGrids, self.phiSC.ravel(), list_of_points, missing_freq_val=fill_value, ) * npy.prod(self.deltaT) * (1.0 / (2 * npy.pi)) ** self.num_variables ) # unstandarize the PDF pdf /= npy.prod(self.dataNorm) # TODO: implement positive_shift for point-based PDF estimates # TODO: implement log_axes for point-based PDF estimates return pdf
[docs] def getTransformedPDF(self): """Returns a copy of the PDF. This function exists for backward compatibility""" return npy.array(self.pdf)
[docs] def getTransformedAxes(self): """Returns a copy of the axes. This function exists for backward compatibility""" return tuple([npy.array(xg) for xg in self.axes])
[docs] def getTransformedCopula(self, data=None): """A wrapper for getCopula; this function is deprecated.""" return self.getCopula(data)
[docs] def estimateConditionals(self, variables, data, peak_frac=0.0, reApplyFilter=False): """For a multidimensional PDF, estimates the conditional P(x_i | x_j). input: ------ variables : A integer or tuple of array indicies indicating the variables on which to condition e.g., For a 2D PDF, obj.estimateConditionals(1) estimates P(x_0 | x_1) from the joint PDF P(x_0,x_1) that is the result of the self-consistent density estimate. For a 3D PDF: obj.estimateConditionals( (0,2) ) estimates P( x_0, x_2 | x_1) from P(x_0,x_1,x_2) If all possible variables are listed, the copula is returned instead. If negative values are provided, variables are wrapped (i.e., index -1 indicates the last variable) data : The data original used to create the fastKDE object. This is needed to calculated the various marginals required in the conditional computation. peak_frac : The fractional threshold below which to truncate the marginal PDF (to avoid divding by small numbers); this is the fraction of the height of the mode. reapplyFilter : Flags whether to reapply the ECF filter to the conditional output: ------- Returns P( x_i | x_j ) """ data = npy.array(data) # If the data are univariate, simply return the PDF itself if self.num_variables == 1: return self.pdf # Check that we can interpret the variables tuple try: len(variables) except TypeError: try: range(variables) variables = (variables,) except TypeError: raise ValueError( "variables appears to be neither a tuple or an integer" ) # Check that the variable indices are sane rightSideVariableIndices = [] for ind in tuple(variables): if ind > self.num_variables - 1: raise ValueError("out-of-bounds positive index found in 'variables'") if ind < 0: dum = self.num_variables + ind if dum < 0: raise ValueError( "out-of-bounds negative index found in 'variables'" ) else: dum = ind rightSideVariableIndices.append(dum) # Pull the unique indices and make sure they are sorted rightSideVariableIndices = tuple(sorted(list(set(rightSideVariableIndices)))) if len(rightSideVariableIndices) > self.num_variables: raise ValueError( "More indices were provided in 'variables' than there are " "variables." ) # Check if all variables were provided if len(rightSideVariableIndices) == self.num_variables: return self.getCopula(data) # If there are no right side variables, return the PDF if len(rightSideVariableIndices) == 0: return self.pdf # Create the list of left-side variable indices leftSideVariableIndices = list(range(self.num_variables)) for ind in sorted(rightSideVariableIndices)[::-1]: leftSideVariableIndices.pop(ind) # Calculate the marginal PDF marginalObject = fastKDE( data[rightSideVariableIndices, :], axes=[self.originalAxes[i] for i in rightSideVariableIndices], positive_shift=self.positive_shift, frac_contiguous_hyper_volumes=self.frac_contiguous_hyper_volumes, log_axes=[self.log_axes[i] for i in rightSideVariableIndices], do_save_marginals=False, ) # Make the shape of the new marginal object match that of the original PDF # (using the magic of the numpy newaxis) conformantSlice = list(self.num_variables * (slice(None, None, None),)) # Insert a newaxis for each of the left-side indices sumAxes = [] for ind in leftSideVariableIndices: # The PDF object has var0 in its rightmost axis, so transform ind # accordingly (it references as though var0 is the leftmost axis) ip = self.num_variables - ind - 1 conformantSlice[ip] = npy.newaxis # Add this index to the list of axes over which to sum for normalization sumAxes.append(ip) conformantSlice = tuple(conformantSlice) marginalThreshold = peak_frac * npy.amax(marginalObject.pdf) # Create and mask the marginal PDF marginalPDF = npy.ma.masked_less_equal( marginalObject.pdf[conformantSlice], marginalThreshold ) # Calculate the conditional PDF conditionalPDF = npy.ma.array(self.pdf) / marginalPDF # Refilter the conditional if reApplyFilter: conditionalPDF = npy.ma.masked_less_equal( self.reApplyFilter(conditionalPDF), 0.0 ) # Calculate the normalization matrix dxs = [npy.diff(self.axes[i]) for i in leftSideVariableIndices] dxs = [npy.concatenate((dx, [dx[-1]])) for dx in dxs] if len(dxs) == 1: dxProd = npy.array(dxs[0]) else: dxProd = npy.prod(npy.meshgrid(*dxs), axis=0) cslice = self.num_variables * [npy.newaxis] for i in leftSideVariableIndices: cslice[i] = slice(None, None, None) cslice = tuple(cslice) dxProd = dxProd[cslice[::-1]] normFactor = npy.ma.masked_equal( npy.sum(conditionalPDF * dxProd, axis=tuple(sumAxes)), 0.0 ) # Normalize the conditional PDF for the leftside variables conditionalPDF /= normFactor[conformantSlice] return conditionalPDF
# ***************************************************************************** # ** fastKDE: ******************************************* # ******************* getCopula ****************************************** # ***************************************************************************** # *****************************************************************************
[docs] def getCopula(self, data=None, peak_frac=0.0): """Estimates the copula of the underlying PDF""" # If the data are univariate, simply return the PDF itself if self.num_variables == 1: return self.pdf # Check if we need to calculate the marginal distributions if not self.do_save_marginals: if data is None: raise ValueError( "the data must be provided as argument 'data', if " "do_save_marginals=False when the original PDF was calculated" ) else: # Estimate the marginal distributions marginalObjects = [] for i in range(self.num_variables): marginalObjects.append( fastKDE( data[i, :], axes=[self.originalAxes[i]], positive_shift=self.positive_shift, frac_contiguous_hyper_volumes=self.frac_contiguous_hyper_volumes, log_axes=self.log_axes[i], do_save_marginals=False, ) ) else: # If not, just use the saved marginals marginalObjects = self.marginalObjects # Calculate the marginal distributions and mask bad (or zero) values marginals = [] for obj in marginalObjects: # Add the marginal to the list while masking <0 values marginalThreshold = peak_frac * npy.amax(obj.pdf) # Create and mask the marginal PDF marginals.append(npy.ma.masked_less_equal(obj.pdf, marginalThreshold)) # Calculate the PDF assuming independent marginals independencePDF = npy.ma.prod(npy.meshgrid(*tuple(marginals)), axis=0) # Divide off the indepdencnce PDF to calculate the copula # actualPDF = ma.array(self.pdf) # actualPDF[self.findBadDistributionInds()] = ma.masked actualPDF = npy.ma.array(self.pdf) copulaPDF = actualPDF / independencePDF return copulaPDF
[docs] def reApplyFilter(self, pdf): """Reapplies the filter to a PDF estimate. This is used, e.g., to remove high-frequency noise that results from calculting the conditionals. """ # Transform the PDF to fourier space phiTilde_tmp = npy.fft.fftshift( npy.fft.ifftn(npy.fft.ifftshift(npy.ma.filled(pdf, 0.0))) ) # Normalize the transform midPointAccessor = tuple([(tp - 1) // 2 for tp in self.numTPoints]) phiTilde_tmp /= phiTilde_tmp[midPointAccessor] # Reapply the filter phiTilde = (0.0 + 0.0j) * npy.zeros(npy.shape(phiTilde_tmp)) phiTilde[self.iCalcPhi] = phiTilde_tmp[self.iCalcPhi] # Transform back to real space # Transform the PDF estimate to real space pdf = ( npy.fft.fftshift(npy.real(npy.fft.fftn(npy.fft.ifftshift(phiTilde)))) * npy.prod(self.deltaT) * (1.0 / (2 * npy.pi)) ** self.num_variables ) # Return the transpose of the PDF return pdf.transpose()
# ***************************************************************************** # ** fastKDE: *********************************************** # ******************* Addition operator __add__ ******************************* # ***************************************************************************** # ***************************************************************************** def __add__(self, rhs): """Addition operator for the fastKDE object. Adds the empirical characteristic functions of the two estimates, reapplies the BP11 filter, and transforms back to real space. This is useful for parallelized calculation of densities. Note that this only works if the axes are the same for both operands.""" # Check for proper typing if not isinstance(rhs, fastKDE): raise TypeError( "unsupported operand type(s) for +: " "{} and {}".format(type(self), type(rhs)) ) # Check that the axes are the same for both objects for sxg, rxg in zip(self.axes, rhs.axes): if not all(npy.isclose(sxg, rxg)): raise NotImplementedError( "addition for operands with different axes is not yet " "implemented." ) retObj = copy.deepcopy(self) retObj.phiSC = (0.0 + 0.0j) * npy.zeros(self.numTPoints) retObj.num_data_points += rhs.num_data_points # Convert the returned variance back into standard deviation retObj.dataStandardDeviation = npy.sqrt(retObj.dataStandardDeviation) # Average the Empirical Characteristic Function of the two objects retObj.ECF = ( self.num_data_points * self.ECF + rhs.num_data_points * rhs.ECF ) / retObj.num_data_points if retObj.do_fft: retObj.applyBernacchiaFilter() retObj.__transformphiSC__() # Return the new object return retObj
[docs] def pdf(*args, **kwargs): """Estimate the self-consistent kernel density estimate of the input data input: ------ var0 : An input variable. var1, var2... : Additional input varibles whose length corresponds to the length of var0. As input variables are added, the dimensionality of the resulting PDF increases (e.g., supplying var0 and var1 results in a 2D PDF). use_xarray : If True, returns an xarray DataArray object; otherwise returns a tuple of numpy arrays. If None, defaults to True if xarray is installed. var_names : A list of names for the input variables. If None, the variables are named var0, var1, etc. do_xarray_subset : If True, returns a subset of the xarray DataArray limited to the range of the input variables. If False, returns the full xarray DataArray. If None, defaults to True. ``**kwargs`` : Any additional keyword arguments get passed directly to fastKDE.fastKDE(); see the docstring of fastKDE.fastKDE() for details of kwargs. returns: -------- pdf,axes : The pdf and the axes of the PDF (i.e., this is analogous to hist,bins for a histogram). If there are multiple input variables, the axes variable is a list of the axes, with each axis corresponding to an input variable. .. warning:: The computational expense and the memory requirement of this method grows exponentially with the number of input variables. It is likely that memory errors will occur if a moderately large number of variables are used (e.g., four or more). Try modifying num_points to reduce memory usage if needed. .. note:: If `use_xarray` is True, the number of points in the returned xarray object coordinates will not match the `num_points` argument; set `do_xarray_subset = False` if this is not desired """ # try to get the use_xarray keyword argument try: use_xarray = kwargs["use_xarray"] del kwargs["use_xarray"] except KeyError: use_xarray = None # try to get the var_names keyword argument try: var_names = kwargs["var_names"] del kwargs["var_names"] except KeyError: var_names = None # try to get the do_xarray_subset keyword argument try: do_xarray_subset = kwargs["do_xarray_subset"] del kwargs["do_xarray_subset"] except KeyError: do_xarray_subset = None # set the default value of do_xarray_subset if do_xarray_subset is None: do_xarray_subset = True # Check if use_xarray was given; set defaults if use_xarray is None: # Check if xarray is installed try: import xarray use_xarray = True except ImportError: use_xarray = False # if use_xarray was given, check that xarray is installed if use_xarray: try: import xarray except ImportError: raise ImportError( "xarray is not installed, but use_xarray=True was provided" ) # Try to get var0 from the args or kwargs try: var0 = args[0] except IndexError: try: var0 = kwargs["var0"] except KeyError: raise ValueError("No input data were provided.") # Check that var0 is arraylike try: var0Shape = npy.shape(var0) assert len(var0Shape) != 0, "var0 is not an array" except TypeError or AssertionError: raise ValueError( "Could not get shape of var0; it does not appear to be array-like." ) # Check that var0 is a vector if len(var0Shape) != 1: raise ValueError( "var0 should be a vector. If multiple variables are combined in a " "single array, please use the fastKDE class interface instead." ) # Get the length of var0 N = var0Shape[0] # Check for input varibles provided as key word arguments var_args = [] varKeys = sorted([v for v in kwargs if "var" in v]) for key in varKeys: # Ignore var0 since this was either provided as an argument # or was read as a keyword argument above if key != "var0": try: int(key[3:]) except ValueError: raise ValueError( "Incomprehensible variable-like keyword provided: " "{}".format(key) ) # Append this variable var_args.append(kwargs[key]) # Check if a mixture of keyword and arguments were provided for additional variables if len(var_args) != 0 and len(args) > 1: raise ValueError( "additional variables were provided as a mixture of arguments and " "keyword arguments. They all must be one or the other." ) # Set the additional variables to be the rest of the input arguments # if none were provided as key word arguments if len(args) > 1: var_args = args[1:] # Remove the variables from kwargs for key in list(varKeys): del kwargs[key] # Start preparing the input data for # concatenation inputVariables = npy.array(var0[npy.newaxis, :]) # get the number of variables (add one because we already popped var0) num_variables = len(var_args) + 1 # get or set the variable names if var_names is None: var_names = ["var{}".format(i) for i in range(num_variables)] else: # check that var_names is a list of strings of appropriate length if len(var_names) != num_variables: raise ValueError( "var_names must be a list of strings of length {}".format(num_variables) ) for i in range(num_variables): if not isinstance(var_names[i], str): raise ValueError( "var_names must be a list of strings of length {}".format( num_variables ) ) # Attempt to read additional variables # and concatenate them to the input variable for i in range(num_variables - 1): try: varn = npy.array(var_args[i][npy.newaxis, :]) except BaseException as e: print(e) raise ValueError( "Could not convert var{} into a numpy arrray".format(i + 1) ) lenN = npy.shape(varn)[1] if lenN != N: raise ValueError( ( "len(var{}) is {}, but it should be the same of len(var0) " + "= {}" ).format(i + 1, lenN, N) ) inputVariables = npy.concatenate((inputVariables, varn)) # Remove the do_save_marginals keyword argument try: kwargs["do_save_marginals"] del kwargs["do_save_marginals"] except KeyError: pass # Calculate the PDF _pdfobj = fastKDE(inputVariables, do_save_marginals=False, **kwargs) if len(_pdfobj.axes) == 1: pdf_tuple = _pdfobj.pdf, _pdfobj.axes[0] else: pdf_tuple = _pdfobj.pdf, _pdfobj.axes # Return the PDF if use_xarray: # set the coordinates coords = {} dims = [] for i in range(num_variables): # set the variable name varname = var_names[i] # set the coordinate if num_variables == 1: coords[varname] = pdf_tuple[1] else: coords[varname] = pdf_tuple[1][i] # set the dimension dims.append(varname) # construct the xarray object; the two transpose operations ensure # that the dimensions are in an intuitive order pdf_da = xarray.DataArray(pdf_tuple[0].T, coords=coords, dims=dims).T # set the long name pdf_da.attrs["long_name"] = f"PDF({','.join(var_names)})" # subset the xarray object if requested if do_xarray_subset: subset_dict = {} for i in range(num_variables): varname = var_names[i] # get the min and max of the variable varmin = npy.amin(inputVariables[i]) varmax = npy.amax(inputVariables[i]) # set the subset dictionary entry subset_dict[varname] = slice(varmin, varmax) # subset the xarray object pdf_da = pdf_da.sel(**subset_dict) return pdf_da else: return pdf_tuple
[docs] def conditional(input_vars, conditioning_vars, **kwargs): """Estimates the conditional PDF of `input_vars` given `conditioning_vars` input_vars : A vector of input values, or a list of such vectors conditioning_vars : A vector of conditioning values, or a list of such vectors use_xarray : If True, returns an xarray DataArray object; otherwise returns a tuple of numpy arrays. If None, defaults to True if xarray is installed. var_names : A list of names for the input variables, starting with the conditioning variables. If None, the variables are named var0, var1, etc. do_xarray_subset : If True, returns a subset of the xarray DataArray limited to the range of the input variables. If False, returns the full xarray DataArray. If None, defaults to True. ``**kwargs`` : Any additional keyword arguments get passed directly to fastKDE.fastKDE() or fastKDE.estimateConditionals(); see the docstrings of fastKDE.fastKDE() and fastKDE.estimateConditionals() for details of kwargs. Note the following two arguments have different default values here:: positive_shift=True by default, and peak_frac = 0.01 by default. :returns: (cPDF, axes) \ where cPDF is the PDF(input_vars | conditioning_vars), and axes is a list of axis vectors giving the points at which cPDF is defined. If N conditioning_vars were provided, then axes[0:N-1] corresponds to the variables provided in conditioning_vars, in the order they were provided; axes[N:M-1] corresponds to the M input_vars provided, in the order provided. Ex:: ```python import pylab as PP from numpy import * # *************************** # Generate random samples # *************************** # Stochastically sample from the function underlyingFunction() (a sigmoid): # sample the absicissa values from a gamma distribution # relate the ordinate values to the sample absicissa values and add # noise from a normal distribution # Set the number of samples numSamples = int(1e6) # Define a sigmoid function def underlyingFunction(x,x0=305,y0=200,yrange=4): return (yrange/2)*tanh(x-x0) + y0 xp1,xp2,xmid = 5,2,305 # Set gamma distribution parameters yp1,yp2 = 0,12 # Set normal distribution parameters (mean and std) # Generate random samples of X from the gamma distribution x = -(random.gamma(xp1,xp2,int(numSamples))-xp1*xp2) + xmid # Generate random samples of y from x and add normally distributed noise y = underlyingFunction(x) + random.normal(loc=yp1,scale=yp2,size=numSamples) # *************************** # Calculate the conditional # *************************** pOfYGivenX,axes = fastKDE.conditional(y,x) ``` """ # try to get the use_xarray keyword argument try: use_xarray = kwargs["use_xarray"] del kwargs["use_xarray"] except KeyError: use_xarray = None # try to get the input_var_names keyword argument try: var_names = kwargs["var_names"] del kwargs["var_names"] except KeyError: var_names = None # try to get the do_xarray_subset keyword argument try: do_xarray_subset = kwargs["do_xarray_subset"] del kwargs["do_xarray_subset"] except KeyError: do_xarray_subset = None # set the default value of do_xarray_subset if do_xarray_subset is None: do_xarray_subset = True # Check if use_xarray was given; set defaults if use_xarray is None: # Check if xarray is installed try: import xarray use_xarray = True except ImportError: use_xarray = False # if use_xarray was given, check that xarray is installed if use_xarray: try: import xarray except ImportError: raise ImportError( "xarray is not installed, but use_xarray=True was provided" ) # Check whether input_vars is an iterable of vectors or a single vector; # ensure it is an iterable try: ivarLengths = [len(v) for v in input_vars] except TypeError: input_vars = [input_vars] ivarLengths = [len(v) for v in input_vars] # Check whether conditioning_vars is an iterable of vectors or a single vector; # ensure it is an iterable try: cvarLengths = [len(v) for v in conditioning_vars] except TypeError: conditioning_vars = [conditioning_vars] cvarLengths = [len(v) for v in conditioning_vars] # Create a list of all variables fullVarList = conditioning_vars + input_vars # Check that all input variable lengths are the same if not all(npy.array([len(v) for v in fullVarList]) == ivarLengths[0]): raise ValueError( ( "input_vars and conditioning_vars all must be the same length. " + "Got {} for input_vars and {} for conditioning_vars" ).format(ivarLengths, cvarLengths) ) # Extract the peak_frac argument if "peak_frac" in kwargs: peak_frac = kwargs["peak_frac"] del kwargs["peak_frac"] else: peak_frac = 0.01 # Default to positive_shift=True if "positive_shift" in kwargs: positive_shift = kwargs["positive_shift"] del kwargs["positive_shift"] else: positive_shift = True # set the input variable names if var_names is None: var_names = ["var{}".format(i) for i in range(len(fullVarList))] else: # check that var_names is a list of strings of appropriate length if len(var_names) != len(fullVarList): raise ValueError( "var_names must be a list of strings of length {}".format( len(fullVarList) ) ) for i in range(len(fullVarList)): if not isinstance(var_names[i], str): raise ValueError( "var_names must be a list of strings of length {}".format( len(fullVarList) ) ) # extract the conditioning variable names conditioning_var_names = var_names[:len(conditioning_vars)] # extract the input variable names input_var_names = var_names[len(conditioning_vars):] # Estimate the full joint PDF _pdf = fastKDE(npy.array(fullVarList), positive_shift=positive_shift, **kwargs) # Set the indices of the conditional variables cvarInds = list(range(len(conditioning_vars))) # Estimate the conditional cpdf = _pdf.estimateConditionals( cvarInds, npy.array(fullVarList), peak_frac=peak_frac ) # Return the PDF if use_xarray: # set the coordinates coords = {} dims = [] for i in range(len(fullVarList)): # set the variable name varname = var_names[i] # set the coordinate coords[varname] = _pdf.axes[i] # set the dimension dims.append(varname) # construct the xarray object; the two transpose operations ensure # that the dimensions are in an intuitive order cdf_da = xarray.DataArray(cpdf.T, coords=coords, dims=dims).T # set the long name cdf_da.attrs[ "long_name" ] = f"PDF({','.join(input_var_names)}|{','.join(conditioning_var_names)})" # subset the xarray object if requested if do_xarray_subset: subset_dict = {} for i in range(len(fullVarList)): varname = var_names[i] # get the min and max of the variable varmin = npy.amin(fullVarList[i]) varmax = npy.amax(fullVarList[i]) # set the subset dictionary entry subset_dict[varname] = slice(varmin, varmax) # subset the xarray object cdf_da = cdf_da.sel(**subset_dict) return cdf_da else: # Return the conditional and the axes return cpdf, _pdf.axes
[docs] def pdf_at_points(*args, **kwargs): """Estimate the self-consistent kernel density estimate of the input data at a fixed set of points. input: ------ var1 : An input variable. var2, var3... : Additional input varibles whose length corresponds to the length of var1. As input variables are added, the dimensionality of the resulting PDF increases (e.g., supplying var1 and var2 results in a 2D PDF). list_of_points : Points at which the PDF should be estimated. Points should be provided as a list of tuples, where each tuple contains a point at which the PDF should be estimated. If not provided, the input data points will be used by default. ``**kwargs`` : Any additional keyword arguments get passed directly to fastKDE.fastKDE(); see the docstring of fastKDE.fastKDE() for details of kwargs. returns: -------- pdf : The pdf evaluated at list_of_points (or at the input data points if list_of_points was not provided) NOTE: The computational expense and the memory requirement of this method grows exponentially with the number of input variables. NOTE: `pdf_at_points()` is potentially slow relative to `pdf()` becuase it does not take advantage of the inverse FFT for transforming from Fourier space to data space. However, if few input points are requested, it may actually be faster. """ # Try to get var1 from the args or kwargs try: var1 = args[0] except IndexError: try: var1 = kwargs["var1"] except KeyError: raise ValueError("No input data were provided.") # Check that var1 is arraylike try: var1Shape = npy.shape(var1) assert len(var1Shape) != 0, "var1 is not an array" except ValueError or AssertionError: raise ValueError( "Could not get shape of var1; it does not appear to be array-like." ) # Check that var1 is a vector if len(var1Shape) != 1: raise ValueError( "var1 should be a vector. If multiple variables are combined in a " "single array, please use the fastKDE class interface instead." ) # Get the length of var1 N = var1Shape[0] # Check for input varibles provided as key word arguments var_args = [] varKeys = sorted([v for v in kwargs if "var" in v]) for key in varKeys: # Ignore var1 since this was either provided as an argument # or was read as a keyword argument above if key != "var1": try: int(key[3:]) except ValueError: raise ValueError( "Incomprehensible variable-like keyword provided: " "{}".format(key) ) # Append this variable var_args.append(kwargs[key]) # Check if a mixture of keyword and arguments were provided for additional variables if len(var_args) != 0 and len(args) > 1: raise ValueError( "additional variables were provided as a mixture of arguments and " "keyword arguments. They all must be one or the other." ) # Set the additional variables to be the rest of the input arguments # if none were provided as key word arguments if len(args) > 1: var_args = args[1:] # Remove the variables from kwargs for key in list(varKeys): del kwargs[key] # Start preparing the input data for # concatenation inputVariables = npy.array(var1[npy.newaxis, :]) # Attempt to read additional variables # and concatenate them to the input variable for i in range(len(var_args)): try: varn = npy.array(var_args[i][npy.newaxis, :]) except BaseException as e: print(e) raise ValueError( "Could not convert var{} into a numpy arrray".format(i + 1) ) lenN = npy.shape(varn)[1] if lenN != N: raise ValueError( ( "len(var{}) is {}, but it should be the same of len(var1) " + "= {}" ).format(i + 1, lenN, N) ) inputVariables = npy.concatenate((inputVariables, varn)) # check if list_of_points was provided list_of_points_provided_in_kwargs = False try: # extract the list of points list_of_points = kwargs["list_of_points"] # delete it from the keyword argument dictionary del kwargs["list_of_points"] # flag that this argument was provided list_of_points_provided_in_kwargs = True except KeyError: # default to using the input points as the list of points list_of_points = inputVariables # make sure list_of_points is in the expected format if list_of_points_provided_in_kwargs: try: list_of_points = npy.array(list_of_points, copy=True, dtype=npy.float64).T except ValueError: raise RuntimeError("Could not convert list_of_points to a numpy array.") # check the rank of the input data points data_rank = len(npy.shape(list_of_points)) # If the data are a vector, promote the data to a rank-1 array with only # 1 column if data_rank == 1: list_of_points = npy.array(list_of_points[npy.newaxis, :], dtype=npy.float64) if data_rank > 2: # raise an error indicating the proper shape for list_of_points raise ValueError( "list_of_points must be able to be broadcast to a rank-2 array " "of shape [num_data_points,num_variables]" ) # Remove the do_save_marginals keyword argument try: _ = kwargs["do_save_marginals"] del kwargs["do_save_marginals"] except KeyError: pass # Remove the log_axes argument try: _ = kwargs["log_axes"] del kwargs["log_axes"] warnings.warn( "fastKDE.pdf_at_points() does not currently support the log_axes " "option; it will be ignored." ) except KeyError: pass # Remove the positive_shift argument try: _ = kwargs["positive_shift"] del kwargs["positive_shift"] warnings.warn( "fastKDE.pdf_at_points() does not currently support the " "positive_shift option; it will be ignored." ) except KeyError: pass # Calculate the PDF in Fourier space _pdfobj = fastKDE( inputVariables, do_save_marginals=False, do_fft=False, positive_shift=False, log_axes=False, **kwargs, ) # complete the Fourier-space calculation of the PDF _pdfobj.applyBernacchiaFilter() # calculate the PDF at the requested points pdf = _pdfobj.__transformphiSC_points__(list_of_points) # return the pdf return pdf
# ******************************************************************************* # ******************************************************************************* # ***************************** Unit testing code ******************************* # ******************************************************************************* # ******************************************************************************* # Test this implementation of the BP11 density estimate against a normal # distribution. Calculate the estimate for a variety of sample sizes and show # how the distribution error decreases as sample size increases. As of revision # 9 of the code, this unit testing shows that this implementation of the BP11 # estimate converges on the true normal distribution like N**-1, which agrees # the theoretical and empirical convergence rate given in BP11. if __name__ == "__main__": # set a seed so that results are repeatable npy.random.seed(0) doOneDimensionalTests = True if doOneDimensionalTests: import matplotlib.pyplot as plt import scipy.stats as stats mu = -1e3 sig = 1e3 # Define a gaussian function for evaluation purposes def mygaus(x): return (1.0 / (sig * npy.sqrt(2 * npy.pi))) * npy.exp( -((x - mu) ** 2) / (2.0 * sig**2) ) # Set the size of the sample to calculate powmax = 19 npow = npy.asarray(range(powmax)) + 1.0 # Set the maximum sample size nmax = 2**powmax # Create a random normal sample of this size randsample = sig * npy.random.normal(size=nmax) + mu # Pre-define sample size and error-squared arrays nsample = npy.zeros([len(npow)]) esq = npy.zeros([len(npow)]) epct = npy.zeros([len(npow)]) evaluateError = True if evaluateError: # Do the optimal calculation on a number of different random draws for i, n in zip(range(len(npow)), npow): # Extract a sample of length 2**n + 1 from the previously-created # random sample randgauss = randsample[: (2**n + 1)] # Set the sample size nsample[i] = len(randgauss) with Timer(nsample[i]): # Do the BP11 density estimate bkernel = fastKDE( randgauss, do_approximate_ecf=True, num_points=513 ) # Calculate the mean squared error between the estimated density and # the gaussian esq[i] = average(abs(mygaus(bkernel.x)-bkernel.pdf)**2 # *bkernel.deltaX) esq[i] = npy.average( abs(mygaus(bkernel.axes[0]) - bkernel.pdf[:]) ** 2 * bkernel.deltaX[0] ) epct[i] = 100 * sum( abs(mygaus(bkernel.axes[0]) - bkernel.pdf[:]) * bkernel.deltaX[0] ) # Plot the optimal distribution plt.subplot(2, 2, 1) # ,yscale="log") # pdfmask = ma.masked_less(bkernel.pdf,bkernel.distributionThreshold) pdfmask = bkernel.pdf plt.plot(bkernel.axes[0], pdfmask, "b-") # Plot the empirical characteristic function plt.subplot(2, 2, 2, xscale="log", yscale="log") plt.plot(bkernel.tgrids[0][1:], abs(bkernel.ECF[1:]) ** 2, "b-") # Plot the sample gaussian plt.subplot(2, 2, 1) # ,yscale="log") plt.plot(bkernel.axes[0], mygaus(bkernel.axes[0]), "r-") # Do a simple power law fit to the scaling [m, b, _, _, _] = stats.linregress(npy.log(nsample), npy.log(esq)) # Print the error scaling (following BP11, this is expected to be m ~ -1) print("Error scales ~ N**{}".format(m)) # Plot the error vs sample size on a log-log curve plt.subplot(2, 2, 3) plt.loglog(nsample, esq) plt.plot(nsample, npy.exp(b) * nsample**m, "r-") print("") bDemoSum = False if not bDemoSum: plt.show() else: # ********************************************************************* # Demonstrate the capability to sum fastKDE objects # ********************************************************************* nsamp = 512 nloop = nmax / nsamp # Pre-define sample size and error-squared arrays nsample2 = npy.zeros([nloop]) esq2 = npy.zeros([nloop]) for i in range(nloop): randgauss = randsample[i * nsamp : (i + 1) * nsamp] if i == 0: bkernel2 = fastKDE(randgauss) nsample2[i] = len(randgauss) else: bkernel2 += fastKDE(randgauss) nsample2[i] = nsample2[i - 1] + len(randgauss) # Calculate the mean squared error between the estimated density # And the gaussian esq2[i] = npy.average( npy.abs(mygaus(bkernel2.axes[0]) - bkernel2.pdf) ** 2 * bkernel2.deltaX[0] ) # Print the sample size and the error to show that the code is proceeeding print("{}, {}".format(nsample2[i], esq2[i])) # Plot the distribution plt.subplot(2, 2, 1) plt.plot(bkernel2.axes[0], bkernel2.pdf, "g-") # Plot the ECF plt.subplot(2, 2, 2, xscale="log", yscale="log") plt.plot(bkernel2.tgrids[0][1:], abs(bkernel2.ECF[0, 1:]) ** 2, "b-") # Plot the error-rate change plt.subplot(2, 2, 3) plt.loglog(nsample2, esq2, "g-") # Plot the difference between the two distributions plt.subplot(2, 2, 4) plt.plot( bkernel2.axes[0], abs(bkernel.pdf - bkernel2.pdf) * bkernel.deltaX[0], ) # Show the plots plt.show() else: print(randsample) # Simply do the BP11 density estimate and plot it bkernel = fastKDE( randsample, do_approximate_ecf=True, be_verbose=True, num_points=513 ) # Plot the optimal distribution plt.subplot(2, 1, 1) # pdfmask = ma.masked_less(bkernel.pdf,bkernel.distributionThreshold) pdfmask = bkernel.pdf plt.plot(bkernel.axes[0], pdfmask, "b-") # Plot the sample gaussian plt.plot(bkernel.axes[0], mygaus(bkernel.axes[0]), "r-") # for d in randsample: # plt.plot([d,d],[0,1./len(randsample)],'k-',alpha=0.5) # Plot the transforms plt.subplot(2, 1, 2) plt.plot(bkernel.tgrids[0], abs(bkernel.phiSC), "b-") ecfStandard = npy.fft.ifft(mygaus(bkernel.axes[0])) ecfStandard /= ecfStandard[0] ecfStandard = npy.fft.fftshift(ecfStandard) plt.plot(bkernel.tgrids[0], abs(ecfStandard), "r-") mean = sum(bkernel.axes[0] * bkernel.pdf * bkernel.deltaX[0]) plt.show() doTwoDimensionalTests = True if doTwoDimensionalTests: import matplotlib.pyplot as plt import scipy.stats as stats nvariables = 2 # Seed with 0 so results are reproducable npy.random.seed(0) # Define a bivariate normal function def norm2d(x, y, mux=0, muy=0, sx=1, sy=1, r=0): coef = 1.0 / (2 * npy.pi * sx * sy * npy.sqrt(1.0 - r**2)) expArg = -(1.0 / (2 * (1 - r**2))) * ( (x - mux) ** 2 / sx**2 + (y - muy) ** 2 / sy**2 - 2 * r * (x - mux) * (y - muy) / (sx * sy) ) return coef * npy.exp(expArg) # Set the size of the sample to calculate powmax = 16 npow = npy.asarray(range(1, powmax)) + 1.0 # Set the maximum sample size nmax = 2**powmax def covMat(sx, sy, r): return [[sx**2, r * sx * sy], [r * sx * sy, sy**2]] gausParams = [] gausParams.append([0.0, 0.0, 1.0, 1.0, 0.0]) # Standard, uncorrelated bivariate gausParams.append([2.0, 0.0, 1.0, 1.0, 0.7]) # correlation 0.7, mean x+2 gausParams.append([0.0, 2.0, 1.0, 0.5, 0.0]) # Flat in y-direction, mean y+2 gausParams.append([2.0, 2.0, 0.5, 1.0, 0.0]) # Flat in x-direction, mean xy+2 # Define the corresponding standard function def pdfStandard(x, y): pdfStandard = npy.zeros(npy.shape(x)) for gg in gausParams: pdfStandard += norm2d(x2d, y2d, *tuple(gg)) * (1.0 / ngg) return pdfStandard # Generate samples from this distribution randsamples = [] ngg = len(gausParams) for gg in gausParams: mu = gg[:2] gCovMat = covMat(*tuple(gg[2:])) size = tuple([2, nmax / ngg]) # Append a 2D gaussian to the list randsamples.append( npy.random.multivariate_normal(mu, gCovMat, (nmax / ngg,)).transpose() ) # Concatenate the gaussian samples randsample = npy.concatenate(tuple(randsamples), axis=1) # Shuffle the samples along the long axis so that we # can draw successively larger samples ishuffle = npy.asarray(range(nmax)) npy.random.shuffle(ishuffle) randsample = randsample[:, ishuffle] doSaveCSV = False if doSaveCSV: npy.savetxt("bp11_2d_samples.csv", randsample.transpose(), delimiter=",") # Pre-define sample size and error-squared arrays nsample = npy.zeros([len(npow)]) esq = npy.zeros([len(npow)]) epct = npy.zeros([len(npow)]) evaluateError = True if evaluateError: # Do the optimal calculation on a number of different random draws for z, n in zip(range(len(npow)), npow): # Extract a sample of length 2**n + 1 from the previously-created # random sample randsub = randsample[:, : (2**n)] # Set the sample size nsample[z] = npy.shape(randsub)[1] with Timer(nsample[z]): # Do the BP11 density estimate bkernel = fastKDE( randsub, be_verbose=False, do_save_marginals=False, num_points=129, ) x, y = tuple(bkernel.axes) x2d, y2d = npy.meshgrid(x, y) # Calculate the mean squared error between the estimated density # and the gaussian absdiffsq = abs(pdfStandard(x2d, y2d) - bkernel.pdf) ** 2 dx = x[1] - x[0] dy = y[1] - y[0] esq[z] = sum(dy * sum(absdiffsq * dx, axis=0)) / (len(x) * len(y)) # Print the sample size and the error to show that the code is proceeeding # print "{}: {}, {}".format(n,nsample[z],esq[z]) # Do a simple power law fit to the scaling [m, b, _, _, _] = stats.linregress(npy.log(nsample), npy.log(esq)) # Print the error scaling (following BP11, this is expected to be m ~ -1) print("Error scales ~ N**{}".format(m)) else: with Timer(npy.shape(randsample)[1]): bkernel = fastKDE( randsample, be_verbose=True, do_save_marginals=False, num_points=129 ) doPlot = True if doPlot: x, y = tuple(bkernel.axes) x2d, y2d = npy.meshgrid(x, y) fig = plt.figure() ax1 = fig.add_subplot(121) clevs = npy.asarray(range(2, 10)) / 100.0 ax1.contour(x2d, y2d, bkernel.pdf, levels=clevs) ax1.contour(x2d, y2d, pdfStandard(x2d, y2d), levels=clevs, colors="k") # ax1.plot(randsample[0,:],randsample[1,:],'k.',markersize=1) plt.xlim([-4, 6]) plt.ylim([-4, 6]) if evaluateError: # Plot the error vs sample size on a log-log curve ax3 = fig.add_subplot(122, xscale="log", yscale="log") ax3.plot(nsample, esq) ax3.plot(nsample, npy.exp(b) * nsample**m, "r-") # ax3 = fig.add_subplot(223) # ax3.plot(randsample[0,::16],randsample[1,::16],'k.',markersize=1) # plt.xlim([-4,6]) # plt.ylim([-4,6]) else: ax3 = fig.add_subplot(122) errorStandardSum = sum( abs(pdfStandard(x2d, y2d) - bkernel.pdf) ** 2, axis=0 ) ax3.plot(x, errorStandardSum) plt.show()