Source code for csep.utils.stats

import numpy
import scipy.stats
import scipy.special
# PyCSEP imports
from csep.core import regions

[docs] def sup_dist(cdf1, cdf2): """ given two cumulative distribution functions, compute the supremum of the set of absolute distances. note: this function does not check that the ecdfs are ordered or balanced. beware! """ return numpy.max(numpy.absolute(cdf2 - cdf1))
[docs] def sup_dist_na(data1, data2): """ computes the ks statistic for two ecdfs that are not necessarily aligned on the same values. performs this operation by merging the two datasets together. this is taken from the 2sample ks test in the scipy codebase Args: data1: (numpy array like) data2: (numpy array like) Returns: ks: sup dist from the two cdf functions """ data1, data2 = map(numpy.asarray, (data1, data2)) n1 = len(data1) n2 = len(data2) data1 = numpy.sort(data1) data2 = numpy.sort(data2) data_all = numpy.concatenate([data1,data2]) cdf1 = numpy.searchsorted(data1,data_all,side='right')/(1.0*n1) cdf2 = (numpy.searchsorted(data2,data_all,side='right'))/(1.0*n2) d = numpy.max(numpy.absolute(cdf1-cdf2)) return d
[docs] def cumulative_square_diff(cdf1, cdf2): """ given two cumulative distribution functions, compute the cumulative sq. diff of the set of distances. note: this function does not check that the ecdfs are ordered or balanced. beware! Args: cdf1: ndarray cdf2: ndarray Returns: cum_dist: scalar distance metric for the histograms """ return numpy.sum((cdf2 - cdf1)**2)
[docs] def binned_ecdf(x, vals): """ returns the statement P(X ≤ x) for val in vals. vals must be monotonically increasing and unqiue. returns: tuple: sorted vals, and ecdf computed at vals """ # precompute ecdf for x: returns(sorted(x), ecdf()) if len(x) == 0: return None ex, ey = ecdf(x) cdf = numpy.array(list(map(lambda val: less_equal_ecdf(x, val, cdf=(ex, ey)), vals))) return vals, cdf
[docs] def ecdf(x): """ Compute the ecdf of vector x. This does not contain zero, should be equal to 1 in the last value to satisfy F(x) == P(X ≤ x). Args: x (numpy.array): vector of values Returns: xs (numpy.array), ys (numpy.array) """ xs = numpy.sort(x) ys = numpy.arange(1, len(x) + 1) / float(len(x)) return xs, ys
[docs] def greater_equal_ecdf(x, val, cdf=()): """ Given val return P(x ≥ val). Args: x (numpy.array): set of values val (float): value ecdf (tuple): ecdf of x, should be tuple (sorted(x), ecdf(x)) Returns: (float): probability that x ≤ val """ x = numpy.asarray(x) if x.shape[0] == 0: return None if not cdf: ex, ey = ecdf(x) else: ex, ey = cdf eyc = ey[::-1] # some short-circuit cases for discrete distributions; x is sorted, but reversed. if val > ex[-1]: return 0.0 if val < ex[0]: return 1.0 return eyc[numpy.searchsorted(ex, val)]
[docs] def less_equal_ecdf(x, val, cdf=()): """ Given val return P(x ≤ val). Args: x (numpy.array): set of values val (float): value Returns: (float): probability that x ≤ val """ x = numpy.asarray(x) if x.shape[0] == 0: return None if not cdf: ex, ey = ecdf(x) else: ex, ey = cdf # some short-circuit cases for discrete distributions if val > ex[-1]: return 1.0 if val < ex[0]: return 0.0 # uses numpy implementation of binary search return ey[numpy.searchsorted(ex, val, side='right') - 1]
[docs] def min_or_none(x): """ Given an array x, returns the min value. If x = [], returns None. """ if len(x) == 0: return None else: return numpy.min(x)
[docs] def max_or_none(x): """ Given an array x, returns the max value. If x = [], returns None. """ if len(x) == 0: return None else: return numpy.max(x)
[docs] def get_quantiles(sim_counts, obs_count): """ Computes delta1 and delta2 quantile scores from empirical distribution and observation """ # delta 1 prob of observation at least n_obs events given the forecast delta_1 = greater_equal_ecdf(sim_counts, obs_count) # delta 2 prob of observing at most n_obs events given the catalog delta_2 = less_equal_ecdf(sim_counts, obs_count) return delta_1, delta_2
[docs] def poisson_log_likelihood(observation, forecast): """ Wrapper around scipy to compute the Poisson log-likelihood Args: observation: Observed (Grided) seismicity forecast: Forecast of a Model (Grided) Returns: Log-Liklihood values of between binned-observations and binned-forecasts """ return numpy.log(scipy.stats.poisson.pmf(observation, forecast))
[docs] def poisson_joint_log_likelihood_ndarray(target_event_log_rates, target_observations, n_fore): """ Efficient calculation of joint log-likelihood of grid-based forecast. Note: log(w!) = 0 Args: target_event_log_rates: natural log of bin rates where target events occurred target_observations: counts of target events n_fore: expected number from the forecasts Returns: joint_log_likelihood """ sum_log_target_event_rates = numpy.sum(target_event_log_rates) # factorial(n) = loggamma(n+1) discrete_penalty_term = numpy.sum(scipy.special.loggamma(target_observations+1)) return sum_log_target_event_rates - discrete_penalty_term - n_fore
[docs] def poisson_inverse_cdf(random_matrix, lam): """ Wrapper around scipy inverse poisson cdf function Args: random_matrix: Matrix of dimenions equal to forecast, containing random numbers between 0 and 1. lam: vector of parameters for poisson distribution Returns: sample from the poisson distribution """ return scipy.stats.poisson.ppf(random_matrix, lam)
def get_Kagan_I1_score(forecasts, catalog): """ A program for scoring (I_1) earthquake-forecast grids by the methods of: Kagan, Yan Y. [2009] Testing long-term earthquake forecasts: likelihood methods and error diagrams, Geophys. J. Int., v. 177, pages 532-542. Some advantages of these methods are that they: - are insensitive to the grid used to cover the Earth; - are insensitive to changes in the overall seismicity rate; - do not modify the locations or magnitudes of test earthquakes; - do not require simulation of virtual catalogs; - return relative quality measures, not just "pass" or "fail;" and - indicate relative specificity of forecasts as well as relative success. Written by Han Bao, UCLA, March 2021. Modified June 2021 Note that: (1) The testing catalog and forecast should have exactly the same time-window (duration) (2) Forecasts and catalogs have identical regions Args: forecasts: csep.forecast or a list of csep.forecast (one catalog to test against different forecasts) catalog: csep.catalog Returns: I_1 (numpy.array): containing I1 for each forecast in inputs """ ### Determine if input 'forecasts' is a list of csep.forecasts or a single csep.forecasts try: n_forecast = len(forecasts) # the input forecasts is a list of csep.forecast except: n_forecast = 1 # the input forecasts is a single csep.forecast forecasts = [forecasts] # Sanity checks can go here for forecast in forecasts: if forecast.region != catalog.region: raise RuntimeError("Catalog and forecasts must have identical regions.") # Initialize array I_1 = numpy.zeros(n_forecast, dtype=numpy.float64) # Compute cell areas area_km2 = catalog.region.get_cell_area() total_area = numpy.sum(area_km2) for j, forecast in enumerate(forecasts): # Eq per cell per duration in forecast; note, if called on a CatalogForecast this could require computed expeted rates rate = forecast.spatial_counts() # Get Rate Density and uniform_forecast of the Forecast rate_den = rate / area_km2 uniform_forecast = numpy.sum(rate) / total_area # Compute I_1 score n_event = catalog.event_count counts = catalog.spatial_counts() non_zero_idx = numpy.argwhere(rate_den > 0) non_zero_idx = non_zero_idx[:,0] I_1[j] = numpy.dot(counts[non_zero_idx], numpy.log2(rate_den[non_zero_idx] / uniform_forecast)) / n_event return I_1