# Source code for csep.utils.stats

```import numpy
import scipy.stats
import scipy.special

[docs]def sup_dist(cdf1, cdf2):
"""
given two cumulative distribution functions, compute the supremum of the set of absolute distances.

note:
this function does not check that the ecdfs are ordered or balanced. beware!
"""
return numpy.max(numpy.absolute(cdf2 - cdf1))

[docs]def sup_dist_na(data1, data2):
"""
computes the ks statistic for two ecdfs that are not necessarily aligned on the same values. performs this
operation by merging the two datasets together. this is taken from the 2sample ks test in the scipy codebase

Args:
data1: (numpy array like)
data2: (numpy array like)

Returns:
ks: sup dist from the two cdf functions
"""
data1, data2 = map(numpy.asarray, (data1, data2))
n1 = len(data1)
n2 = len(data2)
data1 = numpy.sort(data1)
data2 = numpy.sort(data2)
data_all = numpy.concatenate([data1,data2])
cdf1 = numpy.searchsorted(data1,data_all,side='right')/(1.0*n1)
cdf2 = (numpy.searchsorted(data2,data_all,side='right'))/(1.0*n2)
d = numpy.max(numpy.absolute(cdf1-cdf2))
return d

[docs]def cumulative_square_diff(cdf1, cdf2):
"""
given two cumulative distribution functions, compute the cumulative sq. diff of the set of distances.

note:
this function does not check that the ecdfs are ordered or balanced. beware!

Args:
cdf1: ndarray
cdf2: ndarray

Returns:
cum_dist: scalar distance metric for the histograms

"""
return numpy.sum((cdf2 - cdf1)**2)

[docs]def binned_ecdf(x, vals):
"""
returns the statement P(X ≤ x) for val in vals.
vals must be monotonically increasing and unqiue.

returns:
tuple: sorted vals, and ecdf computed at vals
"""
# precompute ecdf for x: returns(sorted(x), ecdf())
if len(x) == 0:
return None
ex, ey = ecdf(x)
cdf = numpy.array(list(map(lambda val: less_equal_ecdf(x, val, cdf=(ex, ey)), vals)))
return vals, cdf

[docs]def ecdf(x):
"""
Compute the ecdf of vector x. This does not contain zero, should be equal to 1 in the last value
to satisfy F(x) == P(X ≤ x).

Args:
x (numpy.array): vector of values

Returns:
xs (numpy.array), ys (numpy.array)
"""
xs = numpy.sort(x)
ys = numpy.arange(1, len(x) + 1) / float(len(x))
return xs, ys

[docs]def greater_equal_ecdf(x, val, cdf=()):
"""
Given val return P(x ≥ val).

Args:
x (numpy.array): set of values
val (float): value
ecdf (tuple): ecdf of x, should be tuple (sorted(x), ecdf(x))

Returns:
(float): probability that x ≤ val
"""
x = numpy.asarray(x)
if x.shape == 0:
return None
if not cdf:
ex, ey = ecdf(x)
else:
ex, ey = cdf

eyc = ey[::-1]

# some short-circuit cases for discrete distributions; x is sorted, but reversed.
if val > ex[-1]:
return 0.0
if val < ex:
return 1.0
return eyc[numpy.searchsorted(ex, val)]

[docs]def less_equal_ecdf(x, val, cdf=()):
"""
Given val return P(x ≤ val).

Args:
x (numpy.array): set of values
val (float): value

Returns:
(float): probability that x ≤ val
"""
x = numpy.asarray(x)
if x.shape == 0:
return None
if not cdf:
ex, ey = ecdf(x)
else:
ex, ey = cdf
# some short-circuit cases for discrete distributions
if val > ex[-1]:
return 1.0
if val < ex:
return 0.0
# uses numpy implementation of binary search
return ey[numpy.searchsorted(ex, val, side='right') - 1]

[docs]def min_or_none(x):
"""
Given an array x, returns the min value. If x = [], returns None.
"""
if len(x) == 0:
return None
else:
return numpy.min(x)

[docs]def max_or_none(x):
"""
Given an array x, returns the max value. If x = [], returns None.
"""
if len(x) == 0:
return None
else:
return numpy.max(x)

[docs]def get_quantiles(sim_counts, obs_count):
""" Computes delta1 and delta2 quantile scores from empirical distribution and observation """
# delta 1 prob of observation at least n_obs events given the forecast
delta_1 = greater_equal_ecdf(sim_counts, obs_count)
# delta 2 prob of observing at most n_obs events given the catalog
delta_2 = less_equal_ecdf(sim_counts, obs_count)
return delta_1, delta_2

[docs]def poisson_log_likelihood(observation, forecast):
""" Wrapper around scipy to compute the Poisson log-likelihood

Args:
observation: Observed (Grided) seismicity
forecast: Forecast of a Model (Grided)

Returns:
Log-Liklihood values of between binned-observations and binned-forecasts
"""
return numpy.log(scipy.stats.poisson.pmf(observation, forecast))

[docs]def poisson_joint_log_likelihood_ndarray(target_event_log_rates, target_observations, n_fore):
""" Efficient calculation of joint log-likelihood of grid-based forecast.

Note: log(w!) = 0

Args:
target_event_log_rates: natural log of bin rates where target events occurred
target_observations: counts of target events
n_fore: expected number from the forecasts

Returns:
joint_log_likelihood

"""
sum_log_target_event_rates = numpy.sum(target_event_log_rates)
# factorial(n) = loggamma(n+1)
discrete_penalty_term = numpy.sum(scipy.special.loggamma(target_observations+1))
return sum_log_target_event_rates - discrete_penalty_term - n_fore

[docs]def poisson_inverse_cdf(random_matrix, lam):
""" Wrapper around scipy inverse poisson cdf function

Args:
random_matrix: Matrix of dimenions equal to forecast, containing random
numbers between 0 and 1.
lam: vector of parameters for poisson distribution

Returns:
sample from the poisson distribution
"""
return scipy.stats.poisson.ppf(random_matrix, lam)
```