"""
This module provides utility functions that are used within scikit-extremes
that are also useful for external consumption.
"""
import warnings as _warnings
from numpy.random import randint as _randint
import numpy as _np
import scipy.stats as _st
from scipy import optimize as _op
from scipy.special import gamma as _gamma
###############################################################################
# Bootstrap confidence intervals calculations using percentile interval method
###############################################################################
class InstabilityWarning(UserWarning):
"""Issued when results may be unstable."""
pass
# On import, make sure that InstabilityWarnings are not filtered out.
_warnings.simplefilter('always', InstabilityWarning)
_warnings.simplefilter('always', UserWarning)
[docs]def bootstrap_ci(data, statfunction=_np.average, alpha = 0.05,
n_samples = 100):
"""
Given a set of data ``data``, and a statistics function ``statfunction`` that
applies to that data, computes the bootstrap confidence interval for
``statfunction`` on that data. Data points are assumed to be delineated by
axis 0.
This function has been derived and simplified from scikits-bootstrap
package created by cgevans (https://github.com/cgevans/scikits-bootstrap).
All the credits shall go to him.
**Parameters**
data : array_like, shape (N, ...) OR tuple of array_like all with shape (N, ...)
Input data. Data points are assumed to be delineated by axis 0. Beyond this,
the shape doesn't matter, so long as ``statfunction`` can be applied to the
array. If a tuple of array_likes is passed, then samples from each array (along
axis 0) are passed in order as separate parameters to the statfunction. The
type of data (single array or tuple of arrays) can be explicitly specified
by the multi parameter.
statfunction : function (data, weights = (weights, optional)) -> value
This function should accept samples of data from ``data``. It is applied
to these samples individually.
alpha : float, optional
The percentiles to use for the confidence interval (default=0.05). The
returned values are (alpha/2, 1-alpha/2) percentile confidence
intervals.
n_samples : int or float, optional
The number of bootstrap samples to use (default=100)
**Returns**
confidences : tuple of floats
The confidence percentiles specified by alpha
**Calculation Methods**
'pi' : Percentile Interval (Efron 13.3)
The percentile interval method simply returns the 100*alphath bootstrap
sample's values for the statistic. This is an extremely simple method of
confidence interval calculation. However, it has several disadvantages
compared to the bias-corrected accelerated method.
If you want to use more complex calculation methods, please, see
`scikits-bootstrap package
<https://github.com/cgevans/scikits-bootstrap>`_.
**References**
Efron (1993): 'An Introduction to the Bootstrap', Chapman & Hall.
"""
def bootstrap_indexes(data, n_samples=10000):
"""
Given data points data, where axis 0 is considered to delineate points, return
an generator for sets of bootstrap indexes. This can be used as a list
of bootstrap indexes (with list(bootstrap_indexes(data))) as well.
"""
for _ in range(n_samples):
yield _randint(data.shape[0], size=(data.shape[0],))
alphas = _np.array([alpha / 2,1 - alpha / 2])
data = _np.array(data)
tdata = (data,)
# We don't need to generate actual samples; that would take more memory.
# Instead, we can generate just the indexes, and then apply the statfun
# to those indexes.
bootindexes = bootstrap_indexes(tdata[0], n_samples)
stat = _np.array([statfunction(*(x[indexes] for x in tdata)) for indexes in bootindexes])
stat.sort(axis=0)
# Percentile Interval Method
avals = alphas
nvals = _np.round((n_samples - 1)*avals).astype('int')
if _np.any(nvals == 0) or _np.any(nvals == n_samples - 1):
_warnings.warn("Some values used extremal samples; results are probably unstable.", InstabilityWarning)
elif _np.any(nvals<10) or _np.any(nvals>=n_samples-10):
_warnings.warn("Some values used top 10 low/high samples; results may be unstable.", InstabilityWarning)
if nvals.ndim == 1:
# All nvals are the same. Simple broadcasting
return stat[nvals]
else:
# Nvals are different for each data point. Not simple broadcasting.
# Each set of nvals along axis 0 corresponds to the data at the same
# point in other axes.
return stat[(nvals, _np.indices(nvals.shape)[1:].squeeze())]
###############################################################################
# Function to estimate parameters of GEV using method of moments
###############################################################################
[docs]def gev_momfit(data):
"""
Estimate parameters of Generalised Extreme Value distribution using the
method of moments. The methodology has been extracted from appendix A.4
on EVA (see references below).
**Parameters**
data : array_like
Sample extreme data
**Returns**
tuple
tuple with the shape, location and scale parameters. In this,
case, the shape parameter is always 0.
**References**
DHI, (2003): '`EVA(Extreme Value Analysis - Reference manual)
<http://www.tnmckc.org/upload/document/wup/1/1.3/Manuals/MIKE%2011/eva/EVA_RefManual.pdf>`_',
DHI.
"""
g = lambda n, x : _gamma(1 + n * x)
mean = _np.mean(data)
std = _np.std(data)
skew = _st.skew(data)
def minimize_skew(x):
a = -g(3, x) + 3 * g(1, x) * g(2, x) - 2 * g(1, x)**3
b = (g(2, x) - (g(1, x))**2)**1.5
c = abs(a / b - skew)
return c
c = _op.fmin(minimize_skew, 0)[0] # first guess is set to 0
scale = std * abs(c) / _np.sqrt((g(2, c) - g(1, c)**2))
loc = mean - scale * (1 - g(1, c)) / c
return c, loc, scale
###############################################################################
# Function to estimate parameters of Gumbel using method of moments
###############################################################################
[docs]def gum_momfit(data):
"""
Estimate parameters of Gumbel distribution using the
method of moments. The methodology has been extracted from Wilks
(see references below).
**Parameters**
data : array_like
Sample extreme data
**Returns**
tuple
tuple with the shape, location and scale parameters. In this,
case, the shape parameter is always 0.
**References**
Wilks,D.S. (2006): '`Statistical Methods in the Atmospheric Sciences,
second edition <http://store.elsevier.com/Statistical-Methods-in-the-Atmospheric-Sciences/Daniel-Wilks/isbn-9780080456225/>`_',
Academic Press.
"""
mean = _np.mean(data)
std = _np.std(data)
euler_cte = 0.5772156649015328606065120900824024310421
scale = std * _np.sqrt(6) / _np.pi
loc = mean - scale * euler_cte
return 0, loc, scale