Source code for skextremes.utils

"""
This module provides utility functions that are used within scikit-extremes
that are also useful for external consumption.
"""

import warnings as _warnings

from numpy.random import randint as _randint
import numpy as _np
import scipy.stats as _st
from scipy import optimize as _op
from scipy.special import gamma as _gamma


###############################################################################
# Bootstrap confidence intervals calculations using percentile interval method
###############################################################################
class InstabilityWarning(UserWarning):
    """Issued when results may be unstable."""
    pass

# On import, make sure that InstabilityWarnings are not filtered out.
_warnings.simplefilter('always', InstabilityWarning)
_warnings.simplefilter('always', UserWarning)

[docs]def bootstrap_ci(data, statfunction=_np.average, alpha = 0.05, n_samples = 100): """ Given a set of data ``data``, and a statistics function ``statfunction`` that applies to that data, computes the bootstrap confidence interval for ``statfunction`` on that data. Data points are assumed to be delineated by axis 0. This function has been derived and simplified from scikits-bootstrap package created by cgevans (https://github.com/cgevans/scikits-bootstrap). All the credits shall go to him. **Parameters** data : array_like, shape (N, ...) OR tuple of array_like all with shape (N, ...) Input data. Data points are assumed to be delineated by axis 0. Beyond this, the shape doesn't matter, so long as ``statfunction`` can be applied to the array. If a tuple of array_likes is passed, then samples from each array (along axis 0) are passed in order as separate parameters to the statfunction. The type of data (single array or tuple of arrays) can be explicitly specified by the multi parameter. statfunction : function (data, weights = (weights, optional)) -> value This function should accept samples of data from ``data``. It is applied to these samples individually. alpha : float, optional The percentiles to use for the confidence interval (default=0.05). The returned values are (alpha/2, 1-alpha/2) percentile confidence intervals. n_samples : int or float, optional The number of bootstrap samples to use (default=100) **Returns** confidences : tuple of floats The confidence percentiles specified by alpha **Calculation Methods** 'pi' : Percentile Interval (Efron 13.3) The percentile interval method simply returns the 100*alphath bootstrap sample's values for the statistic. This is an extremely simple method of confidence interval calculation. However, it has several disadvantages compared to the bias-corrected accelerated method. If you want to use more complex calculation methods, please, see `scikits-bootstrap package <https://github.com/cgevans/scikits-bootstrap>`_. **References** Efron (1993): 'An Introduction to the Bootstrap', Chapman & Hall. """ def bootstrap_indexes(data, n_samples=10000): """ Given data points data, where axis 0 is considered to delineate points, return an generator for sets of bootstrap indexes. This can be used as a list of bootstrap indexes (with list(bootstrap_indexes(data))) as well. """ for _ in range(n_samples): yield _randint(data.shape[0], size=(data.shape[0],)) alphas = _np.array([alpha / 2,1 - alpha / 2]) data = _np.array(data) tdata = (data,) # We don't need to generate actual samples; that would take more memory. # Instead, we can generate just the indexes, and then apply the statfun # to those indexes. bootindexes = bootstrap_indexes(tdata[0], n_samples) stat = _np.array([statfunction(*(x[indexes] for x in tdata)) for indexes in bootindexes]) stat.sort(axis=0) # Percentile Interval Method avals = alphas nvals = _np.round((n_samples - 1)*avals).astype('int') if _np.any(nvals == 0) or _np.any(nvals == n_samples - 1): _warnings.warn("Some values used extremal samples; results are probably unstable.", InstabilityWarning) elif _np.any(nvals<10) or _np.any(nvals>=n_samples-10): _warnings.warn("Some values used top 10 low/high samples; results may be unstable.", InstabilityWarning) if nvals.ndim == 1: # All nvals are the same. Simple broadcasting return stat[nvals] else: # Nvals are different for each data point. Not simple broadcasting. # Each set of nvals along axis 0 corresponds to the data at the same # point in other axes. return stat[(nvals, _np.indices(nvals.shape)[1:].squeeze())] ############################################################################### # Function to estimate parameters of GEV using method of moments ###############################################################################
[docs]def gev_momfit(data): """ Estimate parameters of Generalised Extreme Value distribution using the method of moments. The methodology has been extracted from appendix A.4 on EVA (see references below). **Parameters** data : array_like Sample extreme data **Returns** tuple tuple with the shape, location and scale parameters. In this, case, the shape parameter is always 0. **References** DHI, (2003): '`EVA(Extreme Value Analysis - Reference manual) <http://www.tnmckc.org/upload/document/wup/1/1.3/Manuals/MIKE%2011/eva/EVA_RefManual.pdf>`_', DHI. """ g = lambda n, x : _gamma(1 + n * x) mean = _np.mean(data) std = _np.std(data) skew = _st.skew(data) def minimize_skew(x): a = -g(3, x) + 3 * g(1, x) * g(2, x) - 2 * g(1, x)**3 b = (g(2, x) - (g(1, x))**2)**1.5 c = abs(a / b - skew) return c c = _op.fmin(minimize_skew, 0)[0] # first guess is set to 0 scale = std * abs(c) / _np.sqrt((g(2, c) - g(1, c)**2)) loc = mean - scale * (1 - g(1, c)) / c return c, loc, scale ############################################################################### # Function to estimate parameters of Gumbel using method of moments ###############################################################################
[docs]def gum_momfit(data): """ Estimate parameters of Gumbel distribution using the method of moments. The methodology has been extracted from Wilks (see references below). **Parameters** data : array_like Sample extreme data **Returns** tuple tuple with the shape, location and scale parameters. In this, case, the shape parameter is always 0. **References** Wilks,D.S. (2006): '`Statistical Methods in the Atmospheric Sciences, second edition <http://store.elsevier.com/Statistical-Methods-in-the-Atmospheric-Sciences/Daniel-Wilks/isbn-9780080456225/>`_', Academic Press. """ mean = _np.mean(data) std = _np.std(data) euler_cte = 0.5772156649015328606065120900824024310421 scale = std * _np.sqrt(6) / _np.pi loc = mean - scale * euler_cte return 0, loc, scale