Source code for yatsm.algorithms.postprocess

""" Result post-processing utilities

Includes comission and omission tests and robust linear model result
calculations
"""
import logging

import numpy as np
import numpy.lib.recfunctions as nprf
import scipy.stats
import statsmodels.api as sm

from ..regression.diagnostics import rmse
from ..utils import date2index

logger = logging.getLogger('yatsm')


# POST-PROCESSING
[docs]def commission_test(yatsm, alpha=0.10):
    """ Merge adjacent records based on Chow Tests for nested models

    Use Chow Test to find false positive, spurious, or unnecessary breaks
    in the timeseries by comparing the effectiveness of two separate
    adjacent models with one single model that spans the entire time
    period.

    Chow test is described:

    .. math::
        \\frac{[RSS_r - (RSS_1 + RSS_2)] / k}{(RSS_1 + RSS_2) / (n - 2k)}

    where:

        - :math:`RSS_r` is the RSS of the combined, or, restricted model
        - :math:`RSS_1` is the RSS of the first model
        - :math:`RSS_2` is the RSS of the second model
        - :math:`k` is the number of model parameters
        - :math:`n` is the number of total observations

    Because we look for change in multiple bands, the RSS used to compare
    the unrestricted versus restricted models is the mean RSS
    values from all ``model.test_indices``.

    Args:
        yatsm (YATSM model): fitted YATSM model to check for commission errors
        alpha (float): significance level for F-statistic (default: 0.10)

    Returns:
        np.ndarray: updated copy of ``yatsm.record`` with spurious models
            combined into unified model

    """
    if yatsm.record.size == 1:
        return yatsm.record

    k = yatsm.record[0]['coef'].shape[0]

    # Allocate memory outside of loop
    m_1_rss = np.zeros(yatsm.test_indices.size)
    m_2_rss = np.zeros(yatsm.test_indices.size)
    m_r_rss = np.zeros(yatsm.test_indices.size)

    models = []
    merged = False
    for i in range(len(yatsm.record) - 1):
        if merged:
            m_1 = models[-1]
        else:
            m_1 = yatsm.record[i]
        m_2 = yatsm.record[i + 1]

        # Unrestricted model starts/ends
        m_1_start = date2index(yatsm.dates, m_1['start'])
        m_1_end = date2index(yatsm.dates, m_1['end'])
        m_2_start = date2index(yatsm.dates, m_2['start'])
        m_2_end = date2index(yatsm.dates, m_2['end'])
        # Restricted start/end
        m_r_start = m_1_start
        m_r_end = m_2_end

        # Need enough obs to fit models (n > k)
        if (m_1_end - m_1_start) <= k or (m_2_end - m_2_start) <= k:
            logger.debug('Too few obs (n <= k) to merge segment')
            merged = False
            if i == 0:
                models.append(m_1)
            models.append(m_2)
            continue

        n = m_r_end - m_r_start
        F_crit = scipy.stats.f.ppf(1 - alpha, k, n - 2 * k)

        for i_b, b in enumerate(yatsm.test_indices):
            m_1_rss[i_b] = np.linalg.lstsq(yatsm.X[m_1_start:m_1_end, :],
                                           yatsm.Y[b, m_1_start:m_1_end])[1]
            m_2_rss[i_b] = np.linalg.lstsq(yatsm.X[m_2_start:m_2_end, :],
                                           yatsm.Y[b, m_2_start:m_2_end])[1]
            m_r_rss[i_b] = np.linalg.lstsq(yatsm.X[m_r_start:m_r_end, :],
                                           yatsm.Y[b, m_r_start:m_r_end])[1]

        # Collapse RSS across all test indices for F statistic
        F = (
            ((m_r_rss.mean() - (m_1_rss.mean() + m_2_rss.mean())) / k) /
            ((m_1_rss.mean() + m_2_rss.mean()) / (n - 2 * k))
        )
        if F > F_crit:
            # Reject H0 and retain change
            # Only add in previous model if first model
            if i == 0:
                models.append(m_1)
            models.append(m_2)
            merged = False
        else:
            # Fail to reject H0 -- ignore change and merge
            m_new = np.copy(yatsm.record_template)[0]

            # Remove last previously added model from list to merge
            if i != 0:
                del models[-1]

            m_new['start'] = m_1['start']
            m_new['end'] = m_2['end']
            m_new['break'] = m_2['break']

            # Re-fit models and copy over attributes
            yatsm.fit_models(yatsm.X[m_r_start:m_r_end, :],
                             yatsm.Y[:, m_r_start:m_r_end])
            for i_m, _m in enumerate(yatsm.models):
                m_new['coef'][:, i_m] = _m.coef
                m_new['rmse'][i_m] = _m.rmse

            if 'magnitude' in yatsm.record.dtype.names:
                # Preserve magnitude from 2nd model that was merged
                m_new['magnitude'] = m_2['magnitude']

            models.append(m_new)

            merged = True

    return np.array(models)


[docs]def omission_test(model, crit=0.05, behavior='ANY', indices=None):
    """ Add omitted breakpoint into records based on residual stationarity

    Uses recursive residuals within a CUMSUM test to check if each model
    has omitted a "structural change" (e.g., land cover change). Returns
    an array of True or False for each timeseries segment record depending
    on result from `statsmodels.stats.diagnostic.breaks_cusumolsresid`.

    Args:
        crit (float, optional): Critical p-value for rejection of null
            hypothesis that data contain no structural change
        behavior (str, optional): Method for dealing with multiple
            `test_indices`. `ANY` will return True if any one test index
            rejects the null hypothesis. `ALL` will only return True if ALL
            test indices reject the null hypothesis.
        indices (np.ndarray, optional): Array indices to test. User provided
            indices must be a subset of `model.test_indices`.

    Returns:
        np.ndarray: Array of True or False for each record where
            True indicates omitted break point

    """
    if behavior.lower() not in ['any', 'all']:
        raise ValueError('`behavior` must be "any" or "all"')

    if not indices:
        indices = model.test_indices

    if not np.all(np.in1d(indices, model.test_indices)):
        raise ValueError('`indices` must be a subset of '
                         '`model.test_indices`')

    if not model.ran:
        return np.empty(0, dtype=bool)

    omission = np.zeros((model.record.size, len(indices)), dtype=bool)

    for i, r in enumerate(model.record):
        # Skip if no model fit
        if r['start'] == 0 or r['end'] == 0:
            continue
        # Find matching X and Y in data
        index = np.where(
            (model.dates >= min(r['start'], r['end'])) &
            (model.dates <= max(r['end'], r['start'])))[0]
        # Grab matching X and Y
        _X = model.X[index, :]
        _Y = model.Y[:, index]

        for i_b, b in enumerate(indices):
            # Create OLS regression
            ols = sm.OLS(_Y[b, :], _X).fit()
            # Perform CUMSUM test on residuals
            test = sm.stats.diagnostic.breaks_cusumolsresid(
                ols.resid, _X.shape[1])

            if test[1] < crit:
                omission[i, i_b] = True
            else:
                omission[i, i_b] = False

    # Collapse band answers according to `behavior`
    if behavior.lower() == 'any':
        return np.any(omission, 1)
    else:
        return np.all(omission, 1)


[docs]def refit_record(model, prefix, estimator,
                 fitopt=None, keep_regularized=False):
    """ Refit YATSM model segments with a new estimator and update record

    YATSM class model must be ran and contain at least one record before this
    function is called.

    Args:
        model (YATSM model): YATSM model to refit
        prefix (str): prefix for refitted coefficient and RMSE (don't include
            underscore as it will be added)
        estimator (object): instance of a scikit-learn compatible estimator
            object
        fitopt (dict, optional): dict of options for the ``fit`` method of the
            ``estimator`` provided (default: None)
        keep_regularized (bool, optional): do not use features with coefficient
            estimates that are fit to 0 (i.e., if using L1 regularization)

    Returns:
        np.array: updated model.record NumPy structured array with refitted
            coefficients and RMSE

    """
    if not model:
        return None

    fitopt = fitopt or {}

    refit_coef = prefix + '_coef'
    refit_rmse = prefix + '_rmse'

    # Create new array for robust coefficients and RMSE
    n_coef, n_series = model.record[0]['coef'].shape
    refit = np.zeros(model.record.shape[0], dtype=[
        (refit_coef, 'float32', (n_coef, n_series)),
        (refit_rmse, 'float32', (n_series)),
    ])

    for i_rec, rec in enumerate(model.record):
        # Find matching X and Y in data
        # start/end dates are considered in case ran backward
        index = np.where((model.dates >= min(rec['start'], rec['end'])) &
                         (model.dates <= max(rec['start'], rec['end'])))[0]

        X = model.X.take(index, axis=0)
        Y = model.Y.take(index, axis=1)

        # Refit each band
        for i_y, y in enumerate(Y):
            if keep_regularized:
                # Find nonzero in case of regularized regression
                nonzero = np.nonzero(rec['coef'][:, i_y])[0]
                if nonzero.size == 0:
                    refit[i_rec][refit_rmse][:] = rec['rmse']
                    continue
            else:
                nonzero = np.arange(n_series)

            # Fit
            estimator.fit(X[:, nonzero], y, **fitopt)
            # Store updated coefficients
            refit[i_rec][refit_coef][nonzero, i_y] = estimator.coef_
            refit[i_rec][refit_coef][0, i_y] += getattr(
                estimator, 'intercept_', 0.0)

            # Update RMSE
            refit[i_rec][refit_rmse][i_y] = rmse(
                y, estimator.predict(X[:, nonzero]))

    # Merge
    refit = nprf.merge_arrays((model.record, refit), flatten=True)

    return refit