Source code for yatsm.classifiers.diagnostics

import logging

import numpy as np
import scipy.ndimage

from sklearn.utils import check_random_state
# from sklearn.cross_validation import KFold, StratifiedKFold

logger = logging.getLogger('yatsm')

[docs]def kfold_scores(X, y, algo, kf_generator): """ Performs KFold crossvalidation and reports mean/std of scores Args: X (np.ndarray): X feature input used in classification y (np.ndarray): y labeled examples algo (sklean classifier): classifier used from scikit-learn kf_generator (sklearn crossvalidation generator): generator for indices used in crossvalidation Returns: (mean, std): mean and standard deviation of crossvalidation scores """ scores = np.zeros(kf_generator.n_folds) for i, (train, test) in enumerate(kf_generator): scores[i] =[train, :], y[train]).score(X[test, :], y[test])'scores: {0}'.format(scores))'score mean/std: {0}/{1}'.format(scores.mean(), scores.std())) return scores.mean(), scores.std()
[docs]class SpatialKFold(object): """ Spatial cross validation iterator Training data samples physically located next to test samples are likely to be strongly related due to spatial autocorrelation. This violation of independence will artificially inflate crossvalidated measures of algorithm performance. Provides training and testing indices to split data into training and testing sets. Splits a "Region of Interest" image into k consecutive folds. Each fold is used as a validation set once while k - 1 remaining folds form the training set. Parameters: y (np.ndarray): Labeled features row (np.ndarray): Row (y) pixel location for each `y` col (np.ndarray): Column (x) pixel location for each `x` n_folds (int, optional): Number of folds (default: 3) shuffle (bool, optional): Shuffle the unique training data regions before splitting into batches (default: False) random_state (None, int, or np.random.RandomState): Pseudo-random number generator to use for random sampling. If None, default to numpy RNG for shuffling """ shuffle = False def __init__(self, y, row, col, n_folds=3, shuffle=False, random_state=None): if y.size != row.size or y.size != col.size: raise ValueError('Labels provided (y) must be the same size as ' 'the row and columns provided') self.y = y self.row = row self.col = col self.n_folds = n_folds if shuffle: self.shuffle = True self.rng = check_random_state(random_state) self._recreate_labels() def __iter__(self): fold_sizes = (self.n // self.n_folds) * np.ones(self.n_folds, fold_sizes[:self.n % self.n_folds] += 1 current = 0 ind = np.arange(self.y.size) for fold_size in fold_sizes: start, stop = current, current + fold_size test_i = self._labels_to_indices(self.labels[start:stop]) yield ind[test_i], ind[~test_i] current = stop def _recreate_labels(self): """ Internal method to label regions of `self.y` from pixel locations """ roi = np.zeros((self.row.max() + 1, self.col.max() + 1), dtype=self.y.dtype) roi[self.row, self.col] = self.y self.labeled, _ = scipy.ndimage.label(roi) self.labels = np.unique(self.labeled[self.labeled != 0]) self.n = self.labels.size if self.shuffle: self.rng.shuffle(self.labels) self.indices = [] def _labels_to_indices(self, labels): lab_row, lab_col = np.where(np.in1d( self.labeled, labels).reshape(self.labeled.shape)) return np.logical_and(np.in1d(self.row, lab_row), np.in1d(self.col, lab_col))
[docs]class SpatialKFold_ROI(object): """ Spatial cross validation iterator on ROI images Training data samples physically located next to test samples are likely to be strongly related due to spatial autocorrelation. This violation of independence will artificially inflate crossvalidated measures of algorithm performance. Provides training and testing indices to split data into training and testing sets. Splits a "Region of Interest" image into k consecutive folds. Each fold is used as a validation set once while k - 1 remaining folds form the training set. Parameters: roi (np.ndarray): "Region of interest" matrix providing training data samples of some class n_folds (int, optional): Number of folds (default: 3) mask_values (int, list, tuple, or np.ndarray, optional): one or more values within roi to ignore from sampling (default: [0]) shuffle (bool, optional): Shuffle the unique training data regions before splitting into batches (default: False) random_state (None, int, or np.random.RandomState): Pseudo-random number generator to use for random sampling. If None, default to numpy RNG for shuffling """ shuffle = False def __init__(self, roi, n_folds=3, mask_values=[0], shuffle=False, random_state=None): self.roi = roi self.n_folds = n_folds if isinstance(mask_values, (float, int)): self.mask_values = np.array([mask_values]) elif isinstance(mask_values, (list, tuple)): self.mask_values = np.array(mask_values) elif isinstance(mask_values, np.ndarray): self.mask_values = mask_values else: raise TypeError('mask_values must be float, int, list, tuple,' ' or np.ndarray') if shuffle: self.shuffle = True self.rng = check_random_state(random_state) self._label_roi() def __iter__(self): n = self.n n_folds = self.n_folds fold_sizes = (n // n_folds) * np.ones(n_folds, fold_sizes[:n % n_folds] += 1 current = 0 for fold_size in fold_sizes: start, stop = current, current + fold_size test_i = np.in1d(self.indices[:, 0], self.labels[start:stop]) train_i = np.in1d(self.indices[:, 0], self.labels[stop:]) yield ((self.indices[test_i, 1], self.indices[test_i, 2]), (self.indices[train_i, 1], self.indices[train_i, 2])) current = stop def _label_roi(self): """ Internal method to label region of interest image """ labeled, n = scipy.ndimage.label(self.roi) labels = np.unique(labeled) self.labels = labels[~np.in1d(labels, self.mask_values)] self.n = self.labels.size n_samples = (~np.in1d(self.roi, self.mask_values)).sum() self.indices = np.zeros((n_samples, 3), _start = 0 for l in self.labels: _n = (labeled == l).sum() _row, _col = np.where(labeled == l) self.indices[_start:_start + _n, 0] = l self.indices[_start:_start + _n, 1] = _row self.indices[_start:_start + _n, 2] = _col _start += _n if self.shuffle: self.rng.shuffle(self.labels)