import logging
import numpy as np
import scipy.ndimage
from sklearn.utils import check_random_state
# from sklearn.cross_validation import KFold, StratifiedKFold
logger = logging.getLogger('yatsm')
[docs]def kfold_scores(X, y, algo, kf_generator):
""" Performs KFold crossvalidation and reports mean/std of scores
Args:
X (np.ndarray): X feature input used in classification
y (np.ndarray): y labeled examples
algo (sklean classifier): classifier used from scikit-learn
kf_generator (sklearn crossvalidation generator): generator for indices
used in crossvalidation
Returns:
(mean, std): mean and standard deviation of crossvalidation scores
"""
scores = np.zeros(kf_generator.n_folds)
for i, (train, test) in enumerate(kf_generator):
scores[i] = algo.fit(X[train, :], y[train]).score(X[test, :], y[test])
logger.info('scores: {0}'.format(scores))
logger.info('score mean/std: {0}/{1}'.format(scores.mean(), scores.std()))
return scores.mean(), scores.std()
[docs]class SpatialKFold(object):
""" Spatial cross validation iterator
Training data samples physically located next to test samples are likely to
be strongly related due to spatial autocorrelation. This violation of
independence will artificially inflate crossvalidated measures of
algorithm performance.
Provides training and testing indices to split data into training and
testing sets. Splits a "Region of Interest" image into k consecutive
folds. Each fold is used as a validation set once while k - 1 remaining
folds form the training set.
Parameters:
y (np.ndarray): Labeled features
row (np.ndarray): Row (y) pixel location for each `y`
col (np.ndarray): Column (x) pixel location for each `x`
n_folds (int, optional): Number of folds (default: 3)
shuffle (bool, optional): Shuffle the unique training data regions before
splitting into batches (default: False)
random_state (None, int, or np.random.RandomState): Pseudo-random number
generator to use for random sampling. If None, default to numpy RNG
for shuffling
"""
shuffle = False
def __init__(self, y, row, col, n_folds=3, shuffle=False,
random_state=None):
if y.size != row.size or y.size != col.size:
raise ValueError('Labels provided (y) must be the same size as '
'the row and columns provided')
self.y = y
self.row = row
self.col = col
self.n_folds = n_folds
if shuffle:
self.shuffle = True
self.rng = check_random_state(random_state)
self._recreate_labels()
def __iter__(self):
fold_sizes = (self.n // self.n_folds) * np.ones(self.n_folds,
dtype=np.int)
fold_sizes[:self.n % self.n_folds] += 1
current = 0
ind = np.arange(self.y.size)
for fold_size in fold_sizes:
start, stop = current, current + fold_size
test_i = self._labels_to_indices(self.labels[start:stop])
yield ind[test_i], ind[~test_i]
current = stop
def _recreate_labels(self):
""" Internal method to label regions of `self.y` from pixel locations
"""
roi = np.zeros((self.row.max() + 1, self.col.max() + 1),
dtype=self.y.dtype)
roi[self.row, self.col] = self.y
self.labeled, _ = scipy.ndimage.label(roi)
self.labels = np.unique(self.labeled[self.labeled != 0])
self.n = self.labels.size
if self.shuffle:
self.rng.shuffle(self.labels)
self.indices = []
def _labels_to_indices(self, labels):
lab_row, lab_col = np.where(np.in1d(
self.labeled, labels).reshape(self.labeled.shape))
return np.logical_and(np.in1d(self.row, lab_row),
np.in1d(self.col, lab_col))
[docs]class SpatialKFold_ROI(object):
""" Spatial cross validation iterator on ROI images
Training data samples physically located next to test samples are likely to
be strongly related due to spatial autocorrelation. This violation of
independence will artificially inflate crossvalidated measures of
algorithm performance.
Provides training and testing indices to split data into training and
testing sets. Splits a "Region of Interest" image into k consecutive
folds. Each fold is used as a validation set once while k - 1 remaining
folds form the training set.
Parameters:
roi (np.ndarray): "Region of interest" matrix providing training data
samples of some class
n_folds (int, optional): Number of folds (default: 3)
mask_values (int, list, tuple, or np.ndarray, optional): one or more
values within roi to ignore from sampling (default: [0])
shuffle (bool, optional): Shuffle the unique training data regions before
splitting into batches (default: False)
random_state (None, int, or np.random.RandomState): Pseudo-random number
generator to use for random sampling. If None, default to numpy RNG
for shuffling
"""
shuffle = False
def __init__(self, roi, n_folds=3, mask_values=[0], shuffle=False,
random_state=None):
self.roi = roi
self.n_folds = n_folds
if isinstance(mask_values, (float, int)):
self.mask_values = np.array([mask_values])
elif isinstance(mask_values, (list, tuple)):
self.mask_values = np.array(mask_values)
elif isinstance(mask_values, np.ndarray):
self.mask_values = mask_values
else:
raise TypeError('mask_values must be float, int, list, tuple,'
' or np.ndarray')
if shuffle:
self.shuffle = True
self.rng = check_random_state(random_state)
self._label_roi()
def __iter__(self):
n = self.n
n_folds = self.n_folds
fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int)
fold_sizes[:n % n_folds] += 1
current = 0
for fold_size in fold_sizes:
start, stop = current, current + fold_size
test_i = np.in1d(self.indices[:, 0], self.labels[start:stop])
train_i = np.in1d(self.indices[:, 0], self.labels[stop:])
yield ((self.indices[test_i, 1], self.indices[test_i, 2]),
(self.indices[train_i, 1], self.indices[train_i, 2]))
current = stop
def _label_roi(self):
""" Internal method to label region of interest image
"""
labeled, n = scipy.ndimage.label(self.roi)
labels = np.unique(labeled)
self.labels = labels[~np.in1d(labels, self.mask_values)]
self.n = self.labels.size
n_samples = (~np.in1d(self.roi, self.mask_values)).sum()
self.indices = np.zeros((n_samples, 3), dtype=np.int)
_start = 0
for l in self.labels:
_n = (labeled == l).sum()
_row, _col = np.where(labeled == l)
self.indices[_start:_start + _n, 0] = l
self.indices[_start:_start + _n, 1] = _row
self.indices[_start:_start + _n, 2] = _col
_start += _n
if self.shuffle:
self.rng.shuffle(self.labels)