Source code for yatsm.config_parser

import logging
import os

import numpy as np
import six
import sklearn.externals.joblib as joblib
import yaml

from . import algorithms
from .regression.packaged import find_packaged_regressor, packaged_regressions

logger = logging.getLogger('yatsm')


[docs]def parse_config_file(config_file):
    """ Parse YAML config file

    Args:
        config_file (str): path to YAML config file

    Returns:
        dict: dict of sub-dicts, each sub-dict containing configuration keys
            and values pertinent to a process or algorithm. Pickled
            estimators compatible with ``scikit-learn``
            (i.e., that follow :class:`sklearn.base.BaseEstimator`)
            models will be loaded and returned as an object within the dict

    Raises:
        KeyError: raise KeyError if configuration file is not specified
            correctly

    """
    with open(config_file) as f:
        cfg = yaml.safe_load(f)
    cfg = expand_envvars(cfg)

    # Ensure algorithm & prediction sections are specified
    if 'YATSM' not in cfg:
        raise KeyError('YATSM must be a section in configuration YAML file')
    if 'prediction' not in cfg['YATSM']:
        raise KeyError('YATSM section does not declare a prediction method')
    if 'algorithm' not in cfg['YATSM']:
        raise KeyError('YATSM section does not declare an algorithm')
    algo = cfg['YATSM']['algorithm']
    if algo not in cfg:
        raise KeyError('Algorithm specified (%s) is not parameterized in '
                       'configuration file' % algo)

    # Embed algorithm in YATSM key
    # TODO: broaden this concept to at least algo['change']
    if algo not in algorithms.available['change']:
        raise NotImplementedError('Algorithm specified (%s) is not currently '
                                  'available' % algo)
    cfg['YATSM']['algorithm_object'] = algorithms.available['change'][algo]
    if not cfg['YATSM']['algorithm_object']:
        raise KeyError('Could not find algorithm specified (%s) in '
                       '`yatsm.algorithms.available`' % algo)

    # Add in dummy phenology and classification dicts if not included
    if 'phenology' not in cfg:
        cfg['phenology'] = {'enable': False}

    if 'classification' not in cfg:
        cfg['classification'] = {'training_image': None}

    return convert_config(cfg)


[docs]def convert_config(cfg):
    """ Convert some configuration values to different values

    Args:
        cfg (dict): dict of sub-dicts, each sub-dict containing configuration
            keys and values pertinent to a process or algorithm

    Returns:
        dict: configuration dict with some items converted to different objects

    Raises:
        KeyError: raise KeyError if configuration file is not specified
            correctly
    """
    # Parse dataset:
    cfg = _parse_dataset_config(cfg)
    # Parse YATSM:
    cfg = _parse_YATSM_config(cfg)

    return cfg


def _parse_dataset_config(cfg):
    """ Parse "dataset:" configuration section
    """
    # Expand min/max values to all bands
    n_bands = cfg['dataset']['n_bands']
    mins, maxes = cfg['dataset']['min_values'], cfg['dataset']['max_values']
    if isinstance(mins, (float, int)):
        cfg['dataset']['min_values'] = np.asarray([mins] * n_bands)
    else:
        if len(mins) != n_bands:
            raise ValueError('Dataset minimum values must be specified for '
                             '"n_bands" (got %i values, needed %i)' %
                             (len(mins), n_bands))
        cfg['dataset']['min_values'] = np.asarray(mins)
    if isinstance(maxes, (float, int)):
        cfg['dataset']['max_values'] = np.asarray([maxes] * n_bands)
    else:
        if len(maxes) != n_bands:
            raise ValueError('Dataset maximum values must be specified for '
                             '"n_bands" (got %i values, needed %i)' %
                             (len(maxes), n_bands))
        cfg['dataset']['max_values'] = np.asarray(maxes)

    return cfg


def _parse_YATSM_config(cfg):
    """ Parse "YATSM:" configuration section
    """
    # Unpickle main predictor
    pred_method = cfg['YATSM']['prediction']
    cfg['YATSM']['estimator'] = {'prediction': pred_method}
    cfg['YATSM']['estimator']['object'] = _unpickle_predictor(
        _find_pickle(pred_method, cfg))
    # Grab estimator fit options
    cfg['YATSM']['estimator']['fit'] = cfg.get(
        pred_method, {}).get('fit', {}) or {}

    # Unpickle refit objects
    if cfg['YATSM'].get('refit', {}).get('prediction', None):
        # Restore pickles
        pickles = []
        fitopts = []
        for pred_method in cfg['YATSM']['refit']['prediction']:
            pickles.append(_unpickle_predictor(_find_pickle(pred_method, cfg)))
            fitopts.append(cfg.get(pred_method, {}).get('fit', {}) or {})
        cfg['YATSM']['refit']['prediction_object'] = pickles
        cfg['YATSM']['refit']['fit'] = fitopts
    # Fill in as empty refit
    else:
        refit = dict(prefix=[], prediction=[], prediction_object=[],
                     stay_regularized=[], fit=[])
        cfg['YATSM']['refit'] = refit

    # Check number of refits
    n_refit = len(cfg['YATSM']['refit']['prediction_object'])
    n_prefix = len(cfg['YATSM']['refit']['prefix'])
    if n_refit != n_prefix:
        raise KeyError('Must supply a prefix for all refix predictions '
                       '(%i vs %i)' % (n_refit, n_prefix))

    # Fill in "stay_regularized" -- default True
    reg = cfg['YATSM']['refit'].get('stay_regularized', None)
    if reg is None:
        cfg['YATSM']['refit']['stay_regularized'] = [True] * n_refit
    elif isinstance(reg, bool):
        cfg['YATSM']['refit']['stay_regularized'] = [reg] * n_refit

    return cfg


def _find_pickle(pickle, cfg):
    """ Return filename for pickle specified

    Pickle should either be from packaged estimators or specified as a section
    in the configuration file.
    """
    # Check if in packaged
    if pickle in packaged_regressions:
        pickle_path = find_packaged_regressor(pickle)
        logger.debug('Using pre-packaged prediction method "%s" from %s' %
                     (pickle, pickle_path))
        return pickle_path
    # Check if in configuration file
    elif pickle in cfg:
        if 'pickle' in cfg[pickle]:
            pickle_path = cfg[pickle]['pickle']
            logger.debug('Using prediction method "%s" from config file (%s)' %
                         (pickle, pickle_path))
            return pickle_path
        else:
            raise KeyError('Prediction method "%s" in config file, but no '
                           'path is given in "pickle" key' % pickle)
    else:
        raise KeyError('Prediction method "%s" is not a pre-packaged estimator'
                       ' nor is it specified as a section in config file' %
                       pickle)


def _unpickle_predictor(pickle):
    # Load sklearn objects
    reg = joblib.load(pickle)

    sklearn_attrs = ['fit', 'predict', 'get_params', 'set_params']
    if all([m in dir(reg) for m in sklearn_attrs]):
        return reg
    else:
        raise AttributeError('Cannot use prediction object from %s. Prediction'
                             ' objects must define the following attributes:\n'
                             '%s'
                             % (pickle, ', '.join(sklearn_attrs)))


[docs]def expand_envvars(d):
    """ Recursively convert lookup that look like environment vars in a dict

    This function things that environmental variables are values that begin
    with `$` and are evaluated with :func:`os.path.expandvars`. No exception
    will be raised if an environment variable is not set.

    Args:
        d (dict): expand environment variables used in the values of this
            dictionary

    Returns:
        dict: input dictionary with environment variables expanded

    """
    def check_envvar(k, v):
        """ Warn if value looks un-expanded """
        if '$' in v:
            logger.warning('Config key=value pair might still contain '
                           'environment variables: "%s=%s"' % (k, v))

    _d = d.copy()
    for k, v in six.iteritems(_d):
        if isinstance(v, dict):
            _d[k] = expand_envvars(v)
        elif isinstance(v, str):
            _d[k] = os.path.expandvars(v)
            check_envvar(k, v)
        elif isinstance(v, (list, tuple)):
            n_v = []
            for _v in v:
                if isinstance(_v, str):
                    _v = os.path.expandvars(_v)
                    check_envvar(k, _v)
                n_v.append(_v)
            _d[k] = n_v
    return _d