Source code for yatsm.cli.classify

""" Command line interface for classifying YATSM algorithm output
"""
from __future__ import division, print_function

import logging
import os
import time

import click
import numpy as np
import numpy.lib.recfunctions as nprfn
import six
from sklearn.externals import joblib

from . import options
from ..config_parser import parse_config_file
from ..utils import distribute_jobs, get_output_name, csvfile_to_dataframe
from ..io import get_image_attribute

logger = logging.getLogger('yatsm')


@click.command(short_help='Classify entire images using trained algorithm')
@options.arg_config_file
@click.argument('algo', metavar='<trained algorithm>',
                type=click.Path(readable=True, resolve_path=True))
@options.arg_job_number
@options.arg_total_jobs
@click.option('--resume', is_flag=True,
              help="Resume classification (don't overwrite)")
@click.pass_context
def classify(ctx, config, algo, job_number, total_jobs, resume):
    cfg = parse_config_file(config)

    df = csvfile_to_dataframe(cfg['dataset']['input_file'],
                              cfg['dataset']['date_format'])
    nrow = get_image_attribute(df['filename'][0])[0]

    classifier = joblib.load(algo)

    # Split into lines and classify
    job_lines = distribute_jobs(job_number, total_jobs, nrow)
    logger.debug('Responsible for lines: {l}'.format(l=job_lines))

    start_time = time.time()
    logger.info('Starting to run lines')
    for job_line in job_lines:
        filename = get_output_name(cfg['dataset'], job_line)
        if not os.path.exists(filename):
            logger.warning('No model result found for line {l} '
                           '(file {f})'.format(l=job_line, f=filename))
            pass

        if resume and try_resume(filename):
            logger.debug('Already processed line {l}'.format(l=job_line))
            continue

        logger.debug('Classifying line {l}'.format(l=job_line))
        classify_line(filename, classifier)

    logger.debug('Completed {n} lines in {m} minutes'.format(
        n=len(job_lines),
        m=round((time.time() - start_time) / 60.0, 2))
    )


[docs]def try_resume(filename): """ Return True/False if dataset has already been classified Args: filename (str): filename of the result to be checked Returns: bool: If the `npz` file exists and contains a file 'class', this test will return True, else False. """ try: z = np.load(filename) except: return False if not z['record'].dtype or 'class' not in z['record'].dtype.names: return False return True
[docs]def classify_line(filename, classifier): """ Use `classifier` to classify data stored in `filename` Args: filename (str): filename of stored results classifier (sklearn classifier): pre-trained classifier """ z = np.load(filename) rec = z['record'] if rec.shape[0] == 0: logger.debug('No records in {f}. Continuing'.format(f=filename)) return # Rescale intercept term coef = rec['coef'].copy() # copy so we don't transform npz coef coef[:, 0, :] = (coef[:, 0, :] + coef[:, 1, :] * ((rec['start'] + rec['end']) / 2.0)[:, np.newaxis]) # Include RMSE for full X matrix newdim = (coef.shape[0], coef.shape[1] * coef.shape[2]) X = np.hstack((coef.reshape(newdim), rec['rmse'])) # Create output and classify classes = classifier.classes_ classified = np.zeros(rec.shape[0], dtype=[ ('class', 'u2'), ('class_proba', 'float32', classes.size) ]) classified['class'] = classifier.predict(X) classified['class_proba'] = classifier.predict_proba(X) # Replace with new classification if exists, or add by merging if ('class' in rec.dtype.names and 'class_proba' in rec.dtype.names and rec['class_proba'].shape[1] == classes.size): rec['class'] = classified['class'] rec['class_proba'] = classified['class_proba'] else: # Drop incompatible classified results if needed # e.g., if the number of classes changed if 'class' in rec.dtype.names and 'class_proba' in rec.dtype.names: rec = nprfn.drop_fields(rec, ['class', 'class_proba']) rec = nprfn.merge_arrays((rec, classified), flatten=True) # Create dict for re-saving `npz` file (only way to append) out = {} for k, v in six.iteritems(z): out[k] = v out['classes'] = classes out['record'] = rec np.savez(filename, **out)