Source code for detectree.train_test_split

"""Split the set of images into training and testing sets."""

import glob
from os import path

import dask
import numpy as np
import pandas as pd
from dask import diagnostics
from sklearn import cluster, decomposition, metrics

from . import filters, image_descriptor, settings

__all__ = ["TrainingSelector"]


[docs] class TrainingSelector: """Select the images/tiles to be used to train the classifier(s)."""
[docs] def __init__( self, *, img_filepaths=None, img_dir=None, img_filename_pattern=None, gabor_frequencies=None, gabor_num_orientations=None, response_bins_per_axis=None, num_color_bins=None, ): """ Initialize the training selector. The arguments provided to the initialization method will determine how the image descriptors are computed. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- img_filepaths : list-like, optional List of paths to the input tiles whose features will be used to train the classifier. img_dir : str representing path to a directory, optional Path to the directory where the images whose filename matches `img_filename_pattern` are to be located. Ignored if `img_filepaths` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `img_filepaths` is provided. gabor_frequencies : tuple, optional Set of frequencies used to build the Gabor filter bank. If no value is provided, the value set in `settings.GIST_GABOR_FREQUENCIES` is used. gabor_num_orientations : int or tuple, optional Number of orientations used to build the Gabor filter bank. If an integer is provided, the corresponding number of orientations will be used for each scale (determined by `gabor_frequencies`). If a tuple is provided, each element will determine the number of orientations that must be used at its matching scale (determined by `gabor_frequencies`) - thus the tuple must match the length of `gabor_frequencies`. If no value is provided, the value set in `settings.GIST_GABOR_NUM_ORIENTATIONS` is used. response_bins_per_axis : int, optional Number of spatial bins per axis into which the responses to the Gabor filter bank will be aggreated. For example, a value of 2 will aggregate the responses into the four quadrants of the image (i.e., 2x2, 2 bins in each axis of the image). If no value is provided, the value set in `settings.GIST_RESPONSE_BINS_PER_AXIS` is used. num_color_bins : int, optional Number of bins in each dimension used to compute a joint color histogram in the L*a*b color space. If no value is provided, the value set in `settings.GIST_NUM_COLOR_BINS` is used. """ super().__init__() # get `None` keyword-arguments from settings if img_filename_pattern is None: img_filename_pattern = settings.IMG_FILENAME_PATTERN if gabor_frequencies is None: gabor_frequencies = settings.GIST_GABOR_FREQUENCIES if gabor_num_orientations is None: gabor_num_orientations = settings.GIST_GABOR_NUM_ORIENTATIONS if response_bins_per_axis is None: response_bins_per_axis = settings.GIST_RESPONSE_BINS_PER_AXIS if num_color_bins is None: num_color_bins = settings.GIST_NUM_COLOR_BINS # now proceed if img_filepaths is None: img_filepaths = glob.glob(path.join(img_dir, img_filename_pattern)) self.img_filepaths = img_filepaths # TODO: boolean arg for equal tile size (and pass `block_shape` to # `get_gist_descriptor`)? self.gabor_frequencies = gabor_frequencies if isinstance(gabor_num_orientations, tuple): self.gabor_num_orientations = gabor_num_orientations else: # `gabor_num_orientations` is an int self.gabor_num_orientations = tuple( gabor_num_orientations for _ in gabor_frequencies ) self.response_bins_per_axis = response_bins_per_axis self.num_color_bins = num_color_bins
@property def descr_feature_matrix(self): """Compute matrix of descriptors (feature rows).""" try: return self._descr_feature_matrix except AttributeError: kernels = filters.get_gabor_filter_bank( frequencies=self.gabor_frequencies, num_orientations=self.gabor_num_orientations, ) # num_blocks = self.response_bins_per_axis**2 # feature_rows = [ # TrainingSelector._get_image_descr( # img_filepath, kernels, self.response_bins_per_axis, # num_blocks, self.num_color_bins) # for img_filepath in self.img_filepaths # ] values = [ dask.delayed(image_descriptor.compute_image_descriptor_from_filepath)( img_filepath, kernels, self.response_bins_per_axis, self.num_color_bins, ) for img_filepath in self.img_filepaths ] with diagnostics.ProgressBar(): feature_rows = dask.compute(*values) self._descr_feature_matrix = np.vstack(feature_rows) # TODO: cache as instance attribute (or even use property with and pass this # method's arguments to init), and then let people interactively choose the # number of PCA components until they're happy with the represented # variance? I vote yes. # TODO: cache this (via persistence): if `img_filepaths` and the technical # parameters coincide, load from a file instead of recomputing it # TODO: return copy? return self._descr_feature_matrix
[docs] def train_test_split( self, *, method="cluster-II", n_components=12, num_img_clusters=4, train_prop=0.01, return_evr=False, pca_kwargs=None, kmeans_kwargs=None, ): """ Select the image/tiles to be used for traning. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- method : {'cluster-I', 'cluster-II'}, optional (default 'cluster-II') Method used in the train/test split. n_components : int, default 12 Number of principal components into which the image descriptors should be represented when applying the *k*-means clustering. num_img_clusters : int, optional (default 4) Number of first-level image clusters of the 'cluster-II' `method`. Ignored if `method` is 'cluster-I'. train_prop : float, optional Overall proportion of images/tiles that must be selected for training. return_evr : bool, optional (default False) Whether the explained variance ratio of the principal component analysis should be returned pca_kwargs : dict, optional Keyword arguments to be passed to the `sklearn.decomposition.PCA` class constructor (except for `n_components`). kmeans_kwargs : dict, optional Keyword arguments to be passed to the `sklearn.cluster.KMeans` class constructor (except for `n_clusters`). Returns ------- split_df : pandas.DataFrame The train/test split data frame. evr : numeric, optional Expected variance ratio of the principal component analysis. """ X = self.descr_feature_matrix if pca_kwargs is None: _pca_kwargs = {} else: _pca_kwargs = pca_kwargs.copy() # if `n_components` is provided in `pca_kwargs`, it will be ignored _ = _pca_kwargs.pop("n_components", None) pca = decomposition.PCA(n_components=n_components, **_pca_kwargs).fit(X) X_pca = pca.transform(X) X_cols = range(n_components) df = pd.concat( ( pd.Series(self.img_filepaths, name="img_filepath"), pd.DataFrame(X_pca, columns=X_cols), ), axis=1, ) if kmeans_kwargs is None: _kmeans_kwargs = {} else: _kmeans_kwargs = kmeans_kwargs.copy() # if `n_clusters` is provided in `kmeans_kwargs`, it will be ignored _ = _kmeans_kwargs.pop("n_clusters", None) if method == "cluster-I": km = cluster.KMeans( n_clusters=int(np.ceil(train_prop * len(df))), **_kmeans_kwargs ).fit(X_pca) closest, _ = metrics.pairwise_distances_argmin_min( km.cluster_centers_, df[X_cols] ) train_idx = df.iloc[closest].index df["train"] = [True if i in train_idx else False for i in df.index] else: def cluster_train_test_split(img_cluster_ser): X_cluster_df = df.loc[img_cluster_ser.index, X_cols] # use `ceil` to avoid zeros, which might completely ignore a significant # image cluster num_train = int(np.ceil(train_prop * len(X_cluster_df))) cluster_km = cluster.KMeans(n_clusters=num_train, **_kmeans_kwargs).fit( X_cluster_df ) closest, _ = metrics.pairwise_distances_argmin_min( cluster_km.cluster_centers_, X_cluster_df ) train_idx = X_cluster_df.iloc[closest].index return [True if i in train_idx else False for i in X_cluster_df.index] df["img_cluster"] = cluster.KMeans( n_clusters=num_img_clusters, **_kmeans_kwargs ).fit_predict(X_pca) df["train"] = df.groupby("img_cluster")["img_cluster"].transform( cluster_train_test_split ) split_df = df.drop(X_cols, axis=1) if return_evr: return split_df, pca.explained_variance_ratio_.sum() else: return split_df