Source code for detectree.train_test_split

"""Split the set of images into training and testing sets."""

import glob
from os import path

import dask
import numpy as np
import pandas as pd
from dask import diagnostics
from sklearn import cluster, decomposition, metrics

from . import filters, image_descriptor, settings

__all__ = ["TrainingSelector"]



[docs]
class TrainingSelector:
    """Select the images/tiles to be used to train the classifier(s)."""


[docs]
    def __init__(
        self,
        *,
        img_filepaths=None,
        img_dir=None,
        img_filename_pattern=None,
        gabor_frequencies=None,
        gabor_num_orientations=None,
        response_bins_per_axis=None,
        num_color_bins=None,
    ):
        """
        Initialize the training selector.

        The arguments provided to the initialization method will determine how the image
        descriptors are computed. See the `background <https://bit.ly/2KlCICO>`_ example
        notebook for more details.

        Parameters
        ----------
        img_filepaths : list-like, optional
            List of paths to the input tiles whose features will be used to train the
            classifier.
        img_dir : str representing path to a directory, optional
            Path to the directory where the images whose filename matches
            `img_filename_pattern` are to be located. Ignored if `img_filepaths` is
            provided.
        img_filename_pattern : str representing a file-name pattern, optional
            Filename pattern to be matched in order to obtain the list of images. If no
            value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
            Ignored if `img_filepaths` is provided.
        gabor_frequencies : tuple, optional
            Set of frequencies used to build the Gabor filter bank. If no value is
            provided, the value set in `settings.GIST_GABOR_FREQUENCIES` is used.
        gabor_num_orientations : int or tuple, optional
            Number of orientations used to build the Gabor filter bank. If an integer is
            provided, the corresponding number of orientations will be used for each
            scale (determined by `gabor_frequencies`). If a tuple is provided, each
            element will determine the number of orientations that must be used at its
            matching scale (determined by `gabor_frequencies`) - thus the tuple must
            match the length of `gabor_frequencies`. If no value is provided, the value
            set in `settings.GIST_GABOR_NUM_ORIENTATIONS` is used.
        response_bins_per_axis : int, optional
            Number of spatial bins per axis into which the responses to the Gabor filter
            bank will be aggreated. For example, a value of 2 will aggregate the
            responses into the four quadrants of the image (i.e., 2x2, 2 bins in each
            axis of the image). If no value is provided, the value set in
            `settings.GIST_RESPONSE_BINS_PER_AXIS` is used.
        num_color_bins : int, optional
            Number of bins in each dimension used to compute a joint color histogram in
            the L*a*b color space. If no value is provided, the value set in
            `settings.GIST_NUM_COLOR_BINS` is used.
        """
        super().__init__()

        # get `None` keyword-arguments from settings
        if img_filename_pattern is None:
            img_filename_pattern = settings.IMG_FILENAME_PATTERN
        if gabor_frequencies is None:
            gabor_frequencies = settings.GIST_GABOR_FREQUENCIES
        if gabor_num_orientations is None:
            gabor_num_orientations = settings.GIST_GABOR_NUM_ORIENTATIONS
        if response_bins_per_axis is None:
            response_bins_per_axis = settings.GIST_RESPONSE_BINS_PER_AXIS
        if num_color_bins is None:
            num_color_bins = settings.GIST_NUM_COLOR_BINS

        # now proceed
        if img_filepaths is None:
            img_filepaths = glob.glob(path.join(img_dir, img_filename_pattern))

        self.img_filepaths = img_filepaths

        # TODO: boolean arg for equal tile size (and pass `block_shape` to
        # `get_gist_descriptor`)?
        self.gabor_frequencies = gabor_frequencies

        if isinstance(gabor_num_orientations, tuple):
            self.gabor_num_orientations = gabor_num_orientations
        else:
            # `gabor_num_orientations` is an int
            self.gabor_num_orientations = tuple(
                gabor_num_orientations for _ in gabor_frequencies
            )

        self.response_bins_per_axis = response_bins_per_axis
        self.num_color_bins = num_color_bins


    @property
    def descr_feature_matrix(self):
        """Compute matrix of descriptors (feature rows)."""
        try:
            return self._descr_feature_matrix
        except AttributeError:
            kernels = filters.get_gabor_filter_bank(
                frequencies=self.gabor_frequencies,
                num_orientations=self.gabor_num_orientations,
            )

            # num_blocks = self.response_bins_per_axis**2

            # feature_rows = [
            #      TrainingSelector._get_image_descr(
            #          img_filepath, kernels, self.response_bins_per_axis,
            #          num_blocks, self.num_color_bins)
            #      for img_filepath in self.img_filepaths
            #  ]
            values = [
                dask.delayed(image_descriptor.compute_image_descriptor_from_filepath)(
                    img_filepath,
                    kernels,
                    self.response_bins_per_axis,
                    self.num_color_bins,
                )
                for img_filepath in self.img_filepaths
            ]

            with diagnostics.ProgressBar():
                feature_rows = dask.compute(*values)

            self._descr_feature_matrix = np.vstack(feature_rows)

            # TODO: cache as instance attribute (or even use property with and pass this
            # method's arguments to init), and then let people interactively choose the
            # number of PCA components until they're happy with the represented
            # variance? I vote yes.
            # TODO: cache this (via persistence): if `img_filepaths` and the technical
            # parameters coincide, load from a file instead of recomputing it
            # TODO: return copy?
            return self._descr_feature_matrix


[docs]
    def train_test_split(
        self,
        *,
        method="cluster-II",
        n_components=12,
        num_img_clusters=4,
        train_prop=0.01,
        return_evr=False,
        pca_kwargs=None,
        kmeans_kwargs=None,
    ):
        """
        Select the image/tiles to be used for traning.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook
        for more details.

        Parameters
        ----------
        method : {'cluster-I', 'cluster-II'}, optional (default 'cluster-II')
            Method used in the train/test split.
        n_components : int, default 12
            Number of principal components into which the image descriptors should be
            represented when applying the *k*-means clustering.
        num_img_clusters : int, optional (default 4)
            Number of first-level image clusters of the 'cluster-II' `method`.  Ignored
            if `method` is 'cluster-I'.
        train_prop : float, optional
            Overall proportion of images/tiles that must be selected for training.
        return_evr : bool, optional (default False)
            Whether the explained variance ratio of the principal component
            analysis should be returned
        pca_kwargs : dict, optional
            Keyword arguments to be passed to the `sklearn.decomposition.PCA` class
            constructor (except for `n_components`).
        kmeans_kwargs : dict, optional
            Keyword arguments to be passed to the `sklearn.cluster.KMeans` class
            constructor (except for `n_clusters`).

        Returns
        -------
        split_df : pandas.DataFrame
            The train/test split data frame.
        evr : numeric, optional
            Expected variance ratio of the principal component analysis.
        """
        X = self.descr_feature_matrix
        if pca_kwargs is None:
            _pca_kwargs = {}
        else:
            _pca_kwargs = pca_kwargs.copy()
            # if `n_components` is provided in `pca_kwargs`, it will be ignored
            _ = _pca_kwargs.pop("n_components", None)
        pca = decomposition.PCA(n_components=n_components, **_pca_kwargs).fit(X)

        X_pca = pca.transform(X)
        X_cols = range(n_components)
        df = pd.concat(
            (
                pd.Series(self.img_filepaths, name="img_filepath"),
                pd.DataFrame(X_pca, columns=X_cols),
            ),
            axis=1,
        )

        if kmeans_kwargs is None:
            _kmeans_kwargs = {}
        else:
            _kmeans_kwargs = kmeans_kwargs.copy()
            # if `n_clusters` is provided in `kmeans_kwargs`, it will be ignored
            _ = _kmeans_kwargs.pop("n_clusters", None)
        if method == "cluster-I":
            km = cluster.KMeans(
                n_clusters=int(np.ceil(train_prop * len(df))), **_kmeans_kwargs
            ).fit(X_pca)
            closest, _ = metrics.pairwise_distances_argmin_min(
                km.cluster_centers_, df[X_cols]
            )
            train_idx = df.iloc[closest].index

            df["train"] = [True if i in train_idx else False for i in df.index]
        else:

            def cluster_train_test_split(img_cluster_ser):
                X_cluster_df = df.loc[img_cluster_ser.index, X_cols]
                # use `ceil` to avoid zeros, which might completely ignore a significant
                # image cluster
                num_train = int(np.ceil(train_prop * len(X_cluster_df)))
                cluster_km = cluster.KMeans(n_clusters=num_train, **_kmeans_kwargs).fit(
                    X_cluster_df
                )
                closest, _ = metrics.pairwise_distances_argmin_min(
                    cluster_km.cluster_centers_, X_cluster_df
                )
                train_idx = X_cluster_df.iloc[closest].index
                return [True if i in train_idx else False for i in X_cluster_df.index]

            df["img_cluster"] = cluster.KMeans(
                n_clusters=num_img_clusters, **_kmeans_kwargs
            ).fit_predict(X_pca)
            df["train"] = df.groupby("img_cluster")["img_cluster"].transform(
                cluster_train_test_split
            )

        split_df = df.drop(X_cols, axis=1)

        if return_evr:
            return split_df, pca.explained_variance_ratio_.sum()
        else:
            return split_df