Source code for detectree.classifier

"""Binary tree/non-tree classifier(s)."""

import glob
from os import path

import dask
import huggingface_hub as hf_hub
import maxflow as mf
import numpy as np
import rasterio as rio
import skops
from dask import diagnostics
from skops import io

from . import pixel_features, pixel_response, settings, utils

__all__ = ["ClassifierTrainer", "Classifier"]

MOORE_NEIGHBORHOOD_ARR = np.array([[0, 0, 0], [0, 0, 1], [1, 1, 1]])



[docs]
class ClassifierTrainer:
    """Train binary tree/non-tree classifier(s) of the pixel features."""


[docs]
    def __init__(
        self,
        *,
        sigmas=None,
        num_orientations=None,
        neighborhood=None,
        min_neighborhood_range=None,
        num_neighborhoods=None,
        tree_val=None,
        nontree_val=None,
        classifier_class=None,
        **classifier_kwargs,
    ):
        """
        Initialize the classifier.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for details.

        Parameters
        ----------
        sigmas : list-like, optional
            The list of scale parameters (sigmas) to build the Gaussian filter bank that
            will be used to compute the pixel-level features. The provided argument will
            be passed to the initialization method of the `PixelFeaturesBuilder` class.
            If no value is provided, the value set in `settings.GAUSS_SIGMAS` will be
            taken.
        num_orientations : int, optional
            The number of equally-distributed orientations to build the Gaussian filter
            bank that will be used to compute the pixel-level features. The provided
            argument will be passed to the initialization method of the
            `PixelFeaturesBuilder` class. If no value is provided, the value set in
            `settings.GAUSS_NUM_ORIENTATIONS` is used.
        neighborhood : array-like, optional
            The base neighborhood structure that will be used to compute the entropy
            features. Theprovided argument will be passed to the initialization method
            of the `PixelFeaturesBuilder` class. If no value is provided, a square with
            a side size of `2 * min_neighborhood_range + 1` is used.
        min_neighborhood_range : int, optional
            The range (i.e., the square radius) of the smallest neigbhorhood window that
            will be used to compute the entropy features. The provided argument will be
            passed to the initialization method of the `PixelFeaturesBuilder` class. If
            no value is provided, the value set in
            `settings.ENTROPY_MIN_NEIGHBORHOOD_RANGE` is used.
        num_neighborhoods : int, optional
            The number of neigbhorhood windows (whose size follows a geometric
            progression starting at `min_neighborhood_range`) that will be used to
            compute the entropy features. The provided argument will be passed to the
            initialization method of the `PixelFeaturesBuilder` class. If no value is
            provided, the value set in `settings.ENTROPY_NUM_NEIGHBORHOODS` is used.
        tree_val : int, optional
            The value that designates tree pixels in the response images. The provided
            argument will be passed to the initialization method of the
            `PixelResponseBuilder` class. If no value is provided, the value set in
            `settings.RESPONSE_TREE_VAL` is used.
        nontree_val : int, optional
            The value that designates non-tree pixels in the response images. The
            provided argument will be passed to the initialization method of the
            `PixelResponseBuilder` class. If no value is provided, the value set in
            `settings.RESPONSE_NONTREE_VAL` is used.
        classifier_class : class, optional
            The class of the classifier to be trained. It can be any scikit-learn
            compatible estimator that implements the `fit`, `predict` and
            `predict_proba` methods and that can be saved to and loaded from memory
            using skops. If no value is provided, the value set in `settings.CLF_CLASS`
            is used.
        classifier_kwargs : key-value pairings, optional
            Keyword arguments that will be passed to the initialization of
            `classifier_class`. If no value is provided, the value set in
            `settings.CLF_KWARGS` is used.
        """
        self.pixel_features_builder_kwargs = dict(
            sigmas=sigmas,
            num_orientations=num_orientations,
            neighborhood=neighborhood,
            min_neighborhood_range=min_neighborhood_range,
            num_neighborhoods=num_neighborhoods,
        )
        self.pixel_response_builder_kwargs = dict(
            tree_val=tree_val, nontree_val=nontree_val
        )
        if classifier_class is None:
            classifier_class = settings.CLF_CLASS
        self.classifier_class = classifier_class
        if classifier_kwargs == {}:
            classifier_kwargs = settings.CLF_KWARGS
        self.classifier_kwargs = classifier_kwargs



[docs]
    def train_classifier(
        self,
        *,
        split_df=None,
        response_img_dir=None,
        img_filepaths=None,
        response_img_filepaths=None,
        img_dir=None,
        img_filename_pattern=None,
        method=None,
        img_cluster=None,
    ):
        """
        Train a classifier.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        split_df : pandas DataFrame, optional
            Data frame with the train/test split.
        response_img_dir : str representing path to a directory, optional
            Path to the directory where the response tiles are located. Required if
            providing `split_df`. Otherwise `response_img_dir` might either be ignored
            if providing `response_img_filepaths`, or be used as the directory where the
            images whose filename matches `img_filename_pattern` are to be located.
        img_filepaths : list-like, optional
            List of paths to the input tiles whose features will be used to train the
            classifier. Ignored if `split_df` is provided.
        response_img_filepaths : list-like, optional
            List of paths to the binary response tiles that will be used to train the
            classifier. Ignored if `split_df` is provided.
        img_dir : str representing path to a directory, optional
            Path to the directory where the images whose filename matches
            `img_filename_pattern` are to be located. Ignored if `split_df` or
            `img_filepaths` is provided.
        img_filename_pattern : str representing a file-name pattern, optional
            Filename pattern to be matched in order to obtain the list of images. If no
            value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
            Ignored if `split_df` or `img_filepaths` is provided.
        method : {'cluster-I', 'cluster-II'}, optional
            Method used in the train/test split.
        img_cluster : int, optional
            The label of the cluster of tiles. Only used if `method` is 'cluster-II'.

        Returns
        -------
        clf : scikit-learn-like classifier
            The trained classifier.
        """
        if split_df is None and response_img_filepaths is None:
            # this is the only case that needs argument tweaking: otherwise, if we pass
            # `img_filepaths`/`img_dir` to `build_features` and `response_img_dir` to
            # `build_response`, the latter would build a response with all the image
            # files in `response_img_dir`. Instead, we need to build the response only
            # for the files speficied in `img_filepaths`/`img_dir`
            if img_filepaths is None:
                # TODO: this is copied from `build_features` - ideally, we should DRY it
                if img_filename_pattern is None:
                    img_filename_pattern = settings.IMG_FILENAME_PATTERN
                if img_dir is None:
                    raise ValueError(
                        "Either `split_df`, `img_filepaths` or `img_dir` must "
                        "be provided"
                    )
                img_filepaths = glob.glob(path.join(img_dir, img_filename_pattern))

            response_img_filepaths = [
                path.join(response_img_dir, path.basename(img_filepath))
                for img_filepath in img_filepaths
            ]

        X = pixel_features.PixelFeaturesBuilder(
            **self.pixel_features_builder_kwargs
        ).build_features(
            split_df=split_df,
            img_filepaths=img_filepaths,
            img_dir=img_dir,
            img_filename_pattern=img_filename_pattern,
            method=method,
            img_cluster=img_cluster,
        )

        y = pixel_response.PixelResponseBuilder(
            **self.pixel_response_builder_kwargs
        ).build_response(
            split_df=split_df,
            response_img_dir=response_img_dir,
            response_img_filepaths=response_img_filepaths,
            img_filename_pattern=img_filename_pattern,
            method=method,
            img_cluster=img_cluster,
        )

        clf = self.classifier_class(**self.classifier_kwargs)
        clf.fit(X, y)

        return clf



[docs]
    def train_classifiers(self, split_df, response_img_dir):
        """
        Train a classifier for each first-level cluster in `split_df`.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        split_df : pandas DataFrame
            Data frame with the train/test split, which must have an `img_cluster`.
            column with the first-level cluster labels.
        response_img_dir : str representing path to a directory
            Path to the directory where the response tiles are located.

        Returns
        -------
        clf_dict : dictionary
            Dictionary mapping a scikit-learn-like classifier to each first-level
            cluster label.
        """
        if "img_cluster" not in split_df:
            raise ValueError(
                "`split_df` must have an 'img_cluster' column ('cluster-II'). "
                "For 'cluster-I', use `train_classifier`."
            )

        clfs_lazy = {}
        for img_cluster, _ in split_df.groupby("img_cluster"):
            clfs_lazy[img_cluster] = dask.delayed(self.train_classifier)(
                split_df=split_df,
                response_img_dir=response_img_dir,
                method="cluster-II",
                img_cluster=img_cluster,
            )

        with diagnostics.ProgressBar():
            clfs_dict = dask.compute(clfs_lazy)[0]

        return clfs_dict





[docs]
class Classifier:
    """Use trained classifier(s) to predict tree pixels."""


[docs]
    def __init__(
        self,
        *,
        clf=None,
        clf_dict=None,
        tree_val=None,
        nontree_val=None,
        refine=None,
        refine_beta=None,
        refine_int_rescale=None,
        **pixel_features_builder_kwargs,
    ):
        """
        Initialize the classifier instance.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        clf : scikit-learn-like classifier, optional
            Trained classifier. If no value is provided, the latest detectree
            pre-trained classifier is used. Ignored if `clf_dict` is provided.
        clf_dict : dictionary, optional
            Dictionary mapping a trained scikit-learn-like classifier to each
            first-level cluster label.
        tree_val : int, optional
            Label used to denote tree pixels in the predicted images. If no value is
            provided, the value set in `settings.CLF_TREE_VAL` is used.
        nontree_val : int, optional
            Label used to denote non-tree pixels in the predicted images. If no value is
            provided, the value set in `settings.CLF_NONTREE_VAL` is used.
        refine : bool, optional
            Whether the pixel-level classification should be refined by optimizing the
            consistence between neighboring pixels. If no value is provided, the value
            set in `settings.CLF_REFINE` is used.
        refine_beta : int, optional
            Parameter of the refinement procedure that controls the smoothness of the
            labelling. Larger values lead to smoother shapes.  If no value is provided,
            the value set in `settings.CLF_REFINE_BETA` is used.
        refine_int_rescale : int, optional
            Parameter of the refinement procedure that controls the precision of the
            transformation of float to integer edge weights, required for the employed
            graph cuts algorithm. Larger values lead to greater precision. If no value
            is provided, the value set in `settings.CLF_REFINE_INT_RESCALE` is used.
        pixel_features_builder_kwargs : dict, optional
            Keyword arguments that will be passed to `detectree.PixelFeaturesBuilder`,
            which customize how the pixel features are built.
        """
        super().__init__()

        if clf_dict is not None:
            self.clf_dict = clf_dict
        elif clf is not None:
            self.clf = clf
        else:
            self.clf = io.load(
                hf_hub.hf_hub_download(
                    repo_id=settings.HF_HUB_REPO_ID,
                    filename=settings.HF_HUB_FILENAME,
                    library_name="skops",
                    library_version=skops.__version__,
                ),
                trusted=settings.SKOPS_TRUSTED,
            )

        if tree_val is None:
            tree_val = settings.CLF_TREE_VAL
        if nontree_val is None:
            nontree_val = settings.CLF_NONTREE_VAL
        if refine is None:
            refine = settings.CLF_REFINE
        if refine_beta is None:
            refine_beta = settings.CLF_REFINE_BETA
        if refine_int_rescale is None:
            refine_int_rescale = settings.CLF_REFINE_INT_RESCALE

        self.tree_val = tree_val
        self.nontree_val = nontree_val
        self.refine = refine
        self.refine_beta = refine_beta
        self.refine_int_rescale = refine_int_rescale

        self.pixel_features_builder_kwargs = pixel_features_builder_kwargs


    def _predict_img(self, img_filepath, clf, *, output_filepath=None):
        # ACHTUNG: Note that we do not use keyword-only arguments in this method because
        # `output_filepath` works as the only "optional" argument
        src = rio.open(img_filepath)
        img_shape = src.shape

        X = pixel_features.PixelFeaturesBuilder(
            **self.pixel_features_builder_kwargs
        ).build_features_from_filepath(img_filepath)

        if not self.refine:
            y_pred = clf.predict(X).reshape(img_shape)
        else:
            p_nontree, p_tree = np.hsplit(clf.predict_proba(X), 2)
            g = mf.Graph[int]()
            node_ids = g.add_grid_nodes(img_shape)
            P_nontree = p_nontree.reshape(img_shape)
            P_tree = p_tree.reshape(img_shape)

            # The classifier probabilities are floats between 0 and 1, and the graph
            # cuts algorithm requires an integer representation. Therefore, we multiply
            # the probabilities by an arbitrary large number and then transform the
            # result to integers. For instance, we could use a `refine_int_rescale` of
            # `100` so that the probabilities are rescaled into integers between 0 and
            # 100 like percentages). The larger `refine_int_rescale`, the greater the
            # precision.
            # ACHTUNG: the data term when the pixel is a tree is `log(1 - P_tree)`,
            # i.e., `log(P_nontree)`, so the two lines below are correct
            D_tree = (self.refine_int_rescale * np.log(P_nontree)).astype(int)
            D_nontree = (self.refine_int_rescale * np.log(P_tree)).astype(int)
            # TODO: option to choose Moore/Von Neumann neighborhood?
            g.add_grid_edges(
                node_ids, self.refine_beta, structure=MOORE_NEIGHBORHOOD_ARR
            )
            g.add_grid_tedges(node_ids, D_tree, D_nontree)
            g.maxflow()
            # y_pred = g.get_grid_segments(node_ids)
            # transform boolean `g.get_grid_segments(node_ids)` to an array of
            # `self.tree_val` and `self.nontree_val`
            y_pred = np.full(img_shape, self.nontree_val)
            y_pred[g.get_grid_segments(node_ids)] = self.tree_val

        # TODO: make the profile of output rasters more customizable (e.g., via the
        # `settings` module)
        # output_filepath = path.join(output_dir,
        #                             f"tile_{tile_start}-{tile_end}.tif")
        if output_filepath is not None:
            with rio.open(
                output_filepath,
                "w",
                driver="GTiff",
                width=y_pred.shape[1],
                height=y_pred.shape[0],
                count=1,
                dtype=np.uint8,
                nodata=self.nontree_val,
                crs=src.crs,
                transform=src.transform,
            ) as dst:
                dst.write(y_pred.astype(np.uint8), 1)

        src.close()
        return y_pred

    def _predict_imgs(self, img_filepaths, clf, output_dir):
        pred_imgs_lazy = []
        pred_img_filepaths = []
        for img_filepath in img_filepaths:
            # filename, ext = path.splitext(path.basename(img_filepath))
            # pred_img_filepath = path.join(
            #     output_dir, f"{filename}-pred{ext}")
            pred_img_filepath = path.join(output_dir, path.basename(img_filepath))
            pred_imgs_lazy.append(
                dask.delayed(self._predict_img)(
                    img_filepath, clf, output_filepath=pred_img_filepath
                )
            )
            pred_img_filepaths.append(pred_img_filepath)

        with diagnostics.ProgressBar():
            dask.compute(*pred_imgs_lazy)

        return pred_img_filepaths

    def predict_img(self, img_filepath, *, img_cluster=None, output_filepath=None):
        """
        Use a trained classifier to predict tree pixels in an image.

        Optionally dump the predicted tree/non-tree image to `output_filepath`.

        Parameters
        ----------
        img_filepath : str, file object or pathlib.Path object
            Path to a file, URI, file object opened in binary ('rb') mode, or a Path
            object representing the image to be classified. The value will be passed to
            `rasterio.open`.
        img_cluster : int, optional
            The label of the cluster of tiles. Only used if the `Classifier` instance
            was initialized with `clf_dict` (i.e., "cluster-II" method).
        output_filepath : str, file object or pathlib.Path object, optional
            Path to a file, URI, file object opened in binary ('rb') mode, or a Path
            object representing where the predicted image is to be dumped. The value
            will be passed to `rasterio.open` in 'write' mode.

        Returns
        -------
        y_pred : numpy ndarray
            Array with the pixel responses.
        """
        clf = getattr(self, "clf", None)
        if clf is None:
            if img_cluster is not None:
                try:
                    clf = self.clf_dict[img_cluster]
                except KeyError:
                    raise ValueError(
                        f"Classifier for cluster {img_cluster} not found in"
                        " `self.clf_dict`."
                    )
            else:
                raise ValueError(
                    "A valid `img_cluster` must be provided for classifiers"
                    " instantiated with `clf_dict`."
                )
        return self._predict_img(img_filepath, clf, output_filepath=output_filepath)

    def predict_imgs(self, split_df, output_dir):
        """
        Use trained classifier(s) to predict tree pixels in multiple images.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        split_df : pandas DataFrame, optional
            Data frame with the train/test split.
        output_dir : str or pathlib.Path object
            Path to the directory where the predicted images are to be dumped.

        Returns
        -------
        pred_imgs : list or dict
            File paths of the dumped tiles.
        """
        if hasattr(self, "clf"):
            return self._predict_imgs(
                split_df[~split_df["train"]]["img_filepath"], self.clf, output_dir
            )
        else:
            # `self.clf_dict` is not `None`
            pred_imgs = {}
            for img_cluster, _ in split_df.groupby("img_cluster"):
                try:
                    clf = self.clf_dict[img_cluster]
                except KeyError:
                    raise ValueError(
                        f"Classifier for cluster {img_cluster} not found in"
                        " `self.clf_dict`."
                    )
                pred_imgs[img_cluster] = self._predict_imgs(
                    utils.get_img_filepaths(split_df, img_cluster, False),
                    clf,
                    output_dir,
                )

            return pred_imgs