Source code for detectree.classifier

"""Binary tree/non-tree classifier(s)."""

import warnings
from os import path

import dask
import huggingface_hub as hf_hub
import numpy as np
import rasterio as rio
import skops
from dask import diagnostics
from skops import io

from detectree import evaluate, pixel_features, pixel_response, settings, utils

__all__ = ["PixelDatasetTransformer", "ClassifierTrainer", "Classifier"]

# suppress LGBM warning due to https://github.com/microsoft/LightGBM/issues/6798
warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMClassifier was fitted with "
    "feature names",
    category=UserWarning,
    module="sklearn.utils.validation",
)


class PixelDatasetTransformer:
    """Build pixel features and responses for training."""

    def __init__(
        self,
        *,
        sigmas=None,
        num_orientations=None,
        neighborhood=None,
        min_neighborhood_range=None,
        num_neighborhoods=None,
        tree_val=None,
        nontree_val=None,
        classifier_class=None,
        **classifier_kwargs,
    ):
        """
        Initialize the classifier.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for details.

        Parameters
        ----------
        sigmas : list-like, optional
            The list of scale parameters (sigmas) to build the Gaussian filter bank that
            will be used to compute the pixel-level features. The provided argument will
            be passed to the initialization method of the `PixelFeaturesBuilder` class.
            If no value is provided, the value set in `settings.GAUSS_SIGMAS` will be
            taken.
        num_orientations : int, optional
            The number of equally-distributed orientations to build the Gaussian filter
            bank that will be used to compute the pixel-level features. The provided
            argument will be passed to the initialization method of the
            `PixelFeaturesBuilder` class. If no value is provided, the value set in
            `settings.GAUSS_NUM_ORIENTATIONS` is used.
        neighborhood : array-like, optional
            The base neighborhood structure that will be used to compute the entropy
            features. Theprovided argument will be passed to the initialization method
            of the `PixelFeaturesBuilder` class. If no value is provided, a square with
            a side size of `2 * min_neighborhood_range + 1` is used.
        min_neighborhood_range : int, optional
            The range (i.e., the square radius) of the smallest neighborhood window that
            will be used to compute the entropy features. The provided argument will be
            passed to the initialization method of the `PixelFeaturesBuilder` class. If
            no value is provided, the value set in
            `settings.ENTROPY_MIN_NEIGHBORHOOD_RANGE` is used.
        num_neighborhoods : int, optional
            The number of neighborhood windows (whose size follows a geometric
            progression starting at `min_neighborhood_range`) that will be used to
            compute the entropy features. The provided argument will be passed to the
            initialization method of the `PixelFeaturesBuilder` class. If no value is
            provided, the value set in `settings.ENTROPY_NUM_NEIGHBORHOODS` is used.
        tree_val, nontree_val : int, optional
            The values that designate tree and non-tree pixels respectively in the
            response images. The provided arguments will be passed to the initialization
            method of the `PixelResponseBuilder` class. If no values are provided, the
            values set in `settings.TREE_VAL` and `settings.NON_TREE_VAL` are
            respectively used.
        classifier_class : class, optional
            The class of the classifier to be trained. It can be any scikit-learn
            compatible estimator that implements the `fit`, `predict` and
            `predict_proba` methods and that can be saved to and loaded from memory
            using skops. If no value is provided, the value set in `settings.CLF_CLASS`
            is used.
        classifier_kwargs : key-value pairings, optional
            Keyword arguments that will be passed to the initialization of
            `classifier_class`. If no value is provided, the value set in
            `settings.CLF_KWARGS` is used.
        """
        self.pixel_features_builder_kwargs = dict(
            sigmas=sigmas,
            num_orientations=num_orientations,
            neighborhood=neighborhood,
            min_neighborhood_range=min_neighborhood_range,
            num_neighborhoods=num_neighborhoods,
        )
        self.pixel_response_builder_kwargs = dict(
            tree_val=tree_val, nontree_val=nontree_val
        )
        if classifier_class is None:
            classifier_class = settings.CLF_CLASS
        self.classifier_class = classifier_class
        if not classifier_kwargs:
            classifier_kwargs = settings.CLF_KWARGS
        self.classifier_kwargs = classifier_kwargs
        self.pixel_features_builder = pixel_features.PixelFeaturesBuilder(
            **self.pixel_features_builder_kwargs
        )
        self.pixel_response_builder = pixel_response.PixelResponseBuilder(
            **self.pixel_response_builder_kwargs
        )

    def fit(self, X=None, y=None, **kwargs):  # noqa: ARG002
        """Fit method for sklearn compatibility."""
        return self

    def transform(
        self,
        *,
        split_df=None,
        img_dir=None,
        response_img_dir=None,
        img_filepaths=None,
        response_img_filepaths=None,
        img_filename_pattern=None,
    ):
        """
        Train a classifier.

        See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        split_df : pandas DataFrame, optional
            Data frame with the train/test split.
        img_dir : str representing path to a directory, optional
            Path to the directory where the images from `split_df` or whose filename
            matches `img_filename_pattern` are located. Required if `split_df` is
            provided. Ignored if `img_filepaths` is provided.
        response_img_dir : str representing path to a directory, optional
            Path to the directory where the response tiles are located. Required if
            providing `split_df`. Otherwise `response_img_dir` might either be ignored
            if providing `response_img_filepaths`, or be used as the directory where the
            images whose filename matches `img_filename_pattern` are to be located.
        img_filepaths : list-like, optional
            List of paths to the input tiles whose features will be used to train the
            classifier. Ignored if `split_df` is provided.
        response_img_filepaths : list-like, optional
            List of paths to the binary response tiles that will be used to train the
            classifier. Ignored if `split_df` is provided.
        img_filename_pattern : str representing a file-name pattern, optional
            Filename pattern to be matched in order to obtain the list of images. If no
            value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
            Ignored if `split_df` or `img_filepaths` is provided.

        Returns
        -------
        X : numpy ndarray
            Array with the pixel features.
        y : numpy ndarray
            Array with the pixel responses.
        """
        if split_df is None and response_img_filepaths is None:
            # this is the only case that needs argument tweaking: otherwise, if we pass
            # `img_filepaths`/`img_dir` to `build_features` and `response_img_dir` to
            # `build_response`, the latter would build a response with all the image
            # files in `response_img_dir`. Instead, we need to build the response only
            # for the files specified in `img_filepaths`/`img_dir`
            if img_filepaths is None:
                if img_dir is None:
                    raise ValueError(
                        "Either `split_df`, `img_filepaths` or `img_dir` must be"
                        " provided"
                    )
                img_filepaths = utils.get_img_filepaths(
                    img_dir, img_filename_pattern=img_filename_pattern
                )

            if response_img_dir is None:
                raise ValueError(
                    "Either `split_df`, `response_img_filepaths` or "
                    "`response_img_dir` must be provided"
                )
            response_img_filepaths = [
                path.join(response_img_dir, path.basename(img_filepath))
                for img_filepath in img_filepaths
            ]

        X = self.pixel_features_builder.build_features(
            split_df=split_df,
            img_filepaths=img_filepaths,
            img_dir=img_dir,
            img_filename_pattern=img_filename_pattern,
        )

        y = self.pixel_response_builder.build_response(
            split_df=split_df,
            response_img_dir=response_img_dir,
            response_img_filepaths=response_img_filepaths,
            img_filename_pattern=img_filename_pattern,
        )

        return X, y

    def fit_transform(self, *args, **kwargs):
        """Fit and transform method for sklearn compatibility."""
        self.fit(*args, **kwargs)
        return self.transform(*args, **kwargs)


[docs] class ClassifierTrainer: """Train binary tree/non-tree classifier(s) of the pixel features."""
[docs] def __init__( self, *, sigmas=None, num_orientations=None, neighborhood=None, min_neighborhood_range=None, num_neighborhoods=None, tree_val=None, nontree_val=None, classifier_class=None, **classifier_kwargs, ): """ Initialize the classifier. See the `background <https://bit.ly/2KlCICO>`_ example notebook for details. Parameters ---------- sigmas : list-like, optional The list of scale parameters (sigmas) to build the Gaussian filter bank that will be used to compute the pixel-level features. The provided argument will be passed to the initialization method of the `PixelFeaturesBuilder` class. If no value is provided, the value set in `settings.GAUSS_SIGMAS` will be taken. num_orientations : int, optional The number of equally-distributed orientations to build the Gaussian filter bank that will be used to compute the pixel-level features. The provided argument will be passed to the initialization method of the `PixelFeaturesBuilder` class. If no value is provided, the value set in `settings.GAUSS_NUM_ORIENTATIONS` is used. neighborhood : array-like, optional The base neighborhood structure that will be used to compute the entropy features. Theprovided argument will be passed to the initialization method of the `PixelFeaturesBuilder` class. If no value is provided, a square with a side size of `2 * min_neighborhood_range + 1` is used. min_neighborhood_range : int, optional The range (i.e., the square radius) of the smallest neighborhood window that will be used to compute the entropy features. The provided argument will be passed to the initialization method of the `PixelFeaturesBuilder` class. If no value is provided, the value set in `settings.ENTROPY_MIN_NEIGHBORHOOD_RANGE` is used. num_neighborhoods : int, optional The number of neighborhood windows (whose size follows a geometric progression starting at `min_neighborhood_range`) that will be used to compute the entropy features. The provided argument will be passed to the initialization method of the `PixelFeaturesBuilder` class. If no value is provided, the value set in `settings.ENTROPY_NUM_NEIGHBORHOODS` is used. tree_val, nontree_val : int, optional The values that designate tree and non-tree pixels respectively in the response images. The provided arguments will be passed to the initialization method of the `PixelResponseBuilder` class. If no values are provided, the values set in `settings.TREE_VAL` and `settings.NON_TREE_VAL` are respectively used. classifier_class : class, optional The class of the classifier to be trained. It can be any scikit-learn compatible estimator that implements the `fit`, `predict` and `predict_proba` methods and that can be saved to and loaded from memory using skops. If no value is provided, the value set in `settings.CLF_CLASS` is used. classifier_kwargs : key-value pairings, optional Keyword arguments that will be passed to the initialization of `classifier_class`. If no value is provided, the value set in `settings.CLF_KWARGS` is used. """ self.pixel_features_builder_kwargs = dict( sigmas=sigmas, num_orientations=num_orientations, neighborhood=neighborhood, min_neighborhood_range=min_neighborhood_range, num_neighborhoods=num_neighborhoods, ) self.pixel_response_builder_kwargs = dict( tree_val=tree_val, nontree_val=nontree_val ) if classifier_class is None: classifier_class = settings.CLF_CLASS self.classifier_class = classifier_class if not classifier_kwargs: classifier_kwargs = settings.CLF_KWARGS self.classifier_kwargs = classifier_kwargs self.pixel_training_transformer = PixelDatasetTransformer( **self.pixel_features_builder_kwargs, **self.pixel_response_builder_kwargs, )
[docs] def train_classifier( self, *, split_df=None, img_dir=None, response_img_dir=None, img_filepaths=None, response_img_filepaths=None, img_filename_pattern=None, method=None, img_cluster=None, ): """ Train a classifier. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- split_df : pandas DataFrame, optional Data frame with the train/test split. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` or whose filename matches `img_filename_pattern` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Required if providing `split_df`. Otherwise `response_img_dir` might either be ignored if providing `response_img_filepaths`, or be used as the directory where the images whose filename matches `img_filename_pattern` are to be located. img_filepaths : list-like, optional List of paths to the input tiles whose features will be used to train the classifier. Ignored if `split_df` is provided. response_img_filepaths : list-like, optional List of paths to the binary response tiles that will be used to train the classifier. Ignored if `split_df` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. method : {'cluster-I', 'cluster-II'}, optional Method used in the train/test split. img_cluster : int, optional The label of the cluster of tiles. Only used if `method` is 'cluster-II'. Returns ------- clf : scikit-learn-like classifier The trained classifier. """ if split_df is not None and method is None: if "img_cluster" in split_df: method = "cluster-II" else: method = "cluster-I" if split_df is not None and method == "cluster-I" and "img_cluster" in split_df: split_df = split_df.drop(columns=["img_cluster"]) elif split_df is not None and method == "cluster-II": if img_cluster is None: raise ValueError( "If `method` is 'cluster-II', `img_cluster` must be provided" ) if img_dir is None: raise ValueError( "If `split_df` is provided, `img_dir` must also be provided" ) img_filename_ser = utils.get_img_filename_ser(split_df, img_cluster, True) img_filepaths = img_filename_ser.apply( lambda img_filename: path.join(img_dir, img_filename) ) split_df = None X, y = self.pixel_training_transformer.fit_transform( split_df=split_df, img_dir=img_dir, response_img_dir=response_img_dir, img_filepaths=img_filepaths, response_img_filepaths=response_img_filepaths, img_filename_pattern=img_filename_pattern, ) clf = self.classifier_class(**self.classifier_kwargs) clf.fit(X, y) return clf
[docs] def train_classifiers(self, split_df, img_dir, response_img_dir): """ Train a classifier for each first-level cluster in `split_df`. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- split_df : pandas DataFrame Data frame with the train/test split, which must have an `img_cluster`. column with the first-level cluster labels. img_dir : str representing path to a directory Path to the directory where the images from `split_df` or whose filename matches `img_filename_pattern` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. response_img_dir : str representing path to a directory Path to the directory where the response tiles are located. Returns ------- clf_dict : dictionary Dictionary mapping a scikit-learn-like classifier to each first-level cluster label. """ if "img_cluster" not in split_df: raise ValueError( "`split_df` must have an 'img_cluster' column ('cluster-II'). " "For 'cluster-I', use `train_classifier`." ) clfs_lazy = {} for img_cluster, _ in split_df.groupby("img_cluster"): clfs_lazy[img_cluster] = dask.delayed(self.train_classifier)( split_df=split_df, img_dir=img_dir, response_img_dir=response_img_dir, method="cluster-II", img_cluster=img_cluster, ) with diagnostics.ProgressBar(): clfs_dict = dask.compute(clfs_lazy)[0] return clfs_dict
[docs] class Classifier: """Use trained classifier(s) to predict tree pixels."""
[docs] def __init__( self, *, clf=None, clf_dict=None, hf_hub_repo_id=None, hf_hub_clf_filename=None, hf_hub_download_kwargs=None, skops_trusted=None, tree_val=None, nontree_val=None, refine_method=None, refine_kwargs=None, return_proba=None, **pixel_features_builder_kwargs, ): """ Initialize the classifier instance. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- clf : scikit-learn-like classifier, optional Trained classifier. If no value is provided, the latest detectree pre-trained classifier is used. Ignored if `clf_dict` is provided. clf_dict : dictionary, optional Dictionary mapping a trained scikit-learn-like classifier to each first-level cluster label. hf_hub_repo_id, hf_hub_clf_filename : str, optional HuggingFace Hub repository id (string with the user or organization and repository name separated by a `/`) and file name of the skops classifier respectively. If no value is provided, the values set in `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or `clf_dict` are provided. hf_hub_download_kwargs : dict, optional Additional keyword arguments (besides "repo_id", "filename", "library_name" and "library_version") to pass to `huggingface_hub.hf_hub_download`. skops_trusted : list, optional List of trusted object types to load the classifier from HuggingFace Hub, passed to `skops.io.load`. If no value is provided, the value from `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided. tree_val, nontree_val : int, optional The values that designate tree and non-tree pixels respectively in the response images. If no values are provided, the values set in `settings.TREE_VAL` and `settings.NON_TREE_VAL` are respectively used. refine_method : callable or bool, optional Method to refine the pixel-level classification, e.g., to optimize the consistence between neighboring pixels. If `False` is provided, no refinement is performed. If `None` is provided and `return_proba` is `None` or `False`, the value from `settings.CLF_REFINE` is used. refine_method_kwargs : dict, optional Keyword arguments that will be passed to the `refine_method`. If no value is provided, the value set in `settings.CLF_REFINE_KWARGS` is used. Ignored if no refinement is performed (i.e., `refine_method` is `False` or `refine_method` is `None` and `return_proba` is `True`). return_proba : bool, optional If `True`, the classifier will return the probabilities of each pixel belonging to the tree class. If `False`, the classifier will return the predicted class labels. Ignored if a valid `refine_method` is provided. pixel_features_builder_kwargs : dict, optional Keyword arguments that will be passed to `detectree.PixelFeaturesBuilder`, which customize how the pixel features are built. """ super().__init__() if clf_dict is not None: self.clf_dict = clf_dict elif clf is not None: self.clf = clf else: if hf_hub_repo_id is None: hf_hub_repo_id = settings.HF_HUB_REPO_ID if hf_hub_clf_filename is None: hf_hub_clf_filename = settings.HF_HUB_CLF_FILENAME if hf_hub_download_kwargs is None: _hf_hub_download_kwargs = {} else: _hf_hub_download_kwargs = hf_hub_download_kwargs.copy() for key in [ "repo_id", "filename", "library_name", "library_version", ]: _ = _hf_hub_download_kwargs.pop(key, None) if skops_trusted is None: skops_trusted = settings.SKOPS_TRUSTED self.clf = io.load( hf_hub.hf_hub_download( repo_id=hf_hub_repo_id, filename=hf_hub_clf_filename, library_name="skops", library_version=skops.__version__, **_hf_hub_download_kwargs, ), trusted=skops_trusted, ) if tree_val is None: tree_val = settings.TREE_VAL if nontree_val is None: nontree_val = settings.NONTREE_VAL if refine_method is None and not return_proba: refine_method = settings.CLF_REFINE_METHOD if refine_method: if refine_kwargs is None: refine_kwargs = settings.CLF_REFINE_KWARGS self.refine_method = refine_method self.refine_kwargs = refine_kwargs self._predict_X = self._predict_X_refine self.tree_val = tree_val self.nontree_val = nontree_val self.dst_nodata = nontree_val else: if return_proba is None: # we will only get here if `refine_method` is `False` return_proba = settings.CLF_RETURN_PROBA if return_proba: # there is no refine method, return proba self._predict_X = self._predict_X_proba # TODO: how to manage this better? self.dst_nodata = -1 else: # there is no refine method, return labels self._predict_X = self._predict_X_labels self.dst_nodata = nontree_val self.tree_val = tree_val self.nontree_val = nontree_val self.pixel_features_builder_kwargs = pixel_features_builder_kwargs self.pixel_training_transformer = PixelDatasetTransformer( tree_val=tree_val, nontree_val=nontree_val, **self.pixel_features_builder_kwargs, ) self.pixel_features_builder = ( self.pixel_training_transformer.pixel_features_builder )
def _predict_X_refine(self, X, clf, img_shape): # TODO: properly manage the order classes in `clf`, i.e., are we sure that # "tree" is always the second class? If so, we could probably fully omit # `tree_val` and `nontree_val` and get them from `clf.classes_` # p_nontree_img, p_tree_img = np.hsplit(clf.pred_proba(X), 2) p_tree_img = clf.predict_proba(X)[:, 1].reshape(img_shape) return self.refine_method( p_tree_img, self.tree_val, self.nontree_val, **self.refine_kwargs ).astype(np.uint8) def _predict_X_proba(self, X, clf, img_shape): return clf.predict_proba(X)[:, 1].reshape(img_shape) def _predict_X_labels(self, X, clf, img_shape): return clf.predict(X).reshape(img_shape).astype(np.uint8) def _predict_img(self, img_filepath, clf, *, output_filepath=None): # ACHTUNG: Note that we do not use keyword-only arguments in this method because # `output_filepath` works as the only "optional" argument src = rio.open(img_filepath) img_shape = src.shape X = self.pixel_features_builder.build_features_from_filepath(img_filepath) y_pred = self._predict_X(X, clf, img_shape) # TODO: make the profile of output rasters more customizable (e.g., via the # `settings` module) # output_filepath = path.join(output_dir, # f"tile_{tile_start}-{tile_end}.tif") if output_filepath is not None: with rio.open( output_filepath, "w", # driver="GTiff", width=y_pred.shape[1], height=y_pred.shape[0], count=1, dtype=y_pred.dtype, nodata=self.dst_nodata, crs=src.crs, transform=src.transform, ) as dst: dst.write(y_pred, 1) src.close() return y_pred def _predict_imgs(self, img_filepaths, clf, output_dir): pred_imgs_lazy = [] pred_img_filepaths = [] for img_filepath in img_filepaths: # filename, ext = path.splitext(path.basename(img_filepath)) # pred_img_filepath = path.join( # output_dir, f"{filename}-pred{ext}") pred_img_filepath = path.join(output_dir, path.basename(img_filepath)) pred_imgs_lazy.append( dask.delayed(self._predict_img)( img_filepath, clf, output_filepath=pred_img_filepath ) ) pred_img_filepaths.append(pred_img_filepath) with diagnostics.ProgressBar(): dask.compute(*pred_imgs_lazy) return pred_img_filepaths def predict_img(self, img_filepath, *, img_cluster=None, output_filepath=None): """ Use a trained classifier to predict tree pixels in an image. Optionally dump the predicted tree/non-tree image to `output_filepath`. Parameters ---------- img_filepath : str, file object or pathlib.Path object Path to a file, URI, file object opened in binary ('rb') mode, or a Path object representing the image to be classified. The value will be passed to `rasterio.open`. img_cluster : int, optional The label of the cluster of tiles. Only used if the `Classifier` instance was initialized with `clf_dict` (i.e., "cluster-II" method). output_filepath : str, file object or pathlib.Path object, optional Path to a file, URI, file object opened in binary ('rb') mode, or a Path object representing where the predicted image is to be dumped. The value will be passed to `rasterio.open` in 'write' mode. Returns ------- y_pred : numpy ndarray Array with the pixel responses. """ clf = getattr(self, "clf", None) if clf is None: if img_cluster is not None: try: clf = self.clf_dict[img_cluster] except KeyError as exc: raise ValueError( f"Classifier for cluster {img_cluster} not found in" " `self.clf_dict`." ) from exc else: raise ValueError( "A valid `img_cluster` must be provided for classifiers" " instantiated with `clf_dict`." ) return self._predict_img(img_filepath, clf, output_filepath=output_filepath) def predict_imgs( self, output_dir, *, split_df=None, img_dir=None, img_filepaths=None, img_filename_pattern=None, ): """ Use trained classifier(s) to predict tree pixels in multiple images. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- output_dir : str or pathlib.Path object Path to the directory where the predicted images are to be dumped. split_df : pandas DataFrame, optional Data frame with the train/test split. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. Returns ------- pred_imgs : list File paths of the dumped tiles. """ if hasattr(self, "clf"): # predicting with a single classifier if split_df is None: if img_filepaths is None: if img_dir is None: raise ValueError( "Either `split_df`, `img_filepaths` or `img_dir` must be" " provided." ) img_filepaths = utils.get_img_filepaths( img_dir, img_filename_pattern=img_filename_pattern, ) else: img_filepaths = split_df["img_filename"].apply( lambda img_filename: path.join(img_dir, img_filename) ) pred_imgs = self._predict_imgs( img_filepaths, self.clf, output_dir, ) else: # `self.clf_dict` is not `None` # predicting with multiple classifiers pred_imgs = [] for img_cluster, img_filename_ser in split_df.groupby("img_cluster")[ "img_filename" ]: try: clf = self.clf_dict[img_cluster] except KeyError as exc: raise ValueError( f"Classifier for cluster {img_cluster} not found in" " `self.clf_dict`." ) from exc pred_imgs += self._predict_imgs( img_filename_ser.apply( lambda img_filename: path.join(img_dir, img_filename) ), clf, output_dir, ) return pred_imgs def compute_eval_metrics( self, metrics=None, metrics_kwargs=None, refine_method=None, refine_kwargs=None, split_df=None, img_dir=None, response_img_dir=None, img_filepaths=None, response_img_filepaths=None, img_filename_pattern=None, ): """ Compute evaluation metrics for validation images. Parameters ---------- metrics : str, func or list of str or func The metrics to compute, must be either a string with a function of the `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional arguments with the true and predicted labels respectively or a list-like of any of the two options. If no value is provided, the values set in `settings.EVAL_METRICS` are used. metrics_kwargs : dict or list of dict Additional keyword arguments to pass to each of the metric functions. refine_method : callable or bool, optional Method to refine the pixel-level classification. If `False` is provided, no refinement is performed. If any non-None value is provided, it overrides the `refine_method` argument provided at instantiation time. If `None` is provided, the value from `self.refine_method` is used if set, otherwise no refinement is performed. refine_kwargs : dict, optional Keyword arguments that will be passed to `refine_method`. If any non-None value is provided, it overrides the `refine_kwargs` argument provided at instantiation time. If `None` is provided, the value from `self.refine_kwargs` is used if set. Ignored if no refinement is performed. split_df : pandas DataFrame, optional Data frame with the validation images. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Ignored if providing `response_img_filepaths`. Only images with a matching response (by basename) are evaluated. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. response_img_filepaths : list-like, optional List of paths to the binary response tiles that will be used for evaluation. Ignored if `split_df` is provided. Only images with a matching response (by basename) are evaluated. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. Returns ------- metric_dict : numeric, dict Values of the metrics computed for the validation images. If only one metric is provided, a single value is returned. If multiple metrics are provided, a dict with a key for each metric is returned. The metric values can be of different types depending on the metric function used, e.g., `precision_score` returns a single float value, `precision_recall_curve` returns a tuple of arrays, and `confusion_matrix` returns a two-dimensional array. """ if refine_method is None: refine_method = getattr(self, "refine_method", None) if refine_kwargs is None: refine_kwargs = getattr(self, "refine_kwargs", None) return evaluate.compute_eval_metrics( metrics=metrics, metrics_kwargs=metrics_kwargs, clf=getattr(self, "clf", None), clf_dict=getattr(self, "clf_dict", None), refine_method=refine_method, refine_kwargs=refine_kwargs, split_df=split_df, img_dir=img_dir, response_img_dir=response_img_dir, img_filepaths=img_filepaths, response_img_filepaths=response_img_filepaths, img_filename_pattern=img_filename_pattern, ) def eval_refine_params( self, refine_method=None, refine_params_list=None, metrics=None, metrics_kwargs=None, tree_val=None, nontree_val=None, split_df=None, img_dir=None, img_filepaths=None, img_filename_pattern=None, response_img_dir=None, ): """ Evaluate a refinement procedure for different parameters. Parameters ---------- refine_method : callable, optional Refinement method that takes a probability image as the first positional argument followed by tree and non-tree values, e.g., `refine_method(p_tree_img, tree_val, nontree_val, **kwargs)`. If no value is provided, the value from `self.refine_method` is used if set, otherwise the value from `settings.CLF_REFINE_METHOD` is used. refine_params_list : list of dict, optional Parameters to evaluate for the refinement method, as a list of keyword arguments. The metrics will be computed for each item of this list. If no value is provided, the value from `settings.EVAL_REFINE_PARAMS` is used. metrics : str, func or list of str or func The metrics to compute, must be either a string with a function of the `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional arguments with the true and predicted labels respectively or a list-like of any of the two options. If no value is provided, the values set in `settings.EVAL_METRICS` are used. metrics_kwargs : dict or list of dict Additional keyword arguments to pass to each of the metric functions. tree_val, nontree_val : int, optional The values that designate tree and non-tree pixels respectively in the response images. If no values are provided, the values from this instance are used. split_df : pandas DataFrame, optional Data frame with the validation images. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Returns ------- results : pandas DataFrame A DataFrame with the computed values for each metric (row) and each refinement keyword argument set (column, stringified). """ if refine_method is None: refine_method = getattr(self, "refine_method", None) if tree_val is None: tree_val = self.tree_val if nontree_val is None: nontree_val = self.nontree_val return evaluate.eval_refine_params( refine_method=refine_method, refine_params_list=refine_params_list, metrics=metrics, metrics_kwargs=metrics_kwargs, clf=getattr(self, "clf", None), clf_dict=getattr(self, "clf_dict", None), tree_val=tree_val, nontree_val=nontree_val, split_df=split_df, img_dir=img_dir, img_filepaths=img_filepaths, img_filename_pattern=img_filename_pattern, response_img_dir=response_img_dir, )