Source code for detectree.evaluate

"""Evaluation."""

import tempfile
from os import path

import numpy as np
import pandas as pd
import rasterio as rio
from sklearn import metrics as sklearn_metrics

from detectree import classifier, settings, utils


def _match_img_response_filepaths(
    img_filepaths, *, response_img_filepaths=None, response_img_dir=None
):
    # match images and response images returning only the intersection
    if response_img_filepaths is None and response_img_dir is None:
        raise ValueError(
            "Either `response_img_filepaths` or `response_img_dir` must be provided."
        )
    matched_img_filepaths = []
    matched_response_filepaths = []
    if response_img_filepaths is None:
        for img_filepath in img_filepaths:
            response_filepath = path.join(response_img_dir, path.basename(img_filepath))
            if path.exists(response_filepath):
                matched_img_filepaths.append(img_filepath)
                matched_response_filepaths.append(response_filepath)
    else:
        # # use the set intersection of the base names
        # matched_filenames = set(
        #     path.basename(img_filepath) for img_filepath in img_filepaths
        # ) & set(
        #     path.basename(response_filepath)
        #     for response_filepath in response_img_filepaths
        # )
        # matched_img_filepaths = [
        #     path.join(response_img_dir, path.basename(img_filename))
        #     for img_filename in matched_filenames
        # ]
        # matched_response_filepaths = [
        #     path.join(response_img_dir, path.basename(img_filename))
        #     for img_filename in matched_filenames
        # ]
        # use this instead of the set intersection approach above (which assumes that
        # all responses are in the same directory)
        response_filename_filepath_dict = {
            path.basename(response_filepath): response_filepath
            for response_filepath in response_img_filepaths
        }
        for img_filepath in img_filepaths:
            response_filepath = response_filename_filepath_dict.get(
                path.basename(img_filepath)
            )
            if response_filepath is not None:
                matched_img_filepaths.append(img_filepath)
                matched_response_filepaths.append(response_filepath)

    if not matched_img_filepaths:
        raise ValueError("No matching response images found for evaluation.")

    return matched_img_filepaths, matched_response_filepaths


def _compute_metrics_from_true_pred_arr(true_pred_arr, metrics, metrics_kwargs):
    metric_dict = {}
    for metric, kwargs in zip(metrics, metrics_kwargs):
        if isinstance(metric, str):
            metric_func = getattr(sklearn_metrics, metric)
            metric_label = metric
        elif callable(metric):
            metric_func = metric
            metric_label = metric.__name__
        else:
            raise TypeError(
                "Metrics must be either a string with a function of the "
                "`sklearn.metrics` module, a function that takes `y_true` and `y_pred` "
                "positional arguments or a list-like of any of the two options."
            )

        metric_dict[metric_label] = metric_func(
            true_pred_arr[0], true_pred_arr[1], **kwargs
        )

    return metric_dict


def _get_true_pred_arr(
    pred_img_filepaths, *, response_img_filepaths=None, response_img_dir=None
):
    pred_img_filepaths, response_img_filepaths = _match_img_response_filepaths(
        pred_img_filepaths,
        response_img_filepaths=response_img_filepaths,
        response_img_dir=response_img_dir,
    )

    true_arrs = []
    pred_arrs = []
    for pred_img_filepath, response_img_filepath in zip(
        pred_img_filepaths, response_img_filepaths
    ):
        with rio.open(response_img_filepath) as src:
            true_arr = src.read(1).flatten()
        with rio.open(pred_img_filepath) as src:
            pred_arr = src.read(1).flatten()
        # results.append((true_arr, pred_arr))
        true_arrs.append(true_arr)
        pred_arrs.append(pred_arr)

    return np.vstack([np.concatenate(arrs, axis=0) for arrs in [true_arrs, pred_arrs]])


[docs] def get_true_pred_arr( *, pred_img_filepaths=None, clf=None, clf_dict=None, hf_hub_repo_id=None, hf_hub_clf_filename=None, hf_hub_download_kwargs=None, skops_trusted=None, refine_method=None, refine_kwargs=None, split_df=None, img_dir=None, response_img_dir=None, img_filepaths=None, response_img_filepaths=None, img_filename_pattern=None, **classifier_kwargs, ): """ Get true and predicted values for the validation images. Parameters ---------- pred_img_filepaths : list-like, optional List of paths to precomputed predicted images. If provided, classification is skipped and predictions are read directly from these files. Only predictions with a matching response image (by basename) are used, and all arguments except `response_img_dir` or `response_img_filepaths` are ignored. clf : scikit-learn-like classifier, optional Trained classifier. If no value is provided, the classifier is loaded from HuggingFace Hub using the values provided in `hf_hub_repo_id` and `hf_hub_clf_filename`. clf_dict : dictionary, optional Dictionary mapping a trained scikit-learn-like classifier to each first-level cluster label. hf_hub_repo_id, hf_hub_clf_filename : str, optional HuggingFace Hub repository id (string with the user or organization and repository name separated by a `/`) and file name of the skops classifier respectively. If no value is provided, the values set in `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or `clf_dict` are provided. hf_hub_download_kwargs : dict, optional Additional keyword arguments (besides "repo_id", "filename", "library_name" and "library_version") to pass to `huggingface_hub.hf_hub_download`. skops_trusted : list, optional List of trusted object types to load the classifier from HuggingFace Hub, passed to `skops.io.load`. If no value is provided, the value from `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided. refine_method : callable or bool, optional Method to refine the pixel-level classification. If `False` is provided, no refinement is performed. If `None` is provided, the default behavior of `detectree.classifier.Classifier` is used. refine_kwargs : dict, optional Keyword arguments that will be passed to `refine_method`. Ignored if no refinement is performed. split_df : pandas DataFrame, optional Data frame with the validation images. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Required if providing `split_df`. Otherwise `response_img_dir` might either be ignored if providing `response_img_filepaths`, or be used as the directory where the images whose filename matches `img_filename_pattern` are to be located. Only images with a matching response (by basename) are evaluated. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. response_img_filepaths : list-like, optional List of paths to the binary response tiles that will be used for evaluation. Ignored if `split_df` is provided. Only images with a matching response (by basename) are evaluated. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. classifier_kwargs : dict, optional Additional keyword arguments to pass to the initialization of `detectree.classifier.Classifier` class. Returns ------- true_pred : numpy.ndarray Array with two rows respectively containing the true and predicted values for the provided images. """ if pred_img_filepaths is not None: # no need for inference # return here because the other return will need the tmp_dir context manager # anyway so they cannot be merged return _get_true_pred_arr( pred_img_filepaths, response_img_filepaths=response_img_filepaths, response_img_dir=response_img_dir, ) # inference is needed _classifier_kwargs = classifier_kwargs.copy() if refine_method is not None: _classifier_kwargs["refine_method"] = refine_method if refine_kwargs is not None: _classifier_kwargs["refine_kwargs"] = refine_kwargs c = classifier.Classifier( clf=clf, clf_dict=clf_dict, hf_hub_repo_id=hf_hub_repo_id, hf_hub_clf_filename=hf_hub_clf_filename, hf_hub_download_kwargs=hf_hub_download_kwargs, skops_trusted=skops_trusted, **_classifier_kwargs, ) with tempfile.TemporaryDirectory() as tmp_dir: if split_df is not None: if img_dir is None: raise ValueError( "If `split_df` is provided, `img_dir` must also be provided." ) img_filepaths = split_df["img_filename"].apply( lambda img_filename: path.join(img_dir, img_filename) ) matched_img_filepaths, response_img_filepaths = ( _match_img_response_filepaths( img_filepaths, response_img_filepaths=response_img_filepaths, response_img_dir=response_img_dir, ) ) matched_filenames = [ path.basename(img_filepath) for img_filepath in matched_img_filepaths ] split_df = split_df[split_df["img_filename"].isin(matched_filenames)] pred_img_filepaths = c.predict_imgs( tmp_dir, split_df=split_df, img_dir=img_dir, img_filename_pattern=img_filename_pattern, ) else: if img_filepaths is None: if img_dir is None: raise ValueError( "Either `split_df`, `img_filepaths` or `img_dir` must be " "provided." ) img_filepaths = utils.get_img_filepaths( img_dir, img_filename_pattern=img_filename_pattern ) img_filepaths, response_img_filepaths = _match_img_response_filepaths( img_filepaths, response_img_filepaths=response_img_filepaths, response_img_dir=response_img_dir, ) pred_img_filepaths = c.predict_imgs( tmp_dir, img_filepaths=img_filepaths, img_filename_pattern=img_filename_pattern, ) # TODO: maybe DRY with the same return on top of the function, but we need the # tmp_dir context manager here return _get_true_pred_arr( pred_img_filepaths, response_img_filepaths=response_img_filepaths, response_img_dir=response_img_dir, )
[docs] def compute_eval_metrics( *, pred_img_filepaths=None, metrics=None, metrics_kwargs=None, clf=None, clf_dict=None, hf_hub_repo_id=None, hf_hub_clf_filename=None, hf_hub_download_kwargs=None, skops_trusted=None, refine_method=None, refine_kwargs=None, split_df=None, img_dir=None, response_img_dir=None, img_filepaths=None, response_img_filepaths=None, img_filename_pattern=None, **classifier_kwargs, ): """ Compute evaluation metrics for the validation images. Parameters ---------- pred_img_filepaths : list-like, optional List of paths to precomputed predicted images. If provided, classification is skipped and metrics are computed directly from these files. Only predictions with a matching response image (by basename) are used. Requires `response_img_dir` or `response_img_filepaths`. metrics : str, func or list of str or func The metrics to compute, must be either a string with a function of the `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional arguments with the true and predicted labels respectively or a list-like of any of the two options. If no value is provided, the values set in `settings.EVAL_METRICS` are used. metrics_kwargs : dict or list of dict Additional keyword arguments to pass to each of the metric functions. clf : scikit-learn-like classifier, optional Trained classifier. If no value is provided, the classifier is loaded from HuggingFace Hub using the values provided in `hf_hub_repo_id` and `hf_hub_clf_filename`. clf_dict : dictionary, optional Dictionary mapping a trained scikit-learn-like classifier to each first-level cluster label. hf_hub_repo_id, hf_hub_clf_filename : str, optional HuggingFace Hub repository id (string with the user or organization and repository name separated by a `/`) and file name of the skops classifier respectively. If no value is provided, the values set in `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or `clf_dict` are provided. hf_hub_download_kwargs : dict, optional Additional keyword arguments (besides "repo_id", "filename", "library_name" and "library_version") to pass to `huggingface_hub.hf_hub_download`. skops_trusted : list, optional List of trusted object types to load the classifier from HuggingFace Hub, passed to `skops.io.load`. If no value is provided, the value from `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided. refine_method : callable or bool, optional Method to refine the pixel-level classification. If `False` is provided, no refinement is performed. If `None` is provided, the default behavior of `detectree.classifier.Classifier` is used. refine_kwargs : dict, optional Keyword arguments that will be passed to `refine_method`. Ignored if no refinement is performed. split_df : pandas DataFrame, optional Data frame with the validation images. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Ignored if providing `response_img_filepaths`. Only images with a matching response (by basename) are evaluated. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. response_img_filepaths : list-like, optional List of paths to the binary response tiles that will be used for evaluation. Ignored if `split_df` is provided. Only images with a matching response (by basename) are evaluated. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. classifier_kwargs : dict, optional Additional keyword arguments to pass to the initialization of `detectree.classifier.Classifier` class. Returns ------- metric_dict : numeric, dict Values of the metrics computed for the validation images. If only one metric is provided, a single value is returned. If multiple metrics are provided, a dict with a key for each metric is returned. The metric values can be of different types depending on the metric function used, e.g., `precision_score` returns a single float value, `precision_recall_curve` returns a tuple of arrays, and `confusion_matrix` returns a two-dimensional array. """ if metrics is None: metrics = settings.EVAL_METRICS if metrics_kwargs is None: metrics_kwargs = [{}] * len(metrics) true_pred_arr = get_true_pred_arr( pred_img_filepaths=pred_img_filepaths, clf=clf, clf_dict=clf_dict, hf_hub_repo_id=hf_hub_repo_id, hf_hub_clf_filename=hf_hub_clf_filename, hf_hub_download_kwargs=hf_hub_download_kwargs, skops_trusted=skops_trusted, refine_method=refine_method, refine_kwargs=refine_kwargs, split_df=split_df, img_dir=img_dir, img_filepaths=img_filepaths, response_img_filepaths=response_img_filepaths, img_filename_pattern=img_filename_pattern, response_img_dir=response_img_dir, **classifier_kwargs, ) metric_dict = _compute_metrics_from_true_pred_arr( true_pred_arr, metrics, metrics_kwargs ) if len(metric_dict) == 1: return next(iter(metric_dict.values())) return metric_dict
[docs] def eval_refine_params( *, refine_method=None, refine_params_list=None, metrics=None, metrics_kwargs=None, clf=None, clf_dict=None, hf_hub_repo_id=None, hf_hub_clf_filename=None, hf_hub_download_kwargs=None, skops_trusted=None, tree_val=None, nontree_val=None, split_df=None, img_dir=None, img_filepaths=None, img_filename_pattern=None, response_img_dir=None, **classifier_kwargs, ): """ Evaluate a refinement procedure for different parameters. Parameters ---------- refine_method : callable, optional Refinement method that takes a probability image as the first positional argument followed by tree and non-tree values, e.g., `refine_method(p_tree_img, tree_val, nontree_val, **kwargs)`. If no value is provided, the value from `settings.CLF_REFINE_METHOD` is used. refine_params_list : list of dict, optional Parameters to evaluate for the refinement method, as a list of keyword arguments. The metrics will be computed for each item of this list. If no value is provided, the value from `settings.EVAL_REFINE_PARAMS` is used. metrics : str, func or list of str or func The metrics to compute, must be either a string with a function of the `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional arguments with the true and predicted labels respectively or a list-like of any of the two options. If no value is provided, the values set in `settings.EVAL_METRICS` are used. metrics_kwargs : dict or list of dict Additional keyword arguments to pass to each of the metric functions. clf : scikit-learn-like classifier, optional Trained classifier. If no value is provided, the classifier is loaded from HuggingFace Hub using the values provided in `hf_hub_repo_id` and `hf_hub_clf_filename`. clf_dict : dictionary, optional Dictionary mapping a trained scikit-learn-like classifier to each first-level cluster label. hf_hub_repo_id, hf_hub_clf_filename : str, optional HuggingFace Hub repository id (string with the user or organization and repository name separated by a `/`) and file name of the skops classifier respectively. If no value is provided, the values set in `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or `clf_dict` are provided. hf_hub_download_kwargs : dict, optional Additional keyword arguments (besides "repo_id", "filename", "library_name" and "library_version") to pass to `huggingface_hub.hf_hub_download`. skops_trusted : list, optional List of trusted object types to load the classifier from HuggingFace Hub, passed to `skops.io.load`. If no value is provided, the value from `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided. tree_val, nontree_val : int, optional The values that designate tree and non-tree pixels respectively in the response images. If no values are provided, the values set in `settings.TREE_VAL` and `settings.NON_TREE_VAL` are respectively used. split_df : pandas DataFrame, optional Data frame with the validation images. img_dir : str representing path to a directory, optional Path to the directory where the images from `split_df` are located. Required if `split_df` is provided. Ignored if `img_filepaths` is provided. img_filepaths : list-like, optional List of paths to the tiles that will be used for validation. Ignored if `split_df` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used. Ignored if `split_df` or `img_filepaths` is provided. response_img_dir : str representing path to a directory, optional Path to the directory where the response tiles are located. Ignored if providing `response_img_filepaths`. classifier_kwargs : dict, optional Additional keyword arguments to pass to the initialization of `detectree.classifier.Classifier` class. Returns ------- results : pandas DataFrame A DataFrame with the computed values for each metric (row) and each refinement keyword argument set (column, stringified). """ if refine_method is None: refine_method = settings.CLF_REFINE_METHOD if refine_params_list is None: refine_params_list = settings.EVAL_REFINE_PARAMS # refine_params_list = list(refine_params_list) if metrics is None: metrics = settings.EVAL_METRICS if metrics_kwargs is None: metrics_kwargs = [{}] * len(metrics) if tree_val is None: tree_val = settings.TREE_VAL if nontree_val is None: nontree_val = settings.NONTREE_VAL _classifier_kwargs = classifier_kwargs.copy() for key in ("refine_method", "refine_kwargs", "return_proba"): _classifier_kwargs.pop(key, None) c = classifier.Classifier( clf=clf, clf_dict=clf_dict, hf_hub_repo_id=hf_hub_repo_id, hf_hub_clf_filename=hf_hub_clf_filename, hf_hub_download_kwargs=hf_hub_download_kwargs, skops_trusted=skops_trusted, tree_val=tree_val, nontree_val=nontree_val, return_proba=True, **_classifier_kwargs, ) with tempfile.TemporaryDirectory() as tmp_dir: pred_img_filepaths = c.predict_imgs( tmp_dir, split_df=split_df, img_dir=img_dir, img_filepaths=img_filepaths, img_filename_pattern=img_filename_pattern, ) true_arrs = [] pred_refined_by_kwargs = [[] for _ in refine_params_list] for pred_img_filepath in pred_img_filepaths: with rio.open( path.join(response_img_dir, path.basename(pred_img_filepath)) ) as src: true_arrs.append(src.read(1).flatten()) with rio.open(pred_img_filepath) as src: pred_arr = src.read(1) for idx, refine_kwargs in enumerate(refine_params_list): pred_refined_by_kwargs[idx].append( refine_method( pred_arr, tree_val, nontree_val, **refine_kwargs ).flatten() ) true_arr = np.concatenate(true_arrs) pred_refined_arrs = [ np.concatenate(arrs, axis=0) for arrs in pred_refined_by_kwargs ] metric_dict_by_refine_kwargs = [] for refined_arr in pred_refined_arrs: true_pred_arr = np.vstack((true_arr, refined_arr)) metric_dict_by_refine_kwargs.append( _compute_metrics_from_true_pred_arr(true_pred_arr, metrics, metrics_kwargs) ) return pd.DataFrame( metric_dict_by_refine_kwargs, index=[str(kwargs) for kwargs in refine_params_list], ).transpose()