Source code for detectree.evaluate

"""Evaluation."""

import tempfile
from os import path

import numpy as np
import pandas as pd
import rasterio as rio
from sklearn import metrics as sklearn_metrics

from detectree import classifier, settings, utils


def _match_img_response_filepaths(
    img_filepaths, *, response_img_filepaths=None, response_img_dir=None
):
    # match images and response images returning only the intersection
    if response_img_filepaths is None and response_img_dir is None:
        raise ValueError(
            "Either `response_img_filepaths` or `response_img_dir` must be provided."
        )
    matched_img_filepaths = []
    matched_response_filepaths = []
    if response_img_filepaths is None:
        for img_filepath in img_filepaths:
            response_filepath = path.join(response_img_dir, path.basename(img_filepath))
            if path.exists(response_filepath):
                matched_img_filepaths.append(img_filepath)
                matched_response_filepaths.append(response_filepath)
    else:
        # # use the set intersection of the base names
        # matched_filenames = set(
        #     path.basename(img_filepath) for img_filepath in img_filepaths
        # ) & set(
        #     path.basename(response_filepath)
        #     for response_filepath in response_img_filepaths
        # )
        # matched_img_filepaths = [
        #     path.join(response_img_dir, path.basename(img_filename))
        #     for img_filename in matched_filenames
        # ]
        # matched_response_filepaths = [
        #     path.join(response_img_dir, path.basename(img_filename))
        #     for img_filename in matched_filenames
        # ]
        # use this instead of the set intersection approach above (which assumes that
        # all responses are in the same directory)
        response_filename_filepath_dict = {
            path.basename(response_filepath): response_filepath
            for response_filepath in response_img_filepaths
        }
        for img_filepath in img_filepaths:
            response_filepath = response_filename_filepath_dict.get(
                path.basename(img_filepath)
            )
            if response_filepath is not None:
                matched_img_filepaths.append(img_filepath)
                matched_response_filepaths.append(response_filepath)

    if not matched_img_filepaths:
        raise ValueError("No matching response images found for evaluation.")

    return matched_img_filepaths, matched_response_filepaths


def _compute_metrics_from_true_pred_arr(true_pred_arr, metrics, metrics_kwargs):
    metric_dict = {}
    for metric, kwargs in zip(metrics, metrics_kwargs):
        if isinstance(metric, str):
            metric_func = getattr(sklearn_metrics, metric)
            metric_label = metric
        elif callable(metric):
            metric_func = metric
            metric_label = metric.__name__
        else:
            raise TypeError(
                "Metrics must be either a string with a function of the "
                "`sklearn.metrics` module, a function that takes `y_true` and `y_pred` "
                "positional arguments or a list-like of any of the two options."
            )

        metric_dict[metric_label] = metric_func(
            true_pred_arr[0], true_pred_arr[1], **kwargs
        )

    return metric_dict


def _get_true_pred_arr(
    pred_img_filepaths, *, response_img_filepaths=None, response_img_dir=None
):
    pred_img_filepaths, response_img_filepaths = _match_img_response_filepaths(
        pred_img_filepaths,
        response_img_filepaths=response_img_filepaths,
        response_img_dir=response_img_dir,
    )

    true_arrs = []
    pred_arrs = []
    for pred_img_filepath, response_img_filepath in zip(
        pred_img_filepaths, response_img_filepaths
    ):
        with rio.open(response_img_filepath) as src:
            true_arr = src.read(1).flatten()
        with rio.open(pred_img_filepath) as src:
            pred_arr = src.read(1).flatten()
        # results.append((true_arr, pred_arr))
        true_arrs.append(true_arr)
        pred_arrs.append(pred_arr)

    return np.vstack([np.concatenate(arrs, axis=0) for arrs in [true_arrs, pred_arrs]])



[docs]
def get_true_pred_arr(
    *,
    pred_img_filepaths=None,
    clf=None,
    clf_dict=None,
    hf_hub_repo_id=None,
    hf_hub_clf_filename=None,
    hf_hub_download_kwargs=None,
    skops_trusted=None,
    refine_method=None,
    refine_kwargs=None,
    split_df=None,
    img_dir=None,
    response_img_dir=None,
    img_filepaths=None,
    response_img_filepaths=None,
    img_filename_pattern=None,
    **classifier_kwargs,
):
    """
    Get true and predicted values for the validation images.

    Parameters
    ----------
    pred_img_filepaths : list-like, optional
        List of paths to precomputed predicted images. If provided, classification is
        skipped and predictions are read directly from these files. Only predictions
        with a matching response image (by basename) are used, and all arguments except
        `response_img_dir` or `response_img_filepaths` are ignored.
    clf : scikit-learn-like classifier, optional
        Trained classifier. If no value is provided, the classifier is loaded from
        HuggingFace Hub using the values provided in `hf_hub_repo_id` and
        `hf_hub_clf_filename`.
    clf_dict : dictionary, optional
        Dictionary mapping a trained scikit-learn-like classifier to each first-level
        cluster label.
    hf_hub_repo_id, hf_hub_clf_filename : str, optional
        HuggingFace Hub repository id (string with the user or organization and
        repository name separated by a `/`) and file name of the skops classifier
        respectively. If no value is provided, the values set in
        `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or
        `clf_dict` are provided.
    hf_hub_download_kwargs : dict, optional
        Additional keyword arguments (besides "repo_id", "filename", "library_name"  and
        "library_version") to pass to `huggingface_hub.hf_hub_download`.
    skops_trusted : list, optional
        List of trusted object types to load the classifier from HuggingFace Hub, passed
        to `skops.io.load`. If no value is provided, the value from
        `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided.
    refine_method : callable or bool, optional
        Method to refine the pixel-level classification. If `False` is provided, no
        refinement is performed. If `None` is provided, the default behavior of
        `detectree.classifier.Classifier` is used.
    refine_kwargs : dict, optional
        Keyword arguments that will be passed to `refine_method`. Ignored if no
        refinement is performed.
    split_df : pandas DataFrame, optional
        Data frame with the validation images.
    img_dir : str representing path to a directory, optional
        Path to the directory where the images from `split_df` are located. Required if
        `split_df` is provided. Ignored if `img_filepaths` is provided.
    response_img_dir : str representing path to a directory, optional
        Path to the directory where the response tiles are located. Required if
        providing `split_df`. Otherwise `response_img_dir` might either be ignored if
        providing `response_img_filepaths`, or be used as the directory where the images
        whose filename matches `img_filename_pattern` are to be located. Only images
        with a matching response (by basename) are evaluated.
    img_filepaths : list-like, optional
        List of paths to the tiles that will be used for validation. Ignored if
        `split_df` is provided.
    response_img_filepaths : list-like, optional
        List of paths to the binary response tiles that will be used for evaluation.
        Ignored if `split_df` is provided. Only images with a matching response (by
        basename) are evaluated.
    img_filename_pattern : str representing a file-name pattern, optional
        Filename pattern to be matched in order to obtain the list of images. If no
        value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
        Ignored if `split_df` or `img_filepaths` is provided.
    classifier_kwargs : dict, optional
        Additional keyword arguments to pass to the initialization of
        `detectree.classifier.Classifier` class.

    Returns
    -------
    true_pred : numpy.ndarray
        Array with two rows respectively containing the true and predicted values for
        the provided images.
    """
    if pred_img_filepaths is not None:
        # no need for inference

        # return here because the other return will need the tmp_dir context manager
        # anyway so they cannot be merged
        return _get_true_pred_arr(
            pred_img_filepaths,
            response_img_filepaths=response_img_filepaths,
            response_img_dir=response_img_dir,
        )

    # inference is needed
    _classifier_kwargs = classifier_kwargs.copy()
    if refine_method is not None:
        _classifier_kwargs["refine_method"] = refine_method
    if refine_kwargs is not None:
        _classifier_kwargs["refine_kwargs"] = refine_kwargs

    c = classifier.Classifier(
        clf=clf,
        clf_dict=clf_dict,
        hf_hub_repo_id=hf_hub_repo_id,
        hf_hub_clf_filename=hf_hub_clf_filename,
        hf_hub_download_kwargs=hf_hub_download_kwargs,
        skops_trusted=skops_trusted,
        **_classifier_kwargs,
    )

    with tempfile.TemporaryDirectory() as tmp_dir:
        if split_df is not None:
            if img_dir is None:
                raise ValueError(
                    "If `split_df` is provided, `img_dir` must also be provided."
                )
            img_filepaths = split_df["img_filename"].apply(
                lambda img_filename: path.join(img_dir, img_filename)
            )
            matched_img_filepaths, response_img_filepaths = (
                _match_img_response_filepaths(
                    img_filepaths,
                    response_img_filepaths=response_img_filepaths,
                    response_img_dir=response_img_dir,
                )
            )
            matched_filenames = [
                path.basename(img_filepath) for img_filepath in matched_img_filepaths
            ]
            split_df = split_df[split_df["img_filename"].isin(matched_filenames)]
            pred_img_filepaths = c.predict_imgs(
                tmp_dir,
                split_df=split_df,
                img_dir=img_dir,
                img_filename_pattern=img_filename_pattern,
            )
        else:
            if img_filepaths is None:
                if img_dir is None:
                    raise ValueError(
                        "Either `split_df`, `img_filepaths` or `img_dir` must be "
                        "provided."
                    )
                img_filepaths = utils.get_img_filepaths(
                    img_dir, img_filename_pattern=img_filename_pattern
                )
            img_filepaths, response_img_filepaths = _match_img_response_filepaths(
                img_filepaths,
                response_img_filepaths=response_img_filepaths,
                response_img_dir=response_img_dir,
            )
            pred_img_filepaths = c.predict_imgs(
                tmp_dir,
                img_filepaths=img_filepaths,
                img_filename_pattern=img_filename_pattern,
            )
        # TODO: maybe DRY with the same return on top of the function, but we need the
        # tmp_dir context manager here
        return _get_true_pred_arr(
            pred_img_filepaths,
            response_img_filepaths=response_img_filepaths,
            response_img_dir=response_img_dir,
        )




[docs]
def compute_eval_metrics(
    *,
    pred_img_filepaths=None,
    metrics=None,
    metrics_kwargs=None,
    clf=None,
    clf_dict=None,
    hf_hub_repo_id=None,
    hf_hub_clf_filename=None,
    hf_hub_download_kwargs=None,
    skops_trusted=None,
    refine_method=None,
    refine_kwargs=None,
    split_df=None,
    img_dir=None,
    response_img_dir=None,
    img_filepaths=None,
    response_img_filepaths=None,
    img_filename_pattern=None,
    **classifier_kwargs,
):
    """
    Compute evaluation metrics for the validation images.

    Parameters
    ----------
    pred_img_filepaths : list-like, optional
        List of paths to precomputed predicted images. If provided, classification is
        skipped and metrics are computed directly from these files. Only predictions
        with a matching response image (by basename) are used. Requires
        `response_img_dir` or `response_img_filepaths`.
    metrics : str, func or list of str or func
        The metrics to compute, must be either a string with a function of the
        `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional
        arguments with the true and predicted labels respectively or a list-like of any
        of the two options. If no value is provided, the values set in
        `settings.EVAL_METRICS` are used.
    metrics_kwargs : dict or list of dict
        Additional keyword arguments to pass to each of the metric functions.
    clf : scikit-learn-like classifier, optional
        Trained classifier. If no value is provided, the classifier is loaded from
        HuggingFace Hub using the values provided in `hf_hub_repo_id` and
        `hf_hub_clf_filename`.
    clf_dict : dictionary, optional
        Dictionary mapping a trained scikit-learn-like classifier to each first-level
        cluster label.
    hf_hub_repo_id, hf_hub_clf_filename : str, optional
        HuggingFace Hub repository id (string with the user or organization and
        repository name separated by a `/`) and file name of the skops classifier
        respectively. If no value is provided, the values set in
        `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or
        `clf_dict` are provided.
    hf_hub_download_kwargs : dict, optional
        Additional keyword arguments (besides "repo_id", "filename", "library_name"  and
        "library_version") to pass to `huggingface_hub.hf_hub_download`.
    skops_trusted : list, optional
        List of trusted object types to load the classifier from HuggingFace Hub, passed
        to `skops.io.load`. If no value is provided, the value from
        `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided.
    refine_method : callable or bool, optional
        Method to refine the pixel-level classification. If `False` is provided, no
        refinement is performed. If `None` is provided, the default behavior of
        `detectree.classifier.Classifier` is used.
    refine_kwargs : dict, optional
        Keyword arguments that will be passed to `refine_method`. Ignored if no
        refinement is performed.
    split_df : pandas DataFrame, optional
        Data frame with the validation images.
    img_dir : str representing path to a directory, optional
        Path to the directory where the images from `split_df` are located. Required if
        `split_df` is provided. Ignored if `img_filepaths` is provided.
    response_img_dir : str representing path to a directory, optional
        Path to the directory where the response tiles are located. Ignored if providing
        `response_img_filepaths`. Only images with a matching response (by basename)
        are evaluated.
    img_filepaths : list-like, optional
        List of paths to the tiles that will be used for validation. Ignored if
        `split_df` is provided.
    response_img_filepaths : list-like, optional
        List of paths to the binary response tiles that will be used for evaluation.
        Ignored if `split_df` is provided. Only images with a matching response (by
        basename) are evaluated.
    img_filename_pattern : str representing a file-name pattern, optional
        Filename pattern to be matched in order to obtain the list of images. If no
        value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
        Ignored if `split_df` or `img_filepaths` is provided.
    classifier_kwargs : dict, optional
        Additional keyword arguments to pass to the initialization of
        `detectree.classifier.Classifier` class.

    Returns
    -------
    metric_dict : numeric, dict
        Values of the metrics computed for the validation images. If only one metric is
        provided, a single value is returned. If multiple metrics are provided, a dict
        with a key for each metric is returned. The metric values can be of different
        types depending on the metric function used, e.g., `precision_score` returns a
        single float value, `precision_recall_curve` returns a tuple of arrays, and
        `confusion_matrix` returns a two-dimensional array.
    """
    if metrics is None:
        metrics = settings.EVAL_METRICS
    if metrics_kwargs is None:
        metrics_kwargs = [{}] * len(metrics)

    true_pred_arr = get_true_pred_arr(
        pred_img_filepaths=pred_img_filepaths,
        clf=clf,
        clf_dict=clf_dict,
        hf_hub_repo_id=hf_hub_repo_id,
        hf_hub_clf_filename=hf_hub_clf_filename,
        hf_hub_download_kwargs=hf_hub_download_kwargs,
        skops_trusted=skops_trusted,
        refine_method=refine_method,
        refine_kwargs=refine_kwargs,
        split_df=split_df,
        img_dir=img_dir,
        img_filepaths=img_filepaths,
        response_img_filepaths=response_img_filepaths,
        img_filename_pattern=img_filename_pattern,
        response_img_dir=response_img_dir,
        **classifier_kwargs,
    )

    metric_dict = _compute_metrics_from_true_pred_arr(
        true_pred_arr, metrics, metrics_kwargs
    )

    if len(metric_dict) == 1:
        return next(iter(metric_dict.values()))
    return metric_dict




[docs]
def eval_refine_params(
    *,
    refine_method=None,
    refine_params_list=None,
    metrics=None,
    metrics_kwargs=None,
    clf=None,
    clf_dict=None,
    hf_hub_repo_id=None,
    hf_hub_clf_filename=None,
    hf_hub_download_kwargs=None,
    skops_trusted=None,
    tree_val=None,
    nontree_val=None,
    split_df=None,
    img_dir=None,
    img_filepaths=None,
    img_filename_pattern=None,
    response_img_dir=None,
    **classifier_kwargs,
):
    """
    Evaluate a refinement procedure for different parameters.

    Parameters
    ----------
    refine_method : callable, optional
        Refinement method that takes a probability image as the first positional
        argument followed by tree and non-tree values, e.g.,
        `refine_method(p_tree_img, tree_val, nontree_val, **kwargs)`. If no value is
        provided, the value from `settings.CLF_REFINE_METHOD` is used.
    refine_params_list : list of dict, optional

        Parameters to evaluate for the refinement method, as a list of keyword
        arguments. The metrics will be computed for each item of this list. If no value
        is provided, the value from `settings.EVAL_REFINE_PARAMS` is used.
    metrics : str, func or list of str or func
        The metrics to compute, must be either a string with a function of the
        `sklearn.metrics`, a function that takes a `y_true` and `y_pred` positional
        arguments with the true and predicted labels respectively or a list-like of any
        of the two options. If no value is provided, the values set in
        `settings.EVAL_METRICS` are used.
    metrics_kwargs : dict or list of dict
        Additional keyword arguments to pass to each of the metric functions.
    clf : scikit-learn-like classifier, optional
        Trained classifier. If no value is provided, the classifier is loaded from
        HuggingFace Hub using the values provided in `hf_hub_repo_id` and
        `hf_hub_clf_filename`.
    clf_dict : dictionary, optional
        Dictionary mapping a trained scikit-learn-like classifier to each first-level
        cluster label.
    hf_hub_repo_id, hf_hub_clf_filename : str, optional
        HuggingFace Hub repository id (string with the user or organization and
        repository name separated by a `/`) and file name of the skops classifier
        respectively. If no value is provided, the values set in
        `settings.HF_HUB_REPO_ID` and `settings.HF_HUB_CLF_FILENAME` Ignored if `clf` or
        `clf_dict` are provided.
    hf_hub_download_kwargs : dict, optional
        Additional keyword arguments (besides "repo_id", "filename", "library_name"  and
        "library_version") to pass to `huggingface_hub.hf_hub_download`.
    skops_trusted : list, optional
        List of trusted object types to load the classifier from HuggingFace Hub, passed
        to `skops.io.load`. If no value is provided, the value from
        `settings.SKOPS_TRUSTED` is used. Ignored if `clf` or `clf_dict` are provided.
    tree_val, nontree_val : int, optional
        The values that designate tree and non-tree pixels respectively in the response
        images. If no values are provided, the values set in `settings.TREE_VAL` and
        `settings.NON_TREE_VAL` are respectively used.
    split_df : pandas DataFrame, optional
        Data frame with the validation images.
    img_dir : str representing path to a directory, optional
        Path to the directory where the images from `split_df` are located. Required if
        `split_df` is provided. Ignored if `img_filepaths` is provided.
    img_filepaths : list-like, optional
        List of paths to the tiles that will be used for validation. Ignored if
        `split_df` is provided.
    img_filename_pattern : str representing a file-name pattern, optional
        Filename pattern to be matched in order to obtain the list of images. If no
        value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
        Ignored if `split_df` or `img_filepaths` is provided.
    response_img_dir : str representing path to a directory, optional
        Path to the directory where the response tiles are located. Ignored if providing
        `response_img_filepaths`.
    classifier_kwargs : dict, optional
        Additional keyword arguments to pass to the initialization of
        `detectree.classifier.Classifier` class.

    Returns
    -------
    results : pandas DataFrame
        A DataFrame with the computed values for each metric (row) and each refinement
        keyword argument set (column, stringified).
    """
    if refine_method is None:
        refine_method = settings.CLF_REFINE_METHOD
    if refine_params_list is None:
        refine_params_list = settings.EVAL_REFINE_PARAMS
    # refine_params_list = list(refine_params_list)

    if metrics is None:
        metrics = settings.EVAL_METRICS
    if metrics_kwargs is None:
        metrics_kwargs = [{}] * len(metrics)

    if tree_val is None:
        tree_val = settings.TREE_VAL
    if nontree_val is None:
        nontree_val = settings.NONTREE_VAL

    _classifier_kwargs = classifier_kwargs.copy()
    for key in ("refine_method", "refine_kwargs", "return_proba"):
        _classifier_kwargs.pop(key, None)

    c = classifier.Classifier(
        clf=clf,
        clf_dict=clf_dict,
        hf_hub_repo_id=hf_hub_repo_id,
        hf_hub_clf_filename=hf_hub_clf_filename,
        hf_hub_download_kwargs=hf_hub_download_kwargs,
        skops_trusted=skops_trusted,
        tree_val=tree_val,
        nontree_val=nontree_val,
        return_proba=True,
        **_classifier_kwargs,
    )
    with tempfile.TemporaryDirectory() as tmp_dir:
        pred_img_filepaths = c.predict_imgs(
            tmp_dir,
            split_df=split_df,
            img_dir=img_dir,
            img_filepaths=img_filepaths,
            img_filename_pattern=img_filename_pattern,
        )
        true_arrs = []
        pred_refined_by_kwargs = [[] for _ in refine_params_list]
        for pred_img_filepath in pred_img_filepaths:
            with rio.open(
                path.join(response_img_dir, path.basename(pred_img_filepath))
            ) as src:
                true_arrs.append(src.read(1).flatten())
            with rio.open(pred_img_filepath) as src:
                pred_arr = src.read(1)
            for idx, refine_kwargs in enumerate(refine_params_list):
                pred_refined_by_kwargs[idx].append(
                    refine_method(
                        pred_arr, tree_val, nontree_val, **refine_kwargs
                    ).flatten()
                )

    true_arr = np.concatenate(true_arrs)
    pred_refined_arrs = [
        np.concatenate(arrs, axis=0) for arrs in pred_refined_by_kwargs
    ]

    metric_dict_by_refine_kwargs = []
    for refined_arr in pred_refined_arrs:
        true_pred_arr = np.vstack((true_arr, refined_arr))
        metric_dict_by_refine_kwargs.append(
            _compute_metrics_from_true_pred_arr(true_pred_arr, metrics, metrics_kwargs)
        )

    return pd.DataFrame(
        metric_dict_by_refine_kwargs,
        index=[str(kwargs) for kwargs in refine_params_list],
    ).transpose()