"""Binary tree/non-tree classifier(s)."""
import glob
from os import path
import dask
import huggingface_hub as hf_hub
import maxflow as mf
import numpy as np
import rasterio as rio
import skops
from dask import diagnostics
from skops import io
from . import pixel_features, pixel_response, settings, utils
__all__ = ["ClassifierTrainer", "Classifier"]
MOORE_NEIGHBORHOOD_ARR = np.array([[0, 0, 0], [0, 0, 1], [1, 1, 1]])
[docs]
class ClassifierTrainer:
"""Train binary tree/non-tree classifier(s) of the pixel features."""
[docs]
def __init__(
self,
*,
sigmas=None,
num_orientations=None,
neighborhood=None,
min_neighborhood_range=None,
num_neighborhoods=None,
tree_val=None,
nontree_val=None,
classifier_class=None,
**classifier_kwargs,
):
"""
Initialize the classifier.
See the `background <https://bit.ly/2KlCICO>`_ example notebook for details.
Parameters
----------
sigmas : list-like, optional
The list of scale parameters (sigmas) to build the Gaussian filter bank that
will be used to compute the pixel-level features. The provided argument will
be passed to the initialization method of the `PixelFeaturesBuilder` class.
If no value is provided, the value set in `settings.GAUSS_SIGMAS` will be
taken.
num_orientations : int, optional
The number of equally-distributed orientations to build the Gaussian filter
bank that will be used to compute the pixel-level features. The provided
argument will be passed to the initialization method of the
`PixelFeaturesBuilder` class. If no value is provided, the value set in
`settings.GAUSS_NUM_ORIENTATIONS` is used.
neighborhood : array-like, optional
The base neighborhood structure that will be used to compute the entropy
features. Theprovided argument will be passed to the initialization method
of the `PixelFeaturesBuilder` class. If no value is provided, a square with
a side size of `2 * min_neighborhood_range + 1` is used.
min_neighborhood_range : int, optional
The range (i.e., the square radius) of the smallest neigbhorhood window that
will be used to compute the entropy features. The provided argument will be
passed to the initialization method of the `PixelFeaturesBuilder` class. If
no value is provided, the value set in
`settings.ENTROPY_MIN_NEIGHBORHOOD_RANGE` is used.
num_neighborhoods : int, optional
The number of neigbhorhood windows (whose size follows a geometric
progression starting at `min_neighborhood_range`) that will be used to
compute the entropy features. The provided argument will be passed to the
initialization method of the `PixelFeaturesBuilder` class. If no value is
provided, the value set in `settings.ENTROPY_NUM_NEIGHBORHOODS` is used.
tree_val : int, optional
The value that designates tree pixels in the response images. The provided
argument will be passed to the initialization method of the
`PixelResponseBuilder` class. If no value is provided, the value set in
`settings.RESPONSE_TREE_VAL` is used.
nontree_val : int, optional
The value that designates non-tree pixels in the response images. The
provided argument will be passed to the initialization method of the
`PixelResponseBuilder` class. If no value is provided, the value set in
`settings.RESPONSE_NONTREE_VAL` is used.
classifier_class : class, optional
The class of the classifier to be trained. It can be any scikit-learn
compatible estimator that implements the `fit`, `predict` and
`predict_proba` methods and that can be saved to and loaded from memory
using skops. If no value is provided, the value set in `settings.CLF_CLASS`
is used.
classifier_kwargs : key-value pairings, optional
Keyword arguments that will be passed to the initialization of
`classifier_class`. If no value is provided, the value set in
`settings.CLF_KWARGS` is used.
"""
self.pixel_features_builder_kwargs = dict(
sigmas=sigmas,
num_orientations=num_orientations,
neighborhood=neighborhood,
min_neighborhood_range=min_neighborhood_range,
num_neighborhoods=num_neighborhoods,
)
self.pixel_response_builder_kwargs = dict(
tree_val=tree_val, nontree_val=nontree_val
)
if classifier_class is None:
classifier_class = settings.CLF_CLASS
self.classifier_class = classifier_class
if classifier_kwargs == {}:
classifier_kwargs = settings.CLF_KWARGS
self.classifier_kwargs = classifier_kwargs
[docs]
def train_classifier(
self,
*,
split_df=None,
response_img_dir=None,
img_filepaths=None,
response_img_filepaths=None,
img_dir=None,
img_filename_pattern=None,
method=None,
img_cluster=None,
):
"""
Train a classifier.
See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
details.
Parameters
----------
split_df : pandas DataFrame, optional
Data frame with the train/test split.
response_img_dir : str representing path to a directory, optional
Path to the directory where the response tiles are located. Required if
providing `split_df`. Otherwise `response_img_dir` might either be ignored
if providing `response_img_filepaths`, or be used as the directory where the
images whose filename matches `img_filename_pattern` are to be located.
img_filepaths : list-like, optional
List of paths to the input tiles whose features will be used to train the
classifier. Ignored if `split_df` is provided.
response_img_filepaths : list-like, optional
List of paths to the binary response tiles that will be used to train the
classifier. Ignored if `split_df` is provided.
img_dir : str representing path to a directory, optional
Path to the directory where the images whose filename matches
`img_filename_pattern` are to be located. Ignored if `split_df` or
`img_filepaths` is provided.
img_filename_pattern : str representing a file-name pattern, optional
Filename pattern to be matched in order to obtain the list of images. If no
value is provided, the value set in `settings.IMG_FILENAME_PATTERN` is used.
Ignored if `split_df` or `img_filepaths` is provided.
method : {'cluster-I', 'cluster-II'}, optional
Method used in the train/test split.
img_cluster : int, optional
The label of the cluster of tiles. Only used if `method` is 'cluster-II'.
Returns
-------
clf : scikit-learn-like classifier
The trained classifier.
"""
if split_df is None and response_img_filepaths is None:
# this is the only case that needs argument tweaking: otherwise, if we pass
# `img_filepaths`/`img_dir` to `build_features` and `response_img_dir` to
# `build_response`, the latter would build a response with all the image
# files in `response_img_dir`. Instead, we need to build the response only
# for the files speficied in `img_filepaths`/`img_dir`
if img_filepaths is None:
# TODO: this is copied from `build_features` - ideally, we should DRY it
if img_filename_pattern is None:
img_filename_pattern = settings.IMG_FILENAME_PATTERN
if img_dir is None:
raise ValueError(
"Either `split_df`, `img_filepaths` or `img_dir` must "
"be provided"
)
img_filepaths = glob.glob(path.join(img_dir, img_filename_pattern))
response_img_filepaths = [
path.join(response_img_dir, path.basename(img_filepath))
for img_filepath in img_filepaths
]
X = pixel_features.PixelFeaturesBuilder(
**self.pixel_features_builder_kwargs
).build_features(
split_df=split_df,
img_filepaths=img_filepaths,
img_dir=img_dir,
img_filename_pattern=img_filename_pattern,
method=method,
img_cluster=img_cluster,
)
y = pixel_response.PixelResponseBuilder(
**self.pixel_response_builder_kwargs
).build_response(
split_df=split_df,
response_img_dir=response_img_dir,
response_img_filepaths=response_img_filepaths,
img_filename_pattern=img_filename_pattern,
method=method,
img_cluster=img_cluster,
)
clf = self.classifier_class(**self.classifier_kwargs)
clf.fit(X, y)
return clf
[docs]
def train_classifiers(self, split_df, response_img_dir):
"""
Train a classifier for each first-level cluster in `split_df`.
See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
details.
Parameters
----------
split_df : pandas DataFrame
Data frame with the train/test split, which must have an `img_cluster`.
column with the first-level cluster labels.
response_img_dir : str representing path to a directory
Path to the directory where the response tiles are located.
Returns
-------
clf_dict : dictionary
Dictionary mapping a scikit-learn-like classifier to each first-level
cluster label.
"""
if "img_cluster" not in split_df:
raise ValueError(
"`split_df` must have an 'img_cluster' column ('cluster-II'). "
"For 'cluster-I', use `train_classifier`."
)
clfs_lazy = {}
for img_cluster, _ in split_df.groupby("img_cluster"):
clfs_lazy[img_cluster] = dask.delayed(self.train_classifier)(
split_df=split_df,
response_img_dir=response_img_dir,
method="cluster-II",
img_cluster=img_cluster,
)
with diagnostics.ProgressBar():
clfs_dict = dask.compute(clfs_lazy)[0]
return clfs_dict
[docs]
class Classifier:
"""Use trained classifier(s) to predict tree pixels."""
[docs]
def __init__(
self,
*,
clf=None,
clf_dict=None,
tree_val=None,
nontree_val=None,
refine=None,
refine_beta=None,
refine_int_rescale=None,
**pixel_features_builder_kwargs,
):
"""
Initialize the classifier instance.
See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
details.
Parameters
----------
clf : scikit-learn-like classifier, optional
Trained classifier. If no value is provided, the latest detectree
pre-trained classifier is used. Ignored if `clf_dict` is provided.
clf_dict : dictionary, optional
Dictionary mapping a trained scikit-learn-like classifier to each
first-level cluster label.
tree_val : int, optional
Label used to denote tree pixels in the predicted images. If no value is
provided, the value set in `settings.CLF_TREE_VAL` is used.
nontree_val : int, optional
Label used to denote non-tree pixels in the predicted images. If no value is
provided, the value set in `settings.CLF_NONTREE_VAL` is used.
refine : bool, optional
Whether the pixel-level classification should be refined by optimizing the
consistence between neighboring pixels. If no value is provided, the value
set in `settings.CLF_REFINE` is used.
refine_beta : int, optional
Parameter of the refinement procedure that controls the smoothness of the
labelling. Larger values lead to smoother shapes. If no value is provided,
the value set in `settings.CLF_REFINE_BETA` is used.
refine_int_rescale : int, optional
Parameter of the refinement procedure that controls the precision of the
transformation of float to integer edge weights, required for the employed
graph cuts algorithm. Larger values lead to greater precision. If no value
is provided, the value set in `settings.CLF_REFINE_INT_RESCALE` is used.
pixel_features_builder_kwargs : dict, optional
Keyword arguments that will be passed to `detectree.PixelFeaturesBuilder`,
which customize how the pixel features are built.
"""
super().__init__()
if clf_dict is not None:
self.clf_dict = clf_dict
elif clf is not None:
self.clf = clf
else:
self.clf = io.load(
hf_hub.hf_hub_download(
repo_id=settings.HF_HUB_REPO_ID,
filename=settings.HF_HUB_FILENAME,
library_name="skops",
library_version=skops.__version__,
),
trusted=settings.SKOPS_TRUSTED,
)
if tree_val is None:
tree_val = settings.CLF_TREE_VAL
if nontree_val is None:
nontree_val = settings.CLF_NONTREE_VAL
if refine is None:
refine = settings.CLF_REFINE
if refine_beta is None:
refine_beta = settings.CLF_REFINE_BETA
if refine_int_rescale is None:
refine_int_rescale = settings.CLF_REFINE_INT_RESCALE
self.tree_val = tree_val
self.nontree_val = nontree_val
self.refine = refine
self.refine_beta = refine_beta
self.refine_int_rescale = refine_int_rescale
self.pixel_features_builder_kwargs = pixel_features_builder_kwargs
def _predict_img(self, img_filepath, clf, *, output_filepath=None):
# ACHTUNG: Note that we do not use keyword-only arguments in this method because
# `output_filepath` works as the only "optional" argument
src = rio.open(img_filepath)
img_shape = src.shape
X = pixel_features.PixelFeaturesBuilder(
**self.pixel_features_builder_kwargs
).build_features_from_filepath(img_filepath)
if not self.refine:
y_pred = clf.predict(X).reshape(img_shape)
else:
p_nontree, p_tree = np.hsplit(clf.predict_proba(X), 2)
g = mf.Graph[int]()
node_ids = g.add_grid_nodes(img_shape)
P_nontree = p_nontree.reshape(img_shape)
P_tree = p_tree.reshape(img_shape)
# The classifier probabilities are floats between 0 and 1, and the graph
# cuts algorithm requires an integer representation. Therefore, we multiply
# the probabilities by an arbitrary large number and then transform the
# result to integers. For instance, we could use a `refine_int_rescale` of
# `100` so that the probabilities are rescaled into integers between 0 and
# 100 like percentages). The larger `refine_int_rescale`, the greater the
# precision.
# ACHTUNG: the data term when the pixel is a tree is `log(1 - P_tree)`,
# i.e., `log(P_nontree)`, so the two lines below are correct
D_tree = (self.refine_int_rescale * np.log(P_nontree)).astype(int)
D_nontree = (self.refine_int_rescale * np.log(P_tree)).astype(int)
# TODO: option to choose Moore/Von Neumann neighborhood?
g.add_grid_edges(
node_ids, self.refine_beta, structure=MOORE_NEIGHBORHOOD_ARR
)
g.add_grid_tedges(node_ids, D_tree, D_nontree)
g.maxflow()
# y_pred = g.get_grid_segments(node_ids)
# transform boolean `g.get_grid_segments(node_ids)` to an array of
# `self.tree_val` and `self.nontree_val`
y_pred = np.full(img_shape, self.nontree_val)
y_pred[g.get_grid_segments(node_ids)] = self.tree_val
# TODO: make the profile of output rasters more customizable (e.g., via the
# `settings` module)
# output_filepath = path.join(output_dir,
# f"tile_{tile_start}-{tile_end}.tif")
if output_filepath is not None:
with rio.open(
output_filepath,
"w",
driver="GTiff",
width=y_pred.shape[1],
height=y_pred.shape[0],
count=1,
dtype=np.uint8,
nodata=self.nontree_val,
crs=src.crs,
transform=src.transform,
) as dst:
dst.write(y_pred.astype(np.uint8), 1)
src.close()
return y_pred
def _predict_imgs(self, img_filepaths, clf, output_dir):
pred_imgs_lazy = []
pred_img_filepaths = []
for img_filepath in img_filepaths:
# filename, ext = path.splitext(path.basename(img_filepath))
# pred_img_filepath = path.join(
# output_dir, f"{filename}-pred{ext}")
pred_img_filepath = path.join(output_dir, path.basename(img_filepath))
pred_imgs_lazy.append(
dask.delayed(self._predict_img)(
img_filepath, clf, output_filepath=pred_img_filepath
)
)
pred_img_filepaths.append(pred_img_filepath)
with diagnostics.ProgressBar():
dask.compute(*pred_imgs_lazy)
return pred_img_filepaths
def predict_img(self, img_filepath, *, img_cluster=None, output_filepath=None):
"""
Use a trained classifier to predict tree pixels in an image.
Optionally dump the predicted tree/non-tree image to `output_filepath`.
Parameters
----------
img_filepath : str, file object or pathlib.Path object
Path to a file, URI, file object opened in binary ('rb') mode, or a Path
object representing the image to be classified. The value will be passed to
`rasterio.open`.
img_cluster : int, optional
The label of the cluster of tiles. Only used if the `Classifier` instance
was initialized with `clf_dict` (i.e., "cluster-II" method).
output_filepath : str, file object or pathlib.Path object, optional
Path to a file, URI, file object opened in binary ('rb') mode, or a Path
object representing where the predicted image is to be dumped. The value
will be passed to `rasterio.open` in 'write' mode.
Returns
-------
y_pred : numpy ndarray
Array with the pixel responses.
"""
clf = getattr(self, "clf", None)
if clf is None:
if img_cluster is not None:
try:
clf = self.clf_dict[img_cluster]
except KeyError:
raise ValueError(
f"Classifier for cluster {img_cluster} not found in"
" `self.clf_dict`."
)
else:
raise ValueError(
"A valid `img_cluster` must be provided for classifiers"
" instantiated with `clf_dict`."
)
return self._predict_img(img_filepath, clf, output_filepath=output_filepath)
def predict_imgs(self, split_df, output_dir):
"""
Use trained classifier(s) to predict tree pixels in multiple images.
See the `background <https://bit.ly/2KlCICO>`_ example notebook for more
details.
Parameters
----------
split_df : pandas DataFrame, optional
Data frame with the train/test split.
output_dir : str or pathlib.Path object
Path to the directory where the predicted images are to be dumped.
Returns
-------
pred_imgs : list or dict
File paths of the dumped tiles.
"""
if hasattr(self, "clf"):
return self._predict_imgs(
split_df[~split_df["train"]]["img_filepath"], self.clf, output_dir
)
else:
# `self.clf_dict` is not `None`
pred_imgs = {}
for img_cluster, _ in split_df.groupby("img_cluster"):
try:
clf = self.clf_dict[img_cluster]
except KeyError:
raise ValueError(
f"Classifier for cluster {img_cluster} not found in"
" `self.clf_dict`."
)
pred_imgs[img_cluster] = self._predict_imgs(
utils.get_img_filepaths(split_df, img_cluster, False),
clf,
output_dir,
)
return pred_imgs