add instance matching

donglaiw · donglaiw · commit 91de668e7982 · 2024-05-14T13:06:19.000-04:00
diff --git a/connectomics/utils/evaluate.py b/connectomics/utils/evaluate.py
@@ -2,9 +2,16 @@
 import scipy.sparse as sparse
 import h5py
 from scipy import ndimage
+from scipy.optimize import linear_sum_assignment
+from collections import namedtuple
+from skimage.segmentation import relabel_sequential
+
+matching_criteria = dict()
 
 __all__ = [
     'get_binary_jaccard',
+    'adapted_rand',
+    'instance_matching'
 ]
 
 
@@ -478,3 +485,288 @@ def convert_dtype(data):
     print("\tdistance to proposal    : " + str(false_negative_stats))
 
     return false_positive_stats['mean'], false_negative_stats['mean']
+
+
+# Code modified from https://github.com/stardist/stardist
+
+# Copied from https://github.com/CSBDeep/CSBDeep/blob/master/csbdeep/utils/utils.py
+def _raise(e):
+    if isinstance(e, BaseException):
+        raise e
+    else:
+        raise ValueError(e)
+
+def label_are_sequential(y):
+    """ returns true if y has only sequential labels from 1... """
+    labels = np.unique(y)
+    return (set(labels)-{0}) == set(range(1,1+labels.max()))
+
+
+def is_array_of_integers(y):
+    return isinstance(y,np.ndarray) and np.issubdtype(y.dtype, np.integer)
+
+
+def _check_label_array(y, name=None, check_sequential=False):
+    err = ValueError("{label} must be an array of {integers}.".format(
+        label = 'labels' if name is None else name,
+        integers = ('sequential ' if check_sequential else '') + 'non-negative integers',
+    ))
+    is_array_of_integers(y) or _raise(err)
+    if len(y) == 0:
+        return True
+    if check_sequential:
+        label_are_sequential(y) or _raise(err)
+    else:
+        y.min() >= 0 or _raise(err)
+    return True
+
+
+def label_overlap(x, y, check=True):
+    if check:
+        _check_label_array(x,'x',True)
+        _check_label_array(y,'y',True)
+        x.shape == y.shape or _raise(ValueError("x and y must have the same shape"))
+    return _label_overlap(x, y)
+
+def _label_overlap(x, y):
+    x = x.ravel()
+    y = y.ravel()
+    overlap = np.zeros((1+x.max(),1+y.max()), dtype=np.uint)
+    for i in range(len(x)):
+        overlap[x[i],y[i]] += 1
+    return overlap
+
+def _safe_divide(x,y, eps=1e-10):
+    """computes a safe divide which returns 0 if y is zero"""
+    if np.isscalar(x) and np.isscalar(y):
+        return x/y if np.abs(y)>eps else 0.0
+    else:
+        out = np.zeros(np.broadcast(x,y).shape, np.float32)
+        np.divide(x,y, out=out, where=np.abs(y)>eps)
+        return out
+
+
+def intersection_over_union(overlap):
+    _check_label_array(overlap,'overlap')
+    if np.sum(overlap) == 0:
+        return overlap
+    n_pixels_pred = np.sum(overlap, axis=0, keepdims=True)
+    n_pixels_true = np.sum(overlap, axis=1, keepdims=True)
+    return _safe_divide(overlap, (n_pixels_pred + n_pixels_true - overlap))
+
+matching_criteria['iou'] = intersection_over_union
+
+
+def intersection_over_true(overlap):
+    _check_label_array(overlap,'overlap')
+    if np.sum(overlap) == 0:
+        return overlap
+    n_pixels_true = np.sum(overlap, axis=1, keepdims=True)
+    return _safe_divide(overlap, n_pixels_true)
+
+matching_criteria['iot'] = intersection_over_true
+
+
+def intersection_over_pred(overlap):
+    _check_label_array(overlap,'overlap')
+    if np.sum(overlap) == 0:
+        return overlap
+    n_pixels_pred = np.sum(overlap, axis=0, keepdims=True)
+    return _safe_divide(overlap, n_pixels_pred)
+
+matching_criteria['iop'] = intersection_over_pred
+
+
+def precision(tp,fp,fn):
+    return tp/(tp+fp) if tp > 0 else 0
+def recall(tp,fp,fn):
+    return tp/(tp+fn) if tp > 0 else 0
+def accuracy(tp,fp,fn):
+    # also known as "average precision" (?)
+    # -> https://www.kaggle.com/c/data-science-bowl-2018#evaluation
+    return tp/(tp+fp+fn) if tp > 0 else 0
+def f1(tp,fp,fn):
+    # also known as "dice coefficient"
+    return (2*tp)/(2*tp+fp+fn) if tp > 0 else 0
+
+
+def instance_matching(y_true, y_pred, thresh=0.5, criterion='iou', report_matches=False):
+    """Calculate detection/instance segmentation metrics between ground truth and predicted label images.
+
+    Currently, the following metrics are implemented:
+
+    'fp', 'tp', 'fn', 'precision', 'recall', 'accuracy', 'f1', 'criterion', 'thresh', 'n_true', 'n_pred', 'mean_true_score', 'mean_matched_score', 'panoptic_quality'
+
+    Corresponding objects of y_true and y_pred are counted as true positives (tp), false positives (fp), and false negatives (fn)
+    whether their intersection over union (IoU) >= thresh (for criterion='iou', which can be changed)
+
+    * mean_matched_score is the mean IoUs of matched true positives
+
+    * mean_true_score is the mean IoUs of matched true positives but normalized by the total number of GT objects
+
+    * panoptic_quality defined as in Eq. 1 of Kirillov et al. "Panoptic Segmentation", CVPR 2019
+
+    Parameters
+    ----------
+    y_true: ndarray
+        ground truth label image (integer valued)
+    y_pred: ndarray
+        predicted label image (integer valued)
+    thresh: float
+        threshold for matching criterion (default 0.5)
+    criterion: string
+        matching criterion (default IoU)
+    report_matches: bool
+        if True, additionally calculate matched_pairs and matched_scores (note, that this returns even gt-pred pairs whose scores are below  'thresh')
+
+    Returns
+    -------
+    Matching object with different metrics as attributes
+
+    Examples
+    --------
+    >>> y_true = np.zeros((100,100), np.uint16)
+    >>> y_true[10:20,10:20] = 1
+    >>> y_pred = np.roll(y_true,5,axis = 0)
+
+    >>> stats = instance_matching(y_true, y_pred)
+    >>> print(stats)
+    Matching(criterion='iou', thresh=0.5, fp=1, tp=0, fn=1, precision=0, recall=0, accuracy=0, f1=0, n_true=1, n_pred=1, mean_true_score=0.0, mean_matched_score=0.0, panoptic_quality=0.0)
+
+    """
+    _check_label_array(y_true,'y_true')
+    _check_label_array(y_pred,'y_pred')
+    y_true.shape == y_pred.shape or _raise(ValueError("y_true ({y_true.shape}) and y_pred ({y_pred.shape}) have different shapes".format(y_true=y_true, y_pred=y_pred)))
+    criterion in matching_criteria or _raise(ValueError("Matching criterion '%s' not supported." % criterion))
+    if thresh is None: thresh = 0
+    thresh = float(thresh) if np.isscalar(thresh) else map(float,thresh)
+
+    y_true, _, map_rev_true = relabel_sequential(y_true)
+    y_pred, _, map_rev_pred = relabel_sequential(y_pred)
+
+    map_rev_true = np.array(map_rev_true)
+    map_rev_pred = np.array(map_rev_pred)
+
+    overlap = label_overlap(y_true, y_pred, check=False)
+    scores = matching_criteria[criterion](overlap)
+    assert 0 <= np.min(scores) <= np.max(scores) <= 1
+
+    # ignoring background
+    scores = scores[1:,1:]
+    n_true, n_pred = scores.shape
+    n_matched = min(n_true, n_pred)
+
+    def _single(thr):
+        not_trivial = n_matched > 0 and np.any(scores >= thr)
+        if not_trivial:
+            # compute optimal matching with scores as tie-breaker
+            costs = -(scores >= thr).astype(float) - scores / (2*n_matched)
+            true_ind, pred_ind = linear_sum_assignment(costs)
+            assert n_matched == len(true_ind) == len(pred_ind)
+            match_ok = scores[true_ind,pred_ind] >= thr
+            tp = np.count_nonzero(match_ok)
+        else:
+            tp = 0
+        fp = n_pred - tp
+        fn = n_true - tp
+        # assert tp+fp == n_pred
+        # assert tp+fn == n_true
+
+        # the score sum over all matched objects (tp)
+        sum_matched_score = np.sum(scores[true_ind,pred_ind][match_ok]) if not_trivial else 0.0
+
+        # the score average over all matched objects (tp)
+        mean_matched_score = _safe_divide(sum_matched_score, tp)
+        # the score average over all gt/true objects
+        mean_true_score    = _safe_divide(sum_matched_score, n_true)
+        panoptic_quality   = _safe_divide(sum_matched_score, tp+fp/2+fn/2)
+
+        stats_dict = dict (
+            criterion          = criterion,
+            thresh             = thr,
+            fp                 = fp,
+            tp                 = tp,
+            fn                 = fn,
+            precision          = precision(tp,fp,fn),
+            recall             = recall(tp,fp,fn),
+            accuracy           = accuracy(tp,fp,fn),
+            f1                 = f1(tp,fp,fn),
+            n_true             = n_true,
+            n_pred             = n_pred,
+            mean_true_score    = mean_true_score,
+            mean_matched_score = mean_matched_score,
+            panoptic_quality   = panoptic_quality,
+        )
+        if bool(report_matches):
+            if not_trivial:
+                stats_dict.update (
+                    # int() to be json serializable
+                    matched_pairs  = tuple((int(map_rev_true[i]),int(map_rev_pred[j])) for i,j in zip(1+true_ind,1+pred_ind)),
+                    matched_scores = tuple(scores[true_ind,pred_ind]),
+                    matched_tps    = tuple(map(int,np.flatnonzero(match_ok))),
+                    pred_ids       = tuple(map_rev_pred),
+                    gt_ids         = tuple(map_rev_true),
+                )
+            else:
+                stats_dict.update (
+                    matched_pairs  = (),
+                    matched_scores = (),
+                    matched_tps    = (),
+                    pred_ids       = (),
+                    gt_ids         = (),
+                )
+        return stats_dict
+
+    return _single(thresh) if np.isscalar(thresh) else tuple(map(_single,thresh))
+
+
+def wrapper_matching_dataset_lazy(stats_all, thresh, criterion='iou', by_image=False):
+
+    expected_keys = set(('fp', 'tp', 'fn', 'precision', 'recall', 'accuracy', 'f1', 'criterion', 'thresh', 'n_true', 'n_pred', 'mean_true_score', 'mean_matched_score', 'panoptic_quality'))
+
+    # accumulate results over all images for each threshold separately
+    n_images, n_threshs = len(stats_all), len(thresh)
+    single_thresh = True if n_threshs == 1 else False
+    accumulate = [{} for _ in range(n_threshs)]
+    for stats in stats_all:
+        for i, s in enumerate(stats):
+            acc = accumulate[i]
+            for item in s.items():
+                k, v = item
+                if k == 'mean_true_score' and not bool(by_image):
+                    # convert mean_true_score to "sum_matched_score"
+                    acc[k] = acc.setdefault(k,0) + v * s['n_true']
+                else:
+                    try:
+                        acc[k] = acc.setdefault(k,0) + v
+                    except TypeError:
+                        pass
+
+    # normalize/compute 'precision', 'recall', 'accuracy', 'f1'
+    for thr,acc in zip(thresh,accumulate):
+        acc['criterion'] = criterion
+        acc['thresh'] = thr
+        acc['by_image'] = bool(by_image)
+        if bool(by_image):
+            for k in ('precision', 'recall', 'accuracy', 'f1', 'mean_true_score', 'mean_matched_score', 'panoptic_quality'):
+                acc[k] /= n_images
+        else:
+            tp, fp, fn, n_true = acc['tp'], acc['fp'], acc['fn'], acc['n_true']
+            sum_matched_score = acc['mean_true_score']
+
+            mean_matched_score = _safe_divide(sum_matched_score, tp)
+            mean_true_score    = _safe_divide(sum_matched_score, n_true)
+            panoptic_quality   = _safe_divide(sum_matched_score, tp+fp/2+fn/2)
+
+            acc.update(
+                precision          = precision(tp,fp,fn),
+                recall             = recall(tp,fp,fn),
+                accuracy           = accuracy(tp,fp,fn),
+                f1                 = f1(tp,fp,fn),
+                mean_true_score    = mean_true_score,
+                mean_matched_score = mean_matched_score,
+                panoptic_quality   = panoptic_quality,
+            )
+
+    accumulate = tuple(namedtuple('DatasetMatching',acc.keys())(*acc.values()) for acc in accumulate)
+    return accumulate[0] if single_thresh else accumulate