Spaces:

mdocekal
/

multi_label_precision_recall_accuracy_fscore

Runtime error

App Files Files Community

mdocekal commited on Oct 29, 2024

Commit

5ae6761

1 Parent(s): 728e0e5

Init commit containing implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014) and multiset variant.

Browse files

Files changed (4) hide show

README.md +39 -6
multi_label_precision_recall_accuracy_fscore.py +125 -43
requirements.txt +2 -1
tests.py +333 -17

README.md CHANGED Viewed

@@ -13,16 +13,49 @@ pinned: false
 ---
 # Metric Card for Multi Label Precision Recall Accuracy Fscore
-***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
-## Metric Description
-*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
-## How to Use
-*Give general statement of how to use the metric*
-*Provide simplest possible example for using the metric*
 ### Inputs
 *List all input arguments in the format below*

 ---
 # Metric Card for Multi Label Precision Recall Accuracy Fscore
+Implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014).
+## How to Use
+    >>> multi_label_precision_recall_accuracy_fscore = evaluate.load("mdocekal/multi_label_precision_recall_accuracy_fscore")
+    >>> results = multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ],
+                references=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ]
+            )
+    >>> print(results)
+    {
+        "precision": 1.0,
+        "recall": 1.0,
+        "accuracy": 1.0,
+        "fscore": 1.0
+    }
+There is also multiset configuration available, which allows to calculate the metrics for multi-label classification with repeated labels.
+It uses the same definition as in previous case, but it works with multiset of labels. Thus, intersection, union, and cardinality for multisets are used instead.
+    >>> results = multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1, 1]
+                ],
+                references=[
+                    [1, 0, 1, 1, 0, 0],
+                ]
+            )
+    >>> print(results)
+    {
+        "precision": 1.0,
+        "recall": 0.5,
+        "accuracy": 0.5,
+        "fscore": 0.6666666666666666
+    }
 ### Inputs
 *List all input arguments in the format below*

multi_label_precision_recall_accuracy_fscore.py CHANGED Viewed

@@ -11,58 +11,79 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""TODO: Add a description here."""
 import evaluate
 import datasets
-# TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
-# TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
     predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class MultiLabelPrecisionRecallAccuracyFscore(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
@@ -70,26 +91,87 @@ class MultiLabelPrecisionRecallAccuracyFscore(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
         return {
             "accuracy": accuracy,
-        }

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import Counter
+from typing import Optional, Union
 import evaluate
 import datasets
 _CITATION = """\
+@article{Zhang2014ARO,
+  title={A Review on Multi-Label Learning Algorithms},
+  author={Min-Ling Zhang and Zhi-Hua Zhou},
+  journal={IEEE Transactions on Knowledge and Data Engineering},
+  year={2014},
+  volume={26},
+  pages={1819-1837},
+  url={https://api.semanticscholar.org/CorpusID:1008003}
 }
 """
 _DESCRIPTION = """\
+Implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014).
 """
 _KWARGS_DESCRIPTION = """
+Implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014).
 Args:
     predictions: list of predictions to score. Each predictions
+        should be a list of predicted labels
     references: list of reference for each prediction. Each
+        reference should be a list of reference labels
 Returns:
+    precision
+    recall
+    accuracy
+    fscore
 Examples:
+    >>> multi_label_precision_recall_accuracy_fscore = evaluate.load("mdocekal/multi_label_precision_recall_accuracy_fscore")
+    >>> results = multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ],
+                references=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ]
+            )
     >>> print(results)
+    {
+        "precision": 1.0,
+        "recall": 1.0,
+        "accuracy": 1.0,
+        "fscore": 1.0
+    }
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class MultiLabelPrecisionRecallAccuracyFscore(evaluate.Metric):
+    """
+    Implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014).
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.beta = kwargs.get("beta", 1.0)
+        self.use_multiset = self.config_name == "multiset"
     def _info(self):
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=[
+                datasets.Features({
+                    'predictions': datasets.Sequence(datasets.Value('int64')),
+                    'references': datasets.Sequence(datasets.Value('int64')),
+                }),
+                datasets.Features({
+                    'predictions': datasets.Sequence(datasets.Value('string')),
+                    'references': datasets.Sequence(datasets.Value('string')),
+                }),
+            ]
         )
+    def eval_example(self, prediction, reference):
+        if self.use_multiset:
+            prediction = Counter(prediction)
+            reference = Counter(reference)
+            intersection_cardinality = sum((prediction & reference).values())
+            union_cardinality = sum((prediction | reference).values())
+            prediction_cardinality = sum(prediction.values())
+            reference_cardinality = sum(reference.values())
+        else:
+            prediction = set(prediction)
+            reference = set(reference)
+            intersection_cardinality = len(prediction & reference)
+            union_cardinality = len(prediction | reference)
+            prediction_cardinality = len(prediction)
+            reference_cardinality = len(reference)
+        precision = intersection_cardinality / prediction_cardinality if prediction_cardinality > 0 else 0
+        recall = intersection_cardinality / reference_cardinality if reference_cardinality > 0 else 0
+        accuracy = intersection_cardinality / union_cardinality if union_cardinality > 0 else 0
+        return precision, recall, accuracy
+    def _compute(self, predictions: list[list[Union[int, str]]], references: list[list[Union[int, str]]],
+                 beta: Optional[float] = None) -> dict[str, float]:
+        """
+        Computes metrics for a list of predictions and references
+        Args:
+            predictions: list of predictions to score. Each predictions
+                should be a list of predicted labels
+            references: list of reference for each prediction. Each
+                reference should be a list of reference labels
+            beta: beta value for F-score calculation
+                if None the beta is set to default value
+        Returns: dict with
+            precision
+            recall
+            accuracy
+            fscore
+        """
+        assert len(predictions) == len(references), "Predictions and references must have the same length"
+        if beta is None:
+            beta = self.beta
+        precision, recall, accuracy = 0, 0, 0
+        for p, r in zip(predictions, references):
+            p, r, a = self.eval_example(p, r)
+            precision += p
+            recall += r
+            accuracy += a
+        precision /= len(predictions)
+        recall /= len(predictions)
+        accuracy /= len(predictions)
+        if precision + recall == 0:
+            fscore = 0.0
+        else:
+            fscore = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
         return {
+            "precision": precision,
+            "recall": recall,
             "accuracy": accuracy,
+            "fscore": fscore
+        }

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~git+https://github.com/huggingface/~~evaluate~~@main~~


1	+ evaluate
2	+ datasets

tests.py CHANGED Viewed

@@ -1,17 +1,333 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

+from unittest import TestCase
+from multi_label_precision_recall_accuracy_fscore import MultiLabelPrecisionRecallAccuracyFscore
+class MultiLabelPrecisionRecallAccuracyFscoreTest(TestCase):
+    """
+    All of these tests are also used for multiset configuration. So please mind this and write the test in a way that
+    it is valid for both configurations (do not use same label multiple times).
+    """
+    def setUp(self):
+        self.multi_label_precision_recall_accuracy_fscore = MultiLabelPrecisionRecallAccuracyFscore()
+    def test_eok(self):
+        self.assertDictEqual(
+            {
+                "precision": 1.0,
+                "recall": 1.0,
+                "accuracy": 1.0,
+                "fscore": 1.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [1, 2],
+                    [0, 1, 2],
+                ],
+                references=[
+                    [0, 1],
+                    [1, 2],
+                    [0, 1, 2],
+                ]
+            )
+        )
+    def test_eok_string(self):
+        self.assertDictEqual(
+            {
+                "precision": 1.0,
+                "recall": 1.0,
+                "accuracy": 1.0,
+                "fscore": 1.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ],
+                references=[
+                    ["0", "1"],
+                    ["1", "2"],
+                    ["0", "1", "2"],
+                ]
+            )
+        )
+    def test_empty(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "accuracy": 0.0,
+                "fscore": 0.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [],
+                    [],
+                    [],
+                ],
+                references=[
+                    [],
+                    [],
+                    [],
+                ]
+            )
+        )
+    def test_empty_reference(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "accuracy": 0.0,
+                "fscore": 0.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [1, 2],
+                    [0, 1, 2],
+                ],
+                references=[
+                    [],
+                    [],
+                    [],
+                ]
+            )
+        )
+    def test_empty_prediction(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "accuracy": 0.0,
+                "fscore": 0.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [],
+                    [],
+                    [],
+                ],
+                references=[
+                    [0, 1],
+                    [1, 2],
+                    [0, 1, 2],
+                ]
+            )
+        )
+    def test_completely_different(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "accuracy": 0.0,
+                "fscore": 0.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [1, 2],
+                    [0, 1, 2],
+                ],
+                references=[
+                    [3, 4],
+                    [5, 6],
+                    [7, 8, 9],
+                ]
+            )
+        )
+    def test_max_precision(self):
+        self.assertDictEqual(
+            {
+                "precision": 1.0,
+                "recall": 0.5,
+                "accuracy": 0.5,
+                "fscore": 2/3
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1]
+                ],
+                references=[
+                    [0, 1, 2, 3]
+                ]
+            )
+        )
+    def test_max_recall(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.5,
+                "recall": 1.0,
+                "accuracy": 0.5,
+                "fscore": 2/3
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1, 2, 3]
+                ],
+                references=[
+                    [0, 1]
+                ]
+            )
+        )
+    def test_partial_match(self):
+        self.assertDictEqual(
+            {
+                "precision": 0.5,
+                "recall": 0.5,
+                "accuracy": 1/3,
+                "fscore": 0.5
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1]
+                ],
+                references=[
+                    [0, 2]
+                ]
+            )
+        )
+    def test_partial_match_multi_sample(self):
+        self.assertDictEqual(
+            {
+                "precision": 2.5/3,
+                "recall": 2/3,
+                "accuracy": 0.5,
+                "fscore": 2*(2.5/3 * 2/3) / (2.5/3 + 2/3)
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [0, 1],
+                    [2, 3]
+                ],
+                references=[
+                    [0, 1, 2, 3],
+                    [0, 1, 2, 3],
+                    [2]
+                ]
+            )
+        )
+    def test_beta(self):
+        self.multi_label_precision_recall_accuracy_fscore.beta = 2
+        self.assertDictEqual(
+            {
+                "precision": 2.5/3,
+                "recall": 2/3,
+                "accuracy": 0.5,
+                "fscore": 5*(2.5/3 * 2/3) / (4*2.5/3 + 2/3)
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [0, 1],
+                    [2, 3]
+                ],
+                references=[
+                    [0, 1, 2, 3],
+                    [0, 1, 2, 3],
+                    [2]
+                ]
+            )
+        )
+        self.assertDictEqual(
+            {
+                "precision": 2.5 / 3,
+                "recall": 2 / 3,
+                "accuracy": 0.5,
+                "fscore": 10 * (2.5 / 3 * 2 / 3) / (9 * 2.5 / 3 + 2 / 3)
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1],
+                    [0, 1],
+                    [2, 3]
+                ],
+                references=[
+                    [0, 1, 2, 3],
+                    [0, 1, 2, 3],
+                    [2]
+                ],
+                beta=3
+            )
+        )
+class MultiLabelPrecisionRecallAccuracyFscoreTestMultiset(MultiLabelPrecisionRecallAccuracyFscoreTest):
+    def setUp(self):
+        self.multi_label_precision_recall_accuracy_fscore = MultiLabelPrecisionRecallAccuracyFscore(config_name="multiset")
+    def test_multiset_eok(self):
+        self.assertDictEqual(
+            {
+                "precision": 1.0,
+                "recall": 1.0,
+                "accuracy": 1.0,
+                "fscore": 1.0
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1, 1],
+                    [1, 2, 2],
+                    [0, 1, 2, 1],
+                ],
+                references=[
+                    [1, 0, 1],
+                    [1, 2, 2],
+                    [0, 1, 1, 2],
+                ]
+            )
+        )
+    def test_multiset_partial_match(self):
+        self.assertDictEqual(
+            {
+                "precision": 1.0,
+                "recall": 0.5,
+                "accuracy": 0.5,
+                "fscore": 2/3
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1, 1]
+                ],
+                references=[
+                    [1, 0, 1, 1, 0, 0],
+                ]
+            )
+        )
+    def test_multiset_partial_match_multi_sample(self):
+        p = (1+2/3) / 2
+        r = (3/4 + 1) / 2
+        self.assertDictEqual(
+            {
+                "precision": p,
+                "recall": r,
+                "accuracy": (3/4 + 2/3) / 2,
+                "fscore": 2*p*r / (p + r)
+            },
+            self.multi_label_precision_recall_accuracy_fscore.compute(
+                predictions=[
+                    [0, 1, 1],
+                    [1, 2, 2]
+                ],
+                references=[
+                    [1, 0, 1, 1],
+                    [1, 2],
+                ]
+            )
+        )