Spaces:
Running
Running
Update Space (evaluate main: 0b7ed95a)
Browse files
README.md
CHANGED
|
@@ -38,12 +38,8 @@ At minimum, this metric takes as input a list of predictions and a list of refer
|
|
| 38 |
>>> references = ["hello there", "general kenobi"]
|
| 39 |
>>> results = rouge.compute(predictions=predictions,
|
| 40 |
... references=references)
|
| 41 |
-
>>> print(
|
| 42 |
-
|
| 43 |
-
>>> print(results["rouge1"])
|
| 44 |
-
AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
|
| 45 |
-
>>> print(results["rouge1"].mid.fmeasure)
|
| 46 |
-
1.0
|
| 47 |
```
|
| 48 |
|
| 49 |
### Inputs
|
|
@@ -62,18 +58,18 @@ AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(pre
|
|
| 62 |
- **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
|
| 63 |
|
| 64 |
### Output Values
|
| 65 |
-
The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of
|
| 66 |
|
| 67 |
```python
|
| 68 |
-
{'rouge1': [
|
| 69 |
```
|
| 70 |
|
| 71 |
If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
|
| 72 |
```python
|
| 73 |
-
{'rouge1':
|
| 74 |
```
|
| 75 |
|
| 76 |
-
The
|
| 77 |
|
| 78 |
|
| 79 |
#### Values from Popular Papers
|
|
@@ -86,11 +82,12 @@ An example without aggregation:
|
|
| 86 |
>>> predictions = ["hello goodbye", "ankh morpork"]
|
| 87 |
>>> references = ["goodbye", "general kenobi"]
|
| 88 |
>>> results = rouge.compute(predictions=predictions,
|
| 89 |
-
... references=references
|
|
|
|
| 90 |
>>> print(list(results.keys()))
|
| 91 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
| 92 |
>>> print(results["rouge1"])
|
| 93 |
-
[
|
| 94 |
```
|
| 95 |
|
| 96 |
The same example, but with aggregation:
|
|
@@ -104,7 +101,7 @@ The same example, but with aggregation:
|
|
| 104 |
>>> print(list(results.keys()))
|
| 105 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
| 106 |
>>> print(results["rouge1"])
|
| 107 |
-
|
| 108 |
```
|
| 109 |
|
| 110 |
The same example, but only calculating `rouge_1`:
|
|
@@ -119,7 +116,7 @@ The same example, but only calculating `rouge_1`:
|
|
| 119 |
>>> print(list(results.keys()))
|
| 120 |
['rouge1']
|
| 121 |
>>> print(results["rouge1"])
|
| 122 |
-
|
| 123 |
```
|
| 124 |
|
| 125 |
## Limitations and Bias
|
|
|
|
| 38 |
>>> references = ["hello there", "general kenobi"]
|
| 39 |
>>> results = rouge.compute(predictions=predictions,
|
| 40 |
... references=references)
|
| 41 |
+
>>> print(results)
|
| 42 |
+
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
```
|
| 44 |
|
| 45 |
### Inputs
|
|
|
|
| 58 |
- **use_stemmer** (`boolean`): If `True`, uses Porter stemmer to strip word suffixes. Defaults to `False`.
|
| 59 |
|
| 60 |
### Output Values
|
| 61 |
+
The output is a dictionary with one entry for each rouge type in the input list `rouge_types`. If `use_aggregator=False`, each dictionary entry is a list of scores, with one score for each sentence. E.g. if `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=False`, the output is:
|
| 62 |
|
| 63 |
```python
|
| 64 |
+
{'rouge1': [0.6666666666666666, 1.0], 'rouge2': [0.0, 1.0]}
|
| 65 |
```
|
| 66 |
|
| 67 |
If `rouge_types=['rouge1', 'rouge2']` and `use_aggregator=True`, the output is of the following format:
|
| 68 |
```python
|
| 69 |
+
{'rouge1': 1.0, 'rouge2': 1.0}
|
| 70 |
```
|
| 71 |
|
| 72 |
+
The ROUGE values are in the range of 0 to 1.
|
| 73 |
|
| 74 |
|
| 75 |
#### Values from Popular Papers
|
|
|
|
| 82 |
>>> predictions = ["hello goodbye", "ankh morpork"]
|
| 83 |
>>> references = ["goodbye", "general kenobi"]
|
| 84 |
>>> results = rouge.compute(predictions=predictions,
|
| 85 |
+
... references=references,
|
| 86 |
+
... use_aggregator=False)
|
| 87 |
>>> print(list(results.keys()))
|
| 88 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
| 89 |
>>> print(results["rouge1"])
|
| 90 |
+
[0.5, 0.0]
|
| 91 |
```
|
| 92 |
|
| 93 |
The same example, but with aggregation:
|
|
|
|
| 101 |
>>> print(list(results.keys()))
|
| 102 |
['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
|
| 103 |
>>> print(results["rouge1"])
|
| 104 |
+
0.25
|
| 105 |
```
|
| 106 |
|
| 107 |
The same example, but only calculating `rouge_1`:
|
|
|
|
| 116 |
>>> print(list(results.keys()))
|
| 117 |
['rouge1']
|
| 118 |
>>> print(results["rouge1"])
|
| 119 |
+
0.25
|
| 120 |
```
|
| 121 |
|
| 122 |
## Limitations and Bias
|
rouge.py
CHANGED
|
@@ -65,22 +65,18 @@ Args:
|
|
| 65 |
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
| 66 |
use_aggregator: Return aggregates if this is set to True
|
| 67 |
Returns:
|
| 68 |
-
rouge1: rouge_1 (
|
| 69 |
-
rouge2: rouge_2 (
|
| 70 |
-
rougeL: rouge_l (
|
| 71 |
-
rougeLsum: rouge_lsum (
|
| 72 |
Examples:
|
| 73 |
|
| 74 |
>>> rouge = evaluate.load('rouge')
|
| 75 |
>>> predictions = ["hello there", "general kenobi"]
|
| 76 |
>>> references = ["hello there", "general kenobi"]
|
| 77 |
>>> results = rouge.compute(predictions=predictions, references=references)
|
| 78 |
-
>>> print(
|
| 79 |
-
|
| 80 |
-
>>> print(results["rouge1"])
|
| 81 |
-
AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))
|
| 82 |
-
>>> print(results["rouge1"].mid.fmeasure)
|
| 83 |
-
1.0
|
| 84 |
"""
|
| 85 |
|
| 86 |
|
|
@@ -123,9 +119,12 @@ class Rouge(evaluate.EvaluationModule):
|
|
| 123 |
|
| 124 |
if use_aggregator:
|
| 125 |
result = aggregator.aggregate()
|
|
|
|
|
|
|
|
|
|
| 126 |
else:
|
| 127 |
result = {}
|
| 128 |
for key in scores[0]:
|
| 129 |
-
result[key] = list(score[key] for score in scores)
|
| 130 |
|
| 131 |
return result
|
|
|
|
| 65 |
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
| 66 |
use_aggregator: Return aggregates if this is set to True
|
| 67 |
Returns:
|
| 68 |
+
rouge1: rouge_1 (f1),
|
| 69 |
+
rouge2: rouge_2 (f1),
|
| 70 |
+
rougeL: rouge_l (f1),
|
| 71 |
+
rougeLsum: rouge_lsum (f1)
|
| 72 |
Examples:
|
| 73 |
|
| 74 |
>>> rouge = evaluate.load('rouge')
|
| 75 |
>>> predictions = ["hello there", "general kenobi"]
|
| 76 |
>>> references = ["hello there", "general kenobi"]
|
| 77 |
>>> results = rouge.compute(predictions=predictions, references=references)
|
| 78 |
+
>>> print(results)
|
| 79 |
+
{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"""
|
| 81 |
|
| 82 |
|
|
|
|
| 119 |
|
| 120 |
if use_aggregator:
|
| 121 |
result = aggregator.aggregate()
|
| 122 |
+
for key in result:
|
| 123 |
+
result[key] = result[key].mid.fmeasure
|
| 124 |
+
|
| 125 |
else:
|
| 126 |
result = {}
|
| 127 |
for key in scores[0]:
|
| 128 |
+
result[key] = list(score[key].fmeasure for score in scores)
|
| 129 |
|
| 130 |
return result
|