Spaces:
Runtime error
Runtime error
Updating module
Browse files- levenshtein_distance.py +35 -4
levenshtein_distance.py
CHANGED
@@ -16,6 +16,7 @@
|
|
16 |
import evaluate
|
17 |
import datasets
|
18 |
|
|
|
19 |
|
20 |
# TODO: Add BibTeX citation
|
21 |
_CITATION = """\
|
@@ -56,6 +57,29 @@ Examples:
|
|
56 |
# TODO: Define external resources urls if needed
|
57 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
61 |
class LevenshteinDistance(evaluate.Comparison):
|
@@ -86,10 +110,17 @@ class LevenshteinDistance(evaluate.Comparison):
|
|
86 |
# TODO: Download external resources if needed
|
87 |
pass
|
88 |
|
89 |
-
def _compute(self, predictions, references):
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return {
|
94 |
-
"
|
95 |
}
|
|
|
16 |
import evaluate
|
17 |
import datasets
|
18 |
|
19 |
+
import numpy as np
|
20 |
|
21 |
# TODO: Add BibTeX citation
|
22 |
_CITATION = """\
|
|
|
57 |
# TODO: Define external resources urls if needed
|
58 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
59 |
|
60 |
+
# This code was taken from https://gist.github.com/kylebgorman/1081951/bce3de986e4b05fc0b63d4d9e0cfa4bde6664365
|
61 |
+
def _dist(A, B, insertion, deletion, substitution):
|
62 |
+
D = np.zeros((len(A) + 1, len(B) + 1))
|
63 |
+
for i in range(len(A)):
|
64 |
+
D[i + 1][0] = D[i][0] + deletion
|
65 |
+
for j in range(len(B)):
|
66 |
+
D[0][j + 1] = D[0][j] + insertion
|
67 |
+
for i in range(len(A)): # fill out middle of matrix
|
68 |
+
for j in range(len(B)):
|
69 |
+
if A[i] == B[j]:
|
70 |
+
D[i + 1][j + 1] = D[i][j] # aka, it's free.
|
71 |
+
else:
|
72 |
+
D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
|
73 |
+
D[i][j + 1] + deletion,
|
74 |
+
D[i][j] + substitution)
|
75 |
+
return D
|
76 |
+
|
77 |
+
def levenshtein_distance(l1, l2, normalize=False):
|
78 |
+
dist = _dist(l1, l2, 1, 1, 1)[-1][-1]
|
79 |
+
if normalize:
|
80 |
+
return 1. - dist / max(len(l1), len(l2))
|
81 |
+
else:
|
82 |
+
return dist
|
83 |
|
84 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
85 |
class LevenshteinDistance(evaluate.Comparison):
|
|
|
110 |
# TODO: Download external resources if needed
|
111 |
pass
|
112 |
|
113 |
+
def _compute(self, predictions, references, tokenizer=None, normalize=False):
|
114 |
"""Returns the scores"""
|
115 |
+
|
116 |
+
dists = []
|
117 |
+
for prediction, reference in zip(predictions, references):
|
118 |
+
tokenized_prediction = tokenizer(prediction)
|
119 |
+
tokenized_reference = tokenizer(reference)
|
120 |
+
dists.append(levenshtein_distance(tokenized_prediction, tokenized_reference, normalize=normalize))
|
121 |
+
|
122 |
+
avg_dist = np.mean(dists)
|
123 |
+
|
124 |
return {
|
125 |
+
"levenshtein_distance": avg_dist,
|
126 |
}
|