ncoop57 commited on
Commit
5b3f71a
·
1 Parent(s): 7307644

Updating module

Browse files
Files changed (1) hide show
  1. levenshtein_distance.py +35 -4
levenshtein_distance.py CHANGED
@@ -16,6 +16,7 @@
16
  import evaluate
17
  import datasets
18
 
 
19
 
20
  # TODO: Add BibTeX citation
21
  _CITATION = """\
@@ -56,6 +57,29 @@ Examples:
56
  # TODO: Define external resources urls if needed
57
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class LevenshteinDistance(evaluate.Comparison):
@@ -86,10 +110,17 @@ class LevenshteinDistance(evaluate.Comparison):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
  }
 
16
  import evaluate
17
  import datasets
18
 
19
+ import numpy as np
20
 
21
  # TODO: Add BibTeX citation
22
  _CITATION = """\
 
57
  # TODO: Define external resources urls if needed
58
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
59
 
60
+ # This code was taken from https://gist.github.com/kylebgorman/1081951/bce3de986e4b05fc0b63d4d9e0cfa4bde6664365
61
+ def _dist(A, B, insertion, deletion, substitution):
62
+ D = np.zeros((len(A) + 1, len(B) + 1))
63
+ for i in range(len(A)):
64
+ D[i + 1][0] = D[i][0] + deletion
65
+ for j in range(len(B)):
66
+ D[0][j + 1] = D[0][j] + insertion
67
+ for i in range(len(A)): # fill out middle of matrix
68
+ for j in range(len(B)):
69
+ if A[i] == B[j]:
70
+ D[i + 1][j + 1] = D[i][j] # aka, it's free.
71
+ else:
72
+ D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
73
+ D[i][j + 1] + deletion,
74
+ D[i][j] + substitution)
75
+ return D
76
+
77
+ def levenshtein_distance(l1, l2, normalize=False):
78
+ dist = _dist(l1, l2, 1, 1, 1)[-1][-1]
79
+ if normalize:
80
+ return 1. - dist / max(len(l1), len(l2))
81
+ else:
82
+ return dist
83
 
84
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
85
  class LevenshteinDistance(evaluate.Comparison):
 
110
  # TODO: Download external resources if needed
111
  pass
112
 
113
+ def _compute(self, predictions, references, tokenizer=None, normalize=False):
114
  """Returns the scores"""
115
+
116
+ dists = []
117
+ for prediction, reference in zip(predictions, references):
118
+ tokenized_prediction = tokenizer(prediction)
119
+ tokenized_reference = tokenizer(reference)
120
+ dists.append(levenshtein_distance(tokenized_prediction, tokenized_reference, normalize=normalize))
121
+
122
+ avg_dist = np.mean(dists)
123
+
124
  return {
125
+ "levenshtein_distance": avg_dist,
126
  }