Spaces:
Running
Running
Better handling of bad characters
Browse files
app.py
CHANGED
@@ -34,19 +34,23 @@ def get_windows_batched(
|
|
34 |
|
35 |
BAD_CHAR = chr(0xfffd)
|
36 |
|
37 |
-
def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False):
|
38 |
cur_ids = []
|
39 |
result = []
|
|
|
|
|
|
|
|
|
40 |
for idx in ids:
|
41 |
cur_ids.append(idx)
|
42 |
decoded = tokenizer.decode(cur_ids)
|
43 |
-
if BAD_CHAR not in decoded:
|
44 |
if strip_whitespace:
|
45 |
decoded = decoded.strip()
|
46 |
result.append(decoded)
|
47 |
del cur_ids[:]
|
48 |
else:
|
49 |
-
result.append(
|
50 |
return result
|
51 |
|
52 |
def nll_score(logprobs, labels):
|
|
|
34 |
|
35 |
BAD_CHAR = chr(0xfffd)
|
36 |
|
37 |
+
def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False, bad_token_replacement=BAD_CHAR):
|
38 |
cur_ids = []
|
39 |
result = []
|
40 |
+
bad_ids = [
|
41 |
+
_id for _id in tokenizer.convert_tokens_to_ids([BAD_CHAR, " " + BAD_CHAR])
|
42 |
+
if _id != tokenizer.unk_token_id
|
43 |
+
]
|
44 |
for idx in ids:
|
45 |
cur_ids.append(idx)
|
46 |
decoded = tokenizer.decode(cur_ids)
|
47 |
+
if BAD_CHAR not in decoded or any(_id in cur_ids for _id in bad_ids):
|
48 |
if strip_whitespace:
|
49 |
decoded = decoded.strip()
|
50 |
result.append(decoded)
|
51 |
del cur_ids[:]
|
52 |
else:
|
53 |
+
result.append(bad_token_replacement)
|
54 |
return result
|
55 |
|
56 |
def nll_score(logprobs, labels):
|