cifkao commited on
Commit
6c2e238
·
1 Parent(s): 3033d56

Better handling of bad characters

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -34,19 +34,23 @@ def get_windows_batched(
34
 
35
  BAD_CHAR = chr(0xfffd)
36
 
37
- def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False):
38
  cur_ids = []
39
  result = []
 
 
 
 
40
  for idx in ids:
41
  cur_ids.append(idx)
42
  decoded = tokenizer.decode(cur_ids)
43
- if BAD_CHAR not in decoded:
44
  if strip_whitespace:
45
  decoded = decoded.strip()
46
  result.append(decoded)
47
  del cur_ids[:]
48
  else:
49
- result.append("")
50
  return result
51
 
52
  def nll_score(logprobs, labels):
 
34
 
35
  BAD_CHAR = chr(0xfffd)
36
 
37
+ def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False, bad_token_replacement=BAD_CHAR):
38
  cur_ids = []
39
  result = []
40
+ bad_ids = [
41
+ _id for _id in tokenizer.convert_tokens_to_ids([BAD_CHAR, " " + BAD_CHAR])
42
+ if _id != tokenizer.unk_token_id
43
+ ]
44
  for idx in ids:
45
  cur_ids.append(idx)
46
  decoded = tokenizer.decode(cur_ids)
47
+ if BAD_CHAR not in decoded or any(_id in cur_ids for _id in bad_ids):
48
  if strip_whitespace:
49
  decoded = decoded.strip()
50
  result.append(decoded)
51
  del cur_ids[:]
52
  else:
53
+ result.append(bad_token_replacement)
54
  return result
55
 
56
  def nll_score(logprobs, labels):