Spaces:

AshtonIsNotHere
/

xlmr-longformer_comparison

Runtime error

AshtonIsNotHere commited on Feb 3, 2023

Commit

b840e20

1 Parent(s): 2e85755

Fix to allow masked token after 512th token

Sequences longer than 510 are now truncated around the masked token for xlm-roberta-base, regardless of mask location.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -31,8 +31,35 @@ xlmr_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', max_length=51
 xlmr_p = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 def xlmr_base_fn(text):
-    text = ' '.join(text.split()[:500])
-    preds = xlmr_p(text)
     pred_dict = {}
     for pred in preds:
         pred_dict[pred['token_str']] = pred['score']

 xlmr_p = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 def xlmr_base_fn(text):
+    # Find our masked token
+    tokens = xlmr_tokenizer.tokenize(text)
+    mask_token_idx = [i for i, x in enumerate(tokens) if xlmr_tokenizer.mask_token in x][0]
+    max_len = tokenizer.model_max_length
+    max_len = max_len-2 if max_len % 512 == 0 and max_len < 4096 else 510
+    # Smart truncation for long sequences
+    if not len(tokens) < max_len:
+        # Find left and right bounds for truncated sequences
+        lbound = max(0, mask_token_idx-(max_len//2))
+        rbound = min(len(tokens), mask_token_idx+(max_len//2))
+        # If we hit an edge, expand sequence in the other direction
+        if lbound == 0 and rbound != len(tokens)-1:
+            rbound = min(len(tokens), max_len)
+        elif rbound == len(tokens) and lbound != 0:
+            lbound = max(0, len(tokens)-max_len)
+        # Apply truncation and rejoin tokens to form new text
+        truncated_text = ''.join(tokens[lbound:rbound])
+        # Handle lowbar from xlmr tokenizer
+        truncated_text = ''.join([x if ord(x) != 9601 else ' ' for x in result])
+    else:
+        truncated_text = text
+    preds = xlmr_p(truncated_text)
     pred_dict = {}
     for pred in preds:
         pred_dict[pred['token_str']] = pred['score']