NuExtract-1.5

Running on Zero

liamcripwell commited on Oct 15, 2024

Commit

13e3b2f

1 Parent(s): b17b8f7

allow non-json template

Files changed (1) hide show

app.py CHANGED Viewed

@@ -92,11 +92,31 @@ def sliding_window_prediction(template, text, model, tokenizer, window_size=4000
         pred = handle_broken_output(pred, prev)
         # create highlighted text
-        highlighted_pred = highlight_words(text, json.loads(pred))
         # Sync empty fields
-        synced_pred = sync_empty_fields(json.loads(pred), json.loads(template))
-        synced_pred = json.dumps(synced_pred, indent=4, ensure_ascii=False)
         # Return progress, current prediction, and updated HTML
         yield f"Processed chunk {i+1}/{len(chunks)}", synced_pred, highlighted_pred
@@ -118,13 +138,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=auth_token)
 model.eval()
 def gradio_interface_function(template, text, is_example):
-    # reject invalid JSON
-    try:
-        template_json = json.loads(template)
-    except:
-        yield "", "Invalid JSON template", ""
-        return  # End the function since there was an error
     if len(tokenizer.tokenize(text)) > MAX_INPUT_SIZE:
         yield "", "Input text too long for space. Download model to use unrestricted.", ""
         return  # End the function since there was an error

         pred = handle_broken_output(pred, prev)
         # create highlighted text
+        try:
+            highlighted_pred = highlight_words(text, json.loads(pred))
+        except:
+            highlighted_pred = text
+        # attempt json parsing
+        template_dict = None
+        pred_dict = None
+        try:
+            template_dict = json.loads(template)
+        except:
+            pass
+        try:
+            pred_dict = json.loads(pred)
+        except:
+            pass
         # Sync empty fields
+        if template_dict and pred_dict:
+            synced_pred = sync_empty_fields(pred_dict, template_dict)
+            synced_pred = json.dumps(synced_pred, indent=4, ensure_ascii=False)
+        elif pred_dict:
+            synced_pred = json.dumps(pred_dict, indent=4, ensure_ascii=False)
+        else:
+            synced_pred = pred
         # Return progress, current prediction, and updated HTML
         yield f"Processed chunk {i+1}/{len(chunks)}", synced_pred, highlighted_pred
 model.eval()
 def gradio_interface_function(template, text, is_example):
     if len(tokenizer.tokenize(text)) > MAX_INPUT_SIZE:
         yield "", "Input text too long for space. Download model to use unrestricted.", ""
         return  # End the function since there was an error