Spaces:

Tonic
/

GOT-OCR

Running

Tonic commited on Sep 14, 2024

Commit

3497964

unverified ·

1 Parent(s): 63a03ad

return formatted res

Files changed (1) hide show

app.py CHANGED Viewed

@@ -132,6 +132,36 @@ def update_inputs(task):
             gr.update(visible=True)
         ]
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
     res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)
@@ -139,9 +169,7 @@ def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
         return res, None
     res = res.replace("\\title", "\\title ")
-    lines = re.split(r'\\\\', res)  # Split on double backslashes
-    formatted_lines = [f"$$ {line.strip()} $$" for line in lines if line.strip()]
-    formatted_res = "\n".join(formatted_lines)
     if html_content:
         encoded_html = base64.b64encode(html_content.encode('utf-8')).decode('utf-8')

             gr.update(visible=True)
         ]
+def parse_latex_output(res):
+    lines = res.split('\n')
+    parsed_lines = []
+    in_tabular = False
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith('\\begin{tabular}') or line.startswith('\\end{tabular}'):
+            parsed_lines.append(f'$$ {line} $$')
+            in_tabular = line.startswith('\\begin{tabular}')
+            continue
+        if in_tabular:
+            parsed_lines.append(f'$$ {line} $$')
+            continue
+        if line.startswith('\\title') or line.startswith('\\author') or line.startswith('\\section'):
+            parsed_lines.append(line)
+            continue
+        if re.search(r'[\\{}$_^]', line) or any(keyword in line for keyword in ['\\hline', '\\begin', '\\end']):
+            parsed_lines.append(f'$$ {line} $$')
+        else:
+            parsed_lines.append(line)
+    return '\n'.join(parsed_lines)
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
     res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)
         return res, None
     res = res.replace("\\title", "\\title ")
+    formatted_res = parse_latex_output(res)
     if html_content:
         encoded_html = base64.b64encode(html_content.encode('utf-8')).decode('utf-8')