Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

ce01472

verified ·

1 Parent(s): 5e96fa0

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -5

app.py CHANGED Viewed

@@ -66,14 +66,14 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
-            with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
                         "text": text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
-                    json.dump(json_data, tmp, ensure_ascii=False, indent=4)  # Ensure ASCII compatibility
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
@@ -84,7 +84,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
-                    tmp.write(markdown_text.encode('utf-8'))
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
@@ -95,7 +95,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
-                    tmp.write(html_text.encode('utf-8'))
                 download_path = tmp.name
             return text, download_path
@@ -117,4 +117,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()  # Temporarily disable sharing for debugging

                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
+            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
                         "text": text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
+                    json.dump(json_data, tmp, ensure_ascii=False, indent=4)
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
+                    tmp.write(markdown_text)
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
+                    tmp.write(html_text)
                 download_path = tmp.name
             return text, download_path
 )
 if __name__ == "__main__":
+    iface.launch()  # Temporarily disable sharing for debugging