Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

4d96b5c

verified ·

1 Parent(s): 893b405

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -6

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ def parse_pdf(pdf_file, output_format):
         text = ""
         tables = []
         images = []
-        markdown_text = ""  # Initialize markdown_text outside conditional blocks
         for page in pages:
             for element in page:
@@ -25,14 +25,15 @@ def parse_pdf(pdf_file, output_format):
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
         # ...
-    # Convert extracted data to desired format
     if output_format == "JSON":
         json_data = {
             "text": text,
             "tables": tables,  # Implement table conversion to JSON
             "images": images  # Implement image conversion to JSON (e.g., base64)
         }
-        return json_data
     elif output_format == "Markdown":
         markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
         # Implement table conversion to Markdown
@@ -40,7 +41,8 @@ def parse_pdf(pdf_file, output_format):
         markdown_text += "\n# Images\n"
         # Implement image conversion to Markdown (e.g., embedding images)
         # ...
-        return markdown_text
     elif output_format == "HTML":
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
         # Implement table conversion to HTML
@@ -48,7 +50,9 @@ def parse_pdf(pdf_file, output_format):
         html_text += "<h2>Images</h2>\n"
         # Implement image conversion to HTML (e.g., embedding images)
         # ...
-        return html_text
 iface = gr.Interface(
     fn=parse_pdf,
@@ -62,4 +66,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

         text = ""
         tables = []
         images = []
+        download_data = None  # Initialize an empty variable for download data
         for page in pages:
             for element in page:
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
         # ...
+    # Convert extracted data to desired format and populate download_data
     if output_format == "JSON":
         json_data = {
             "text": text,
             "tables": tables,  # Implement table conversion to JSON
             "images": images  # Implement image conversion to JSON (e.g., base64)
         }
+        download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
     elif output_format == "Markdown":
         markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
         # Implement table conversion to Markdown
         markdown_text += "\n# Images\n"
         # Implement image conversion to Markdown (e.g., embedding images)
         # ...
+        download_data = markdown_text.encode("utf-8")  # Encode Markdown for download
     elif output_format == "HTML":
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
         # Implement table conversion to HTML
         html_text += "<h2>Images</h2>\n"
         # Implement image conversion to HTML (e.g., embedding images)
         # ...
+        download_data = html_text.encode("utf-8")  # Encode HTML for download
+    return text, download_data
 iface = gr.Interface(
     fn=parse_pdf,
 )
 if __name__ == "__main__":
+    iface.launch(share=True)  # Set share=True to create a public link