Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

0e0f376

verified ·

1 Parent(s): 0dd31f7

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -17

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import PyPDF2
 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
 import os
 from huggingface_hub import HfApi, create_repo
 import re
@@ -47,9 +48,15 @@ def upload_image_to_hf(image, filename):
     except Exception as e:
         return f"Error uploading image: {str(e)}"
-def extract_text_from_pdf(pdf_file):
-    """Extract text from PDF using PyPDF2."""
     try:
         reader = PyPDF2.PdfReader(pdf_file)
         text = ""
         for page in reader.pages:
@@ -59,14 +66,15 @@ def extract_text_from_pdf(pdf_file):
     except Exception as e:
         return f"Error extracting text: {str(e)}"
-def extract_images_from_pdf(pdf_file):
-    """Extract images from PDF and convert to PIL images."""
     try:
-        if isinstance(pdf_file, str):  # URL case
-            response = requests.get(pdf_file, stream=True)
             images = convert_from_bytes(response.content)
         else:  # File upload case
-            images = convert_from_path(pdf_file.name)
         return images
     except Exception as e:
         return f"Error extracting images: {str(e)}"
@@ -116,18 +124,15 @@ def process_pdf(pdf_input, pdf_url):
         pdf_url = urllib.parse.unquote(pdf_url)
         try:
             response = requests.head(pdf_url, allow_redirects=True)
-            if response.status_code != 200:
-                return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
-            pdf_file = pdf_url
         except requests.RequestException as e:
             return f"Error accessing URL: {str(e)}"
-    elif pdf_input:
-        pdf_file = pdf_input
-    else:
         return "Error: Please provide a PDF file or URL."
-    text = extract_text_from_pdf(pdf_file)
-    images = extract_images_from_pdf(pdf_file)
     if isinstance(text, str) and text.startswith("Error"):
         return text
@@ -142,11 +147,11 @@ iface = gr.Interface(
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
-        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":

 from pdf2image import convert_from_path, convert_from_bytes
 import pytesseract
 from PIL import Image
+import io
 import os
 from huggingface_hub import HfApi, create_repo
 import re
     except Exception as e:
         return f"Error uploading image: {str(e)}"
+def extract_text_from_pdf(pdf_input):
+    """Extract text from PDF (URL or file) using PyPDF2."""
     try:
+        if isinstance(pdf_input, str):  # URL case
+            response = requests.get(pdf_input, stream=True)
+            response.raise_for_status()
+            pdf_file = io.BytesIO(response.content)
+        else:  # File upload case
+            pdf_file = pdf_input
         reader = PyPDF2.PdfReader(pdf_file)
         text = ""
         for page in reader.pages:
     except Exception as e:
         return f"Error extracting text: {str(e)}"
+def extract_images_from_pdf(pdf_input):
+    """Extract images from PDF (URL or file) and convert to PIL images."""
     try:
+        if isinstance(pdf_input, str):  # URL case
+            response = requests.get(pdf_input, stream=True)
+            response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
+            images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
         return f"Error extracting images: {str(e)}"
         pdf_url = urllib.parse.unquote(pdf_url)
         try:
             response = requests.head(pdf_url, allow_redirects=True)
+            response.raise_for_status()
+            pdf_input = pdf_url
         except requests.RequestException as e:
             return f"Error accessing URL: {str(e)}"
+    elif not pdf_input:
         return "Error: Please provide a PDF file or URL."
+    text = extract_text_from_pdf(pdf_input)
+    images = extract_images_from_pdf(pdf_input)
     if isinstance(text, str) and text.startswith("Error"):
         return text
     fn=process_pdf,
     inputs=[
         gr.File(label="Upload PDF File", type="filepath"),
+        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":