Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

77541b8

verified ·

1 Parent(s): 7a57213

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -15

app.py CHANGED Viewed

@@ -8,13 +8,12 @@ import io
 import os
 from huggingface_hub import HfApi, create_repo
 import re
-import markdown
 from datetime import datetime
 # Initialize Hugging Face API
 hf_api = HfApi()
-HF_TOKEN = os.getenv("HF_TOKEN")  # Set your Hugging Face API token as an environment variable
-REPO_NAME = "pdf-images-extracted"  # Hugging Face repo name
 def ensure_hf_repo():
     """Create or get Hugging Face repository."""
@@ -32,7 +31,7 @@ def upload_image_to_hf(image, filename):
     try:
         # Save image temporarily
-        temp_path = f"temp_{filename}.png"
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face
@@ -92,7 +91,6 @@ def format_to_markdown(text, images):
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
-            # Perform OCR on image to include any text (e.g., in charts)
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
@@ -109,19 +107,19 @@ def format_to_markdown(text, images):
 def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     if pdf_url and pdf_url.strip():
-        # Process PDF from URL
         response = requests.head(pdf_url)
         if response.status_code != 200:
             return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
         pdf_file = pdf_url
     elif pdf_input:
-        # Process uploaded PDF
         pdf_file = pdf_input
     else:
         return "Error: Please provide a PDF file or URL."
-    # Extract text and images
     text = extract_text_from_pdf(pdf_file)
     images = extract_images_from_pdf(pdf_file)
@@ -130,7 +128,6 @@ def process_pdf(pdf_input, pdf_url):
     if isinstance(images, str) and images.startswith("Error"):
         return images
-    # Generate Markdown
     markdown_output = format_to_markdown(text, images)
     return markdown_output
@@ -143,12 +140,8 @@ iface = gr.Interface(
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
-    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
 )
 if __name__ == "__main__":
-    # Ensure Hugging Face token is set
-    if not HF_TOKEN:
-        print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
-    else:
-        iface.launch()

 import os
 from huggingface_hub import HfApi, create_repo
 import re
 from datetime import datetime
 # Initialize Hugging Face API
+HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
+REPO_NAME = "pdf-images-extracted"  # Hugging Face repo for images
 hf_api = HfApi()
 def ensure_hf_repo():
     """Create or get Hugging Face repository."""
     try:
         # Save image temporarily
+        temp_path = f"/tmp/temp_{filename}.png"
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
 def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
+    if not HF_TOKEN:
+        return "Error: HF_TOKEN not set in Spaces Secrets."
     if pdf_url and pdf_url.strip():
         response = requests.head(pdf_url)
         if response.status_code != 200:
             return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
         pdf_file = pdf_url
     elif pdf_input:
         pdf_file = pdf_input
     else:
         return "Error: Please provide a PDF file or URL."
     text = extract_text_from_pdf(pdf_file)
     images = extract_images_from_pdf(pdf_file)
     if isinstance(images, str) and images.startswith("Error"):
         return images
     markdown_output = format_to_markdown(text, images)
     return markdown_output
     ],
     outputs=gr.Markdown(label="Markdown Output"),
     title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
 )
 if __name__ == "__main__":
+    iface.launch()