Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
|
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
|
|
7 |
import os
|
8 |
from huggingface_hub import HfApi, create_repo
|
9 |
import re
|
@@ -47,9 +48,15 @@ def upload_image_to_hf(image, filename):
|
|
47 |
except Exception as e:
|
48 |
return f"Error uploading image: {str(e)}"
|
49 |
|
50 |
-
def extract_text_from_pdf(
|
51 |
-
"""Extract text from PDF using PyPDF2."""
|
52 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
reader = PyPDF2.PdfReader(pdf_file)
|
54 |
text = ""
|
55 |
for page in reader.pages:
|
@@ -59,14 +66,15 @@ def extract_text_from_pdf(pdf_file):
|
|
59 |
except Exception as e:
|
60 |
return f"Error extracting text: {str(e)}"
|
61 |
|
62 |
-
def extract_images_from_pdf(
|
63 |
-
"""Extract images from PDF and convert to PIL images."""
|
64 |
try:
|
65 |
-
if isinstance(
|
66 |
-
response = requests.get(
|
|
|
67 |
images = convert_from_bytes(response.content)
|
68 |
else: # File upload case
|
69 |
-
images = convert_from_path(
|
70 |
return images
|
71 |
except Exception as e:
|
72 |
return f"Error extracting images: {str(e)}"
|
@@ -116,18 +124,15 @@ def process_pdf(pdf_input, pdf_url):
|
|
116 |
pdf_url = urllib.parse.unquote(pdf_url)
|
117 |
try:
|
118 |
response = requests.head(pdf_url, allow_redirects=True)
|
119 |
-
|
120 |
-
|
121 |
-
pdf_file = pdf_url
|
122 |
except requests.RequestException as e:
|
123 |
return f"Error accessing URL: {str(e)}"
|
124 |
-
elif pdf_input:
|
125 |
-
pdf_file = pdf_input
|
126 |
-
else:
|
127 |
return "Error: Please provide a PDF file or URL."
|
128 |
|
129 |
-
text = extract_text_from_pdf(
|
130 |
-
images = extract_images_from_pdf(
|
131 |
|
132 |
if isinstance(text, str) and text.startswith("Error"):
|
133 |
return text
|
@@ -142,11 +147,11 @@ iface = gr.Interface(
|
|
142 |
fn=process_pdf,
|
143 |
inputs=[
|
144 |
gr.File(label="Upload PDF File", type="filepath"),
|
145 |
-
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
|
146 |
],
|
147 |
outputs=gr.Markdown(label="Markdown Output"),
|
148 |
title="PDF to Markdown Converter",
|
149 |
-
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
|
150 |
)
|
151 |
|
152 |
if __name__ == "__main__":
|
|
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
+
import io
|
8 |
import os
|
9 |
from huggingface_hub import HfApi, create_repo
|
10 |
import re
|
|
|
48 |
except Exception as e:
|
49 |
return f"Error uploading image: {str(e)}"
|
50 |
|
51 |
+
def extract_text_from_pdf(pdf_input):
|
52 |
+
"""Extract text from PDF (URL or file) using PyPDF2."""
|
53 |
try:
|
54 |
+
if isinstance(pdf_input, str): # URL case
|
55 |
+
response = requests.get(pdf_input, stream=True)
|
56 |
+
response.raise_for_status()
|
57 |
+
pdf_file = io.BytesIO(response.content)
|
58 |
+
else: # File upload case
|
59 |
+
pdf_file = pdf_input
|
60 |
reader = PyPDF2.PdfReader(pdf_file)
|
61 |
text = ""
|
62 |
for page in reader.pages:
|
|
|
66 |
except Exception as e:
|
67 |
return f"Error extracting text: {str(e)}"
|
68 |
|
69 |
+
def extract_images_from_pdf(pdf_input):
|
70 |
+
"""Extract images from PDF (URL or file) and convert to PIL images."""
|
71 |
try:
|
72 |
+
if isinstance(pdf_input, str): # URL case
|
73 |
+
response = requests.get(pdf_input, stream=True)
|
74 |
+
response.raise_for_status()
|
75 |
images = convert_from_bytes(response.content)
|
76 |
else: # File upload case
|
77 |
+
images = convert_from_path(pdf_input.name)
|
78 |
return images
|
79 |
except Exception as e:
|
80 |
return f"Error extracting images: {str(e)}"
|
|
|
124 |
pdf_url = urllib.parse.unquote(pdf_url)
|
125 |
try:
|
126 |
response = requests.head(pdf_url, allow_redirects=True)
|
127 |
+
response.raise_for_status()
|
128 |
+
pdf_input = pdf_url
|
|
|
129 |
except requests.RequestException as e:
|
130 |
return f"Error accessing URL: {str(e)}"
|
131 |
+
elif not pdf_input:
|
|
|
|
|
132 |
return "Error: Please provide a PDF file or URL."
|
133 |
|
134 |
+
text = extract_text_from_pdf(pdf_input)
|
135 |
+
images = extract_images_from_pdf(pdf_input)
|
136 |
|
137 |
if isinstance(text, str) and text.startswith("Error"):
|
138 |
return text
|
|
|
147 |
fn=process_pdf,
|
148 |
inputs=[
|
149 |
gr.File(label="Upload PDF File", type="filepath"),
|
150 |
+
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
|
151 |
],
|
152 |
outputs=gr.Markdown(label="Markdown Output"),
|
153 |
title="PDF to Markdown Converter",
|
154 |
+
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
|
155 |
)
|
156 |
|
157 |
if __name__ == "__main__":
|