Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
-
import
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|
19 |
|
20 |
# Initialize Hugging Face API
|
21 |
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
22 |
-
REPO_NAME = "
|
23 |
hf_api = HfApi()
|
24 |
|
25 |
def check_poppler():
|
@@ -36,11 +36,11 @@ def ensure_hf_dataset():
|
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
38 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
39 |
-
logger.info(f"
|
40 |
return repo_id
|
41 |
except Exception as e:
|
42 |
-
logger.error(f"
|
43 |
-
return f"Error
|
44 |
|
45 |
def upload_image_to_hf(image, filename):
|
46 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
@@ -68,8 +68,9 @@ def upload_image_to_hf(image, filename):
|
|
68 |
logger.error(f"Error uploading image: {str(e)}")
|
69 |
return f"Error uploading image: {str(e)}"
|
70 |
|
71 |
-
def extract_text_from_pdf(pdf_input):
|
72 |
-
"""Extract text from PDF
|
|
|
73 |
try:
|
74 |
if isinstance(pdf_input, str): # URL case
|
75 |
response = requests.get(pdf_input, stream=True)
|
@@ -77,18 +78,22 @@ def extract_text_from_pdf(pdf_input):
|
|
77 |
pdf_file = io.BytesIO(response.content)
|
78 |
else: # File upload case
|
79 |
pdf_file = pdf_input
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
85 |
return text
|
86 |
except Exception as e:
|
87 |
logger.error(f"Error extracting text: {str(e)}")
|
88 |
return f"Error extracting text: {str(e)}"
|
89 |
|
90 |
-
def extract_images_from_pdf(pdf_input):
|
91 |
-
"""Extract images from PDF
|
|
|
92 |
if not check_poppler():
|
93 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
94 |
|
@@ -106,15 +111,16 @@ def extract_images_from_pdf(pdf_input):
|
|
106 |
logger.error(f"Error extracting images: {str(e)}")
|
107 |
return f"Error extracting images: {str(e)}"
|
108 |
|
109 |
-
def format_to_markdown(text, images):
|
110 |
"""Convert extracted text and images to Markdown format."""
|
|
|
111 |
markdown_output = "# Extracted PDF Content\n\n"
|
112 |
|
113 |
# Clean and format text
|
114 |
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
|
115 |
lines = text.split("\n")
|
116 |
for line in lines:
|
117 |
-
# Detect headings (
|
118 |
if line.isupper() and len(line) > 5:
|
119 |
markdown_output += f"## {line}\n\n"
|
120 |
# Detect lists (lines starting with numbers or bullets)
|
@@ -127,6 +133,7 @@ def format_to_markdown(text, images):
|
|
127 |
if isinstance(images, list) and images:
|
128 |
markdown_output += "## Extracted Images\n\n"
|
129 |
for i, image in enumerate(images):
|
|
|
130 |
ocr_text = pytesseract.image_to_string(image).strip()
|
131 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
132 |
filename = f"image_{i}_{timestamp}"
|
@@ -141,11 +148,14 @@ def format_to_markdown(text, images):
|
|
141 |
|
142 |
return markdown_output
|
143 |
|
144 |
-
def process_pdf(pdf_input, pdf_url):
|
145 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
146 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
|
|
|
|
147 |
if not HF_TOKEN:
|
148 |
-
|
|
|
149 |
|
150 |
# Log poppler status
|
151 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
@@ -154,39 +164,55 @@ def process_pdf(pdf_input, pdf_url):
|
|
154 |
if pdf_url and pdf_url.strip():
|
155 |
pdf_url = urllib.parse.unquote(pdf_url)
|
156 |
logger.info(f"Decoded URL: {pdf_url}")
|
|
|
157 |
try:
|
158 |
response = requests.head(pdf_url, allow_redirects=True)
|
159 |
response.raise_for_status()
|
160 |
pdf_input = pdf_url
|
161 |
except requests.RequestException as e:
|
162 |
logger.error(f"Error accessing URL: {str(e)}")
|
163 |
-
|
|
|
164 |
elif not pdf_input:
|
165 |
-
|
|
|
166 |
|
167 |
-
text = extract_text_from_pdf(pdf_input)
|
168 |
-
images = extract_images_from_pdf(pdf_input)
|
169 |
|
170 |
if isinstance(text, str) and text.startswith("Error"):
|
171 |
-
|
|
|
172 |
if isinstance(images, str) and images.startswith("Error"):
|
173 |
-
|
|
|
174 |
|
175 |
-
markdown_output = format_to_markdown(text, images)
|
176 |
-
|
|
|
177 |
|
178 |
# Gradio Interface
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
if __name__ == "__main__":
|
192 |
# In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
+
import pdfplumber
|
4 |
from pdf2image import convert_from_path, convert_from_bytes
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
|
|
19 |
|
20 |
# Initialize Hugging Face API
|
21 |
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
22 |
+
REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
|
23 |
hf_api = HfApi()
|
24 |
|
25 |
def check_poppler():
|
|
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
38 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
39 |
+
logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
|
40 |
return repo_id
|
41 |
except Exception as e:
|
42 |
+
logger.error(f"Failed to create/access dataset repo: {str(e)}")
|
43 |
+
return f"Error: Failed to create/access dataset repo: {str(e)}"
|
44 |
|
45 |
def upload_image_to_hf(image, filename):
|
46 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
|
|
68 |
logger.error(f"Error uploading image: {str(e)}")
|
69 |
return f"Error uploading image: {str(e)}"
|
70 |
|
71 |
+
def extract_text_from_pdf(pdf_input, status_callback):
|
72 |
+
"""Extract text from PDF using pdfplumber."""
|
73 |
+
status_callback("Extracting text from PDF...")
|
74 |
try:
|
75 |
if isinstance(pdf_input, str): # URL case
|
76 |
response = requests.get(pdf_input, stream=True)
|
|
|
78 |
pdf_file = io.BytesIO(response.content)
|
79 |
else: # File upload case
|
80 |
pdf_file = pdf_input
|
81 |
+
with pdfplumber.open(pdf_file) as pdf:
|
82 |
+
text = ""
|
83 |
+
for page in pdf.pages:
|
84 |
+
page_text = page.extract_text() or ""
|
85 |
+
text += page_text + "\n\n"
|
86 |
+
tables = page.extract_tables()
|
87 |
+
for table in tables:
|
88 |
+
text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
|
89 |
return text
|
90 |
except Exception as e:
|
91 |
logger.error(f"Error extracting text: {str(e)}")
|
92 |
return f"Error extracting text: {str(e)}"
|
93 |
|
94 |
+
def extract_images_from_pdf(pdf_input, status_callback):
|
95 |
+
"""Extract images from PDF and convert to PIL images."""
|
96 |
+
status_callback("Extracting images from PDF...")
|
97 |
if not check_poppler():
|
98 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
99 |
|
|
|
111 |
logger.error(f"Error extracting images: {str(e)}")
|
112 |
return f"Error extracting images: {str(e)}"
|
113 |
|
114 |
+
def format_to_markdown(text, images, status_callback):
|
115 |
"""Convert extracted text and images to Markdown format."""
|
116 |
+
status_callback("Formatting output as Markdown...")
|
117 |
markdown_output = "# Extracted PDF Content\n\n"
|
118 |
|
119 |
# Clean and format text
|
120 |
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
|
121 |
lines = text.split("\n")
|
122 |
for line in lines:
|
123 |
+
# Detect headings (heuristic: all caps or specific keywords)
|
124 |
if line.isupper() and len(line) > 5:
|
125 |
markdown_output += f"## {line}\n\n"
|
126 |
# Detect lists (lines starting with numbers or bullets)
|
|
|
133 |
if isinstance(images, list) and images:
|
134 |
markdown_output += "## Extracted Images\n\n"
|
135 |
for i, image in enumerate(images):
|
136 |
+
status_callback(f"Uploading image {i+1}...")
|
137 |
ocr_text = pytesseract.image_to_string(image).strip()
|
138 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
139 |
filename = f"image_{i}_{timestamp}"
|
|
|
148 |
|
149 |
return markdown_output
|
150 |
|
151 |
+
def process_pdf(pdf_input, pdf_url, status_callback):
|
152 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
153 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
154 |
+
status_callback("Starting PDF processing...")
|
155 |
+
|
156 |
if not HF_TOKEN:
|
157 |
+
status_callback("Error: HF_TOKEN not set.")
|
158 |
+
return "Error: HF_TOKEN not set in Spaces Secrets.", ""
|
159 |
|
160 |
# Log poppler status
|
161 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
|
|
164 |
if pdf_url and pdf_url.strip():
|
165 |
pdf_url = urllib.parse.unquote(pdf_url)
|
166 |
logger.info(f"Decoded URL: {pdf_url}")
|
167 |
+
status_callback(f"Downloading PDF from URL: {pdf_url}")
|
168 |
try:
|
169 |
response = requests.head(pdf_url, allow_redirects=True)
|
170 |
response.raise_for_status()
|
171 |
pdf_input = pdf_url
|
172 |
except requests.RequestException as e:
|
173 |
logger.error(f"Error accessing URL: {str(e)}")
|
174 |
+
status_callback(f"Error accessing URL: {str(e)}")
|
175 |
+
return f"Error accessing URL: {str(e)}", ""
|
176 |
elif not pdf_input:
|
177 |
+
status_callback("Error: No PDF provided.")
|
178 |
+
return "Error: Please provide a PDF file or URL.", ""
|
179 |
|
180 |
+
text = extract_text_from_pdf(pdf_input, status_callback)
|
181 |
+
images = extract_images_from_pdf(pdf_input, status_callback)
|
182 |
|
183 |
if isinstance(text, str) and text.startswith("Error"):
|
184 |
+
status_callback("Text extraction failed.")
|
185 |
+
return text, ""
|
186 |
if isinstance(images, str) and images.startswith("Error"):
|
187 |
+
status_callback("Image extraction failed.")
|
188 |
+
return images, ""
|
189 |
|
190 |
+
markdown_output = format_to_markdown(text, images, status_callback)
|
191 |
+
status_callback("Processing complete.")
|
192 |
+
return markdown_output, ""
|
193 |
|
194 |
# Gradio Interface
|
195 |
+
with gr.Blocks() as iface:
|
196 |
+
gr.Markdown("# PDF to Markdown Converter")
|
197 |
+
gr.Markdown("Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.")
|
198 |
+
|
199 |
+
with gr.Row():
|
200 |
+
pdf_input = gr.File(label="Upload PDF File", type="filepath")
|
201 |
+
pdf_url = gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF")
|
202 |
+
|
203 |
+
status = gr.Textbox(label="Processing Status", interactive=False)
|
204 |
+
output = gr.Markdown(label="Markdown Output")
|
205 |
+
|
206 |
+
submit_btn = gr.Button("Process PDF")
|
207 |
+
|
208 |
+
def update_status(message):
|
209 |
+
return message
|
210 |
+
|
211 |
+
submit_btn.click(
|
212 |
+
fn=process_pdf,
|
213 |
+
inputs=[pdf_input, pdf_url, update_status],
|
214 |
+
outputs=[output, status]
|
215 |
+
)
|
216 |
|
217 |
if __name__ == "__main__":
|
218 |
# In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
|