Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,16 +14,15 @@ import logging
|
|
14 |
import subprocess
|
15 |
|
16 |
# Set up logging
|
17 |
-
logging.basicConfig(level=logging.INFO)
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
# Initialize Hugging Face API
|
21 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
22 |
-
REPO_NAME = "pdf-images-extracted"
|
23 |
hf_api = HfApi()
|
24 |
|
25 |
def check_poppler():
|
26 |
-
"""Check if poppler-utils is installed."""
|
27 |
try:
|
28 |
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
|
29 |
logger.info(f"Poppler version: {result.stdout}")
|
@@ -33,29 +32,23 @@ def check_poppler():
|
|
33 |
return False
|
34 |
|
35 |
def ensure_hf_dataset():
|
36 |
-
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
38 |
if not HF_TOKEN:
|
39 |
raise ValueError("HF_TOKEN is not set")
|
40 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
41 |
-
logger.info(f"
|
42 |
return repo_id
|
43 |
except Exception as e:
|
44 |
-
logger.error(f"
|
45 |
-
return f"Error: Failed to
|
46 |
|
47 |
def upload_image_to_hf(image, filename):
|
48 |
-
"""Upload an image to Hugging Face dataset and return its URL."""
|
49 |
repo_id = ensure_hf_dataset()
|
50 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
51 |
return repo_id
|
52 |
-
|
53 |
try:
|
54 |
-
# Save image temporarily
|
55 |
temp_path = f"/tmp/temp_{filename}.png"
|
56 |
image.save(temp_path, format="PNG")
|
57 |
-
|
58 |
-
# Upload to Hugging Face dataset
|
59 |
file_url = hf_api.upload_file(
|
60 |
path_or_fileobj=temp_path,
|
61 |
path_in_repo=f"images/{filename}.png",
|
@@ -64,20 +57,19 @@ def upload_image_to_hf(image, filename):
|
|
64 |
token=HF_TOKEN
|
65 |
)
|
66 |
os.remove(temp_path)
|
67 |
-
logger.info(f"Uploaded image
|
68 |
return file_url
|
69 |
except Exception as e:
|
70 |
-
logger.error(f"
|
71 |
return f"Error uploading image: {str(e)}"
|
72 |
|
73 |
def extract_text_from_pdf(pdf_input):
|
74 |
-
"""Extract text from PDF using pdfplumber."""
|
75 |
try:
|
76 |
-
if isinstance(pdf_input, str):
|
77 |
-
response = requests.get(pdf_input, stream=True)
|
78 |
response.raise_for_status()
|
79 |
pdf_file = io.BytesIO(response.content)
|
80 |
-
else:
|
81 |
pdf_file = pdf_input
|
82 |
with pdfplumber.open(pdf_file) as pdf:
|
83 |
text = ""
|
@@ -89,46 +81,35 @@ def extract_text_from_pdf(pdf_input):
|
|
89 |
text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
|
90 |
return text
|
91 |
except Exception as e:
|
92 |
-
logger.error(f"
|
93 |
return f"Error extracting text: {str(e)}"
|
94 |
|
95 |
def extract_images_from_pdf(pdf_input):
|
96 |
-
"""Extract images from PDF and convert to PIL images."""
|
97 |
if not check_poppler():
|
98 |
-
return "Error: poppler-utils not found.
|
99 |
-
|
100 |
try:
|
101 |
-
if isinstance(pdf_input, str):
|
102 |
-
|
103 |
-
response = requests.get(pdf_input, stream=True)
|
104 |
response.raise_for_status()
|
105 |
images = convert_from_bytes(response.content)
|
106 |
-
else:
|
107 |
-
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
|
108 |
images = convert_from_path(pdf_input.name)
|
109 |
return images
|
110 |
except Exception as e:
|
111 |
-
logger.error(f"
|
112 |
return f"Error extracting images: {str(e)}"
|
113 |
|
114 |
def format_to_markdown(text, images):
|
115 |
-
"""Convert extracted text and images to Markdown format."""
|
116 |
markdown_output = "# Extracted PDF Content\n\n"
|
117 |
-
|
118 |
-
# Clean and format text
|
119 |
-
text = re.sub(r'\n\s*\n+', '\n\n', text.strip()) # Normalize newlines
|
120 |
lines = text.split("\n")
|
121 |
for line in lines:
|
122 |
-
# Detect headings (heuristic: all caps or specific keywords)
|
123 |
if line.isupper() and len(line) > 5:
|
124 |
markdown_output += f"## {line}\n\n"
|
125 |
-
# Detect lists (lines starting with numbers or bullets)
|
126 |
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
|
127 |
markdown_output += f"- {line.strip()[2:]}\n"
|
128 |
else:
|
129 |
markdown_output += f"{line}\n\n"
|
130 |
-
|
131 |
-
# Add images with Hugging Face dataset URLs
|
132 |
if isinstance(images, list) and images:
|
133 |
markdown_output += "## Extracted Images\n\n"
|
134 |
for i, image in enumerate(images):
|
@@ -136,82 +117,79 @@ def format_to_markdown(text, images):
|
|
136 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
137 |
filename = f"image_{i}_{timestamp}"
|
138 |
image_url = upload_image_to_hf(image, filename)
|
139 |
-
|
140 |
if not image_url.startswith("Error"):
|
141 |
markdown_output += f"\n"
|
142 |
if ocr_text:
|
143 |
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
144 |
else:
|
145 |
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
|
146 |
-
|
147 |
return markdown_output
|
148 |
|
149 |
def process_pdf(pdf_input, pdf_url):
|
150 |
-
|
151 |
-
status
|
152 |
-
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
153 |
-
|
154 |
-
def update_status(message):
|
155 |
-
status[0] = message
|
156 |
-
return status[0]
|
157 |
-
|
158 |
if not HF_TOKEN:
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
# Log poppler status
|
163 |
-
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
164 |
-
update_status("Checking poppler-utils...")
|
165 |
-
|
166 |
-
# Decode URL-encoded string if provided
|
167 |
if pdf_url and pdf_url.strip():
|
168 |
pdf_url = urllib.parse.unquote(pdf_url)
|
169 |
-
|
170 |
-
|
171 |
try:
|
172 |
-
response = requests.head(pdf_url, allow_redirects=True)
|
173 |
response.raise_for_status()
|
174 |
pdf_input = pdf_url
|
175 |
except requests.RequestException as e:
|
176 |
-
|
177 |
-
|
178 |
-
return
|
179 |
elif not pdf_input:
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
184 |
text = extract_text_from_pdf(pdf_input)
|
185 |
-
update_status("Extracting images from PDF...")
|
186 |
-
images = extract_images_from_pdf(pdf_input)
|
187 |
-
|
188 |
if isinstance(text, str) and text.startswith("Error"):
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
191 |
if isinstance(images, str) and images.startswith("Error"):
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
196 |
markdown_output = format_to_markdown(text, images)
|
197 |
-
|
198 |
-
|
|
|
199 |
|
200 |
# Gradio Interface
|
201 |
iface = gr.Interface(
|
202 |
fn=process_pdf,
|
203 |
inputs=[
|
204 |
-
gr.File(label="Upload PDF File",
|
205 |
-
gr.Textbox(label="PDF URL", placeholder="Enter
|
206 |
],
|
207 |
outputs=[
|
208 |
gr.Markdown(label="Markdown Output"),
|
209 |
gr.Textbox(label="Processing Status", interactive=False),
|
210 |
],
|
211 |
title="PDF to Markdown Converter",
|
212 |
-
description="
|
213 |
allow_flagging="never"
|
214 |
)
|
215 |
|
216 |
if __name__ == "__main__":
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
import subprocess
|
15 |
|
16 |
# Set up logging
|
17 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
# Initialize Hugging Face API
|
21 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
22 |
+
REPO_NAME = "pdf-images-extracted"
|
23 |
hf_api = HfApi()
|
24 |
|
25 |
def check_poppler():
|
|
|
26 |
try:
|
27 |
result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
|
28 |
logger.info(f"Poppler version: {result.stdout}")
|
|
|
32 |
return False
|
33 |
|
34 |
def ensure_hf_dataset():
|
|
|
35 |
try:
|
36 |
if not HF_TOKEN:
|
37 |
raise ValueError("HF_TOKEN is not set")
|
38 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
39 |
+
logger.info(f"Dataset repo: {repo_id}")
|
40 |
return repo_id
|
41 |
except Exception as e:
|
42 |
+
logger.error(f"Dataset error: {str(e)}")
|
43 |
+
return f"Error: Failed to access dataset: {str(e)}"
|
44 |
|
45 |
def upload_image_to_hf(image, filename):
|
|
|
46 |
repo_id = ensure_hf_dataset()
|
47 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
48 |
return repo_id
|
|
|
49 |
try:
|
|
|
50 |
temp_path = f"/tmp/temp_{filename}.png"
|
51 |
image.save(temp_path, format="PNG")
|
|
|
|
|
52 |
file_url = hf_api.upload_file(
|
53 |
path_or_fileobj=temp_path,
|
54 |
path_in_repo=f"images/{filename}.png",
|
|
|
57 |
token=HF_TOKEN
|
58 |
)
|
59 |
os.remove(temp_path)
|
60 |
+
logger.info(f"Uploaded image: {file_url}")
|
61 |
return file_url
|
62 |
except Exception as e:
|
63 |
+
logger.error(f"Image upload error: {str(e)}")
|
64 |
return f"Error uploading image: {str(e)}"
|
65 |
|
66 |
def extract_text_from_pdf(pdf_input):
|
|
|
67 |
try:
|
68 |
+
if isinstance(pdf_input, str):
|
69 |
+
response = requests.get(pdf_input, stream=True, timeout=10)
|
70 |
response.raise_for_status()
|
71 |
pdf_file = io.BytesIO(response.content)
|
72 |
+
else:
|
73 |
pdf_file = pdf_input
|
74 |
with pdfplumber.open(pdf_file) as pdf:
|
75 |
text = ""
|
|
|
81 |
text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
|
82 |
return text
|
83 |
except Exception as e:
|
84 |
+
logger.error(f"Text extraction error: {str(e)}")
|
85 |
return f"Error extracting text: {str(e)}"
|
86 |
|
87 |
def extract_images_from_pdf(pdf_input):
|
|
|
88 |
if not check_poppler():
|
89 |
+
return "Error: poppler-utils not found."
|
|
|
90 |
try:
|
91 |
+
if isinstance(pdf_input, str):
|
92 |
+
response = requests.get(pdf_input, stream=True, timeout=10)
|
|
|
93 |
response.raise_for_status()
|
94 |
images = convert_from_bytes(response.content)
|
95 |
+
else:
|
|
|
96 |
images = convert_from_path(pdf_input.name)
|
97 |
return images
|
98 |
except Exception as e:
|
99 |
+
logger.error(f"Image extraction error: {str(e)}")
|
100 |
return f"Error extracting images: {str(e)}"
|
101 |
|
102 |
def format_to_markdown(text, images):
|
|
|
103 |
markdown_output = "# Extracted PDF Content\n\n"
|
104 |
+
text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
|
|
|
|
|
105 |
lines = text.split("\n")
|
106 |
for line in lines:
|
|
|
107 |
if line.isupper() and len(line) > 5:
|
108 |
markdown_output += f"## {line}\n\n"
|
|
|
109 |
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
|
110 |
markdown_output += f"- {line.strip()[2:]}\n"
|
111 |
else:
|
112 |
markdown_output += f"{line}\n\n"
|
|
|
|
|
113 |
if isinstance(images, list) and images:
|
114 |
markdown_output += "## Extracted Images\n\n"
|
115 |
for i, image in enumerate(images):
|
|
|
117 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
118 |
filename = f"image_{i}_{timestamp}"
|
119 |
image_url = upload_image_to_hf(image, filename)
|
|
|
120 |
if not image_url.startswith("Error"):
|
121 |
markdown_output += f"\n"
|
122 |
if ocr_text:
|
123 |
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
|
124 |
else:
|
125 |
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
|
|
|
126 |
return markdown_output
|
127 |
|
128 |
def process_pdf(pdf_input, pdf_url):
|
129 |
+
status = "Starting PDF processing..."
|
130 |
+
logger.info(status)
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
if not HF_TOKEN:
|
132 |
+
status = "Error: HF_TOKEN not set."
|
133 |
+
logger.error(status)
|
134 |
+
return status, status
|
|
|
|
|
|
|
|
|
|
|
135 |
if pdf_url and pdf_url.strip():
|
136 |
pdf_url = urllib.parse.unquote(pdf_url)
|
137 |
+
status = f"Downloading PDF from URL: {pdf_url}"
|
138 |
+
logger.info(status)
|
139 |
try:
|
140 |
+
response = requests.head(pdf_url, allow_redirects=True, timeout=5)
|
141 |
response.raise_for_status()
|
142 |
pdf_input = pdf_url
|
143 |
except requests.RequestException as e:
|
144 |
+
status = f"Error accessing URL: {str(e)}"
|
145 |
+
logger.error(status)
|
146 |
+
return status, status
|
147 |
elif not pdf_input:
|
148 |
+
status = "Error: No PDF provided."
|
149 |
+
logger.error(status)
|
150 |
+
return status, status
|
151 |
+
status = "Extracting text..."
|
152 |
+
logger.info(status)
|
153 |
text = extract_text_from_pdf(pdf_input)
|
|
|
|
|
|
|
154 |
if isinstance(text, str) and text.startswith("Error"):
|
155 |
+
status = "Text extraction failed."
|
156 |
+
logger.error(status)
|
157 |
+
return text, status
|
158 |
+
status = "Extracting images..."
|
159 |
+
logger.info(status)
|
160 |
+
images = extract_images_from_pdf(pdf_input)
|
161 |
if isinstance(images, str) and images.startswith("Error"):
|
162 |
+
status = "Image extraction failed."
|
163 |
+
logger.error(status)
|
164 |
+
return images, status
|
165 |
+
status = "Formatting output..."
|
166 |
+
logger.info(status)
|
167 |
markdown_output = format_to_markdown(text, images)
|
168 |
+
status = "Processing complete."
|
169 |
+
logger.info(status)
|
170 |
+
return markdown_output, status
|
171 |
|
172 |
# Gradio Interface
|
173 |
iface = gr.Interface(
|
174 |
fn=process_pdf,
|
175 |
inputs=[
|
176 |
+
gr.File(label="Upload PDF File", file_types=[".pdf"]),
|
177 |
+
gr.Textbox(label="PDF URL", placeholder="Enter PDF URL (e.g., https://example.com/file.pdf)"),
|
178 |
],
|
179 |
outputs=[
|
180 |
gr.Markdown(label="Markdown Output"),
|
181 |
gr.Textbox(label="Processing Status", interactive=False),
|
182 |
],
|
183 |
title="PDF to Markdown Converter",
|
184 |
+
description="Convert a PDF file or URL to Markdown. Extracts text, images, and tables, with images uploaded to a Hugging Face dataset. Supports URL-encoded strings. Requires HF_TOKEN in Spaces Secrets.",
|
185 |
allow_flagging="never"
|
186 |
)
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
+
logger.info("Starting Gradio app...")
|
190 |
+
try:
|
191 |
+
iface.launch(server_name="0.0.0.0", server_port=7860, prevent_thread_lock=True)
|
192 |
+
logger.info("Gradio app started successfully.")
|
193 |
+
except Exception as e:
|
194 |
+
logger.error(f"Failed to start Gradio app: {str(e)}")
|
195 |
+
raise
|