broadfield-dev commited on
Commit
aec5733
·
verified ·
1 Parent(s): 5010ab5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -37
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import requests
3
- import PyPDF2
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
19
 
20
  # Initialize Hugging Face API
21
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
22
- REPO_NAME = "broadfield-dev/pdf-images-extracted" # Hugging Face dataset repo
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
@@ -36,11 +36,11 @@ def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
38
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
39
- logger.info(f"Using dataset repo: {repo_id}")
40
  return repo_id
41
  except Exception as e:
42
- logger.error(f"Error creating dataset repo: {str(e)}")
43
- return f"Error creating dataset repo: {str(e)}"
44
 
45
  def upload_image_to_hf(image, filename):
46
  """Upload an image to Hugging Face dataset and return its URL."""
@@ -68,8 +68,9 @@ def upload_image_to_hf(image, filename):
68
  logger.error(f"Error uploading image: {str(e)}")
69
  return f"Error uploading image: {str(e)}"
70
 
71
- def extract_text_from_pdf(pdf_input):
72
- """Extract text from PDF (URL or file) using PyPDF2."""
 
73
  try:
74
  if isinstance(pdf_input, str): # URL case
75
  response = requests.get(pdf_input, stream=True)
@@ -77,18 +78,22 @@ def extract_text_from_pdf(pdf_input):
77
  pdf_file = io.BytesIO(response.content)
78
  else: # File upload case
79
  pdf_file = pdf_input
80
- reader = PyPDF2.PdfReader(pdf_file)
81
- text = ""
82
- for page in reader.pages:
83
- page_text = page.extract_text() or ""
84
- text += page_text + "\n\n"
 
 
 
85
  return text
86
  except Exception as e:
87
  logger.error(f"Error extracting text: {str(e)}")
88
  return f"Error extracting text: {str(e)}"
89
 
90
- def extract_images_from_pdf(pdf_input):
91
- """Extract images from PDF (URL or file) and convert to PIL images."""
 
92
  if not check_poppler():
93
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
94
 
@@ -106,15 +111,16 @@ def extract_images_from_pdf(pdf_input):
106
  logger.error(f"Error extracting images: {str(e)}")
107
  return f"Error extracting images: {str(e)}"
108
 
109
- def format_to_markdown(text, images):
110
  """Convert extracted text and images to Markdown format."""
 
111
  markdown_output = "# Extracted PDF Content\n\n"
112
 
113
  # Clean and format text
114
  text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
115
  lines = text.split("\n")
116
  for line in lines:
117
- # Detect headings (simple heuristic: all caps or specific keywords)
118
  if line.isupper() and len(line) > 5:
119
  markdown_output += f"## {line}\n\n"
120
  # Detect lists (lines starting with numbers or bullets)
@@ -127,6 +133,7 @@ def format_to_markdown(text, images):
127
  if isinstance(images, list) and images:
128
  markdown_output += "## Extracted Images\n\n"
129
  for i, image in enumerate(images):
 
130
  ocr_text = pytesseract.image_to_string(image).strip()
131
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
132
  filename = f"image_{i}_{timestamp}"
@@ -141,11 +148,14 @@ def format_to_markdown(text, images):
141
 
142
  return markdown_output
143
 
144
- def process_pdf(pdf_input, pdf_url):
145
  """Main function to process PDF input (file or URL) and generate Markdown."""
146
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
 
 
147
  if not HF_TOKEN:
148
- return "Error: HF_TOKEN not set in Spaces Secrets."
 
149
 
150
  # Log poppler status
151
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
@@ -154,39 +164,55 @@ def process_pdf(pdf_input, pdf_url):
154
  if pdf_url and pdf_url.strip():
155
  pdf_url = urllib.parse.unquote(pdf_url)
156
  logger.info(f"Decoded URL: {pdf_url}")
 
157
  try:
158
  response = requests.head(pdf_url, allow_redirects=True)
159
  response.raise_for_status()
160
  pdf_input = pdf_url
161
  except requests.RequestException as e:
162
  logger.error(f"Error accessing URL: {str(e)}")
163
- return f"Error accessing URL: {str(e)}"
 
164
  elif not pdf_input:
165
- return "Error: Please provide a PDF file or URL."
 
166
 
167
- text = extract_text_from_pdf(pdf_input)
168
- images = extract_images_from_pdf(pdf_input)
169
 
170
  if isinstance(text, str) and text.startswith("Error"):
171
- return text
 
172
  if isinstance(images, str) and images.startswith("Error"):
173
- return images
 
174
 
175
- markdown_output = format_to_markdown(text, images)
176
- return markdown_output
 
177
 
178
  # Gradio Interface
179
- iface = gr.Interface(
180
- fn=process_pdf,
181
- inputs=[
182
- gr.File(label="Upload PDF File", type="filepath"),
183
- gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
184
- ],
185
- outputs=gr.Markdown(label="Markdown Output"),
186
- title="PDF to Markdown Converter",
187
- description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets. Uses Docker to ensure poppler-utils and tesseract-ocr are installed.",
188
- flagging_dir="/tmp/flagged" # Set writable flagging directory
189
- )
 
 
 
 
 
 
 
 
 
 
190
 
191
  if __name__ == "__main__":
192
  # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
 
1
  import gradio as gr
2
  import requests
3
+ import pdfplumber
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
 
19
 
20
  # Initialize Hugging Face API
21
  HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
22
+ REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
 
36
  """Create or get Hugging Face dataset repository."""
37
  try:
38
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
39
+ logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
40
  return repo_id
41
  except Exception as e:
42
+ logger.error(f"Failed to create/access dataset repo: {str(e)}")
43
+ return f"Error: Failed to create/access dataset repo: {str(e)}"
44
 
45
  def upload_image_to_hf(image, filename):
46
  """Upload an image to Hugging Face dataset and return its URL."""
 
68
  logger.error(f"Error uploading image: {str(e)}")
69
  return f"Error uploading image: {str(e)}"
70
 
71
+ def extract_text_from_pdf(pdf_input, status_callback):
72
+ """Extract text from PDF using pdfplumber."""
73
+ status_callback("Extracting text from PDF...")
74
  try:
75
  if isinstance(pdf_input, str): # URL case
76
  response = requests.get(pdf_input, stream=True)
 
78
  pdf_file = io.BytesIO(response.content)
79
  else: # File upload case
80
  pdf_file = pdf_input
81
+ with pdfplumber.open(pdf_file) as pdf:
82
+ text = ""
83
+ for page in pdf.pages:
84
+ page_text = page.extract_text() or ""
85
+ text += page_text + "\n\n"
86
+ tables = page.extract_tables()
87
+ for table in tables:
88
+ text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
89
  return text
90
  except Exception as e:
91
  logger.error(f"Error extracting text: {str(e)}")
92
  return f"Error extracting text: {str(e)}"
93
 
94
+ def extract_images_from_pdf(pdf_input, status_callback):
95
+ """Extract images from PDF and convert to PIL images."""
96
+ status_callback("Extracting images from PDF...")
97
  if not check_poppler():
98
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
99
 
 
111
  logger.error(f"Error extracting images: {str(e)}")
112
  return f"Error extracting images: {str(e)}"
113
 
114
+ def format_to_markdown(text, images, status_callback):
115
  """Convert extracted text and images to Markdown format."""
116
+ status_callback("Formatting output as Markdown...")
117
  markdown_output = "# Extracted PDF Content\n\n"
118
 
119
  # Clean and format text
120
  text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
121
  lines = text.split("\n")
122
  for line in lines:
123
+ # Detect headings (heuristic: all caps or specific keywords)
124
  if line.isupper() and len(line) > 5:
125
  markdown_output += f"## {line}\n\n"
126
  # Detect lists (lines starting with numbers or bullets)
 
133
  if isinstance(images, list) and images:
134
  markdown_output += "## Extracted Images\n\n"
135
  for i, image in enumerate(images):
136
+ status_callback(f"Uploading image {i+1}...")
137
  ocr_text = pytesseract.image_to_string(image).strip()
138
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
139
  filename = f"image_{i}_{timestamp}"
 
148
 
149
  return markdown_output
150
 
151
+ def process_pdf(pdf_input, pdf_url, status_callback):
152
  """Main function to process PDF input (file or URL) and generate Markdown."""
153
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
154
+ status_callback("Starting PDF processing...")
155
+
156
  if not HF_TOKEN:
157
+ status_callback("Error: HF_TOKEN not set.")
158
+ return "Error: HF_TOKEN not set in Spaces Secrets.", ""
159
 
160
  # Log poppler status
161
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
 
164
  if pdf_url and pdf_url.strip():
165
  pdf_url = urllib.parse.unquote(pdf_url)
166
  logger.info(f"Decoded URL: {pdf_url}")
167
+ status_callback(f"Downloading PDF from URL: {pdf_url}")
168
  try:
169
  response = requests.head(pdf_url, allow_redirects=True)
170
  response.raise_for_status()
171
  pdf_input = pdf_url
172
  except requests.RequestException as e:
173
  logger.error(f"Error accessing URL: {str(e)}")
174
+ status_callback(f"Error accessing URL: {str(e)}")
175
+ return f"Error accessing URL: {str(e)}", ""
176
  elif not pdf_input:
177
+ status_callback("Error: No PDF provided.")
178
+ return "Error: Please provide a PDF file or URL.", ""
179
 
180
+ text = extract_text_from_pdf(pdf_input, status_callback)
181
+ images = extract_images_from_pdf(pdf_input, status_callback)
182
 
183
  if isinstance(text, str) and text.startswith("Error"):
184
+ status_callback("Text extraction failed.")
185
+ return text, ""
186
  if isinstance(images, str) and images.startswith("Error"):
187
+ status_callback("Image extraction failed.")
188
+ return images, ""
189
 
190
+ markdown_output = format_to_markdown(text, images, status_callback)
191
+ status_callback("Processing complete.")
192
+ return markdown_output, ""
193
 
194
  # Gradio Interface
195
+ with gr.Blocks() as iface:
196
+ gr.Markdown("# PDF to Markdown Converter")
197
+ gr.Markdown("Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.")
198
+
199
+ with gr.Row():
200
+ pdf_input = gr.File(label="Upload PDF File", type="filepath")
201
+ pdf_url = gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF")
202
+
203
+ status = gr.Textbox(label="Processing Status", interactive=False)
204
+ output = gr.Markdown(label="Markdown Output")
205
+
206
+ submit_btn = gr.Button("Process PDF")
207
+
208
+ def update_status(message):
209
+ return message
210
+
211
+ submit_btn.click(
212
+ fn=process_pdf,
213
+ inputs=[pdf_input, pdf_url, update_status],
214
+ outputs=[output, status]
215
+ )
216
 
217
  if __name__ == "__main__":
218
  # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server