broadfield-dev commited on
Commit
0e0f376
·
verified ·
1 Parent(s): 0dd31f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -17
app.py CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
 
7
  import os
8
  from huggingface_hub import HfApi, create_repo
9
  import re
@@ -47,9 +48,15 @@ def upload_image_to_hf(image, filename):
47
  except Exception as e:
48
  return f"Error uploading image: {str(e)}"
49
 
50
- def extract_text_from_pdf(pdf_file):
51
- """Extract text from PDF using PyPDF2."""
52
  try:
 
 
 
 
 
 
53
  reader = PyPDF2.PdfReader(pdf_file)
54
  text = ""
55
  for page in reader.pages:
@@ -59,14 +66,15 @@ def extract_text_from_pdf(pdf_file):
59
  except Exception as e:
60
  return f"Error extracting text: {str(e)}"
61
 
62
- def extract_images_from_pdf(pdf_file):
63
- """Extract images from PDF and convert to PIL images."""
64
  try:
65
- if isinstance(pdf_file, str): # URL case
66
- response = requests.get(pdf_file, stream=True)
 
67
  images = convert_from_bytes(response.content)
68
  else: # File upload case
69
- images = convert_from_path(pdf_file.name)
70
  return images
71
  except Exception as e:
72
  return f"Error extracting images: {str(e)}"
@@ -116,18 +124,15 @@ def process_pdf(pdf_input, pdf_url):
116
  pdf_url = urllib.parse.unquote(pdf_url)
117
  try:
118
  response = requests.head(pdf_url, allow_redirects=True)
119
- if response.status_code != 200:
120
- return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
121
- pdf_file = pdf_url
122
  except requests.RequestException as e:
123
  return f"Error accessing URL: {str(e)}"
124
- elif pdf_input:
125
- pdf_file = pdf_input
126
- else:
127
  return "Error: Please provide a PDF file or URL."
128
 
129
- text = extract_text_from_pdf(pdf_file)
130
- images = extract_images_from_pdf(pdf_file)
131
 
132
  if isinstance(text, str) and text.startswith("Error"):
133
  return text
@@ -142,11 +147,11 @@ iface = gr.Interface(
142
  fn=process_pdf,
143
  inputs=[
144
  gr.File(label="Upload PDF File", type="filepath"),
145
- gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings)"),
146
  ],
147
  outputs=gr.Markdown(label="Markdown Output"),
148
  title="PDF to Markdown Converter",
149
- description="Upload a PDF file or provide a PDF URL (including URL-encoded strings) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
150
  )
151
 
152
  if __name__ == "__main__":
 
4
  from pdf2image import convert_from_path, convert_from_bytes
5
  import pytesseract
6
  from PIL import Image
7
+ import io
8
  import os
9
  from huggingface_hub import HfApi, create_repo
10
  import re
 
48
  except Exception as e:
49
  return f"Error uploading image: {str(e)}"
50
 
51
+ def extract_text_from_pdf(pdf_input):
52
+ """Extract text from PDF (URL or file) using PyPDF2."""
53
  try:
54
+ if isinstance(pdf_input, str): # URL case
55
+ response = requests.get(pdf_input, stream=True)
56
+ response.raise_for_status()
57
+ pdf_file = io.BytesIO(response.content)
58
+ else: # File upload case
59
+ pdf_file = pdf_input
60
  reader = PyPDF2.PdfReader(pdf_file)
61
  text = ""
62
  for page in reader.pages:
 
66
  except Exception as e:
67
  return f"Error extracting text: {str(e)}"
68
 
69
+ def extract_images_from_pdf(pdf_input):
70
+ """Extract images from PDF (URL or file) and convert to PIL images."""
71
  try:
72
+ if isinstance(pdf_input, str): # URL case
73
+ response = requests.get(pdf_input, stream=True)
74
+ response.raise_for_status()
75
  images = convert_from_bytes(response.content)
76
  else: # File upload case
77
+ images = convert_from_path(pdf_input.name)
78
  return images
79
  except Exception as e:
80
  return f"Error extracting images: {str(e)}"
 
124
  pdf_url = urllib.parse.unquote(pdf_url)
125
  try:
126
  response = requests.head(pdf_url, allow_redirects=True)
127
+ response.raise_for_status()
128
+ pdf_input = pdf_url
 
129
  except requests.RequestException as e:
130
  return f"Error accessing URL: {str(e)}"
131
+ elif not pdf_input:
 
 
132
  return "Error: Please provide a PDF file or URL."
133
 
134
+ text = extract_text_from_pdf(pdf_input)
135
+ images = extract_images_from_pdf(pdf_input)
136
 
137
  if isinstance(text, str) and text.startswith("Error"):
138
  return text
 
147
  fn=process_pdf,
148
  inputs=[
149
  gr.File(label="Upload PDF File", type="filepath"),
150
+ gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF (supports URL-encoded strings with spaces)"),
151
  ],
152
  outputs=gr.Markdown(label="Markdown Output"),
153
  title="PDF to Markdown Converter",
154
+ description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
155
  )
156
 
157
  if __name__ == "__main__":