broadfield-dev commited on
Commit
81314aa
·
verified ·
1 Parent(s): 57afa22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -80
app.py CHANGED
@@ -14,16 +14,15 @@ import logging
14
  import subprocess
15
 
16
  # Set up logging
17
- logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
  # Initialize Hugging Face API
21
- HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
22
- REPO_NAME = "pdf-images-extracted" # Hugging Face dataset repo
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
26
- """Check if poppler-utils is installed."""
27
  try:
28
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
29
  logger.info(f"Poppler version: {result.stdout}")
@@ -33,29 +32,23 @@ def check_poppler():
33
  return False
34
 
35
  def ensure_hf_dataset():
36
- """Create or get Hugging Face dataset repository."""
37
  try:
38
  if not HF_TOKEN:
39
  raise ValueError("HF_TOKEN is not set")
40
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
41
- logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
42
  return repo_id
43
  except Exception as e:
44
- logger.error(f"Failed to create/access dataset repo: {str(e)}")
45
- return f"Error: Failed to create/access dataset repo: {str(e)}"
46
 
47
  def upload_image_to_hf(image, filename):
48
- """Upload an image to Hugging Face dataset and return its URL."""
49
  repo_id = ensure_hf_dataset()
50
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
51
  return repo_id
52
-
53
  try:
54
- # Save image temporarily
55
  temp_path = f"/tmp/temp_{filename}.png"
56
  image.save(temp_path, format="PNG")
57
-
58
- # Upload to Hugging Face dataset
59
  file_url = hf_api.upload_file(
60
  path_or_fileobj=temp_path,
61
  path_in_repo=f"images/{filename}.png",
@@ -64,20 +57,19 @@ def upload_image_to_hf(image, filename):
64
  token=HF_TOKEN
65
  )
66
  os.remove(temp_path)
67
- logger.info(f"Uploaded image to: {file_url}")
68
  return file_url
69
  except Exception as e:
70
- logger.error(f"Error uploading image: {str(e)}")
71
  return f"Error uploading image: {str(e)}"
72
 
73
  def extract_text_from_pdf(pdf_input):
74
- """Extract text from PDF using pdfplumber."""
75
  try:
76
- if isinstance(pdf_input, str): # URL case
77
- response = requests.get(pdf_input, stream=True)
78
  response.raise_for_status()
79
  pdf_file = io.BytesIO(response.content)
80
- else: # File upload case
81
  pdf_file = pdf_input
82
  with pdfplumber.open(pdf_file) as pdf:
83
  text = ""
@@ -89,46 +81,35 @@ def extract_text_from_pdf(pdf_input):
89
  text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
90
  return text
91
  except Exception as e:
92
- logger.error(f"Error extracting text: {str(e)}")
93
  return f"Error extracting text: {str(e)}"
94
 
95
  def extract_images_from_pdf(pdf_input):
96
- """Extract images from PDF and convert to PIL images."""
97
  if not check_poppler():
98
- return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
99
-
100
  try:
101
- if isinstance(pdf_input, str): # URL case
102
- logger.info(f"Downloading PDF from URL: {pdf_input}")
103
- response = requests.get(pdf_input, stream=True)
104
  response.raise_for_status()
105
  images = convert_from_bytes(response.content)
106
- else: # File upload case
107
- logger.info(f"Processing uploaded PDF: {pdf_input.name}")
108
  images = convert_from_path(pdf_input.name)
109
  return images
110
  except Exception as e:
111
- logger.error(f"Error extracting images: {str(e)}")
112
  return f"Error extracting images: {str(e)}"
113
 
114
  def format_to_markdown(text, images):
115
- """Convert extracted text and images to Markdown format."""
116
  markdown_output = "# Extracted PDF Content\n\n"
117
-
118
- # Clean and format text
119
- text = re.sub(r'\n\s*\n+', '\n\n', text.strip()) # Normalize newlines
120
  lines = text.split("\n")
121
  for line in lines:
122
- # Detect headings (heuristic: all caps or specific keywords)
123
  if line.isupper() and len(line) > 5:
124
  markdown_output += f"## {line}\n\n"
125
- # Detect lists (lines starting with numbers or bullets)
126
  elif re.match(r'^\s*[\d\-*+]\.\s+', line):
127
  markdown_output += f"- {line.strip()[2:]}\n"
128
  else:
129
  markdown_output += f"{line}\n\n"
130
-
131
- # Add images with Hugging Face dataset URLs
132
  if isinstance(images, list) and images:
133
  markdown_output += "## Extracted Images\n\n"
134
  for i, image in enumerate(images):
@@ -136,82 +117,79 @@ def format_to_markdown(text, images):
136
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
137
  filename = f"image_{i}_{timestamp}"
138
  image_url = upload_image_to_hf(image, filename)
139
-
140
  if not image_url.startswith("Error"):
141
  markdown_output += f"![Image {i+1}]({image_url})\n"
142
  if ocr_text:
143
  markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
144
  else:
145
  markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
146
-
147
  return markdown_output
148
 
149
  def process_pdf(pdf_input, pdf_url):
150
- """Main function to process PDF input (file or URL) and generate Markdown."""
151
- status = ["Starting PDF processing..."]
152
- logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
153
-
154
- def update_status(message):
155
- status[0] = message
156
- return status[0]
157
-
158
  if not HF_TOKEN:
159
- update_status("Error: HF_TOKEN not set.")
160
- return "Error: HF_TOKEN not set in Spaces Secrets.", status[0]
161
-
162
- # Log poppler status
163
- logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
164
- update_status("Checking poppler-utils...")
165
-
166
- # Decode URL-encoded string if provided
167
  if pdf_url and pdf_url.strip():
168
  pdf_url = urllib.parse.unquote(pdf_url)
169
- logger.info(f"Decoded URL: {pdf_url}")
170
- update_status(f"Downloading PDF from URL: {pdf_url}")
171
  try:
172
- response = requests.head(pdf_url, allow_redirects=True)
173
  response.raise_for_status()
174
  pdf_input = pdf_url
175
  except requests.RequestException as e:
176
- logger.error(f"Error accessing URL: {str(e)}")
177
- update_status(f"Error accessing URL: {str(e)}")
178
- return f"Error accessing URL: {str(e)}", status[0]
179
  elif not pdf_input:
180
- update_status("Error: No PDF provided.")
181
- return "Error: Please provide a PDF file or URL.", status[0]
182
-
183
- update_status("Extracting text from PDF...")
 
184
  text = extract_text_from_pdf(pdf_input)
185
- update_status("Extracting images from PDF...")
186
- images = extract_images_from_pdf(pdf_input)
187
-
188
  if isinstance(text, str) and text.startswith("Error"):
189
- update_status("Text extraction failed.")
190
- return text, status[0]
 
 
 
 
191
  if isinstance(images, str) and images.startswith("Error"):
192
- update_status("Image extraction failed.")
193
- return images, status[0]
194
-
195
- update_status("Formatting output as Markdown...")
 
196
  markdown_output = format_to_markdown(text, images)
197
- update_status("Processing complete.")
198
- return markdown_output, status[0]
 
199
 
200
  # Gradio Interface
201
  iface = gr.Interface(
202
  fn=process_pdf,
203
  inputs=[
204
- gr.File(label="Upload PDF File", type="filepath"),
205
- gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
206
  ],
207
  outputs=[
208
  gr.Markdown(label="Markdown Output"),
209
  gr.Textbox(label="Processing Status", interactive=False),
210
  ],
211
  title="PDF to Markdown Converter",
212
- description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
213
  allow_flagging="never"
214
  )
215
 
216
  if __name__ == "__main__":
217
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
14
  import subprocess
15
 
16
  # Set up logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
  logger = logging.getLogger(__name__)
19
 
20
  # Initialize Hugging Face API
21
+ HF_TOKEN = os.getenv("HF_TOKEN")
22
+ REPO_NAME = "pdf-images-extracted"
23
  hf_api = HfApi()
24
 
25
  def check_poppler():
 
26
  try:
27
  result = subprocess.run(["pdftoppm", "-v"], capture_output=True, text=True)
28
  logger.info(f"Poppler version: {result.stdout}")
 
32
  return False
33
 
34
  def ensure_hf_dataset():
 
35
  try:
36
  if not HF_TOKEN:
37
  raise ValueError("HF_TOKEN is not set")
38
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
39
+ logger.info(f"Dataset repo: {repo_id}")
40
  return repo_id
41
  except Exception as e:
42
+ logger.error(f"Dataset error: {str(e)}")
43
+ return f"Error: Failed to access dataset: {str(e)}"
44
 
45
  def upload_image_to_hf(image, filename):
 
46
  repo_id = ensure_hf_dataset()
47
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
48
  return repo_id
 
49
  try:
 
50
  temp_path = f"/tmp/temp_{filename}.png"
51
  image.save(temp_path, format="PNG")
 
 
52
  file_url = hf_api.upload_file(
53
  path_or_fileobj=temp_path,
54
  path_in_repo=f"images/{filename}.png",
 
57
  token=HF_TOKEN
58
  )
59
  os.remove(temp_path)
60
+ logger.info(f"Uploaded image: {file_url}")
61
  return file_url
62
  except Exception as e:
63
+ logger.error(f"Image upload error: {str(e)}")
64
  return f"Error uploading image: {str(e)}"
65
 
66
  def extract_text_from_pdf(pdf_input):
 
67
  try:
68
+ if isinstance(pdf_input, str):
69
+ response = requests.get(pdf_input, stream=True, timeout=10)
70
  response.raise_for_status()
71
  pdf_file = io.BytesIO(response.content)
72
+ else:
73
  pdf_file = pdf_input
74
  with pdfplumber.open(pdf_file) as pdf:
75
  text = ""
 
81
  text += "**Table:**\n" + "\n".join([" | ".join(str(cell) for cell in row) for row in table]) + "\n\n"
82
  return text
83
  except Exception as e:
84
+ logger.error(f"Text extraction error: {str(e)}")
85
  return f"Error extracting text: {str(e)}"
86
 
87
  def extract_images_from_pdf(pdf_input):
 
88
  if not check_poppler():
89
+ return "Error: poppler-utils not found."
 
90
  try:
91
+ if isinstance(pdf_input, str):
92
+ response = requests.get(pdf_input, stream=True, timeout=10)
 
93
  response.raise_for_status()
94
  images = convert_from_bytes(response.content)
95
+ else:
 
96
  images = convert_from_path(pdf_input.name)
97
  return images
98
  except Exception as e:
99
+ logger.error(f"Image extraction error: {str(e)}")
100
  return f"Error extracting images: {str(e)}"
101
 
102
  def format_to_markdown(text, images):
 
103
  markdown_output = "# Extracted PDF Content\n\n"
104
+ text = re.sub(r'\n\s*\n+', '\n\n', text.strip())
 
 
105
  lines = text.split("\n")
106
  for line in lines:
 
107
  if line.isupper() and len(line) > 5:
108
  markdown_output += f"## {line}\n\n"
 
109
  elif re.match(r'^\s*[\d\-*+]\.\s+', line):
110
  markdown_output += f"- {line.strip()[2:]}\n"
111
  else:
112
  markdown_output += f"{line}\n\n"
 
 
113
  if isinstance(images, list) and images:
114
  markdown_output += "## Extracted Images\n\n"
115
  for i, image in enumerate(images):
 
117
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
118
  filename = f"image_{i}_{timestamp}"
119
  image_url = upload_image_to_hf(image, filename)
 
120
  if not image_url.startswith("Error"):
121
  markdown_output += f"![Image {i+1}]({image_url})\n"
122
  if ocr_text:
123
  markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
124
  else:
125
  markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
 
126
  return markdown_output
127
 
128
  def process_pdf(pdf_input, pdf_url):
129
+ status = "Starting PDF processing..."
130
+ logger.info(status)
 
 
 
 
 
 
131
  if not HF_TOKEN:
132
+ status = "Error: HF_TOKEN not set."
133
+ logger.error(status)
134
+ return status, status
 
 
 
 
 
135
  if pdf_url and pdf_url.strip():
136
  pdf_url = urllib.parse.unquote(pdf_url)
137
+ status = f"Downloading PDF from URL: {pdf_url}"
138
+ logger.info(status)
139
  try:
140
+ response = requests.head(pdf_url, allow_redirects=True, timeout=5)
141
  response.raise_for_status()
142
  pdf_input = pdf_url
143
  except requests.RequestException as e:
144
+ status = f"Error accessing URL: {str(e)}"
145
+ logger.error(status)
146
+ return status, status
147
  elif not pdf_input:
148
+ status = "Error: No PDF provided."
149
+ logger.error(status)
150
+ return status, status
151
+ status = "Extracting text..."
152
+ logger.info(status)
153
  text = extract_text_from_pdf(pdf_input)
 
 
 
154
  if isinstance(text, str) and text.startswith("Error"):
155
+ status = "Text extraction failed."
156
+ logger.error(status)
157
+ return text, status
158
+ status = "Extracting images..."
159
+ logger.info(status)
160
+ images = extract_images_from_pdf(pdf_input)
161
  if isinstance(images, str) and images.startswith("Error"):
162
+ status = "Image extraction failed."
163
+ logger.error(status)
164
+ return images, status
165
+ status = "Formatting output..."
166
+ logger.info(status)
167
  markdown_output = format_to_markdown(text, images)
168
+ status = "Processing complete."
169
+ logger.info(status)
170
+ return markdown_output, status
171
 
172
  # Gradio Interface
173
  iface = gr.Interface(
174
  fn=process_pdf,
175
  inputs=[
176
+ gr.File(label="Upload PDF File", file_types=[".pdf"]),
177
+ gr.Textbox(label="PDF URL", placeholder="Enter PDF URL (e.g., https://example.com/file.pdf)"),
178
  ],
179
  outputs=[
180
  gr.Markdown(label="Markdown Output"),
181
  gr.Textbox(label="Processing Status", interactive=False),
182
  ],
183
  title="PDF to Markdown Converter",
184
+ description="Convert a PDF file or URL to Markdown. Extracts text, images, and tables, with images uploaded to a Hugging Face dataset. Supports URL-encoded strings. Requires HF_TOKEN in Spaces Secrets.",
185
  allow_flagging="never"
186
  )
187
 
188
  if __name__ == "__main__":
189
+ logger.info("Starting Gradio app...")
190
+ try:
191
+ iface.launch(server_name="0.0.0.0", server_port=7860, prevent_thread_lock=True)
192
+ logger.info("Gradio app started successfully.")
193
+ except Exception as e:
194
+ logger.error(f"Failed to start Gradio app: {str(e)}")
195
+ raise