broadfield-dev commited on
Commit
77541b8
·
verified ·
1 Parent(s): 7a57213

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -15
app.py CHANGED
@@ -8,13 +8,12 @@ import io
8
  import os
9
  from huggingface_hub import HfApi, create_repo
10
  import re
11
- import markdown
12
  from datetime import datetime
13
 
14
  # Initialize Hugging Face API
 
 
15
  hf_api = HfApi()
16
- HF_TOKEN = os.getenv("HF_TOKEN") # Set your Hugging Face API token as an environment variable
17
- REPO_NAME = "pdf-images-extracted" # Hugging Face repo name
18
 
19
  def ensure_hf_repo():
20
  """Create or get Hugging Face repository."""
@@ -32,7 +31,7 @@ def upload_image_to_hf(image, filename):
32
 
33
  try:
34
  # Save image temporarily
35
- temp_path = f"temp_{filename}.png"
36
  image.save(temp_path, format="PNG")
37
 
38
  # Upload to Hugging Face
@@ -92,7 +91,6 @@ def format_to_markdown(text, images):
92
  if isinstance(images, list) and images:
93
  markdown_output += "## Extracted Images\n\n"
94
  for i, image in enumerate(images):
95
- # Perform OCR on image to include any text (e.g., in charts)
96
  ocr_text = pytesseract.image_to_string(image).strip()
97
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
98
  filename = f"image_{i}_{timestamp}"
@@ -109,19 +107,19 @@ def format_to_markdown(text, images):
109
 
110
  def process_pdf(pdf_input, pdf_url):
111
  """Main function to process PDF input (file or URL) and generate Markdown."""
 
 
 
112
  if pdf_url and pdf_url.strip():
113
- # Process PDF from URL
114
  response = requests.head(pdf_url)
115
  if response.status_code != 200:
116
  return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
117
  pdf_file = pdf_url
118
  elif pdf_input:
119
- # Process uploaded PDF
120
  pdf_file = pdf_input
121
  else:
122
  return "Error: Please provide a PDF file or URL."
123
 
124
- # Extract text and images
125
  text = extract_text_from_pdf(pdf_file)
126
  images = extract_images_from_pdf(pdf_file)
127
 
@@ -130,7 +128,6 @@ def process_pdf(pdf_input, pdf_url):
130
  if isinstance(images, str) and images.startswith("Error"):
131
  return images
132
 
133
- # Generate Markdown
134
  markdown_output = format_to_markdown(text, images)
135
  return markdown_output
136
 
@@ -143,12 +140,8 @@ iface = gr.Interface(
143
  ],
144
  outputs=gr.Markdown(label="Markdown Output"),
145
  title="PDF to Markdown Converter",
146
- description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
147
  )
148
 
149
  if __name__ == "__main__":
150
- # Ensure Hugging Face token is set
151
- if not HF_TOKEN:
152
- print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
153
- else:
154
- iface.launch()
 
8
  import os
9
  from huggingface_hub import HfApi, create_repo
10
  import re
 
11
  from datetime import datetime
12
 
13
  # Initialize Hugging Face API
14
+ HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
15
+ REPO_NAME = "pdf-images-extracted" # Hugging Face repo for images
16
  hf_api = HfApi()
 
 
17
 
18
  def ensure_hf_repo():
19
  """Create or get Hugging Face repository."""
 
31
 
32
  try:
33
  # Save image temporarily
34
+ temp_path = f"/tmp/temp_{filename}.png"
35
  image.save(temp_path, format="PNG")
36
 
37
  # Upload to Hugging Face
 
91
  if isinstance(images, list) and images:
92
  markdown_output += "## Extracted Images\n\n"
93
  for i, image in enumerate(images):
 
94
  ocr_text = pytesseract.image_to_string(image).strip()
95
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
96
  filename = f"image_{i}_{timestamp}"
 
107
 
108
  def process_pdf(pdf_input, pdf_url):
109
  """Main function to process PDF input (file or URL) and generate Markdown."""
110
+ if not HF_TOKEN:
111
+ return "Error: HF_TOKEN not set in Spaces Secrets."
112
+
113
  if pdf_url and pdf_url.strip():
 
114
  response = requests.head(pdf_url)
115
  if response.status_code != 200:
116
  return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
117
  pdf_file = pdf_url
118
  elif pdf_input:
 
119
  pdf_file = pdf_input
120
  else:
121
  return "Error: Please provide a PDF file or URL."
122
 
 
123
  text = extract_text_from_pdf(pdf_file)
124
  images = extract_images_from_pdf(pdf_file)
125
 
 
128
  if isinstance(images, str) and images.startswith("Error"):
129
  return images
130
 
 
131
  markdown_output = format_to_markdown(text, images)
132
  return markdown_output
133
 
 
140
  ],
141
  outputs=gr.Markdown(label="Markdown Output"),
142
  title="PDF to Markdown Converter",
143
+ description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
144
  )
145
 
146
  if __name__ == "__main__":
147
+ iface.launch()