Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,12 @@ import io
|
|
8 |
import os
|
9 |
from huggingface_hub import HfApi, create_repo
|
10 |
import re
|
11 |
-
import markdown
|
12 |
from datetime import datetime
|
13 |
|
14 |
# Initialize Hugging Face API
|
|
|
|
|
15 |
hf_api = HfApi()
|
16 |
-
HF_TOKEN = os.getenv("HF_TOKEN") # Set your Hugging Face API token as an environment variable
|
17 |
-
REPO_NAME = "pdf-images-extracted" # Hugging Face repo name
|
18 |
|
19 |
def ensure_hf_repo():
|
20 |
"""Create or get Hugging Face repository."""
|
@@ -32,7 +31,7 @@ def upload_image_to_hf(image, filename):
|
|
32 |
|
33 |
try:
|
34 |
# Save image temporarily
|
35 |
-
temp_path = f"temp_{filename}.png"
|
36 |
image.save(temp_path, format="PNG")
|
37 |
|
38 |
# Upload to Hugging Face
|
@@ -92,7 +91,6 @@ def format_to_markdown(text, images):
|
|
92 |
if isinstance(images, list) and images:
|
93 |
markdown_output += "## Extracted Images\n\n"
|
94 |
for i, image in enumerate(images):
|
95 |
-
# Perform OCR on image to include any text (e.g., in charts)
|
96 |
ocr_text = pytesseract.image_to_string(image).strip()
|
97 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
98 |
filename = f"image_{i}_{timestamp}"
|
@@ -109,19 +107,19 @@ def format_to_markdown(text, images):
|
|
109 |
|
110 |
def process_pdf(pdf_input, pdf_url):
|
111 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
|
|
|
|
|
|
112 |
if pdf_url and pdf_url.strip():
|
113 |
-
# Process PDF from URL
|
114 |
response = requests.head(pdf_url)
|
115 |
if response.status_code != 200:
|
116 |
return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
|
117 |
pdf_file = pdf_url
|
118 |
elif pdf_input:
|
119 |
-
# Process uploaded PDF
|
120 |
pdf_file = pdf_input
|
121 |
else:
|
122 |
return "Error: Please provide a PDF file or URL."
|
123 |
|
124 |
-
# Extract text and images
|
125 |
text = extract_text_from_pdf(pdf_file)
|
126 |
images = extract_images_from_pdf(pdf_file)
|
127 |
|
@@ -130,7 +128,6 @@ def process_pdf(pdf_input, pdf_url):
|
|
130 |
if isinstance(images, str) and images.startswith("Error"):
|
131 |
return images
|
132 |
|
133 |
-
# Generate Markdown
|
134 |
markdown_output = format_to_markdown(text, images)
|
135 |
return markdown_output
|
136 |
|
@@ -143,12 +140,8 @@ iface = gr.Interface(
|
|
143 |
],
|
144 |
outputs=gr.Markdown(label="Markdown Output"),
|
145 |
title="PDF to Markdown Converter",
|
146 |
-
description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
|
147 |
)
|
148 |
|
149 |
if __name__ == "__main__":
|
150 |
-
|
151 |
-
if not HF_TOKEN:
|
152 |
-
print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
|
153 |
-
else:
|
154 |
-
iface.launch()
|
|
|
8 |
import os
|
9 |
from huggingface_hub import HfApi, create_repo
|
10 |
import re
|
|
|
11 |
from datetime import datetime
|
12 |
|
13 |
# Initialize Hugging Face API
|
14 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
|
15 |
+
REPO_NAME = "pdf-images-extracted" # Hugging Face repo for images
|
16 |
hf_api = HfApi()
|
|
|
|
|
17 |
|
18 |
def ensure_hf_repo():
|
19 |
"""Create or get Hugging Face repository."""
|
|
|
31 |
|
32 |
try:
|
33 |
# Save image temporarily
|
34 |
+
temp_path = f"/tmp/temp_{filename}.png"
|
35 |
image.save(temp_path, format="PNG")
|
36 |
|
37 |
# Upload to Hugging Face
|
|
|
91 |
if isinstance(images, list) and images:
|
92 |
markdown_output += "## Extracted Images\n\n"
|
93 |
for i, image in enumerate(images):
|
|
|
94 |
ocr_text = pytesseract.image_to_string(image).strip()
|
95 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
96 |
filename = f"image_{i}_{timestamp}"
|
|
|
107 |
|
108 |
def process_pdf(pdf_input, pdf_url):
|
109 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
110 |
+
if not HF_TOKEN:
|
111 |
+
return "Error: HF_TOKEN not set in Spaces Secrets."
|
112 |
+
|
113 |
if pdf_url and pdf_url.strip():
|
|
|
114 |
response = requests.head(pdf_url)
|
115 |
if response.status_code != 200:
|
116 |
return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
|
117 |
pdf_file = pdf_url
|
118 |
elif pdf_input:
|
|
|
119 |
pdf_file = pdf_input
|
120 |
else:
|
121 |
return "Error: Please provide a PDF file or URL."
|
122 |
|
|
|
123 |
text = extract_text_from_pdf(pdf_file)
|
124 |
images = extract_images_from_pdf(pdf_file)
|
125 |
|
|
|
128 |
if isinstance(images, str) and images.startswith("Error"):
|
129 |
return images
|
130 |
|
|
|
131 |
markdown_output = format_to_markdown(text, images)
|
132 |
return markdown_output
|
133 |
|
|
|
140 |
],
|
141 |
outputs=gr.Markdown(label="Markdown Output"),
|
142 |
title="PDF to Markdown Converter",
|
143 |
+
description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
|
144 |
)
|
145 |
|
146 |
if __name__ == "__main__":
|
147 |
+
iface.launch()
|
|
|
|
|
|
|
|