broadfield-dev commited on
Commit
f86ad35
·
verified ·
1 Parent(s): ba5d90f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import PyPDF2
4
+ from pdf2image import convert_from_path, convert_from_bytes
5
+ import pytesseract
6
+ from PIL import Image
7
+ import io
8
+ import os
9
+ from huggingface_hub import HfApi, create_repo
10
+ import re
11
+ import markdown
12
+ from datetime import datetime
13
+
14
+ # Initialize Hugging Face API
15
+ hf_api = HfApi()
16
+ HF_TOKEN = os.getenv("HF_TOKEN") # Set your Hugging Face API token as an environment variable
17
+ REPO_NAME = "pdf-images-extracted" # Hugging Face repo name
18
+
19
+ def ensure_hf_repo():
20
+ """Create or get Hugging Face repository."""
21
+ try:
22
+ repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
23
+ return repo_id
24
+ except Exception as e:
25
+ return f"Error creating repo: {str(e)}"
26
+
27
+ def upload_image_to_hf(image, filename):
28
+ """Upload an image to Hugging Face Hub and return its URL."""
29
+ repo_id = ensure_hf_repo()
30
+ if isinstance(repo_id, str) and repo_id.startswith("Error"):
31
+ return repo_id
32
+
33
+ try:
34
+ # Save image temporarily
35
+ temp_path = f"temp_{filename}.png"
36
+ image.save(temp_path, format="PNG")
37
+
38
+ # Upload to Hugging Face
39
+ file_url = hf_api.upload_file(
40
+ path_or_fileobj=temp_path,
41
+ path_in_repo=f"images/{filename}.png",
42
+ repo_id=repo_id,
43
+ token=HF_TOKEN
44
+ )
45
+ os.remove(temp_path)
46
+ return file_url
47
+ except Exception as e:
48
+ return f"Error uploading image: {str(e)}"
49
+
50
+ def extract_text_from_pdf(pdf_file):
51
+ """Extract text from PDF using PyPDF2."""
52
+ try:
53
+ reader = PyPDF2.PdfReader(pdf_file)
54
+ text = ""
55
+ for page in reader.pages:
56
+ page_text = page.extract_text() or ""
57
+ text += page_text + "\n\n"
58
+ return text
59
+ except Exception as e:
60
+ return f"Error extracting text: {str(e)}"
61
+
62
+ def extract_images_from_pdf(pdf_file):
63
+ """Extract images from PDF and convert to PIL images."""
64
+ try:
65
+ if isinstance(pdf_file, str): # URL case
66
+ response = requests.get(pdf_file)
67
+ images = convert_from_bytes(response.content)
68
+ else: # File upload case
69
+ images = convert_from_path(pdf_file.name)
70
+ return images
71
+ except Exception as e:
72
+ return f"Error extracting images: {str(e)}"
73
+
74
+ def format_to_markdown(text, images):
75
+ """Convert extracted text and images to Markdown format."""
76
+ markdown_output = "# Extracted PDF Content\n\n"
77
+
78
+ # Clean and format text
79
+ text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
80
+ lines = text.split("\n")
81
+ for line in lines:
82
+ # Detect headings (simple heuristic: all caps or specific keywords)
83
+ if line.isupper() and len(line) > 5:
84
+ markdown_output += f"## {line}\n\n"
85
+ # Detect lists (lines starting with numbers or bullets)
86
+ elif re.match(r'^\s*[\d\-*+]\.\s+', line):
87
+ markdown_output += f"- {line.strip()[2:]}\n"
88
+ else:
89
+ markdown_output += f"{line}\n\n"
90
+
91
+ # Add images with Hugging Face URLs
92
+ if isinstance(images, list) and images:
93
+ markdown_output += "## Extracted Images\n\n"
94
+ for i, image in enumerate(images):
95
+ # Perform OCR on image to include any text (e.g., in charts)
96
+ ocr_text = pytesseract.image_to_string(image).strip()
97
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
98
+ filename = f"image_{i}_{timestamp}"
99
+ image_url = upload_image_to_hf(image, filename)
100
+
101
+ if not image_url.startswith("Error"):
102
+ markdown_output += f"![Image {i+1}]({image_url})\n"
103
+ if ocr_text:
104
+ markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
105
+ else:
106
+ markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
107
+
108
+ return markdown_output
109
+
110
+ def process_pdf(pdf_input, pdf_url):
111
+ """Main function to process PDF input (file or URL) and generate Markdown."""
112
+ if pdf_url and pdf_url.strip():
113
+ # Process PDF from URL
114
+ response = requests.head(pdf_url)
115
+ if response.status_code != 200:
116
+ return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
117
+ pdf_file = pdf_url
118
+ elif pdf_input:
119
+ # Process uploaded PDF
120
+ pdf_file = pdf_input
121
+ else:
122
+ return "Error: Please provide a PDF file or URL."
123
+
124
+ # Extract text and images
125
+ text = extract_text_from_pdf(pdf_file)
126
+ images = extract_images_from_pdf(pdf_file)
127
+
128
+ if isinstance(text, str) and text.startswith("Error"):
129
+ return text
130
+ if isinstance(images, str) and images.startswith("Error"):
131
+ return images
132
+
133
+ # Generate Markdown
134
+ markdown_output = format_to_markdown(text, images)
135
+ return markdown_output
136
+
137
+ # Gradio Interface
138
+ iface = gr.Interface(
139
+ fn=process_pdf,
140
+ inputs=[
141
+ gr.File(label="Upload PDF File", type="filepath"),
142
+ gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
143
+ ],
144
+ outputs=gr.Markdown(label="Markdown Output"),
145
+ title="PDF to Markdown Converter",
146
+ description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible.",
147
+ )
148
+
149
+ if __name__ == "__main__":
150
+ # Ensure Hugging Face token is set
151
+ if not HF_TOKEN:
152
+ print("Error: Please set HF_TOKEN environment variable with your Hugging Face API token.")
153
+ else:
154
+ iface.launch()