Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,40 +1,82 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import pytesseract
|
3 |
-
|
4 |
import tempfile
|
5 |
import os
|
6 |
-
import shutil
|
7 |
|
8 |
-
def
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
extracted_text = ""
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
text = pytesseract.image_to_string(image)
|
18 |
-
extracted_text += f"
|
19 |
-
|
20 |
-
|
21 |
-
output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
|
22 |
-
with open(output_txt_path, "w") as f:
|
23 |
-
f.write(extracted_text)
|
24 |
-
|
25 |
-
# Create a persistent file to serve for download
|
26 |
-
final_output_path = "/tmp/extracted_text.txt"
|
27 |
-
shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
# Gradio
|
32 |
iface = gr.Interface(
|
33 |
-
fn=
|
34 |
-
inputs=gr.File(label="Upload PDF
|
35 |
-
outputs=
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
)
|
38 |
|
|
|
39 |
if __name__ == "__main__":
|
40 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from PIL import Image
|
4 |
import pytesseract
|
5 |
+
import io
|
6 |
import tempfile
|
7 |
import os
|
|
|
8 |
|
9 |
+
def pdf_to_text_ocr(pdf_file):
|
10 |
+
"""
|
11 |
+
Extracts text from a PDF file using OCR, displays it, and provides a download link.
|
12 |
+
|
13 |
+
This function takes an uploaded PDF, converts each page to an image, uses
|
14 |
+
Tesseract OCR to extract text, and then returns both the concatenated text
|
15 |
+
for display and a path to a temporary .txt file for download.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
pdf_file (gradio.File): The uploaded PDF file object from Gradio.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
tuple[str, str | None]: A tuple containing the extracted text and the
|
22 |
+
filepath for the downloadable text file.
|
23 |
+
Returns (error_message, None) on failure.
|
24 |
+
"""
|
25 |
+
if pdf_file is None:
|
26 |
+
return "Please upload a PDF file.", None
|
27 |
+
|
28 |
+
try:
|
29 |
+
# Open the PDF file from the uploaded file's temporary path
|
30 |
+
pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
|
31 |
extracted_text = ""
|
32 |
+
|
33 |
+
# Iterate through each page of the PDF
|
34 |
+
for page_num in range(len(pdf_document)):
|
35 |
+
page = pdf_document.load_page(page_num)
|
36 |
+
|
37 |
+
# Convert the page to an image (pixmap)
|
38 |
+
pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
|
39 |
+
|
40 |
+
# Convert the pixmap to a PIL Image
|
41 |
+
img_data = pix.tobytes("png")
|
42 |
+
image = Image.open(io.BytesIO(img_data))
|
43 |
+
|
44 |
+
# Use Tesseract to do OCR on the image
|
45 |
text = pytesseract.image_to_string(image)
|
46 |
+
extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
|
47 |
+
|
48 |
+
pdf_document.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
if not extracted_text.strip():
|
51 |
+
return "No text could be extracted from the PDF.", None
|
52 |
+
|
53 |
+
# Create a temporary file to store the extracted text
|
54 |
+
# delete=False is important so Gradio can access the file
|
55 |
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
|
56 |
+
temp_file.write(extracted_text)
|
57 |
+
temp_filepath = temp_file.name
|
58 |
+
|
59 |
+
# Return the text for the textbox and the filepath for the download button
|
60 |
+
return extracted_text, temp_filepath
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
# Return the error message to the textbox and None for the file output
|
64 |
+
return f"An error occurred: {str(e)}", None
|
65 |
|
66 |
+
# Define the Gradio interface with two output components
|
67 |
iface = gr.Interface(
|
68 |
+
fn=pdf_to_text_ocr,
|
69 |
+
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
|
70 |
+
outputs=[
|
71 |
+
gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
|
72 |
+
gr.File(label="Download Extracted Text")
|
73 |
+
],
|
74 |
+
title="PDF OCR Extractor with Download",
|
75 |
+
description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
|
76 |
+
article="Powered by PyMuPDF, Tesseract, and Gradio.",
|
77 |
+
examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
|
78 |
)
|
79 |
|
80 |
+
# Launch the app
|
81 |
if __name__ == "__main__":
|
82 |
+
iface.launch()
|