drewThomasson commited on
Commit
a4ef596
·
verified ·
1 Parent(s): b2c785b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -27
app.py CHANGED
@@ -1,40 +1,82 @@
1
  import gradio as gr
 
 
2
  import pytesseract
3
- from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
- import shutil
7
 
8
- def ocr_pdf(file_path):
9
- # Temporary directory for processing
10
- with tempfile.TemporaryDirectory() as temp_dir:
11
- # Convert PDF to images
12
- images = convert_from_path(file_path, output_folder=temp_dir)
13
-
14
- # Extract text from each page image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  extracted_text = ""
16
- for i, image in enumerate(images):
 
 
 
 
 
 
 
 
 
 
 
 
17
  text = pytesseract.image_to_string(image)
18
- extracted_text += f"\n{text}\n\n"
19
-
20
- # Save the extracted text to a .txt file in a persistent location
21
- output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
22
- with open(output_txt_path, "w") as f:
23
- f.write(extracted_text)
24
-
25
- # Create a persistent file to serve for download
26
- final_output_path = "/tmp/extracted_text.txt"
27
- shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
28
 
29
- return final_output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Gradio Interface
32
  iface = gr.Interface(
33
- fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object
34
- inputs=gr.File(label="Upload PDF File"),
35
- outputs=gr.File(label="Download Extracted Text (.txt)"), # Outputs a downloadable .txt file
36
- title="PDF to Text OCR"
 
 
 
 
 
 
37
  )
38
 
 
39
  if __name__ == "__main__":
40
- iface.launch()
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
  import pytesseract
5
+ import io
6
  import tempfile
7
  import os
 
8
 
9
+ def pdf_to_text_ocr(pdf_file):
10
+ """
11
+ Extracts text from a PDF file using OCR, displays it, and provides a download link.
12
+
13
+ This function takes an uploaded PDF, converts each page to an image, uses
14
+ Tesseract OCR to extract text, and then returns both the concatenated text
15
+ for display and a path to a temporary .txt file for download.
16
+
17
+ Args:
18
+ pdf_file (gradio.File): The uploaded PDF file object from Gradio.
19
+
20
+ Returns:
21
+ tuple[str, str | None]: A tuple containing the extracted text and the
22
+ filepath for the downloadable text file.
23
+ Returns (error_message, None) on failure.
24
+ """
25
+ if pdf_file is None:
26
+ return "Please upload a PDF file.", None
27
+
28
+ try:
29
+ # Open the PDF file from the uploaded file's temporary path
30
+ pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
31
  extracted_text = ""
32
+
33
+ # Iterate through each page of the PDF
34
+ for page_num in range(len(pdf_document)):
35
+ page = pdf_document.load_page(page_num)
36
+
37
+ # Convert the page to an image (pixmap)
38
+ pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
39
+
40
+ # Convert the pixmap to a PIL Image
41
+ img_data = pix.tobytes("png")
42
+ image = Image.open(io.BytesIO(img_data))
43
+
44
+ # Use Tesseract to do OCR on the image
45
  text = pytesseract.image_to_string(image)
46
+ extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
47
+
48
+ pdf_document.close()
 
 
 
 
 
 
 
49
 
50
+ if not extracted_text.strip():
51
+ return "No text could be extracted from the PDF.", None
52
+
53
+ # Create a temporary file to store the extracted text
54
+ # delete=False is important so Gradio can access the file
55
+ with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
56
+ temp_file.write(extracted_text)
57
+ temp_filepath = temp_file.name
58
+
59
+ # Return the text for the textbox and the filepath for the download button
60
+ return extracted_text, temp_filepath
61
+
62
+ except Exception as e:
63
+ # Return the error message to the textbox and None for the file output
64
+ return f"An error occurred: {str(e)}", None
65
 
66
+ # Define the Gradio interface with two output components
67
  iface = gr.Interface(
68
+ fn=pdf_to_text_ocr,
69
+ inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
70
+ outputs=[
71
+ gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
72
+ gr.File(label="Download Extracted Text")
73
+ ],
74
+ title="PDF OCR Extractor with Download",
75
+ description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
76
+ article="Powered by PyMuPDF, Tesseract, and Gradio.",
77
+ examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
78
  )
79
 
80
+ # Launch the app
81
  if __name__ == "__main__":
82
+ iface.launch()