drewThomasson commited on
Commit
83d2c4f
Β·
verified Β·
1 Parent(s): e19d42c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -72
app.py CHANGED
@@ -1,82 +1,62 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
- from PIL import Image
4
- import pytesseract
5
- import io
6
- import tempfile
7
- import os
8
 
9
- def pdf_to_text_ocr(pdf_file):
10
- """
11
- Extracts text from a PDF file using OCR, displays it, and provides a download link.
12
-
13
- This function takes an uploaded PDF, converts each page to an image, uses
14
- Tesseract OCR to extract text, and then returns both the concatenated text
15
- for display and a path to a temporary .txt file for download.
16
-
17
- Args:
18
- pdf_file (gradio.File): The uploaded PDF file object from Gradio.
19
-
20
- Returns:
21
- tuple[str, str | None]: A tuple containing the extracted text and the
22
- filepath for the downloadable text file.
23
- Returns (error_message, None) on failure.
24
- """
25
- if pdf_file is None:
26
- return "Please upload a PDF file.", None
27
 
 
28
  try:
29
- # Open the PDF file from the uploaded file's temporary path
30
- pdf_document = fitz.open(stream=pdf_file.file.read(), filetype="pdf")
31
- extracted_text = ""
32
-
33
- # Iterate through each page of the PDF
34
- for page_num in range(len(pdf_document)):
35
- page = pdf_document.load_page(page_num)
36
-
37
- # Convert the page to an image (pixmap)
38
- pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
39
-
40
- # Convert the pixmap to a PIL Image
41
- img_data = pix.tobytes("png")
42
- image = Image.open(io.BytesIO(img_data))
43
-
44
- # Use Tesseract to do OCR on the image
45
- text = pytesseract.image_to_string(image)
46
- extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
47
-
48
- pdf_document.close()
49
-
50
- if not extracted_text.strip():
51
- return "No text could be extracted from the PDF.", None
52
-
53
- # Create a temporary file to store the extracted text
54
- # delete=False is important so Gradio can access the file
55
- with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
56
- temp_file.write(extracted_text)
57
- temp_filepath = temp_file.name
58
-
59
- # Return the text for the textbox and the filepath for the download button
60
- return extracted_text, temp_filepath
61
-
62
  except Exception as e:
63
- # Return the error message to the textbox and None for the file output
64
- return f"An error occurred: {str(e)}", None
65
 
66
- # Define the Gradio interface with two output components
67
- iface = gr.Interface(
68
- fn=pdf_to_text_ocr,
69
- inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  outputs=[
71
- gr.Textbox(label="Extracted Text (Scrollable)", lines=20, placeholder="Text from your PDF will appear here..."),
72
- gr.File(label="Download Extracted Text")
73
  ],
74
- title="PDF OCR Extractor with Download",
75
- description="Upload a PDF to extract its text. The text will be displayed below and a download link for a .txt file will be provided.",
76
- article="Powered by PyMuPDF, Tesseract, and Gradio.",
77
- examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
78
  )
79
 
80
- # Launch the app
81
- if __name__ == "__main__":
82
- iface.launch()
 
1
  import gradio as gr
2
+ import tempfile, os
3
+ from pdf2image import convert_from_path
4
+ import pytesseract, pdfplumber, camelot
5
+ from PIL import Image, ImageOps
 
 
6
 
7
+ # βœ… Must be named "file" for Gradio API to detect correctly
8
+ def extract_text_from_pdf(file):
9
+ extracted = []
10
+ pdf_path = file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # 1. Extract using pdfplumber
13
  try:
14
+ with pdfplumber.open(pdf_path) as pdf:
15
+ for page in pdf.pages:
16
+ text = page.extract_text(layout=True)
17
+ if text:
18
+ extracted.append(text)
19
+ tables = page.extract_tables()
20
+ for table in tables:
21
+ extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  except Exception as e:
23
+ print("pdfplumber error:", e)
 
24
 
25
+ # 2. Table extraction with Camelot
26
+ try:
27
+ tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
28
+ for table in tables:
29
+ extracted.append("CAMELOT TABLE:\n" + table.df.to_csv(index=False))
30
+ except Exception as e:
31
+ print("Camelot error:", e)
32
+
33
+ # 3. OCR fallback if text is too short
34
+ combined = "\n".join(extracted).strip()
35
+ if len(combined) < 100:
36
+ images = convert_from_path(pdf_path, dpi=300)
37
+ for img in images:
38
+ img = img.convert("L")
39
+ img = ImageOps.invert(img)
40
+ combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
41
+
42
+ # Save output
43
+ output_path = os.path.join(tempfile.gettempdir(), "extracted_text.txt")
44
+ with open(output_path, "w", encoding="utf-8") as f:
45
+ f.write(combined)
46
+
47
+ return combined, output_path
48
+
49
+ # βœ… Use Gr.Interface (NOT Blocks) with correct api_name
50
+ app = gr.Interface(
51
+ fn=extract_text_from_pdf,
52
+ inputs=gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"]),
53
  outputs=[
54
+ gr.Textbox(label="πŸ“„ Extracted Text", lines=25, show_copy_button=True),
55
+ gr.File(label="πŸ“₯ Download .txt")
56
  ],
57
+ title="Advanced PDF Extractor",
58
+ description="Extract text + tables + OCR from scanned/digital PDFs.",
59
+ allow_flagging="never",
 
60
  )
61
 
62
+ app.launch()