Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,82 +1,62 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
from
|
4 |
-
import pytesseract
|
5 |
-
import
|
6 |
-
import tempfile
|
7 |
-
import os
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
This function takes an uploaded PDF, converts each page to an image, uses
|
14 |
-
Tesseract OCR to extract text, and then returns both the concatenated text
|
15 |
-
for display and a path to a temporary .txt file for download.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
pdf_file (gradio.File): The uploaded PDF file object from Gradio.
|
19 |
-
|
20 |
-
Returns:
|
21 |
-
tuple[str, str | None]: A tuple containing the extracted text and the
|
22 |
-
filepath for the downloadable text file.
|
23 |
-
Returns (error_message, None) on failure.
|
24 |
-
"""
|
25 |
-
if pdf_file is None:
|
26 |
-
return "Please upload a PDF file.", None
|
27 |
|
|
|
28 |
try:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
# Convert the page to an image (pixmap)
|
38 |
-
pix = page.get_pixmap(dpi=300) # Increase DPI for better OCR quality
|
39 |
-
|
40 |
-
# Convert the pixmap to a PIL Image
|
41 |
-
img_data = pix.tobytes("png")
|
42 |
-
image = Image.open(io.BytesIO(img_data))
|
43 |
-
|
44 |
-
# Use Tesseract to do OCR on the image
|
45 |
-
text = pytesseract.image_to_string(image)
|
46 |
-
extracted_text += f"--- Page {page_num + 1} ---\n{text}\n\n"
|
47 |
-
|
48 |
-
pdf_document.close()
|
49 |
-
|
50 |
-
if not extracted_text.strip():
|
51 |
-
return "No text could be extracted from the PDF.", None
|
52 |
-
|
53 |
-
# Create a temporary file to store the extracted text
|
54 |
-
# delete=False is important so Gradio can access the file
|
55 |
-
with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt", encoding="utf-8") as temp_file:
|
56 |
-
temp_file.write(extracted_text)
|
57 |
-
temp_filepath = temp_file.name
|
58 |
-
|
59 |
-
# Return the text for the textbox and the filepath for the download button
|
60 |
-
return extracted_text, temp_filepath
|
61 |
-
|
62 |
except Exception as e:
|
63 |
-
|
64 |
-
return f"An error occurred: {str(e)}", None
|
65 |
|
66 |
-
#
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
outputs=[
|
71 |
-
gr.Textbox(label="Extracted Text
|
72 |
-
gr.File(label="Download
|
73 |
],
|
74 |
-
title="PDF
|
75 |
-
description="
|
76 |
-
|
77 |
-
examples=[["sample.pdf"]] # You can add a sample pdf file in the same directory
|
78 |
)
|
79 |
|
80 |
-
|
81 |
-
if __name__ == "__main__":
|
82 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import tempfile, os
|
3 |
+
from pdf2image import convert_from_path
|
4 |
+
import pytesseract, pdfplumber, camelot
|
5 |
+
from PIL import Image, ImageOps
|
|
|
|
|
6 |
|
7 |
+
# β
Must be named "file" for Gradio API to detect correctly
|
8 |
+
def extract_text_from_pdf(file):
|
9 |
+
extracted = []
|
10 |
+
pdf_path = file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
# 1. Extract using pdfplumber
|
13 |
try:
|
14 |
+
with pdfplumber.open(pdf_path) as pdf:
|
15 |
+
for page in pdf.pages:
|
16 |
+
text = page.extract_text(layout=True)
|
17 |
+
if text:
|
18 |
+
extracted.append(text)
|
19 |
+
tables = page.extract_tables()
|
20 |
+
for table in tables:
|
21 |
+
extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
except Exception as e:
|
23 |
+
print("pdfplumber error:", e)
|
|
|
24 |
|
25 |
+
# 2. Table extraction with Camelot
|
26 |
+
try:
|
27 |
+
tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
|
28 |
+
for table in tables:
|
29 |
+
extracted.append("CAMELOT TABLE:\n" + table.df.to_csv(index=False))
|
30 |
+
except Exception as e:
|
31 |
+
print("Camelot error:", e)
|
32 |
+
|
33 |
+
# 3. OCR fallback if text is too short
|
34 |
+
combined = "\n".join(extracted).strip()
|
35 |
+
if len(combined) < 100:
|
36 |
+
images = convert_from_path(pdf_path, dpi=300)
|
37 |
+
for img in images:
|
38 |
+
img = img.convert("L")
|
39 |
+
img = ImageOps.invert(img)
|
40 |
+
combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
|
41 |
+
|
42 |
+
# Save output
|
43 |
+
output_path = os.path.join(tempfile.gettempdir(), "extracted_text.txt")
|
44 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
45 |
+
f.write(combined)
|
46 |
+
|
47 |
+
return combined, output_path
|
48 |
+
|
49 |
+
# β
Use Gr.Interface (NOT Blocks) with correct api_name
|
50 |
+
app = gr.Interface(
|
51 |
+
fn=extract_text_from_pdf,
|
52 |
+
inputs=gr.File(label="π€ Upload PDF", file_types=[".pdf"]),
|
53 |
outputs=[
|
54 |
+
gr.Textbox(label="π Extracted Text", lines=25, show_copy_button=True),
|
55 |
+
gr.File(label="π₯ Download .txt")
|
56 |
],
|
57 |
+
title="Advanced PDF Extractor",
|
58 |
+
description="Extract text + tables + OCR from scanned/digital PDFs.",
|
59 |
+
allow_flagging="never",
|
|
|
60 |
)
|
61 |
|
62 |
+
app.launch()
|
|
|
|