drewThomasson commited on
Commit
52020a7
·
verified ·
1 Parent(s): be572f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -73
app.py CHANGED
@@ -3,80 +3,38 @@ import pytesseract
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
-
7
- def ocr_pdf(pdf_file):
8
- """
9
- Performs OCR on a given PDF file.
10
-
11
- Args:
12
- pdf_file: An uploaded file object from Gradio.
13
-
14
- Returns:
15
- A tuple containing:
16
- - The path to the generated .txt file.
17
- - The extracted text content as a string.
18
- """
19
- if pdf_file is None:
20
- return None, "Please upload a PDF file first."
21
-
22
- file_path = pdf_file.name
23
-
24
- try:
25
- # Use a temporary directory that is automatically cleaned up
26
- with tempfile.TemporaryDirectory() as temp_dir:
27
- # Convert PDF pages to images
28
- try:
29
- images = convert_from_path(file_path, output_folder=temp_dir)
30
- except Exception as e:
31
- error_message = (
32
- "Failed to convert PDF. Please ensure Poppler is installed and in your system's PATH.\n"
33
- f"Details: {e}"
34
- )
35
- return None, error_message
36
-
37
- # Extract text from each page image
38
- full_extracted_text = ""
39
- for i, image in enumerate(images):
40
- text = pytesseract.image_to_string(image, lang='eng')
41
- full_extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
42
-
43
- if not full_extracted_text.strip():
44
- full_extracted_text = "No text could be extracted. The PDF might contain only images without text or be empty."
45
-
46
- # Create a temporary file for the extracted text. Gradio will handle serving it.
47
- # We use a NamedTemporaryFile to ensure it has a path and is cleaned up.
48
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as f:
49
- f.write(full_extracted_text)
50
- output_txt_path = f.name
51
-
52
- # Return both the file path for download and the text for display
53
- return output_txt_path, full_extracted_text
54
-
55
- except Exception as e:
56
- return None, f"An unexpected error occurred: {e}"
57
-
58
- # --- Gradio Interface Definition ---
59
- # We define the user interface for our application.
60
  iface = gr.Interface(
61
- fn=ocr_pdf,
62
- inputs=gr.File(label="Upload PDF File", type="file"),
63
- outputs=[
64
- gr.File(label="Download Extracted Text (.txt)"),
65
- gr.Textbox(label="Extracted Text Content", lines=20, interactive=False)
66
- ],
67
- title="📄 PDF to Text Extractor (OCR)",
68
- description=(
69
- "Upload a PDF file to extract its text. The content will be displayed below, "
70
- "and you'll get a link to download it as a .txt file. "
71
- "**Note:** This tool relies on OCR and may not be 100% accurate, especially with complex layouts or poor quality scans."
72
- ),
73
- allow_flagging="never",
74
- examples=[
75
- # You can place paths to example PDFs on the server here if you have any.
76
- # ["path/to/your/example.pdf"]
77
- ]
78
  )
79
 
80
- # --- Launch the Application ---
81
  if __name__ == "__main__":
82
- iface.launch()
 
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
+ import shutil
7
+
8
+ def ocr_pdf(file_path):
9
+ # Temporary directory for processing
10
+ with tempfile.TemporaryDirectory() as temp_dir:
11
+ # Convert PDF to images
12
+ images = convert_from_path(file_path, output_folder=temp_dir)
13
+
14
+ # Extract text from each page image
15
+ extracted_text = ""
16
+ for i, image in enumerate(images):
17
+ text = pytesseract.image_to_string(image)
18
+ extracted_text += f"\n{text}\n\n"
19
+
20
+ # Save the extracted text to a .txt file in a persistent location
21
+ output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
22
+ with open(output_txt_path, "w") as f:
23
+ f.write(extracted_text)
24
+
25
+ # Create a persistent file to serve for download
26
+ final_output_path = "/tmp/extracted_text.txt"
27
+ shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
28
+
29
+ return final_output_path
30
+
31
+ # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  iface = gr.Interface(
33
+ fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object
34
+ inputs=gr.File(label="Upload PDF File"),
35
+ outputs=gr.File(label="Download Extracted Text (.txt)"), # Outputs a downloadable .txt file
36
+ title="PDF to Text OCR"
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
 
 
39
  if __name__ == "__main__":
40
+ iface.launch()