Spaces:

wwydmanski
/

tesseract-ocr

Runtime error

Witold Wydmański commited on Mar 12, 2023

Commit

1506ae7

1 Parent(s): 02d986d

feat: replace pytesseract with tesserocr

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@ FROM python:3.10-slim
 WORKDIR /code
 # Install tesseract
-RUN apt-get update && apt-get install -y tesseract-ocr
 # Install python dependencies
 COPY requirements.txt .

 WORKDIR /code
 # Install tesseract
+RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
 # Install python dependencies
 COPY requirements.txt .

app.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import gradio as gr
 import tempfile
-import pytesseract
 import os
 import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
 from PIL import Image
 import logging
 logging.basicConfig(level=logging.INFO)
 def pdf_to_image(pdf_file, path, progress, max_pages):
     # Convert the PDF to a PNG image using pdf2image
     doc = fitz.open(pdf_file.name)  # open document
@@ -25,6 +31,8 @@ def pdf_to_image(pdf_file, path, progress, max_pages):
     return fnames
 def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
     # Run OCR on the image using Tesseract
     with tempfile.TemporaryDirectory() as path:
         images = pdf_to_image(image, path, progress, max_pages)
@@ -33,7 +41,8 @@ def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
             with open(img, 'rb') as f:
                 img = Image.open(f)
                 img.load()
-                text = pytesseract.image_to_string(img, lang=language)
                 text_res.append(text)
     with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
@@ -43,7 +52,6 @@ def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
 if __name__ == "__main__":
     logging.info("Starting Tesseract OCR")
-    os.environ["TESSDATA_PREFIX"] = "./tessdata"
     iface = gr.Interface(
         fn=tesseract_ocr,
         inputs=[

 import gradio as gr
 import tempfile
+import tesserocr
 import os
 import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
 from PIL import Image
 import logging
+from multiprocessing.pool import Pool
 logging.basicConfig(level=logging.INFO)
+APIs = {
+    "pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"),
+    "eng": tesserocr.PyTessBaseAPI(),
+}
 def pdf_to_image(pdf_file, path, progress, max_pages):
     # Convert the PDF to a PNG image using pdf2image
     doc = fitz.open(pdf_file.name)  # open document
     return fnames
 def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
+    api = APIs[language]
     # Run OCR on the image using Tesseract
     with tempfile.TemporaryDirectory() as path:
         images = pdf_to_image(image, path, progress, max_pages)
             with open(img, 'rb') as f:
                 img = Image.open(f)
                 img.load()
+                api.SetImage(img)
+                text = api.GetUTF8Text()
                 text_res.append(text)
     with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
 if __name__ == "__main__":
     logging.info("Starting Tesseract OCR")
     iface = gr.Interface(
         fn=tesseract_ocr,
         inputs=[

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ pytesseract
 pymupdf
 gradio
 pillow
-tqdm

 pymupdf
 gradio
 pillow
+tqdm
+tesserocr