Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

tahirsher commited on Jan 28

Commit

2be5258

verified ·

1 Parent(s): cbb084d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import streamlit as st
-import PyPDF2
 import docx2txt
 from transformers import pipeline
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
-# Load translation models
 def load_translation_models():
     try:
-        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
         translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
         return translator_en, translator_ur
     except Exception as e:
@@ -22,11 +23,10 @@ def extract_text_from_pdf_with_ocr(file_path):
     """Extract text from image-based PDF using OCR."""
     text = ""
     try:
-        # Convert PDF to images
-        pages = convert_from_path(file_path, 300)
         for page in pages:
-            image = Image.fromarray(page)
-            text += pytesseract.image_to_string(image) + "\n"
     except Exception as e:
         st.error(f"Error during OCR extraction: {e}")
     return text

 import streamlit as st
 import docx2txt
 from transformers import pipeline
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
+import os
+# Initialize translation models
 def load_translation_models():
+    """Load translation models."""
     try:
+        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
         translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
         return translator_en, translator_ur
     except Exception as e:
     """Extract text from image-based PDF using OCR."""
     text = ""
     try:
+        # Convert PDF to images with Poppler support
+        pages = convert_from_path(file_path, 300, poppler_path="/path-to-poppler-bin")  # Update poppler_path on Windows if necessary
         for page in pages:
+            text += pytesseract.image_to_string(page) + "\n"
     except Exception as e:
         st.error(f"Error during OCR extraction: {e}")
     return text