Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 28

Commit

f53330e

verified ·

1 Parent(s): 589a63e

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -23

app.py CHANGED Viewed

@@ -1,46 +1,70 @@
 import streamlit as st
-import docx2txt
-from transformers import pipeline
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
-import os
-# Initialize translation models
 def load_translation_models():
-    """Load translation models."""
-    try:
-        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
-        translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
-        return translator_en, translator_ur
-    except Exception as e:
-        st.error(f"Error initializing translation models: {e}")
-        return None, None
 translator_en, translator_ur = load_translation_models()
 def extract_text_from_pdf_with_ocr(file_path):
-    """Extract text from image-based PDF using OCR."""
     text = ""
     try:
-        # Convert PDF to images with Poppler support
-        pages = convert_from_path(file_path, 300, poppler_path="/path-to-poppler-bin")  # Update poppler_path on Windows if necessary
-        for page in pages:
-            text += pytesseract.image_to_string(page) + "\n"
     except Exception as e:
         st.error(f"Error during OCR extraction: {e}")
     return text
-# Streamlit UI for document translation
-st.title("📚 Multilingual Document Translator with OCR Support")
-uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
 target_language = st.radio("Select target language for translation", ["English", "Urdu"])
 if uploaded_file:
     file_path = f"/mnt/data/{uploaded_file.name}"
-    # OCR-based text extraction for PDFs
-    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")

 import streamlit as st
+from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
 import pytesseract
 from pdf2image import convert_from_path
 from PIL import Image
+import torch
+# Image-to-Text Model (TrOCR)
+def load_image_to_text_model():
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
+    model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
+    return tokenizer, model
+def extract_text_with_trocr(image, tokenizer, model):
+    """Extract text from an image using TrOCR."""
+    pixel_values = tokenizer(image, return_tensors="pt").pixel_values
+    outputs = model.generate(pixel_values)
+    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+# Multilingual Translation Models
 def load_translation_models():
+    translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
+    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
+    return translator_en, translator_ur
 translator_en, translator_ur = load_translation_models()
+tokenizer, trocr_model = load_image_to_text_model()
 def extract_text_from_pdf_with_ocr(file_path):
+    """Extract text from image-based PDF using TrOCR."""
     text = ""
     try:
+        pages = convert_from_path(file_path, 300)
+        for page_image in pages:
+            text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
     except Exception as e:
         st.error(f"Error during OCR extraction: {e}")
     return text
+# Translation Function
+def translate_text(text, translator):
+    """Translate text into the selected language."""
+    max_chunk_size = 512
+    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
+    translations = []
+    for chunk in text_chunks:
+        try:
+            result = translator(chunk)
+            translations.append(result[0]['translation_text'])
+        except Exception as e:
+            st.error(f"Error during translation: {e}")
+            return ""
+    return " ".join(translations)
+# Streamlit UI
+st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
+st.write("Translate image-based PDF or image files using advanced models.")
+uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
 target_language = st.radio("Select target language for translation", ["English", "Urdu"])
 if uploaded_file:
     file_path = f"/mnt/data/{uploaded_file.name}"
+    # Image-based PDF processing using TrOCR
+    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")