Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

File size: 2,452 Bytes

ade99c4
 
 
 
ded567d
 
 
ade99c4
59f49e8
04f9dd5
 
 
59f49e8
04f9dd5
 
 
 
 
 
ade99c4
ded567d
 
ade99c4
464541c
ded567d
 
 
 
 
464541c
ded567d
ade99c4
 
ded567d
 
ade99c4
464541c
ade99c4
 
ded567d
 
 
 
 
ade99c4
 
 
 
 
 
04f9dd5
464541c
04f9dd5
464541c
04f9dd5
 
 
464541c
ade99c4

import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

# Load translation models
def load_translation_models():
    try:
        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")  
        translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
        return translator_en, translator_ur
    except Exception as e:
        st.error(f"Error initializing translation models: {e}")
        return None, None

translator_en, translator_ur = load_translation_models()

def extract_text_from_pdf_with_ocr(file_path):
    """Extract text from image-based PDF using OCR."""
    text = ""
    try:
        # Convert PDF to images
        pages = convert_from_path(file_path, 300)
        for page in pages:
            image = Image.fromarray(page)
            text += pytesseract.image_to_string(image) + "\n"
    except Exception as e:
        st.error(f"Error during OCR extraction: {e}")
    return text

# Streamlit UI for document translation
st.title("📚 Multilingual Document Translator with OCR Support")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])

if uploaded_file:
    file_path = f"/mnt/data/{uploaded_file.name}"
    
    # OCR-based text extraction for PDFs
    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
    
    st.subheader("Extracted Text (Preview)")
    st.write(text_content[:500] if text_content else "No content found in the file.")

    if st.button("Translate"):
        if text_content:
            st.subheader(f"Translated Text ({target_language})")
            if target_language == "English" and translator_en:
                translated_text = translate_text(text_content, translator_en)
            elif target_language == "Urdu" and translator_ur:
                translated_text = translate_text(text_content, translator_ur)
            else:
                st.warning("Translation model not loaded successfully.")
            
            st.text_area("Translation Output", translated_text, height=300)
        else:
            st.warning("No text found to translate. Please upload a valid document.")