File size: 2,452 Bytes
ade99c4
 
 
 
ded567d
 
 
ade99c4
59f49e8
04f9dd5
 
 
59f49e8
04f9dd5
 
 
 
 
 
ade99c4
ded567d
 
ade99c4
464541c
ded567d
 
 
 
 
464541c
ded567d
ade99c4
 
ded567d
 
ade99c4
464541c
ade99c4
 
ded567d
 
 
 
 
ade99c4
 
 
 
 
 
04f9dd5
464541c
04f9dd5
464541c
04f9dd5
 
 
464541c
ade99c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

# Load translation models
def load_translation_models():
    try:
        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")  
        translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
        return translator_en, translator_ur
    except Exception as e:
        st.error(f"Error initializing translation models: {e}")
        return None, None

translator_en, translator_ur = load_translation_models()

def extract_text_from_pdf_with_ocr(file_path):
    """Extract text from image-based PDF using OCR."""
    text = ""
    try:
        # Convert PDF to images
        pages = convert_from_path(file_path, 300)
        for page in pages:
            image = Image.fromarray(page)
            text += pytesseract.image_to_string(image) + "\n"
    except Exception as e:
        st.error(f"Error during OCR extraction: {e}")
    return text

# Streamlit UI for document translation
st.title("📚 Multilingual Document Translator with OCR Support")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])

if uploaded_file:
    file_path = f"/mnt/data/{uploaded_file.name}"
    
    # OCR-based text extraction for PDFs
    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
    
    st.subheader("Extracted Text (Preview)")
    st.write(text_content[:500] if text_content else "No content found in the file.")

    if st.button("Translate"):
        if text_content:
            st.subheader(f"Translated Text ({target_language})")
            if target_language == "English" and translator_en:
                translated_text = translate_text(text_content, translator_en)
            elif target_language == "Urdu" and translator_ur:
                translated_text = translate_text(text_content, translator_ur)
            else:
                st.warning("Translation model not loaded successfully.")
            
            st.text_area("Translation Output", translated_text, height=300)
        else:
            st.warning("No text found to translate. Please upload a valid document.")