File size: 3,188 Bytes
ade99c4
 
 
 
04f9dd5
ade99c4
59f49e8
04f9dd5
59f49e8
04f9dd5
 
59f49e8
04f9dd5
 
 
 
 
 
ade99c4
 
04f9dd5
ade99c4
464541c
 
 
04f9dd5
464541c
 
ade99c4
 
 
04f9dd5
464541c
 
 
 
 
ade99c4
464541c
04f9dd5
 
464541c
 
 
 
 
 
 
 
 
 
 
ade99c4
 
464541c
 
ade99c4
 
464541c
ade99c4
 
464541c
ade99c4
 
 
 
 
 
 
 
 
464541c
ade99c4
 
 
04f9dd5
464541c
04f9dd5
464541c
04f9dd5
 
 
464541c
ade99c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import sentencepiece

# Load translation models
def load_translation_models():
    """Load translation models for English and Urdu."""
    try:
        translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")  
        translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
        return translator_en, translator_ur
    except Exception as e:
        st.error(f"Error initializing translation models: {e}")
        return None, None

translator_en, translator_ur = load_translation_models()

def extract_text_from_pdf(file):
    """Extract text from a PDF file."""
    text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
    return text

def extract_text_from_word(file):
    """Extract text from a Word file."""
    try:
        return docx2txt.process(file)
    except Exception as e:
        st.error(f"Error extracting text from Word document: {e}")
        return ""

def translate_text(text, translator):
    """Translate text in manageable chunks."""
    max_chunk_size = 512
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    translations = []

    for chunk in text_chunks:
        try:
            result = translator(chunk)
            translations.append(result[0]['translation_text'])
        except Exception as e:
            st.error(f"Error during translation: {e}")
            return ""
    return " ".join(translations)

# Streamlit UI
st.title("📚 Multilingual Document Translator")
st.write("Translate PDF or Word documents to English and Urdu effortlessly!")

uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])

if uploaded_file:
    # Extract text from the uploaded file
    if uploaded_file.name.endswith(".pdf"):
        text_content = extract_text_from_pdf(uploaded_file)
    else:
        text_content = extract_text_from_word(uploaded_file)

    # Show extracted text preview
    st.subheader("Extracted Text (Preview)")
    st.write(text_content[:500] if text_content else "No content found in the file.")

    # Perform translation when the user clicks the button
    if st.button("Translate"):
        if text_content:
            st.subheader(f"Translated Text ({target_language})")
            if target_language == "English" and translator_en:
                translated_text = translate_text(text_content, translator_en)
            elif target_language == "Urdu" and translator_ur:
                translated_text = translate_text(text_content, translator_ur)
            else:
                st.warning("Translation model not loaded successfully.")
            
            st.text_area("Translation Output", translated_text, height=300)
        else:
            st.warning("No text found to translate. Please upload a valid document.")