File size: 2,858 Bytes
ade99c4
 
 
 
 
464541c
 
 
 
 
 
ade99c4
 
464541c
ade99c4
464541c
 
 
 
 
 
ade99c4
 
 
464541c
 
 
 
 
 
ade99c4
464541c
 
 
 
 
 
 
 
 
 
 
 
 
 
ade99c4
 
464541c
 
ade99c4
 
464541c
ade99c4
 
464541c
ade99c4
 
 
 
 
 
 
 
 
464541c
ade99c4
 
 
464541c
 
 
 
 
 
ade99c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline

# Initialize Hugging Face Translation Pipelines (Force PyTorch Backend)
try:
    translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")  
    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur", framework="pt")
except Exception as e:
    st.error(f"Failed to initialize translation models. Error: {e}")

def extract_text_from_pdf(file):
    """Extract text from PDF."""
    text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
    return text

def extract_text_from_word(file):
    """Extract text from Word file."""
    try:
        return docx2txt.process(file)
    except Exception as e:
        st.error(f"Error extracting text from Word document: {e}")
        return ""

def translate_text(text, translator):
    """Translate text in chunks using the given translator."""
    max_chunk_size = 512  # Limit due to token constraints
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    translations = []

    for chunk in text_chunks:
        try:
            result = translator(chunk)
            translations.append(result[0]['translation_text'])
        except Exception as e:
            st.error(f"Error during translation: {e}")
            return ""
    return " ".join(translations)

# Streamlit UI
st.title("📚 Multilingual Document Translator")
st.write("Translate PDF or Word documents to English and Urdu effortlessly!")

uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])

if uploaded_file:
    # Extract text from the uploaded file
    if uploaded_file.name.endswith(".pdf"):
        text_content = extract_text_from_pdf(uploaded_file)
    else:
        text_content = extract_text_from_word(uploaded_file)

    # Show extracted text preview
    st.subheader("Extracted Text (Preview)")
    st.write(text_content[:500] if text_content else "No content found in the file.")

    # Perform translation when the user clicks the button
    if st.button("Translate"):
        if text_content:
            st.subheader(f"Translated Text ({target_language})")
            if target_language == "English":
                translated_text = translate_text(text_content, translator_en)
            else:
                translated_text = translate_text(text_content, translator_ur)

            st.text_area("Translation Output", translated_text, height=300)
        else:
            st.warning("No text found to translate. Please upload a valid document.")