|
import streamlit as st |
|
import PyPDF2 |
|
import docx2txt |
|
from transformers import pipeline |
|
import sentencepiece |
|
|
|
|
|
def load_translation_models(): |
|
"""Load translation models for English and Urdu.""" |
|
try: |
|
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt") |
|
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt") |
|
return translator_en, translator_ur |
|
except Exception as e: |
|
st.error(f"Error initializing translation models: {e}") |
|
return None, None |
|
|
|
translator_en, translator_ur = load_translation_models() |
|
|
|
def extract_text_from_pdf(file): |
|
"""Extract text from a PDF file.""" |
|
text = "" |
|
try: |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() or "" |
|
except Exception as e: |
|
st.error(f"Error extracting text from PDF: {e}") |
|
return text |
|
|
|
def extract_text_from_word(file): |
|
"""Extract text from a Word file.""" |
|
try: |
|
return docx2txt.process(file) |
|
except Exception as e: |
|
st.error(f"Error extracting text from Word document: {e}") |
|
return "" |
|
|
|
def translate_text(text, translator): |
|
"""Translate text in manageable chunks.""" |
|
max_chunk_size = 512 |
|
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)] |
|
translations = [] |
|
|
|
for chunk in text_chunks: |
|
try: |
|
result = translator(chunk) |
|
translations.append(result[0]['translation_text']) |
|
except Exception as e: |
|
st.error(f"Error during translation: {e}") |
|
return "" |
|
return " ".join(translations) |
|
|
|
|
|
st.title("π Multilingual Document Translator") |
|
st.write("Translate PDF or Word documents to English and Urdu effortlessly!") |
|
|
|
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"]) |
|
target_language = st.radio("Select target language for translation", ["English", "Urdu"]) |
|
|
|
if uploaded_file: |
|
|
|
if uploaded_file.name.endswith(".pdf"): |
|
text_content = extract_text_from_pdf(uploaded_file) |
|
else: |
|
text_content = extract_text_from_word(uploaded_file) |
|
|
|
|
|
st.subheader("Extracted Text (Preview)") |
|
st.write(text_content[:500] if text_content else "No content found in the file.") |
|
|
|
|
|
if st.button("Translate"): |
|
if text_content: |
|
st.subheader(f"Translated Text ({target_language})") |
|
if target_language == "English" and translator_en: |
|
translated_text = translate_text(text_content, translator_en) |
|
elif target_language == "Urdu" and translator_ur: |
|
translated_text = translate_text(text_content, translator_ur) |
|
else: |
|
st.warning("Translation model not loaded successfully.") |
|
|
|
st.text_area("Translation Output", translated_text, height=300) |
|
else: |
|
st.warning("No text found to translate. Please upload a valid document.") |
|
|