File size: 3,188 Bytes
ade99c4 04f9dd5 ade99c4 59f49e8 04f9dd5 59f49e8 04f9dd5 59f49e8 04f9dd5 ade99c4 04f9dd5 ade99c4 464541c 04f9dd5 464541c ade99c4 04f9dd5 464541c ade99c4 464541c 04f9dd5 464541c ade99c4 464541c ade99c4 464541c ade99c4 464541c ade99c4 464541c ade99c4 04f9dd5 464541c 04f9dd5 464541c 04f9dd5 464541c ade99c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import sentencepiece
# Load translation models
def load_translation_models():
"""Load translation models for English and Urdu."""
try:
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
return translator_en, translator_ur
except Exception as e:
st.error(f"Error initializing translation models: {e}")
return None, None
translator_en, translator_ur = load_translation_models()
def extract_text_from_pdf(file):
"""Extract text from a PDF file."""
text = ""
try:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() or ""
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return text
def extract_text_from_word(file):
"""Extract text from a Word file."""
try:
return docx2txt.process(file)
except Exception as e:
st.error(f"Error extracting text from Word document: {e}")
return ""
def translate_text(text, translator):
"""Translate text in manageable chunks."""
max_chunk_size = 512
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
translations = []
for chunk in text_chunks:
try:
result = translator(chunk)
translations.append(result[0]['translation_text'])
except Exception as e:
st.error(f"Error during translation: {e}")
return ""
return " ".join(translations)
# Streamlit UI
st.title("📚 Multilingual Document Translator")
st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
if uploaded_file:
# Extract text from the uploaded file
if uploaded_file.name.endswith(".pdf"):
text_content = extract_text_from_pdf(uploaded_file)
else:
text_content = extract_text_from_word(uploaded_file)
# Show extracted text preview
st.subheader("Extracted Text (Preview)")
st.write(text_content[:500] if text_content else "No content found in the file.")
# Perform translation when the user clicks the button
if st.button("Translate"):
if text_content:
st.subheader(f"Translated Text ({target_language})")
if target_language == "English" and translator_en:
translated_text = translate_text(text_content, translator_en)
elif target_language == "Urdu" and translator_ur:
translated_text = translate_text(text_content, translator_ur)
else:
st.warning("Translation model not loaded successfully.")
st.text_area("Translation Output", translated_text, height=300)
else:
st.warning("No text found to translate. Please upload a valid document.")
|