File size: 2,452 Bytes
ade99c4 ded567d ade99c4 59f49e8 04f9dd5 59f49e8 04f9dd5 ade99c4 ded567d ade99c4 464541c ded567d 464541c ded567d ade99c4 ded567d ade99c4 464541c ade99c4 ded567d ade99c4 04f9dd5 464541c 04f9dd5 464541c 04f9dd5 464541c ade99c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
# Load translation models
def load_translation_models():
try:
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
return translator_en, translator_ur
except Exception as e:
st.error(f"Error initializing translation models: {e}")
return None, None
translator_en, translator_ur = load_translation_models()
def extract_text_from_pdf_with_ocr(file_path):
"""Extract text from image-based PDF using OCR."""
text = ""
try:
# Convert PDF to images
pages = convert_from_path(file_path, 300)
for page in pages:
image = Image.fromarray(page)
text += pytesseract.image_to_string(image) + "\n"
except Exception as e:
st.error(f"Error during OCR extraction: {e}")
return text
# Streamlit UI for document translation
st.title("📚 Multilingual Document Translator with OCR Support")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
if uploaded_file:
file_path = f"/mnt/data/{uploaded_file.name}"
# OCR-based text extraction for PDFs
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
st.subheader("Extracted Text (Preview)")
st.write(text_content[:500] if text_content else "No content found in the file.")
if st.button("Translate"):
if text_content:
st.subheader(f"Translated Text ({target_language})")
if target_language == "English" and translator_en:
translated_text = translate_text(text_content, translator_en)
elif target_language == "Urdu" and translator_ur:
translated_text = translate_text(text_content, translator_ur)
else:
st.warning("Translation model not loaded successfully.")
st.text_area("Translation Output", translated_text, height=300)
else:
st.warning("No text found to translate. Please upload a valid document.")
|