Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

Multilingual_Translator-English-Urdu / app.py

tahirsher

Update app.py

f53330e verified 10 months ago

raw

history blame

3.58 kB

	import streamlit as st
	from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import torch

	# Image-to-Text Model (TrOCR)
	def load_image_to_text_model():
	tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
	model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
	return tokenizer, model

	def extract_text_with_trocr(image, tokenizer, model):
	"""Extract text from an image using TrOCR."""
	pixel_values = tokenizer(image, return_tensors="pt").pixel_values
	outputs = model.generate(pixel_values)
	return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

	# Multilingual Translation Models
	def load_translation_models():
	translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
	translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
	return translator_en, translator_ur

	translator_en, translator_ur = load_translation_models()
	tokenizer, trocr_model = load_image_to_text_model()

	def extract_text_from_pdf_with_ocr(file_path):
	"""Extract text from image-based PDF using TrOCR."""
	text = ""
	try:
	pages = convert_from_path(file_path, 300)
	for page_image in pages:
	text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
	except Exception as e:
	st.error(f"Error during OCR extraction: {e}")
	return text

	# Translation Function
	def translate_text(text, translator):
	"""Translate text into the selected language."""
	max_chunk_size = 512
	text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
	translations = []

	for chunk in text_chunks:
	try:
	result = translator(chunk)
	translations.append(result[0]['translation_text'])
	except Exception as e:
	st.error(f"Error during translation: {e}")
	return ""
	return " ".join(translations)

	# Streamlit UI
	st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
	st.write("Translate image-based PDF or image files using advanced models.")

	uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
	target_language = st.radio("Select target language for translation", ["English", "Urdu"])

	if uploaded_file:
	file_path = f"/mnt/data/{uploaded_file.name}"

	# Image-based PDF processing using TrOCR
	text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)

	st.subheader("Extracted Text (Preview)")
	st.write(text_content[:500] if text_content else "No content found in the file.")

	if st.button("Translate"):
	if text_content:
	st.subheader(f"Translated Text ({target_language})")
	if target_language == "English" and translator_en:
	translated_text = translate_text(text_content, translator_en)
	elif target_language == "Urdu" and translator_ur:
	translated_text = translate_text(text_content, translator_ur)
	else:
	st.warning("Translation model not loaded successfully.")

	st.text_area("Translation Output", translated_text, height=300)
	else:
	st.warning("No text found to translate. Please upload a valid document.")