Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

Multilingual_Translator-English-Urdu / app.py

tahirsher

Update app.py

1a131d7 verified 5 months ago

raw

history blame

2.68 kB

	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image
	from transformers import pipeline
	import streamlit as st
	import os
	import io

	# Set up the translation pipelines with error handling
	try:
	translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
	except Exception as e:
	st.error(f"Failed to load English translation model: {e}")
	translator_to_english = None

	try:
	translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
	except Exception as e:
	st.error(f"Failed to load Urdu translation model: {e}")
	translator_to_urdu = None

	# Function to extract text from an image using OCR
	def extract_text_from_image(image):
	text = pytesseract.image_to_string(image, lang='eng+urd')
	return text

	# Function to extract images and text from a PDF
	def extract_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	full_text = ""
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	image_list = page.get_images(full=True)
	for img_index, img in enumerate(image_list):
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]
	image = Image.open(io.BytesIO(image_bytes))
	text = extract_text_from_image(image)
	full_text += text + "\n"
	full_text += page.get_text() + "\n"
	return full_text

	# Function to translate text to English and Urdu
	def translate_text(text):
	english_translation = ""
	urdu_translation = ""
	if translator_to_english:
	english_translation = translator_to_english(text, max_length=400)[0]['translation_text']
	if translator_to_urdu:
	urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text']
	return english_translation, urdu_translation

	# Streamlit UI
	st.title("PDF Document Translator")
	uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")

	if uploaded_file is not None:
	with st.spinner("Processing PDF..."):
	# Save the uploaded file temporarily
	with open("temp.pdf", "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Extract text from the PDF
	extracted_text = extract_from_pdf("temp.pdf")

	# Translate the extracted text
	english_translation, urdu_translation = translate_text(extracted_text)

	# Display the translations
	st.subheader("English Translation")
	st.write(english_translation)

	st.subheader("Urdu Translation")
	st.write(urdu_translation)

	# Clean up the temporary file
	os.remove("temp.pdf")