Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

Multilingual_Translator-English-Urdu / app.py

tahirsher

Update app.py

40548f3 verified 5 months ago

raw

history blame

5.19 kB

	import fitz # PyMuPDF for PDF processing
	from PIL import Image
	import pytesseract
	from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
	import streamlit as st
	import os
	import re
	from docx import Document
	from langdetect import detect

	# Initialize BLIP-2 model and processor for image-to-text
	processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
	model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

	# Initialize translation pipeline
	translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

	# Path to Tesseract executable for OCR
	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


	def extract_text_from_image(image):
	"""Extract text from image using OCR or BLIP-2."""
	# First try using BLIP-2
	image = image.convert("RGB")
	inputs = processor(images=image, return_tensors="pt")
	generated_ids = model.generate(**inputs)
	decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# Fallback to OCR if BLIP-2 extraction fails
	if not decoded_text.strip():
	decoded_text = pytesseract.image_to_string(image)

	return decoded_text.strip()


	def extract_from_pdf(pdf_path):
	"""Extract text from PDF by combining direct extraction and OCR fallback."""
	doc = fitz.open(pdf_path)
	full_text = ""

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Try extracting text directly
	text = page.get_text()

	# If no text, fallback to OCR
	if not text.strip():
	pix = page.get_pixmap()
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	text = extract_text_from_image(image)

	full_text += text + "\n"
	return full_text.strip()


	def extract_from_word(docx_path):
	doc = Document(docx_path)
	full_text = ""
	for para in doc.paragraphs:
	full_text += para.text + "\n"
	return full_text.strip()


	def clean_text(text):
	return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()


	def translate_text(text):
	if not text.strip():
	return "No text available for translation."

	detected_language = detect(text)
	st.write(f"Detected language: {detected_language}")

	if detected_language == "en":
	return "The text is already in English."

	chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
	translated_text = ""
	for chunk in chunks:
	translated_chunk = translator(chunk, max_length=400)
	if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
	translated_text += translated_chunk[0]['translation_text'] + " "
	return translated_text.strip()


	def create_pdf(translated_text, output_path):
	doc = fitz.open()
	page = doc.new_page()

	# Define text insertion rectangle
	rect = fitz.Rect(50, 50, 550, 750)

	# Insert text using the defined rectangle
	page.insert_textbox(
	rect, translated_text,
	fontsize=12,
	fontname="helv",
	color=(0, 0, 0),
	)
	doc.save(output_path)


	st.title("Multilingual Document Translator")
	uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])

	if uploaded_file is not None:
	with st.spinner("Processing document..."):
	file_extension = uploaded_file.name.split(".")[-1].lower()
	temp_file_path = f"temp.{file_extension}"
	with open(temp_file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	try:
	if file_extension == "pdf":
	extracted_text = extract_from_pdf(temp_file_path)
	elif file_extension in ["jpg", "jpeg", "png"]:
	image = Image.open(temp_file_path)
	extracted_text = extract_text_from_image(image)
	elif file_extension == "docx":
	extracted_text = extract_from_word(temp_file_path)
	else:
	st.error("Unsupported file format.")
	st.stop()

	extracted_text = clean_text(extracted_text)
	st.write("Extracted Text (First 500 characters):", extracted_text[:500])

	translated_text = translate_text(extracted_text)

	st.subheader("Translated Text (English)")
	st.write(translated_text)

	if translated_text.strip():
	output_pdf_path = "translated_document.pdf"
	create_pdf(translated_text, output_pdf_path)

	with open(output_pdf_path, "rb") as f:
	st.download_button(
	label="Download Translated PDF",
	data=f,
	file_name="translated_document.pdf",
	mime="application/pdf"
	)
	else:
	st.warning("No content to save in the translated PDF.")
	finally:
	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)
	if os.path.exists("translated_document.pdf"):
	os.remove("translated_document.pdf")