Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

Multilingual_Translator-English-Urdu / app.py

tahirsher

Update app.py

5c859a7 verified 8 months ago

raw

history blame

4.37 kB

	import PyPDF2
	import pytesseract
	from PIL import Image
	import pdfplumber
	from transformers import pipeline
	import streamlit as st
	import docx
	from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
	from docx.shared import Pt
	import os

	# Translation model pipeline
	translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

	# Set Tesseract path (modify for your environment)
	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


	def extract_text_and_tables(pdf_path):
	"""Extract structured content from PDF, including tables and text."""
	content_blocks = []

	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	# Extract tables
	tables = page.extract_tables()
	for table in tables:
	content_blocks.append({"type": "table", "content": table})

	# Extract text as paragraphs
	text = page.extract_text()
	if not text: # Fallback to OCR if text extraction fails
	pix = page.to_image()
	text = pytesseract.image_to_string(pix.original)

	if text:
	paragraphs = text.split("\n")
	for para in paragraphs:
	content_blocks.append({"type": "text", "content": para.strip()})

	return content_blocks


	def translate_content(content_blocks):
	"""Translate extracted content preserving structure."""
	translated_blocks = []

	for block in content_blocks:
	if block["type"] == "text" and block["content"].strip():
	translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
	translated_blocks.append({"type": "text", "content": translated_text})

	elif block["type"] == "table":
	translated_table = []
	for row in block["content"]:
	translated_row = [
	translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
	]
	translated_table.append(translated_row)
	translated_blocks.append({"type": "table", "content": translated_table})

	return translated_blocks


	def create_translated_doc(translated_blocks, output_path):
	"""Generate a translated Word document preserving tables and text."""
	doc = docx.Document()

	for block in translated_blocks:
	if block["type"] == "text":
	para = doc.add_paragraph(block["content"])
	para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
	para.style.font.size = Pt(12)

	elif block["type"] == "table":
	table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
	for i, row in enumerate(block["content"]):
	for j, cell_text in enumerate(row):
	table.cell(i, j).text = cell_text

	doc.save(output_path)


	# Streamlit UI
	st.title("Professional Multilingual PDF Translator")
	uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])

	output_docx_path = "translated_document.docx" # Ensure the variable is defined outside any block

	if uploaded_file is not None:
	with st.spinner("Processing and translating the document..."):
	temp_file_path = "uploaded_document.pdf"
	with open(temp_file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	try:
	# Extract content from the PDF
	content_blocks = extract_text_and_tables(temp_file_path)

	# Translate content
	translated_blocks = translate_content(content_blocks)

	# Create translated DOCX
	create_translated_doc(translated_blocks, output_docx_path)

	# Provide download link for the translated document
	with open(output_docx_path, "rb") as f:
	st.download_button(
	label="Download Translated Document",
	data=f,
	file_name="translated_document.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)
	finally:
	# Clean up temporary files
	if os.path.exists(temp_file_path):
	os.remove(temp_file_path)
	if os.path.exists(output_docx_path):
	os.remove(output_docx_path)