File size: 3,712 Bytes
87fcfea add3a0f 7037128 b515916 87fcfea 04f9dd5 add3a0f db19b48 add3a0f ade99c4 87fcfea 7037128 add3a0f ade99c4 87fcfea 7037128 f53330e 87fcfea 7037128 add3a0f 87fcfea ade99c4 7037128 87fcfea ade99c4 7037128 87fcfea 7037128 87fcfea 7037128 87fcfea 7037128 87fcfea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import fitz # PyMuPDF for PDF processing
from PIL import Image # For image processing
from transformers import pipeline
import streamlit as st
import os
import io
from docx import Document # For Word document processing
# Load the TrOCR model for image-to-text (smaller model)
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
# Load the translation model (smaller model)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
# Function to extract text from an image using TrOCR
def extract_text_from_image(image):
result = trocr_pipeline(image)
return result[0]['generated_text']
# Function to extract text from a PDF
def extract_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
full_text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
text = extract_text_from_image(image)
full_text += text + "\n"
full_text += page.get_text() + "\n"
return full_text
# Function to extract text from a Word document
def extract_from_word(docx_path):
doc = Document(docx_path)
full_text = ""
for para in doc.paragraphs:
full_text += para.text + "\n"
return full_text
# Function to translate text to English
def translate_text(text):
translated_text = translator(text, max_length=400)[0]['translation_text']
return translated_text
# Function to create a PDF from translated text
def create_pdf(translated_text, output_path):
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
doc.save(output_path)
# Streamlit UI
st.title("Multilingual Document Translator")
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
if uploaded_file is not None:
with st.spinner("Processing document..."):
# Save the uploaded file temporarily
file_extension = uploaded_file.name.split(".")[-1].lower()
temp_file_path = f"temp.{file_extension}"
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Extract text based on file type
if file_extension == "pdf":
extracted_text = extract_from_pdf(temp_file_path)
elif file_extension in ["jpg", "jpeg", "png"]:
image = Image.open(temp_file_path)
extracted_text = extract_text_from_image(image)
elif file_extension == "docx":
extracted_text = extract_from_word(temp_file_path)
else:
st.error("Unsupported file format.")
st.stop()
# Translate the extracted text
translated_text = translate_text(extracted_text)
# Display the translated text
st.subheader("Translated Text (English)")
st.write(translated_text)
# Create a PDF from the translated text
output_pdf_path = "translated_document.pdf"
create_pdf(translated_text, output_pdf_path)
# Provide a download link for the translated PDF
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Translated PDF",
data=f,
file_name="translated_document.pdf",
mime="application/pdf"
)
# Clean up temporary files
os.remove(temp_file_path)
os.remove(output_pdf_path) |