|
import fitz |
|
from PIL import Image |
|
from transformers import pipeline |
|
import streamlit as st |
|
import os |
|
import io |
|
from docx import Document |
|
|
|
|
|
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed") |
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") |
|
|
|
|
|
def extract_text_from_image(image): |
|
result = trocr_pipeline(image) |
|
return result[0]['generated_text'] |
|
|
|
|
|
def extract_from_pdf(pdf_path): |
|
doc = fitz.open(pdf_path) |
|
full_text = "" |
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
image_list = page.get_images(full=True) |
|
for img_index, img in enumerate(image_list): |
|
xref = img[0] |
|
base_image = doc.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image = Image.open(io.BytesIO(image_bytes)) |
|
text = extract_text_from_image(image) |
|
full_text += text + "\n" |
|
full_text += page.get_text() + "\n" |
|
return full_text |
|
|
|
|
|
def extract_from_word(docx_path): |
|
doc = Document(docx_path) |
|
full_text = "" |
|
for para in doc.paragraphs: |
|
full_text += para.text + "\n" |
|
return full_text |
|
|
|
|
|
def translate_text(text): |
|
translated_text = translator(text, max_length=400)[0]['translation_text'] |
|
return translated_text |
|
|
|
|
|
def create_pdf(translated_text, output_path): |
|
doc = fitz.open() |
|
page = doc.new_page() |
|
page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv") |
|
doc.save(output_path) |
|
|
|
|
|
st.title("Multilingual Document Translator") |
|
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"]) |
|
|
|
if uploaded_file is not None: |
|
with st.spinner("Processing document..."): |
|
|
|
file_extension = uploaded_file.name.split(".")[-1].lower() |
|
temp_file_path = f"temp.{file_extension}" |
|
with open(temp_file_path, "wb") as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
if file_extension == "pdf": |
|
extracted_text = extract_from_pdf(temp_file_path) |
|
elif file_extension in ["jpg", "jpeg", "png"]: |
|
image = Image.open(temp_file_path) |
|
extracted_text = extract_text_from_image(image) |
|
elif file_extension == "docx": |
|
extracted_text = extract_from_word(temp_file_path) |
|
else: |
|
st.error("Unsupported file format.") |
|
st.stop() |
|
|
|
|
|
translated_text = translate_text(extracted_text) |
|
|
|
|
|
st.subheader("Translated Text (English)") |
|
st.write(translated_text) |
|
|
|
|
|
output_pdf_path = "translated_document.pdf" |
|
create_pdf(translated_text, output_pdf_path) |
|
|
|
|
|
with open(output_pdf_path, "rb") as f: |
|
st.download_button( |
|
label="Download Translated PDF", |
|
data=f, |
|
file_name="translated_document.pdf", |
|
mime="application/pdf" |
|
) |
|
|
|
|
|
os.remove(temp_file_path) |
|
os.remove(output_pdf_path) |