File size: 3,712 Bytes
87fcfea
 
add3a0f
7037128
 
b515916
87fcfea
04f9dd5
add3a0f
 
db19b48
add3a0f
 
ade99c4
87fcfea
7037128
add3a0f
 
ade99c4
87fcfea
7037128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f53330e
87fcfea
 
 
 
 
 
 
 
 
7037128
add3a0f
87fcfea
 
 
 
 
 
 
 
ade99c4
7037128
87fcfea
 
ade99c4
7037128
87fcfea
7037128
87fcfea
 
 
7037128
87fcfea
 
 
 
 
 
 
 
 
 
 
 
 
7037128
87fcfea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import fitz  # PyMuPDF for PDF processing
from PIL import Image  # For image processing
from transformers import pipeline
import streamlit as st
import os
import io
from docx import Document  # For Word document processing

# Load the TrOCR model for image-to-text (smaller model)
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")

# Load the translation model (smaller model)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

# Function to extract text from an image using TrOCR
def extract_text_from_image(image):
    result = trocr_pipeline(image)
    return result[0]['generated_text']

# Function to extract text from a PDF
def extract_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            text = extract_text_from_image(image)
            full_text += text + "\n"
        full_text += page.get_text() + "\n"
    return full_text

# Function to extract text from a Word document
def extract_from_word(docx_path):
    doc = Document(docx_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "\n"
    return full_text

# Function to translate text to English
def translate_text(text):
    translated_text = translator(text, max_length=400)[0]['translation_text']
    return translated_text

# Function to create a PDF from translated text
def create_pdf(translated_text, output_path):
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
    doc.save(output_path)

# Streamlit UI
st.title("Multilingual Document Translator")
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])

if uploaded_file is not None:
    with st.spinner("Processing document..."):
        # Save the uploaded file temporarily
        file_extension = uploaded_file.name.split(".")[-1].lower()
        temp_file_path = f"temp.{file_extension}"
        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        # Extract text based on file type
        if file_extension == "pdf":
            extracted_text = extract_from_pdf(temp_file_path)
        elif file_extension in ["jpg", "jpeg", "png"]:
            image = Image.open(temp_file_path)
            extracted_text = extract_text_from_image(image)
        elif file_extension == "docx":
            extracted_text = extract_from_word(temp_file_path)
        else:
            st.error("Unsupported file format.")
            st.stop()

        # Translate the extracted text
        translated_text = translate_text(extracted_text)

        # Display the translated text
        st.subheader("Translated Text (English)")
        st.write(translated_text)

        # Create a PDF from the translated text
        output_pdf_path = "translated_document.pdf"
        create_pdf(translated_text, output_pdf_path)

        # Provide a download link for the translated PDF
        with open(output_pdf_path, "rb") as f:
            st.download_button(
                label="Download Translated PDF",
                data=f,
                file_name="translated_document.pdf",
                mime="application/pdf"
            )

        # Clean up temporary files
        os.remove(temp_file_path)
        os.remove(output_pdf_path)