File size: 5,186 Bytes
b57bd69
5bb4750
40548f3
3eaf646
7037128
 
63f5b6d
5bb4750
 
04f9dd5
40548f3
3eaf646
 
e9de34a
40548f3
add3a0f
ade99c4
40548f3
 
 
b57bd69
7037128
40548f3
 
e9de34a
3eaf646
 
 
e9de34a
40548f3
 
 
 
e9de34a
ade99c4
b57bd69
7037128
40548f3
7037128
 
40548f3
7037128
 
40548f3
 
 
 
 
 
 
 
 
 
 
63f5b6d
f53330e
b57bd69
87fcfea
 
 
 
 
63f5b6d
 
b57bd69
63f5b6d
 
87fcfea
b57bd69
7037128
5bb4750
 
 
 
 
 
 
 
 
e9de34a
b0b875d
 
5bb4750
 
 
b0b875d
87fcfea
b57bd69
87fcfea
 
 
b57bd69
d2195da
 
b57bd69
d2195da
 
 
 
 
 
 
87fcfea
ade99c4
b57bd69
87fcfea
 
ade99c4
7037128
87fcfea
 
 
 
7037128
87fcfea
63f5b6d
 
 
 
 
 
 
 
 
 
 
 
 
5bb4750
63f5b6d
 
 
 
 
 
b57bd69
 
 
 
 
 
 
 
 
 
 
 
 
63f5b6d
 
 
b57bd69
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import fitz  # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
import streamlit as st
import os
import re
from docx import Document
from langdetect import detect

# Initialize BLIP-2 model and processor for image-to-text
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# Initialize translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

# Path to Tesseract executable for OCR
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


def extract_text_from_image(image):
    """Extract text from image using OCR or BLIP-2."""
    # First try using BLIP-2
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    generated_ids = model.generate(**inputs)
    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Fallback to OCR if BLIP-2 extraction fails
    if not decoded_text.strip():
        decoded_text = pytesseract.image_to_string(image)

    return decoded_text.strip()


def extract_from_pdf(pdf_path):
    """Extract text from PDF by combining direct extraction and OCR fallback."""
    doc = fitz.open(pdf_path)
    full_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Try extracting text directly
        text = page.get_text()

        # If no text, fallback to OCR
        if not text.strip():
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text = extract_text_from_image(image)

        full_text += text + "\n"
    return full_text.strip()


def extract_from_word(docx_path):
    doc = Document(docx_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "\n"
    return full_text.strip()


def clean_text(text):
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()


def translate_text(text):
    if not text.strip():
        return "No text available for translation."
    
    detected_language = detect(text)
    st.write(f"Detected language: {detected_language}")

    if detected_language == "en":
        return "The text is already in English."

    chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
    translated_text = ""
    for chunk in chunks:
        translated_chunk = translator(chunk, max_length=400)
        if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
            translated_text += translated_chunk[0]['translation_text'] + " "
    return translated_text.strip()


def create_pdf(translated_text, output_path):
    doc = fitz.open()
    page = doc.new_page()
    
    # Define text insertion rectangle
    rect = fitz.Rect(50, 50, 550, 750)
    
    # Insert text using the defined rectangle
    page.insert_textbox(
        rect, translated_text,
        fontsize=12,
        fontname="helv",
        color=(0, 0, 0),
    )
    doc.save(output_path)


st.title("Multilingual Document Translator")
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])

if uploaded_file is not None:
    with st.spinner("Processing document..."):
        file_extension = uploaded_file.name.split(".")[-1].lower()
        temp_file_path = f"temp.{file_extension}"
        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        try:
            if file_extension == "pdf":
                extracted_text = extract_from_pdf(temp_file_path)
            elif file_extension in ["jpg", "jpeg", "png"]:
                image = Image.open(temp_file_path)
                extracted_text = extract_text_from_image(image)
            elif file_extension == "docx":
                extracted_text = extract_from_word(temp_file_path)
            else:
                st.error("Unsupported file format.")
                st.stop()

            extracted_text = clean_text(extracted_text)
            st.write("Extracted Text (First 500 characters):", extracted_text[:500])

            translated_text = translate_text(extracted_text)

            st.subheader("Translated Text (English)")
            st.write(translated_text)

            if translated_text.strip():
                output_pdf_path = "translated_document.pdf"
                create_pdf(translated_text, output_pdf_path)

                with open(output_pdf_path, "rb") as f:
                    st.download_button(
                        label="Download Translated PDF",
                        data=f,
                        file_name="translated_document.pdf",
                        mime="application/pdf"
                    )
            else:
                st.warning("No content to save in the translated PDF.")
        finally:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)
            if os.path.exists("translated_document.pdf"):
                os.remove("translated_document.pdf")