|
import fitz |
|
from PIL import Image |
|
import pytesseract |
|
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration |
|
import streamlit as st |
|
import os |
|
import re |
|
from docx import Document |
|
from langdetect import detect |
|
|
|
|
|
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") |
|
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b") |
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") |
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" |
|
|
|
|
|
def extract_text_from_image(image): |
|
"""Extract text from image using OCR or BLIP-2.""" |
|
|
|
image = image.convert("RGB") |
|
inputs = processor(images=image, return_tensors="pt") |
|
generated_ids = model.generate(**inputs) |
|
decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
if not decoded_text.strip(): |
|
decoded_text = pytesseract.image_to_string(image) |
|
|
|
return decoded_text.strip() |
|
|
|
|
|
def extract_from_pdf(pdf_path): |
|
"""Extract text from PDF by combining direct extraction and OCR fallback.""" |
|
doc = fitz.open(pdf_path) |
|
full_text = "" |
|
|
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
|
|
|
|
text = page.get_text() |
|
|
|
|
|
if not text.strip(): |
|
pix = page.get_pixmap() |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
text = extract_text_from_image(image) |
|
|
|
full_text += text + "\n" |
|
return full_text.strip() |
|
|
|
|
|
def extract_from_word(docx_path): |
|
doc = Document(docx_path) |
|
full_text = "" |
|
for para in doc.paragraphs: |
|
full_text += para.text + "\n" |
|
return full_text.strip() |
|
|
|
|
|
def clean_text(text): |
|
return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip() |
|
|
|
|
|
def translate_text(text): |
|
if not text.strip(): |
|
return "No text available for translation." |
|
|
|
detected_language = detect(text) |
|
st.write(f"Detected language: {detected_language}") |
|
|
|
if detected_language == "en": |
|
return "The text is already in English." |
|
|
|
chunks = [text[i:i + 500] for i in range(0, len(text), 500)] |
|
translated_text = "" |
|
for chunk in chunks: |
|
translated_chunk = translator(chunk, max_length=400) |
|
if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]: |
|
translated_text += translated_chunk[0]['translation_text'] + " " |
|
return translated_text.strip() |
|
|
|
|
|
def create_pdf(translated_text, output_path): |
|
doc = fitz.open() |
|
page = doc.new_page() |
|
|
|
|
|
rect = fitz.Rect(50, 50, 550, 750) |
|
|
|
|
|
page.insert_textbox( |
|
rect, translated_text, |
|
fontsize=12, |
|
fontname="helv", |
|
color=(0, 0, 0), |
|
) |
|
doc.save(output_path) |
|
|
|
|
|
st.title("Multilingual Document Translator") |
|
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"]) |
|
|
|
if uploaded_file is not None: |
|
with st.spinner("Processing document..."): |
|
file_extension = uploaded_file.name.split(".")[-1].lower() |
|
temp_file_path = f"temp.{file_extension}" |
|
with open(temp_file_path, "wb") as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
try: |
|
if file_extension == "pdf": |
|
extracted_text = extract_from_pdf(temp_file_path) |
|
elif file_extension in ["jpg", "jpeg", "png"]: |
|
image = Image.open(temp_file_path) |
|
extracted_text = extract_text_from_image(image) |
|
elif file_extension == "docx": |
|
extracted_text = extract_from_word(temp_file_path) |
|
else: |
|
st.error("Unsupported file format.") |
|
st.stop() |
|
|
|
extracted_text = clean_text(extracted_text) |
|
st.write("Extracted Text (First 500 characters):", extracted_text[:500]) |
|
|
|
translated_text = translate_text(extracted_text) |
|
|
|
st.subheader("Translated Text (English)") |
|
st.write(translated_text) |
|
|
|
if translated_text.strip(): |
|
output_pdf_path = "translated_document.pdf" |
|
create_pdf(translated_text, output_pdf_path) |
|
|
|
with open(output_pdf_path, "rb") as f: |
|
st.download_button( |
|
label="Download Translated PDF", |
|
data=f, |
|
file_name="translated_document.pdf", |
|
mime="application/pdf" |
|
) |
|
else: |
|
st.warning("No content to save in the translated PDF.") |
|
finally: |
|
if os.path.exists(temp_file_path): |
|
os.remove(temp_file_path) |
|
if os.path.exists("translated_document.pdf"): |
|
os.remove("translated_document.pdf") |
|
|