File size: 5,186 Bytes
b57bd69 5bb4750 40548f3 3eaf646 7037128 63f5b6d 5bb4750 04f9dd5 40548f3 3eaf646 e9de34a 40548f3 add3a0f ade99c4 40548f3 b57bd69 7037128 40548f3 e9de34a 3eaf646 e9de34a 40548f3 e9de34a ade99c4 b57bd69 7037128 40548f3 7037128 40548f3 7037128 40548f3 63f5b6d f53330e b57bd69 87fcfea 63f5b6d b57bd69 63f5b6d 87fcfea b57bd69 7037128 5bb4750 e9de34a b0b875d 5bb4750 b0b875d 87fcfea b57bd69 87fcfea b57bd69 d2195da b57bd69 d2195da 87fcfea ade99c4 b57bd69 87fcfea ade99c4 7037128 87fcfea 7037128 87fcfea 63f5b6d 5bb4750 63f5b6d b57bd69 63f5b6d b57bd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import fitz # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
import streamlit as st
import os
import re
from docx import Document
from langdetect import detect
# Initialize BLIP-2 model and processor for image-to-text
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
# Initialize translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
# Path to Tesseract executable for OCR
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_text_from_image(image):
"""Extract text from image using OCR or BLIP-2."""
# First try using BLIP-2
image = image.convert("RGB")
inputs = processor(images=image, return_tensors="pt")
generated_ids = model.generate(**inputs)
decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Fallback to OCR if BLIP-2 extraction fails
if not decoded_text.strip():
decoded_text = pytesseract.image_to_string(image)
return decoded_text.strip()
def extract_from_pdf(pdf_path):
"""Extract text from PDF by combining direct extraction and OCR fallback."""
doc = fitz.open(pdf_path)
full_text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Try extracting text directly
text = page.get_text()
# If no text, fallback to OCR
if not text.strip():
pix = page.get_pixmap()
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text = extract_text_from_image(image)
full_text += text + "\n"
return full_text.strip()
def extract_from_word(docx_path):
doc = Document(docx_path)
full_text = ""
for para in doc.paragraphs:
full_text += para.text + "\n"
return full_text.strip()
def clean_text(text):
return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
def translate_text(text):
if not text.strip():
return "No text available for translation."
detected_language = detect(text)
st.write(f"Detected language: {detected_language}")
if detected_language == "en":
return "The text is already in English."
chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
translated_text = ""
for chunk in chunks:
translated_chunk = translator(chunk, max_length=400)
if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
translated_text += translated_chunk[0]['translation_text'] + " "
return translated_text.strip()
def create_pdf(translated_text, output_path):
doc = fitz.open()
page = doc.new_page()
# Define text insertion rectangle
rect = fitz.Rect(50, 50, 550, 750)
# Insert text using the defined rectangle
page.insert_textbox(
rect, translated_text,
fontsize=12,
fontname="helv",
color=(0, 0, 0),
)
doc.save(output_path)
st.title("Multilingual Document Translator")
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
if uploaded_file is not None:
with st.spinner("Processing document..."):
file_extension = uploaded_file.name.split(".")[-1].lower()
temp_file_path = f"temp.{file_extension}"
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
try:
if file_extension == "pdf":
extracted_text = extract_from_pdf(temp_file_path)
elif file_extension in ["jpg", "jpeg", "png"]:
image = Image.open(temp_file_path)
extracted_text = extract_text_from_image(image)
elif file_extension == "docx":
extracted_text = extract_from_word(temp_file_path)
else:
st.error("Unsupported file format.")
st.stop()
extracted_text = clean_text(extracted_text)
st.write("Extracted Text (First 500 characters):", extracted_text[:500])
translated_text = translate_text(extracted_text)
st.subheader("Translated Text (English)")
st.write(translated_text)
if translated_text.strip():
output_pdf_path = "translated_document.pdf"
create_pdf(translated_text, output_pdf_path)
with open(output_pdf_path, "rb") as f:
st.download_button(
label="Download Translated PDF",
data=f,
file_name="translated_document.pdf",
mime="application/pdf"
)
else:
st.warning("No content to save in the translated PDF.")
finally:
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
if os.path.exists("translated_document.pdf"):
os.remove("translated_document.pdf")
|