|
import fitz |
|
import pytesseract |
|
from PIL import Image |
|
from transformers import pipeline |
|
import streamlit as st |
|
import os |
|
import io |
|
|
|
|
|
try: |
|
translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") |
|
except Exception as e: |
|
st.error(f"Failed to load English translation model: {e}") |
|
translator_to_english = None |
|
|
|
try: |
|
translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur") |
|
except Exception as e: |
|
st.error(f"Failed to load Urdu translation model: {e}") |
|
translator_to_urdu = None |
|
|
|
|
|
def extract_text_from_image(image): |
|
text = pytesseract.image_to_string(image, lang='eng+urd') |
|
return text |
|
|
|
|
|
def extract_from_pdf(pdf_path): |
|
doc = fitz.open(pdf_path) |
|
full_text = "" |
|
for page_num in range(len(doc)): |
|
page = doc.load_page(page_num) |
|
image_list = page.get_images(full=True) |
|
for img_index, img in enumerate(image_list): |
|
xref = img[0] |
|
base_image = doc.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image = Image.open(io.BytesIO(image_bytes)) |
|
text = extract_text_from_image(image) |
|
full_text += text + "\n" |
|
full_text += page.get_text() + "\n" |
|
return full_text |
|
|
|
|
|
def translate_text(text): |
|
english_translation = "" |
|
urdu_translation = "" |
|
if translator_to_english: |
|
english_translation = translator_to_english(text, max_length=400)[0]['translation_text'] |
|
if translator_to_urdu: |
|
urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text'] |
|
return english_translation, urdu_translation |
|
|
|
|
|
st.title("PDF Document Translator") |
|
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf") |
|
|
|
if uploaded_file is not None: |
|
with st.spinner("Processing PDF..."): |
|
|
|
with open("temp.pdf", "wb") as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
extracted_text = extract_from_pdf("temp.pdf") |
|
|
|
|
|
english_translation, urdu_translation = translate_text(extracted_text) |
|
|
|
|
|
st.subheader("English Translation") |
|
st.write(english_translation) |
|
|
|
st.subheader("Urdu Translation") |
|
st.write(urdu_translation) |
|
|
|
|
|
os.remove("temp.pdf") |