Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

7037128

verified ·

1 Parent(s): 7149616

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -75

app.py CHANGED Viewed

@@ -1,84 +1,64 @@
-import streamlit as st
-from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
 import pytesseract
-from pdf2image import convert_from_path
 from PIL import Image
-import torch
-# Image-to-Text Model (TrOCR)
-def load_image_to_text_model():
-    tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
-    model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
-    return tokenizer, model
-def extract_text_with_trocr(image, tokenizer, model):
-    """Extract text from an image using TrOCR."""
-    pixel_values = tokenizer(image, return_tensors="pt").pixel_values
-    outputs = model.generate(pixel_values)
-    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-# Multilingual Translation Models
-def load_translation_models():
-    translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
-    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
-    return translator_en, translator_ur
-translator_en, translator_ur = load_translation_models()
-tokenizer, trocr_model = load_image_to_text_model()
-def extract_text_from_pdf_with_ocr(file_path):
-    """Extract text from image-based PDF using TrOCR."""
-    text = ""
-    try:
-        pages = convert_from_path(file_path, 300)
-        for page_image in pages:
-            text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
-    except Exception as e:
-        st.error(f"Error during OCR extraction: {e}")
     return text
-# Translation Function
-def translate_text(text, translator):
-    """Translate text into the selected language."""
-    max_chunk_size = 512
-    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
-    translations = []
-    for chunk in text_chunks:
-        try:
-            result = translator(chunk)
-            translations.append(result[0]['translation_text'])
-        except Exception as e:
-            st.error(f"Error during translation: {e}")
-            return ""
-    return " ".join(translations)
-# Streamlit UI
-st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
-st.write("Translate image-based PDF or image files using advanced models.")
-uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
-target_language = st.radio("Select target language for translation", ["English", "Urdu"])
-if uploaded_file:
-    file_path = f"/mnt/data/{uploaded_file.name}"
-    # Image-based PDF processing using TrOCR
-    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
-    st.subheader("Extracted Text (Preview)")
-    st.write(text_content[:500] if text_content else "No content found in the file.")
-    if st.button("Translate"):
-        if text_content:
-            st.subheader(f"Translated Text ({target_language})")
-            if target_language == "English" and translator_en:
-                translated_text = translate_text(text_content, translator_en)
-            elif target_language == "Urdu" and translator_ur:
-                translated_text = translate_text(text_content, translator_ur)
-            else:
-                st.warning("Translation model not loaded successfully.")
-            st.text_area("Translation Output", translated_text, height=300)
-        else:
-            st.warning("No text found to translate. Please upload a valid document.")

+import fitz  # PyMuPDF
 import pytesseract
 from PIL import Image
+from transformers import pipeline
+import streamlit as st
+import os
+# Set up the translation pipelines
+translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
+translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur")
+# Function to extract text from an image using OCR
+def extract_text_from_image(image):
+    text = pytesseract.image_to_string(image, lang='eng+urd')
     return text
+# Function to extract images and text from a PDF
+def extract_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    full_text = ""
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        image_list = page.get_images(full=True)
+        for img_index, img in enumerate(image_list):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            image = Image.open(io.BytesIO(image_bytes))
+            text = extract_text_from_image(image)
+            full_text += text + "\n"
+        full_text += page.get_text() + "\n"
+    return full_text
+# Function to translate text to English and Urdu
+def translate_text(text):
+    english_translation = translator_to_english(text, max_length=400)[0]['translation_text']
+    urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text']
+    return english_translation, urdu_translation
+# Streamlit UI
+st.title("PDF Document Translator")
+uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
+if uploaded_file is not None:
+    with st.spinner("Processing PDF..."):
+        # Save the uploaded file temporarily
+        with open("temp.pdf", "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        # Extract text from the PDF
+        extracted_text = extract_from_pdf("temp.pdf")
+        # Translate the extracted text
+        english_translation, urdu_translation = translate_text(extracted_text)
+        # Display the translations
+        st.subheader("English Translation")
+        st.write(english_translation)
+        st.subheader("Urdu Translation")
+        st.write(urdu_translation)
+        # Clean up the temporary file
+        os.remove("temp.pdf")