Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

tahirsher commited on Jan 31

Commit

b0b875d

verified ·

1 Parent(s): add3a0f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import streamlit as st
 import os
 import io
 from docx import Document  # For Word document processing
 # Load the TrOCR model for image-to-text (smaller model)
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
@@ -17,20 +18,13 @@ def extract_text_from_image(image):
     result = trocr_pipeline(image)
     return result[0]['generated_text']
-# Function to extract text from a PDF
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
-        image_list = page.get_images(full=True)
-        for img_index, img in enumerate(image_list):
-            xref = img[0]
-            base_image = doc.extract_image(xref)
-            image_bytes = base_image["image"]
-            image = Image.open(io.BytesIO(image_bytes))
-            text = extract_text_from_image(image)
-            full_text += text + "\n"
         full_text += page.get_text() + "\n"
     return full_text
@@ -42,10 +36,15 @@ def extract_from_word(docx_path):
         full_text += para.text + "\n"
     return full_text
-# Function to translate text to English
 def translate_text(text):
-    translated_text = translator(text, max_length=400)[0]['translation_text']
-    return translated_text
 # Function to create a PDF from translated text
 def create_pdf(translated_text, output_path):

 import os
 import io
 from docx import Document  # For Word document processing
+import asyncio  # For asynchronous processing
 # Load the TrOCR model for image-to-text (smaller model)
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
     result = trocr_pipeline(image)
     return result[0]['generated_text']
+# Function to extract text from a PDF (optimized for performance)
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
+        # Extract text directly from the page (faster than OCR for text-based PDFs)
         full_text += page.get_text() + "\n"
     return full_text
         full_text += para.text + "\n"
     return full_text
+# Function to translate text to English (batched for performance)
 def translate_text(text):
+    # Split text into smaller chunks for faster translation
+    chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
+    translated_text = ""
+    for chunk in chunks:
+        translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
+        translated_text += translated_chunk + " "
+    return translated_text.strip()
 # Function to create a PDF from translated text
 def create_pdf(translated_text, output_path):