Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import streamlit as st
|
|
| 5 |
import os
|
| 6 |
import io
|
| 7 |
from docx import Document # For Word document processing
|
|
|
|
| 8 |
|
| 9 |
# Load the TrOCR model for image-to-text (smaller model)
|
| 10 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
|
@@ -17,20 +18,13 @@ def extract_text_from_image(image):
|
|
| 17 |
result = trocr_pipeline(image)
|
| 18 |
return result[0]['generated_text']
|
| 19 |
|
| 20 |
-
# Function to extract text from a PDF
|
| 21 |
def extract_from_pdf(pdf_path):
|
| 22 |
doc = fitz.open(pdf_path)
|
| 23 |
full_text = ""
|
| 24 |
for page_num in range(len(doc)):
|
| 25 |
page = doc.load_page(page_num)
|
| 26 |
-
|
| 27 |
-
for img_index, img in enumerate(image_list):
|
| 28 |
-
xref = img[0]
|
| 29 |
-
base_image = doc.extract_image(xref)
|
| 30 |
-
image_bytes = base_image["image"]
|
| 31 |
-
image = Image.open(io.BytesIO(image_bytes))
|
| 32 |
-
text = extract_text_from_image(image)
|
| 33 |
-
full_text += text + "\n"
|
| 34 |
full_text += page.get_text() + "\n"
|
| 35 |
return full_text
|
| 36 |
|
|
@@ -42,10 +36,15 @@ def extract_from_word(docx_path):
|
|
| 42 |
full_text += para.text + "\n"
|
| 43 |
return full_text
|
| 44 |
|
| 45 |
-
# Function to translate text to English
|
| 46 |
def translate_text(text):
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# Function to create a PDF from translated text
|
| 51 |
def create_pdf(translated_text, output_path):
|
|
|
|
| 5 |
import os
|
| 6 |
import io
|
| 7 |
from docx import Document # For Word document processing
|
| 8 |
+
import asyncio # For asynchronous processing
|
| 9 |
|
| 10 |
# Load the TrOCR model for image-to-text (smaller model)
|
| 11 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
|
|
|
| 18 |
result = trocr_pipeline(image)
|
| 19 |
return result[0]['generated_text']
|
| 20 |
|
| 21 |
+
# Function to extract text from a PDF (optimized for performance)
|
| 22 |
def extract_from_pdf(pdf_path):
|
| 23 |
doc = fitz.open(pdf_path)
|
| 24 |
full_text = ""
|
| 25 |
for page_num in range(len(doc)):
|
| 26 |
page = doc.load_page(page_num)
|
| 27 |
+
# Extract text directly from the page (faster than OCR for text-based PDFs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
full_text += page.get_text() + "\n"
|
| 29 |
return full_text
|
| 30 |
|
|
|
|
| 36 |
full_text += para.text + "\n"
|
| 37 |
return full_text
|
| 38 |
|
| 39 |
+
# Function to translate text to English (batched for performance)
|
| 40 |
def translate_text(text):
|
| 41 |
+
# Split text into smaller chunks for faster translation
|
| 42 |
+
chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
|
| 43 |
+
translated_text = ""
|
| 44 |
+
for chunk in chunks:
|
| 45 |
+
translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
|
| 46 |
+
translated_text += translated_chunk + " "
|
| 47 |
+
return translated_text.strip()
|
| 48 |
|
| 49 |
# Function to create a PDF from translated text
|
| 50 |
def create_pdf(translated_text, output_path):
|