Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import streamlit as st
|
|
5 |
import os
|
6 |
import io
|
7 |
from docx import Document # For Word document processing
|
|
|
8 |
|
9 |
# Load the TrOCR model for image-to-text (smaller model)
|
10 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
@@ -17,20 +18,13 @@ def extract_text_from_image(image):
|
|
17 |
result = trocr_pipeline(image)
|
18 |
return result[0]['generated_text']
|
19 |
|
20 |
-
# Function to extract text from a PDF
|
21 |
def extract_from_pdf(pdf_path):
|
22 |
doc = fitz.open(pdf_path)
|
23 |
full_text = ""
|
24 |
for page_num in range(len(doc)):
|
25 |
page = doc.load_page(page_num)
|
26 |
-
|
27 |
-
for img_index, img in enumerate(image_list):
|
28 |
-
xref = img[0]
|
29 |
-
base_image = doc.extract_image(xref)
|
30 |
-
image_bytes = base_image["image"]
|
31 |
-
image = Image.open(io.BytesIO(image_bytes))
|
32 |
-
text = extract_text_from_image(image)
|
33 |
-
full_text += text + "\n"
|
34 |
full_text += page.get_text() + "\n"
|
35 |
return full_text
|
36 |
|
@@ -42,10 +36,15 @@ def extract_from_word(docx_path):
|
|
42 |
full_text += para.text + "\n"
|
43 |
return full_text
|
44 |
|
45 |
-
# Function to translate text to English
|
46 |
def translate_text(text):
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# Function to create a PDF from translated text
|
51 |
def create_pdf(translated_text, output_path):
|
|
|
5 |
import os
|
6 |
import io
|
7 |
from docx import Document # For Word document processing
|
8 |
+
import asyncio # For asynchronous processing
|
9 |
|
10 |
# Load the TrOCR model for image-to-text (smaller model)
|
11 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
|
|
18 |
result = trocr_pipeline(image)
|
19 |
return result[0]['generated_text']
|
20 |
|
21 |
+
# Function to extract text from a PDF (optimized for performance)
|
22 |
def extract_from_pdf(pdf_path):
|
23 |
doc = fitz.open(pdf_path)
|
24 |
full_text = ""
|
25 |
for page_num in range(len(doc)):
|
26 |
page = doc.load_page(page_num)
|
27 |
+
# Extract text directly from the page (faster than OCR for text-based PDFs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
full_text += page.get_text() + "\n"
|
29 |
return full_text
|
30 |
|
|
|
36 |
full_text += para.text + "\n"
|
37 |
return full_text
|
38 |
|
39 |
+
# Function to translate text to English (batched for performance)
|
40 |
def translate_text(text):
|
41 |
+
# Split text into smaller chunks for faster translation
|
42 |
+
chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
|
43 |
+
translated_text = ""
|
44 |
+
for chunk in chunks:
|
45 |
+
translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
|
46 |
+
translated_text += translated_chunk + " "
|
47 |
+
return translated_text.strip()
|
48 |
|
49 |
# Function to create a PDF from translated text
|
50 |
def create_pdf(translated_text, output_path):
|