tahirsher commited on
Commit
b0b875d
·
verified ·
1 Parent(s): add3a0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -5,6 +5,7 @@ import streamlit as st
5
  import os
6
  import io
7
  from docx import Document # For Word document processing
 
8
 
9
  # Load the TrOCR model for image-to-text (smaller model)
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
@@ -17,20 +18,13 @@ def extract_text_from_image(image):
17
  result = trocr_pipeline(image)
18
  return result[0]['generated_text']
19
 
20
- # Function to extract text from a PDF
21
  def extract_from_pdf(pdf_path):
22
  doc = fitz.open(pdf_path)
23
  full_text = ""
24
  for page_num in range(len(doc)):
25
  page = doc.load_page(page_num)
26
- image_list = page.get_images(full=True)
27
- for img_index, img in enumerate(image_list):
28
- xref = img[0]
29
- base_image = doc.extract_image(xref)
30
- image_bytes = base_image["image"]
31
- image = Image.open(io.BytesIO(image_bytes))
32
- text = extract_text_from_image(image)
33
- full_text += text + "\n"
34
  full_text += page.get_text() + "\n"
35
  return full_text
36
 
@@ -42,10 +36,15 @@ def extract_from_word(docx_path):
42
  full_text += para.text + "\n"
43
  return full_text
44
 
45
- # Function to translate text to English
46
  def translate_text(text):
47
- translated_text = translator(text, max_length=400)[0]['translation_text']
48
- return translated_text
 
 
 
 
 
49
 
50
  # Function to create a PDF from translated text
51
  def create_pdf(translated_text, output_path):
 
5
  import os
6
  import io
7
  from docx import Document # For Word document processing
8
+ import asyncio # For asynchronous processing
9
 
10
  # Load the TrOCR model for image-to-text (smaller model)
11
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
 
18
  result = trocr_pipeline(image)
19
  return result[0]['generated_text']
20
 
21
+ # Function to extract text from a PDF (optimized for performance)
22
  def extract_from_pdf(pdf_path):
23
  doc = fitz.open(pdf_path)
24
  full_text = ""
25
  for page_num in range(len(doc)):
26
  page = doc.load_page(page_num)
27
+ # Extract text directly from the page (faster than OCR for text-based PDFs)
 
 
 
 
 
 
 
28
  full_text += page.get_text() + "\n"
29
  return full_text
30
 
 
36
  full_text += para.text + "\n"
37
  return full_text
38
 
39
+ # Function to translate text to English (batched for performance)
40
  def translate_text(text):
41
+ # Split text into smaller chunks for faster translation
42
+ chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
43
+ translated_text = ""
44
+ for chunk in chunks:
45
+ translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
46
+ translated_text += translated_chunk + " "
47
+ return translated_text.strip()
48
 
49
  # Function to create a PDF from translated text
50
  def create_pdf(translated_text, output_path):