tahirsher commited on
Commit
2be5258
·
verified ·
1 Parent(s): cbb084d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -1,15 +1,16 @@
1
  import streamlit as st
2
- import PyPDF2
3
  import docx2txt
4
  from transformers import pipeline
5
  import pytesseract
6
  from pdf2image import convert_from_path
7
  from PIL import Image
 
8
 
9
- # Load translation models
10
  def load_translation_models():
 
11
  try:
12
- translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
13
  translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
14
  return translator_en, translator_ur
15
  except Exception as e:
@@ -22,11 +23,10 @@ def extract_text_from_pdf_with_ocr(file_path):
22
  """Extract text from image-based PDF using OCR."""
23
  text = ""
24
  try:
25
- # Convert PDF to images
26
- pages = convert_from_path(file_path, 300)
27
  for page in pages:
28
- image = Image.fromarray(page)
29
- text += pytesseract.image_to_string(image) + "\n"
30
  except Exception as e:
31
  st.error(f"Error during OCR extraction: {e}")
32
  return text
 
1
  import streamlit as st
 
2
  import docx2txt
3
  from transformers import pipeline
4
  import pytesseract
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
+ import os
8
 
9
+ # Initialize translation models
10
  def load_translation_models():
11
+ """Load translation models."""
12
  try:
13
+ translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
14
  translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
15
  return translator_en, translator_ur
16
  except Exception as e:
 
23
  """Extract text from image-based PDF using OCR."""
24
  text = ""
25
  try:
26
+ # Convert PDF to images with Poppler support
27
+ pages = convert_from_path(file_path, 300, poppler_path="/path-to-poppler-bin") # Update poppler_path on Windows if necessary
28
  for page in pages:
29
+ text += pytesseract.image_to_string(page) + "\n"
 
30
  except Exception as e:
31
  st.error(f"Error during OCR extraction: {e}")
32
  return text