Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
import streamlit as st
|
2 |
-
import PyPDF2
|
3 |
import docx2txt
|
4 |
from transformers import pipeline
|
5 |
import pytesseract
|
6 |
from pdf2image import convert_from_path
|
7 |
from PIL import Image
|
|
|
8 |
|
9 |
-
#
|
10 |
def load_translation_models():
|
|
|
11 |
try:
|
12 |
-
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
13 |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
14 |
return translator_en, translator_ur
|
15 |
except Exception as e:
|
@@ -22,11 +23,10 @@ def extract_text_from_pdf_with_ocr(file_path):
|
|
22 |
"""Extract text from image-based PDF using OCR."""
|
23 |
text = ""
|
24 |
try:
|
25 |
-
# Convert PDF to images
|
26 |
-
pages = convert_from_path(file_path, 300)
|
27 |
for page in pages:
|
28 |
-
|
29 |
-
text += pytesseract.image_to_string(image) + "\n"
|
30 |
except Exception as e:
|
31 |
st.error(f"Error during OCR extraction: {e}")
|
32 |
return text
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import docx2txt
|
3 |
from transformers import pipeline
|
4 |
import pytesseract
|
5 |
from pdf2image import convert_from_path
|
6 |
from PIL import Image
|
7 |
+
import os
|
8 |
|
9 |
+
# Initialize translation models
|
10 |
def load_translation_models():
|
11 |
+
"""Load translation models."""
|
12 |
try:
|
13 |
+
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
14 |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
15 |
return translator_en, translator_ur
|
16 |
except Exception as e:
|
|
|
23 |
"""Extract text from image-based PDF using OCR."""
|
24 |
text = ""
|
25 |
try:
|
26 |
+
# Convert PDF to images with Poppler support
|
27 |
+
pages = convert_from_path(file_path, 300, poppler_path="/path-to-poppler-bin") # Update poppler_path on Windows if necessary
|
28 |
for page in pages:
|
29 |
+
text += pytesseract.image_to_string(page) + "\n"
|
|
|
30 |
except Exception as e:
|
31 |
st.error(f"Error during OCR extraction: {e}")
|
32 |
return text
|