Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

File size: 6,767 Bytes

9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
3d7adba
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
b446f5c
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
1a9fa41
0975d28
07c7d3a
c9a18bc
9c37e72
e113d20
f1ae271
 
b3b4ade
5e3c2f2
ea6bf13
63c4e55
 
 
2bfe916
29e33a8
 
 
 
a2c3102
29e33a8
 
 
 
 
 
 
 
37287e0
29e33a8
 
 
 
a2c3102
c745dc1
4f3134d
8cc1e8b
6e25163
dd55b25
a811191
5f35583
c7a7627
d0a45f9
5f35583
c7a7627
d0a45f9
6c1c515
c745dc1
4f3134d
cd370f7
 
 
c7a7627
a811191
 
c7a7627
d0a45f9
a811191
c7a7627
d0a45f9
c745dc1
4f3134d
cd370f7
0975d28
ce65dbf
0975d28
d0a45f9
b0e2f8f
 
 
 
7af9178
b0e2f8f
 
 
 
 
 
 
d0a45f9
9c37e72
b9b4937

"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
	
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 

st.title("Bangla and English Summarizer:")
#st.subheader("Input texts to summarize: ")
#@st.cache_resource(experimental_allow_widgets=True)
def main():
    """ NLP Based Application with Streamlit """
    def change_photo_state():
        st.session_state["photo"]="done"
    message = st.sidebar.text_input("Type your text here!")
    uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    camera_photo = st.sidebar.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        if uploaded_photo and uploaded_photo.type=='application/pdf':
            tet = read_pdf(uploaded_photo)
            # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            #     temp_file.write(uploaded_photo.read())
            #     temp_file_path = temp_file.name
                
            # loader = PyPDFLoader(temp_file_path)
            # if loader:
            #     text.extend(loader.load())
            #     os.remove(temp_file_path)
            # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
            # text_chunks = text_splitter.split_documents(text)
            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
            text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
            st.text("Selected text for summarize: ")
            #st.success(type(text_chunks))
            st.success(text)
            st.text("Summarized Text: ")
            engsum(text)
        
        elif uploaded_photo and uploaded_photo.type !='application/pdf':
            text=None
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            st.text("Select the summarization type:")
            if st.button("BENGALI"):
                text =  pytesseract.image_to_string(img, lang="ben")
                bansum(text)
            if st.button("ENGLISH"): 
                text=pytesseract.image_to_string(img)
                engsum(text)
            #st.success(text)
        elif camera_photo:
            text=None
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
            st.text("Select the summarization type:")
            if st.button("Bangla"):
                text =  pytesseract.image_to_string(img, lang="ben")
                bansum(text)
            if st.button("English"): 
                text=pytesseract.image_to_string(img)
                engsum(text)
        else:
            text=None
            text = message
            if st.button("Bangla"):
                bansum(text)
            if st.button("English"): 
                engsum(text)  
        # if st.button("English Text Generation"): 
        #     def query(payload):
        #     	response = requests.post(API_URL2, headers=headers2, json=payload)
        #     	return response.json()
            	
        #     out = query({
        #     	"inputs": text,
        #     })
        #     if isinstance(out, list) and out[0].get("generated_text"):
        #         text_output = out[0]["generated_text"]
        #         st.success(text_output)
        #         #text=text_output

if __name__ == '__main__':
    main()