File size: 9,716 Bytes
9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
ba45265
e6d3f33
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
ba45265
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
47d9c5a
0a75d54
 
 
ec4347b
9d1426d
ebcff05
 
59ea779
ebcff05
 
59ea779
f1ebc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59ea779
f1ebc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d1426d
f1ebc19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d1426d
 
 
 
 
 
 
 
 
 
 
 
2dffa73
d0a45f9
9d1426d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Bangla and English Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 
    
@st.cache
def save(l):
    return l
#@st.cache
def main():
    import streamlit as st
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    a, b = st.columns([1, 1])
    def change_photo_state():
        st.session_state["photo"]="done"
    with st.container():
        with a:
            #import torch
            from streamlit_option_menu import option_menu
            from streamlit_chat import message as st_message
            from transformers import BlenderbotTokenizer
            from transformers import BlenderbotForConditionalGeneration
            st.title("Simple Chatbot for fun!")
            
            @st.experimental_singleton
            def get_models():
                # it may be necessary for other frameworks to cache the model
                # seems pytorch keeps an internal state of the conversation
                model_name = "facebook/blenderbot-400M-distill"
                tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
                model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
                return tokenizer, model
            if "history" not in st.session_state:
                st.session_state.history = []
            st.title("Hello Chatbot")
            def main():
                st.text_input("Talk to the bot", key="input_text", on_change=generate_answer)
                def generate_answer():
                    tokenizer, model = get_models()
                    user_message = st.session_state.input_text
                    inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
                    result = model.generate(**inputs)
                    message_bot = tokenizer.decode(
                        result[0], skip_special_tokens=True
                    )  # .replace("<s>", "").replace("</s>", "")
                    st.session_state.history.append({"message": user_message, "is_user": True})
                    st.session_state.history.append({"message": message_bot, "is_user": False})
                from copyreg import clear_extension_cache
                for chat in st.session_state.history:
                    st_message(**chat) 
        with b:
            c2, c3 = st.columns([1,1])
            message = st.text_input("Type your text here!")
            camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
            uploaded_photo = save(c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state))
            if st.session_state["photo"]=="done" or message:
                if uploaded_photo and uploaded_photo.type=='application/pdf':
                    tet = read_pdf(uploaded_photo)
                    # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                    #     temp_file.write(uploaded_photo.read())
                    #     temp_file_path = temp_file.name
                        
                    # loader = PyPDFLoader(temp_file_path)
                    # if loader:
                    #     text.extend(loader.load())
                    #     os.remove(temp_file_path)
                    # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
                    # text_chunks = text_splitter.split_documents(text)
                    values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
                    text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
                    #st.success(type(text_chunks))
                    if st.button("English Pdf Summarize"):
                        st.subheader("Selected text for summarize: ")
                        st.success(text)
                        st.subheader("Summarized Text: ")
                        engsum(text)
                
                elif uploaded_photo and uploaded_photo.type !='application/pdf':
                    text=None
                    img = Image.open(uploaded_photo)
                    img = img.save("img.png")
                    img = cv2.imread("img.png")
                    st.text("Select the summarization type:")
                    c4, c5 = st.columns([1,1])
                    if c4.button("BENGALI"):
                        text =  pytesseract.image_to_string(img, lang="ben")
                        st.subheader("সারাংশ/সারমর্ম")
                        bansum(text)
                    if c5.button("ENGLISH"): 
                        text=pytesseract.image_to_string(img)
                        st.subheader("Summarized Text")
                        engsum(text)
                    #st.success(text)
                elif camera_photo:
                    text=None
                    img = Image.open(camera_photo)
                    img = img.save("img.png")
                    img = cv2.imread("img.png")
                    #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
                    st.text("Select the summarization type:")
                    c6, c7 = st.columns([1,1])
                    if c6.button("Bangla"):
                        text =  pytesseract.image_to_string(img, lang="ben")
                        st.subheader("সারাংশ/সারমর্ম")
                        bansum(text)
                    if c7.button("English"): 
                        text=pytesseract.image_to_string(img)
                        st.subheader("Summarized Text")
                        engsum(text)
                else:
                    text=None
                    text = message
                    c8, c9 = st.columns([1,1])
                    if c8.button("Bangla"):
                        bansum(text)
                    if c9.button("English"): 
                        engsum(text)  
        # if st.button("English Text Generation"): 
        #     def query(payload):
        #     	response = requests.post(API_URL2, headers=headers2, json=payload)
        #     	return response.json()
                
        #     out = query({
        #     	"inputs": text,
        #     })
        #     if isinstance(out, list) and out[0].get("generated_text"):
        #         text_output = out[0]["generated_text"]
        #         st.success(text_output)
        #         #text=text_output


if __name__ == "__main__":
    main()