File size: 9,165 Bytes
9c37e72 dba2773 9c37e72 1a16a58 9c37e72 0c5b55b 9c37e72 0c5b55b 9c37e72 1a16a58 9c37e72 0c5b55b 9c37e72 6e58c44 bd18577 c75cc74 9c37e72 36603f5 09d4214 06dd768 09d4214 f4332f9 baf370a 06dd768 9c37e72 1ecea99 419e04c 9c37e72 baf370a fb120e2 d0ba2f9 f780d66 fb120e2 f780d66 25ae3be f780d66 fb120e2 2406036 fb120e2 cadb958 2406036 10ef8bd 54ee49c f6a6e42 9c37e72 9531d63 9c37e72 f6a6e42 9c37e72 e113d20 3f6c2be 9c37e72 1a16a58 f1ae271 0a7287e 4b6d85d 63c4e55 2181afa 5aeb295 d0ba2f9 63c4e55 f61fc0e 2181afa 8cc1e8b 6e25163 3e4f1f9 f5aabdb ed0375d 3e4f1f9 f5aabdb 3e4f1f9 f5aabdb 3e4f1f9 c08e6a6 b9c9fb8 cd370f7 fda8d0d b9b4937 c95ac40 9c37e72 c95ac40 9c37e72 b9b4937 c95ac40 b9b4937 c95ac40 b9b4937 9c37e72 b9b4937 b4eff56 fca844f b4eff56 b9b4937 9c37e72 b9b4937 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.
This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
"""
# Core Pkgs
import os
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
import docx2txt
from PIL import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
# NLP Pkgs
from textblob import TextBlob
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
@st.experimental_singleton
def read_pdf(file):
images=pdf2image.convert_from_path(file)
# print(type(images))
# pdfReader = PdfFileReader(file)
# count = pdfReader.numPages
all_page_text = ""
for page in images:
# page = pdfReader.getPage(i)
#img = Image.open(page)
img = Image.open(page)
img = img.save("img.png")
image_name = cv2.imread("img.png")
# get co-ordinates to cr
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
all_page_text += text + " " #page.extractText()
return all_page_text
def read_pdf_with_pdfplumber(file):
# all_page_text=" "
# # all_page_text = ""
# #with pdfplumber.open(file) as pdf:
# # page = pdf.pages[0]
# ge=page.to_image()
# img = Image.open(ge)
# img = img.save("img.png")
# image_name = cv2.imread("img.png")
# get co-ordinates to c
#return page.extract_text()
# get co-ordinates to cr
## get co-ordinates to cr
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
all_page_text += text + " " #page.extractText()
return all_page_text
st.title("Streamlit NLP APP")
@st.experimental_singleton
def text_analyzer(my_text):
nlp = spacy.load('en_core_web_sm')
docx = nlp(my_text)
# tokens = [ token.text for token in docx]
allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
return allData
@st.experimental_singleton
def load_models():
tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
return tokenizer, model
# Function For Extracting Entities
@st.experimental_singleton
def entity_analyzer(my_text):
nlp = spacy.load('en_core_web_sm')
docx = nlp(my_text)
tokens = [ token.text for token in docx]
entities = [(entity.text,entity.label_)for entity in docx.ents]
allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
return allData
def main():
""" NLP Based Application with Streamlit """
st.markdown("""
#### Description
##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive.
""")
def change_photo_state():
st.session_state["photo"]="done"
st.subheader("Please, feed your image/text, features/services will appear automatically!")
message = st.text_input("Type your text here!")
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
if "photo" not in st.session_state:
st.session_state["photo"]="not done"
if st.session_state["photo"]=="done" or message:
#text=""
if uploaded_photo.type=='application/pdf':
file = uploaded_photo.read() # Read the data
image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
image_result.write(file)
text = read_pdf(image_result)
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
st.success(text)
elif uploaded_photo.type != "application/image":
img = Image.open(uploaded_photo)
img = img.save("img.png")
imge = cv2.imread("img.png")
# get co-ordinates to crop the image
imag, lc = line_cor.mark_region(imge)
#st.success(*lc)
c = lc
# cropping image img = image[y0:y1, x0:x1]
imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
#plt.figure(figsize=(10,10))
# plt.imshow(img)
# convert the image to black and white for better OCR
ret,thresh1 = cv2.threshold(imgg,120,255,cv2.THRESH_BINARY)
# pytesseract image to string to get results
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
st.success(text)
elif camera_photo:
img = Image.open(camera_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
st.success(text)
elif uploaded_photo==None and camera_photo==None:
#our_image=load_image("image.jpg")
#img = cv2.imread("scholarly_text.jpg")
text = message
if st.checkbox("Show Named Entities English/Bangla"):
entity_result = entity_analyzer(text)
st.json(entity_result)
if st.checkbox("Show Sentiment Analysis for English"):
blob = TextBlob(text)
result_sentiment = blob.sentiment
st.success(result_sentiment)
if st.checkbox("Spell Corrections for English"):
st.success(TextBlob(text).correct())
if st.checkbox("Text Generation"):
ok = st.button("Generate")
if ok:
tokenizer, model = load_models()
input_ids = tokenizer(text, return_tensors='pt').input_ids
st.text("Using Hugging Face Transformer, Contrastive Search ..")
output = model.generate(input_ids, max_length=128)
st.success(tokenizer.decode(output[0], skip_special_tokens=True))
if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
#st.subheader("Summarize Your Text for English and Bangla Texts!")
#message = st.text_area("Enter the Text","Type please ..")
#st.text("Using Gensim Summarizer ..")
#st.success(mess)
summary_result = summarize(text)
st.success(summary_result)
if st.checkbox("Mark to better English Text Summarization!"):
#st.title("Summarize Your Text for English only!")
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
#st.text("Using Google T5 Transformer ..")
inputs = tokenizer.encode("summarize: " + text,
return_tensors='pt',
max_length=512,
truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
summary = tokenizer.decode(summary_ids[0])
st.success(summary)
# Title
if st.button("REFRESH"):
st.experimental_rerun()
st.sidebar.subheader("About App")
st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)")
if __name__ == '__main__':
main()
|