Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 5,362 Bytes

"""
## App: NLP App with Streamlit
Credits: Streamlit Team,Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;

+ Tokenization & Lemmatization using Spacy

+ Named Entity Recognition(NER) using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim
"""
# Core Pkgs
import os
os.system('sudo apt-get install tesseract-ocr-eng')
os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('sudo apt update')
os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
os.system('gunzip ben.traineddata.gz ')
os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
os.system('pip install -q pytesseract')
import streamlit as st 
import os
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

# NLP Pkgs
from textblob import TextBlob 
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
@st.cache
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData

# Function For Extracting Entities
@st.cache
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData


def main():
	""" NLP Based App with Streamlit """

	# Title
	st.title("Streamlit NLP APP")
	st.markdown("""
    	#### Description
    	+ This is a Natural Language Processing(NLP) Based App useful for basic NLP task
         NER,Sentiment, Spell Corrections and Summarization
    	""")


	# Entity Extraction
	if st.checkbox("Show Named Entities"):
		st.subheader("Analyze Your Text")

		message = st.text_area("Enter your Text","Typing Here ..")
		if st.button("Extract"):
			entity_result = entity_analyzer(message)
			st.json(entity_result)

	# Sentiment Analysis
	elif st.checkbox("Show Sentiment Analysis"):
		st.subheader("Analyse Your Text")
		message = st.text_area("Enter Text plz","Type Here .")
		if st.button("Analyze"):
			blob = TextBlob(message)
			result_sentiment = blob.sentiment
			st.success(result_sentiment)
	#Text Corrections
	elif st.checkbox("Spell Corrections"):
		st.subheader("Correct Your Text")
		message = st.text_area("Enter the Text","Type please ..")
		if st.button("Spell Corrections"):
			st.text("Using TextBlob ..")
			st.success(TextBlob(message).correct())
	def change_photo_state():
		st.session_state["photo"]="done"
	st.subheader("Summary section, feed your image!")
	camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg'], on_change=change_photo_state)
	message = st.text_input("Or, drop your text here, only English text!")
	if "photo" not in st.session_state:
		st.session_state["photo"]="not done"

	if st.session_state["photo"]=="done" or message:
		if uploaded_photo:
			img = Image.open(uploaded_photo)
			img = img.save("img.png")
			img = cv2.imread("img.png")
			text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see in Bangla") else pytesseract.image_to_string(img)
			st.success(text)
		if camera_photo:
			img = Image.open(camera_photo)
			img = img.save("img.png")
			img = cv2.imread("img.png")
			text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see Bangla") else pytesseract.image_to_string(img)
			st.success(text)
		if uploaded_photo==None and camera_photo==None:
			#our_image=load_image("image.jpg")
			#img = cv2.imread("scholarly_text.jpg")
			text = message
		# Summarization
		if st.checkbox("Mark here, Text Summarization for English and Bangla!"):
			#st.subheader("Summarize Your Text for English and Bangla Texts!")
			#message = st.text_area("Enter the Text","Type please ..")
			#st.text("Using Gensim Summarizer ..")
			#st.success(mess)
			summary_result = summarize(text)
			st.success(summary_result)
		elif st.checkbox("Mark here, Better Text Summarization for English only!"):
			#st.title("Summarize Your Text for English only!")
			tokenizer = AutoTokenizer.from_pretrained('t5-base')
			model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
			#st.text("Using Google T5 Transformer ..")
			inputs = tokenizer.encode("summarize: " + text,
						return_tensors='pt',
										max_length=512,
										truncation=True)
			summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
			summary = tokenizer.decode(summary_ids[0])
			st.success(summary)
	
	st.sidebar.subheader("About App")
	st.sidebar.subheader("By")
	st.sidebar.text("Soumen Sarker")

if __name__ == '__main__':
	main()