import gradio as gr import numpy as np import pytesseract as pt import pdf2image from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import os import pdfkit import yake from summarizer import Summarizer,TransformerSummarizer from transformers import pipelines nltk.download('punkt') from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig # model_name = 'distilbert-base-uncased' model_name = 'nlpaueb/legal-bert-base-uncased' #model_name = 'laxya007/gpt2_legal' # model_name = 'facebook/bart-large-cnn' # The setup of huggingface.co custom_config = AutoConfig.from_pretrained(model_name) custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained(model_name) custom_model = AutoModel.from_pretrained(model_name, config=custom_config) bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) print('Using model {}\n'.format(model_name)) def lincoln(content = input_text): summary_text = "" for i, paragraph in enumerate(content.split("\n\n")): # get rid of empty paragraphs and one word paras and extra whitespaces paragraph = paragraph.replace('\n',' ') paragraph = paragraph.replace('\t','') paragraph = ' '.join(paragraph.split()) # count words in the paragraph and exclude if less than 4 words tokens = word_tokenize(paragraph) # only do real words tokens = [word for word in tokens if word.isalpha()] # print("\nTokens: {}\n".format(len(tokens))) # only do sentences with more than 1 words excl. alpha crap if len(tokens) <= 1: continue # Perhaps also ignore paragraphs with no sentence? sentences = sent_tokenize(paragraph) # recreate paragraph from the only words tokens list paragraph = ' '.join(tokens) print("\nParagraph:") print(paragraph+"\n") # T5 needs to have 'summarize' in order to work: # text = "summarize:" + paragraph text = paragraph # encoding the input text summary = bert_legal_model(content, ratio = 0.01) # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True) summary_text += str(summary) + "\n\n" print("Summary:") print(summary) summary = bert_legal_model(content, ratio=0.1) all_text = str(summary) + "\n\n\n" \ + "-------- The Larger Summary --------\n" + str(summary_text) output_text = all_text return iface = gr.Interface( lincoln, "text", "text" ) if __name__ == "__main__": iface.launch(share=False)