Amitontheweb's picture
Upload app.py
c41038b verified
raw
history blame
4.5 kB
import numpy as np
import pandas as pd
import re
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator")
model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator")
def generate_title(input_text): #Generate a title for input text with Ateeq model
input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text
#input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu
output = model_gen_title.generate(input_ids,
max_new_tokens=100,
do_sample=True,
temperature=0.8,
top_k = 20
)
decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True)
return decoded_text
def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para
# Split sentences after period. Retains \n if part of the text, but not included in model output
sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
sentences = re.split(sentence_endings, paragraph)
return sentences
def paraphrase(
text,
beam_search,
#num_beams=10,
#num_beam_groups=10,
#num_return_sequences=1,
#repetition_penalty=1.0,
#diversity_penalty=1.0,
#no_repeat_ngram_size=3,
temperature=0.8,
max_length=128
):
if text != "":
sentence_list = split_into_sentences(text) #feed input para into sentence splitter
output = [] #List to hold the individual rephrased sentences obtained from the model
for sentence in sentence_list:
input_ids = tokenizer(
f'paraphrase: {sentence}', #Using paraphrase prompt for T5
return_tensors="pt", padding="longest",
#max_length=max_length,
#truncation=True,
).input_ids
outputs = model.generate(
input_ids,
do_sample=True,
num_beams = 20 if beam_search else 1,
temperature=temperature,
max_length=max_length,
no_repeat_ngram_size=4
)
res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
output.append(res[0]) #Add rephrased sentence to list
paraphrased_text = "" #to hold the combined sentence output made from generated list
titles_list = "" #to hold the three titles
for sentence in output: #Join all new reworded sentences together
paraphrased_text += sentence + " "
for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title
titles_list += (f"Title {title}: {generate_title (paraphrased_text)}<br>")
#titles_list.append ("") #space after each title
return (titles_list, paraphrased_text) # Return paraphrased text after printing three titles above
iface = gr.Interface(fn=paraphrase,
inputs=[gr.Textbox(label="Paste text in the input box and press 'Submit'.", lines=10), "checkbox", gr.Slider(0.1, 2, 0.8)],
outputs=[gr.HTML(label="Titles:"), gr.Textbox(label="Rephrased text:", lines=15)],
title="AI Paraphraser with Title Generator",
description="Sentencet-to-sentence rewording backed with GPT-3.5 training set",
article="<div align=left><h1>AI Paraphraser and Title Generator</h1><li>Each sentence is rephrased separately without context.</li><li>Temperature: Increase value for more creative rewordings. Higher values may corrupt the sentence. Reset value after pressing 'Clear'</li><li>Beam search: Try for safer and conservative rephrasing.</li><p>Models:<br><li>Training set derived by using Chat-GPT3.5. No competition intended.</li><li>Original models: humarin/chatgpt_paraphraser_on_T5_base and Ateeq_news_title_generator. Deployment code modified for long text inputs.</li></p><p>Parameter details:<br><li>For rephraser: Beam search: No. of beams = 20, no_repeat_ngram_size=4, do_sample=True.</li><li>For title generator: do_sample=True, temperature=0.8, top_k = 20 </li></div>",
flagging_mode='never'
)
iface.launch()