Amitontheweb's picture
Create app.py
1a8d567 verified
raw
history blame
2.97 kB
import numpy as np
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator")
model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator")
def generate_title(input_text): #Generate a title for input text with Ateeq model
input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text
input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu
output = model_gen_title.generate(input_ids,
max_new_tokens=100,
do_sample=True,
temperature=0.8,
top_k = 20
)
decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True)
return decoded_text
def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para
# Split sentences after period. Retains \n if part of the text, but not included in model output
sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
sentences = re.split(sentence_endings, paragraph)
return sentences
def paraphrase(
question,
#num_beams=10,
#num_beam_groups=10,
#num_return_sequences=1,
#repetition_penalty=1.0,
#diversity_penalty=1.0,
no_repeat_ngram_size=3,
temperature=0.8,
max_length=128
):
sentence_list = split_into_sentences(question) #feed input para into sentence splitter
output = [] #List to hold the individual rephrased sentences obtained from the model
for sentence in sentence_list:
input_ids = tokenizer(
f'paraphrase: {sentence}', #Using paraphrase prompt for T5
return_tensors="pt", padding="longest",
#max_length=max_length,
#truncation=True,
).input_ids
outputs = model.generate(
input_ids,
do_sample=True,
temperature=temperature,
max_length=max_length,
no_repeat_ngram_size=no_repeat_ngram_size
)
res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
output.append(res[0]) #Add rephrased sentence to list
paraphrased_text = "" #to hold the combined sentence output made from generated list
for sentence in output: #Join all new reworded sentences together
paraphrased_text += sentence + " "
for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title
print (f"Title {title}: {generate_title (paraphrased_text)}")
print ("")
return paraphrased_text # Return paraphrased text after printing three titles above