|
import numpy as np |
|
import pandas as pd |
|
import re |
|
import torch |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
|
|
tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator") |
|
model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator") |
|
|
|
def generate_title(input_text): |
|
|
|
input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") |
|
input_ids = input_ids.to('cuda') |
|
output = model_gen_title.generate(input_ids, |
|
max_new_tokens=100, |
|
do_sample=True, |
|
temperature=0.8, |
|
top_k = 20 |
|
) |
|
decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True) |
|
return decoded_text |
|
|
|
|
|
|
|
def split_into_sentences(paragraph): |
|
|
|
|
|
sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' |
|
sentences = re.split(sentence_endings, paragraph) |
|
return sentences |
|
|
|
def paraphrase( |
|
question, |
|
|
|
|
|
|
|
|
|
|
|
no_repeat_ngram_size=3, |
|
temperature=0.8, |
|
max_length=128 |
|
): |
|
|
|
sentence_list = split_into_sentences(question) |
|
output = [] |
|
|
|
for sentence in sentence_list: |
|
|
|
input_ids = tokenizer( |
|
f'paraphrase: {sentence}', |
|
return_tensors="pt", padding="longest", |
|
|
|
|
|
).input_ids |
|
|
|
outputs = model.generate( |
|
input_ids, |
|
do_sample=True, |
|
temperature=temperature, |
|
max_length=max_length, |
|
no_repeat_ngram_size=no_repeat_ngram_size |
|
) |
|
|
|
res = tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
output.append(res[0]) |
|
|
|
paraphrased_text = "" |
|
|
|
for sentence in output: |
|
paraphrased_text += sentence + " " |
|
|
|
for title in range (1,4): |
|
print (f"Title {title}: {generate_title (paraphrased_text)}") |
|
print ("") |
|
|
|
return paraphrased_text |
|
|
|
|
|
|