|
import numpy as np |
|
import pandas as pd |
|
import re |
|
import torch |
|
import gradio as gr |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base") |
|
|
|
tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator") |
|
model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator") |
|
|
|
def generate_title(input_text): |
|
|
|
input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") |
|
|
|
output = model_gen_title.generate(input_ids, |
|
max_new_tokens=100, |
|
do_sample=True, |
|
temperature=0.8, |
|
top_k = 20 |
|
) |
|
decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True) |
|
return decoded_text |
|
|
|
|
|
|
|
def split_into_sentences(paragraph): |
|
|
|
|
|
sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' |
|
sentences = re.split(sentence_endings, paragraph) |
|
return sentences |
|
|
|
def paraphrase( |
|
text, |
|
beam_search, |
|
|
|
|
|
|
|
|
|
|
|
|
|
temperature=0.8, |
|
max_length=128 |
|
): |
|
if text != "": |
|
sentence_list = split_into_sentences(text) |
|
output = [] |
|
|
|
for sentence in sentence_list: |
|
|
|
input_ids = tokenizer( |
|
f'paraphrase: {sentence}', |
|
return_tensors="pt", padding="longest", |
|
|
|
|
|
).input_ids |
|
|
|
outputs = model.generate( |
|
input_ids, |
|
do_sample=True, |
|
num_beams = 20 if beam_search else 1, |
|
temperature=temperature, |
|
max_length=max_length, |
|
no_repeat_ngram_size=4 |
|
) |
|
|
|
res = tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
output.append(res[0]) |
|
|
|
paraphrased_text = "" |
|
titles_list = "" |
|
|
|
for sentence in output: |
|
paraphrased_text += sentence + " " |
|
|
|
for title in range (1,4): |
|
|
|
titles_list += (f"Title {title}: {generate_title (paraphrased_text)}<br>") |
|
|
|
|
|
|
|
return (titles_list, paraphrased_text) |
|
|
|
|
|
iface = gr.Interface(fn=paraphrase, |
|
inputs=[gr.Textbox(label="Paste text in the input box and press 'Submit'.", lines=10), "checkbox", gr.Slider(0.1, 2, 0.8)], |
|
outputs=[gr.HTML(label="Titles:"), gr.Textbox(label="Rephrased text:", lines=15)], |
|
title="AI Paraphraser with Title Generator", |
|
description="Sentencet-to-sentence rewording backed with GPT-3.5 training set", |
|
article="<div align=left><h1>AI Paraphraser and Title Generator</h1><li>Each sentence is rephrased separately without context.</li><li>Temperature: Increase value for more creative rewordings. Higher values may corrupt the sentence. Reset value after pressing 'Clear'</li><li>Beam search: Try for safer and conservative rephrasing.</li><p>Models:<br><li>Training set derived by using Chat-GPT3.5. No competition intended.</li><li>Original models: humarin/chatgpt_paraphraser_on_T5_base and Ateeq_news_title_generator. Deployment code modified for long text inputs.</li></p><p>Parameter details:<br><li>For rephraser: Beam search: No. of beams = 20, no_repeat_ngram_size=4, do_sample=True.</li><li>For title generator: do_sample=True, temperature=0.8, top_k = 20 </li></div>", |
|
flagging_mode='never' |
|
) |
|
|
|
iface.launch() |