Spaces:

Amitontheweb
/

Text_Paraphraser_Title_Generator

Sleeping

App Files Files Community

Amitontheweb commited on Oct 23, 2024

Commit

1a8d567

verified ·

1 Parent(s): c408ff4

Create app.py

Browse files

Files changed (1) hide show

app.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import numpy as np
+import pandas as pd
+import re
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
+tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator")
+model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator")
+def generate_title(input_text): #Generate a title for input text with Ateeq model
+  input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text
+  input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu
+  output = model_gen_title.generate(input_ids,
+                          max_new_tokens=100,
+                          do_sample=True,
+                          temperature=0.8,
+                          top_k = 20
+                        )
+  decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True)
+  return decoded_text
+def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para
+    # Split sentences after period. Retains \n if part of the text, but not included in model output
+    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
+    sentences = re.split(sentence_endings, paragraph)
+    return sentences
+def paraphrase(
+    question,
+    #num_beams=10,
+    #num_beam_groups=10,
+    #num_return_sequences=1,
+    #repetition_penalty=1.0,
+    #diversity_penalty=1.0,
+    no_repeat_ngram_size=3,
+    temperature=0.8,
+    max_length=128
+):
+    sentence_list = split_into_sentences(question) #feed input para into sentence splitter
+    output = [] #List to hold the individual rephrased sentences obtained from the model
+    for sentence in sentence_list:
+        input_ids = tokenizer(
+            f'paraphrase: {sentence}', #Using paraphrase prompt for T5
+            return_tensors="pt", padding="longest",
+            #max_length=max_length,
+            #truncation=True,
+        ).input_ids
+        outputs = model.generate(
+            input_ids,
+            do_sample=True,
+            temperature=temperature,
+            max_length=max_length,
+            no_repeat_ngram_size=no_repeat_ngram_size
+        )
+        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        output.append(res[0]) #Add rephrased sentence to list
+    paraphrased_text = "" #to hold the combined sentence output made from generated list
+    for sentence in output: #Join all new reworded sentences together
+        paraphrased_text += sentence + " "
+    for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title
+        print (f"Title {title}: {generate_title (paraphrased_text)}")
+        print ("")
+    return paraphrased_text # Return paraphrased text after printing three titles above