Amitontheweb commited on
Commit
1a8d567
·
verified ·
1 Parent(s): c408ff4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import re
4
+ import torch
5
+
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
9
+ model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
10
+
11
+ tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator")
12
+ model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator")
13
+
14
+ def generate_title(input_text): #Generate a title for input text with Ateeq model
15
+
16
+ input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text
17
+ input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu
18
+ output = model_gen_title.generate(input_ids,
19
+ max_new_tokens=100,
20
+ do_sample=True,
21
+ temperature=0.8,
22
+ top_k = 20
23
+ )
24
+ decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True)
25
+ return decoded_text
26
+
27
+
28
+
29
+ def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para
30
+ # Split sentences after period. Retains \n if part of the text, but not included in model output
31
+
32
+ sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
33
+ sentences = re.split(sentence_endings, paragraph)
34
+ return sentences
35
+
36
+ def paraphrase(
37
+ question,
38
+ #num_beams=10,
39
+ #num_beam_groups=10,
40
+ #num_return_sequences=1,
41
+ #repetition_penalty=1.0,
42
+ #diversity_penalty=1.0,
43
+ no_repeat_ngram_size=3,
44
+ temperature=0.8,
45
+ max_length=128
46
+ ):
47
+
48
+ sentence_list = split_into_sentences(question) #feed input para into sentence splitter
49
+ output = [] #List to hold the individual rephrased sentences obtained from the model
50
+
51
+ for sentence in sentence_list:
52
+
53
+ input_ids = tokenizer(
54
+ f'paraphrase: {sentence}', #Using paraphrase prompt for T5
55
+ return_tensors="pt", padding="longest",
56
+ #max_length=max_length,
57
+ #truncation=True,
58
+ ).input_ids
59
+
60
+ outputs = model.generate(
61
+ input_ids,
62
+ do_sample=True,
63
+ temperature=temperature,
64
+ max_length=max_length,
65
+ no_repeat_ngram_size=no_repeat_ngram_size
66
+ )
67
+
68
+ res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
69
+ output.append(res[0]) #Add rephrased sentence to list
70
+
71
+ paraphrased_text = "" #to hold the combined sentence output made from generated list
72
+
73
+ for sentence in output: #Join all new reworded sentences together
74
+ paraphrased_text += sentence + " "
75
+
76
+ for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title
77
+ print (f"Title {title}: {generate_title (paraphrased_text)}")
78
+ print ("")
79
+
80
+ return paraphrased_text # Return paraphrased text after printing three titles above
81
+
82
+