thoristhor commited on
Commit
c392d0f
·
1 Parent(s): 61c9e21
Files changed (1) hide show
  1. helper.py +161 -0
helper.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk, pke, string, torch, requests, random
2
+ from nltk.tokenize import sent_tokenize
3
+ from nltk.corpus import stopwords
4
+ from flashtext import KeywordProcessor
5
+
6
+ def postprocesstext(content):
7
+ final=""
8
+ for sent in sent_tokenize(content):
9
+ sent = sent.capitalize()
10
+ final = final +" "+sent
11
+ return final
12
+
13
+ def summarizer(text,model,tokenizer):
14
+ text = text.strip().replace("\n"," ")
15
+ text = "summarize: "+text
16
+ # print (text)
17
+ max_len = 512
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,\
20
+ truncation=True, return_tensors="pt").to(device)
21
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
22
+ outs = model.generate(input_ids=input_ids,
23
+ attention_mask=attention_mask,
24
+ early_stopping=True,
25
+ num_beams=3,
26
+ num_return_sequences=1,
27
+ no_repeat_ngram_size=2,
28
+ min_length = 75,
29
+ max_length=300)
30
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
31
+ summary = dec[0]
32
+ summary = postprocesstext(summary)
33
+ summary= summary.strip()
34
+ return summary
35
+
36
+ def get_nouns_multipartite(content):
37
+ out=[]
38
+ try:
39
+ extractor = pke.unsupervised.MultipartiteRank()
40
+ extractor.load_document(input=content)
41
+ # not contain punctuation marks or stopwords as candidates.
42
+ pos = {'PROPN','NOUN'}
43
+ #pos = {'PROPN','NOUN'}
44
+ stoplist = list(string.punctuation)
45
+ stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
46
+ stoplist += stopwords.words('english')
47
+ # extractor.candidate_selection(pos=pos, stoplist=stoplist)
48
+ extractor.candidate_selection(pos=pos)
49
+ # 4. build the Multipartite graph and rank candidates using random walk,
50
+ # alpha controls the weight adjustment mechanism, see TopicRank for
51
+ # threshold/method parameters.
52
+ extractor.candidate_weighting(alpha=1.1,
53
+ threshold=0.75,
54
+ method='average')
55
+ keyphrases = extractor.get_n_best(n=15)
56
+ for val in keyphrases:
57
+ out.append(val[0])
58
+ except Exception as e:
59
+ out = []
60
+ #traceback.print_exc()
61
+ print("EXCEPTION: {}".format(e))
62
+ return out
63
+
64
+ def filter_overlap_words(l):
65
+ nl = []
66
+ for i in range(len(l)):
67
+ temp_l = l[0:i]+l[i+1:]
68
+ inside=False
69
+ for j in temp_l:
70
+ if l[i] not in j:
71
+ if l[i] not in nl:
72
+ nl.append(l[i])
73
+ inside = True
74
+ else:
75
+ if inside:
76
+ nl.remove(l[i])
77
+ break
78
+ return nl
79
+
80
+ def get_keywords(originaltext,summarytext):
81
+ keywords = get_nouns_multipartite(originaltext)
82
+ #print ("keywords unsummarized: ",keywords)
83
+ keyword_processor = KeywordProcessor()
84
+ for keyword in keywords:
85
+ keyword_processor.add_keyword(keyword)
86
+ keywords_found = keyword_processor.extract_keywords(summarytext)
87
+ keywords_found = list(set(keywords_found))
88
+ #print("keywords_found in summarized: ",keywords_found)
89
+ important_keywords =[]
90
+ for keyword in keywords:
91
+ if keyword in keywords_found:
92
+ important_keywords.append(keyword)
93
+ ## find keywords which don't have common word ..
94
+ from copy import deepcopy
95
+ imp_words = deepcopy(important_keywords)
96
+ #imp_words = filter_overlap_words(important_keywords)
97
+ imp_words = [str(i).title() for i in imp_words]
98
+ return imp_words[:5]
99
+
100
+ def get_question(context,answer,model,tokenizer):
101
+ text = "context: {} answer: {}".format(context,answer)
102
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
103
+ encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,\
104
+ truncation=True, return_tensors="pt").to(device)
105
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
106
+ outs = model.generate(input_ids=input_ids,
107
+ attention_mask=attention_mask,
108
+ early_stopping=True,
109
+ num_beams=5,
110
+ num_return_sequences=1,
111
+ no_repeat_ngram_size=2,
112
+ max_length=72)
113
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
114
+ Question = dec[0].replace("question:","")
115
+ Question= Question.strip()
116
+ return Question
117
+
118
+ def get_related_word(word):
119
+ url = "https://api.datamuse.com/words"
120
+ querystring = {"ml":word}
121
+ responses = requests.request("GET", url, params=querystring)
122
+ related_words = []
123
+ count = 0
124
+ responses = responses.json()
125
+ for res in responses:
126
+ if count >= 4:
127
+ break
128
+ if res["word"]!=word and res["word"]!="":
129
+ related_words.append(res["word"])
130
+ count += 1
131
+ related_words = [str(i).title() for i in related_words]
132
+ return related_words
133
+
134
+ def get_final_option_list(ans,other_options):
135
+ option1 = ans
136
+ option2,option3,option4 = "dummy","dummy","dummy"
137
+ try:
138
+ option2 = other_options[0]
139
+ except:
140
+ pass
141
+ try:
142
+ option3 = other_options[1]
143
+ except:
144
+ pass
145
+ try:
146
+ option4 = other_options[2]
147
+ except:
148
+ pass
149
+ final_options = [option1,option2,option3,option4]
150
+ random.shuffle(final_options)
151
+ final_options = tuple(final_options)
152
+ ans_index= 0
153
+ for i in range(4):
154
+ if final_options[i] == ans:
155
+ ans_index = i
156
+ return final_options, ans_index
157
+
158
+ def load_raw_text():
159
+ return "Billy and Ron are brothers. Billy is 5 years old. Ron is 7 years old. One day their mom took them to the zoo. Billy wore his red cap, and Ron wore his blue cap. They had fun watching all the animals. Ron liked the monkeys the best. He wanted to stay and watch them some more, but Billy wanted to go see the elephants. Elephants were Billy’s favorite. Their mom said it was time to go see the elephants, and Ron was sad. But their mom said they could come back and see the monkeys again before they left the zoo. Billy and Ron had a great day at the zoo."
160
+
161
+ # https://www.k5learning.com/worksheets/reading-comprehension/grade-1-compare-contrast-a.pdf