File size: 3,417 Bytes
32d9382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Download important files for the pipeline. Uncomment the following lines if you are running this script for the first time"""
# !wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
# !tar -xvf  s2v_reddit_2015_md.tar.gz
# if tar file is already downloaded don't download it again
import os
import urllib.request
import tarfile
if not os.path.exists("models/s2v_reddit_2015_md.tar.gz"):
  print ("Downloading Sense2Vec model")
  urllib.request.urlretrieve(r"https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",filename=r"models/s2v_reddit_2015_md.tar.gz")
else:
  print ("Sense2Vec model already downloaded")  

reddit_s2v= "models/s2v_reddit_2015_md.tar.gz"
extract_s2v="models"
extract_s2v_folder=reddit_s2v.replace(".tar.gz","")
if not os.path.isdir(extract_s2v_folder):
  with tarfile.open(reddit_s2v, 'r:gz') as tar:
    tar.extractall(f"models/")
else:
  print ("Already extracted")

"""Import required libraries"""

import warnings
warnings.filterwarnings('ignore')

from transformers import T5ForConditionalGeneration,T5Tokenizer

import streamlit as st
from sense2vec import Sense2Vec

@st.cache(allow_output_mutation=True)
def cache_models(paths2v,pathT5cond,pathT5):
    s2v = Sense2Vec().from_disk(paths2v)
    question_model = T5ForConditionalGeneration.from_pretrained(pathT5cond)
    question_tokenizer = T5Tokenizer.from_pretrained(pathT5)
    return (s2v,question_model,question_tokenizer)   
s2v,question_model,question_tokenizer=cache_models("models/s2v_old",'ramsrigouthamg/t5_squad_v1','t5-base')


"""Filter out same sense words using sense2vec algorithm"""

def filter_same_sense_words(original,wordlist):
  filtered_words=[]
  base_sense =original.split('|')[1] 
  for eachword in wordlist:
    if eachword[0].split('|')[1] == base_sense:
      filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip())
  return filtered_words

def sense2vec_get_words(topn,input_keyword):
  word=input_keyword
  output=[]
  required_keywords=[]
  output = []
  try:
    sense = s2v.get_best_sense(word)
    most_similar = s2v.most_similar(sense, n=topn)
    for i in range(len(most_similar)):
        required_keywords.append(most_similar[i])
    output = filter_same_sense_words(sense,required_keywords)
    print (f"Similar:{output}")
  except:
    output =[]

  return output

"""T5 Question generation"""
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('t5-base')

def get_question(sentence,answer):
  text = f"context: {sentence} answer: {answer} </s>"
  max_len = 256
  encoding = question_tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = question_model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=200)


  dec = [question_tokenizer.decode(ids) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question