Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,10 +19,13 @@ import textract
|
|
| 19 |
from scipy.special import softmax
|
| 20 |
import pandas as pd
|
| 21 |
from datetime import datetime
|
|
|
|
|
|
|
| 22 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 23 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
| 24 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
| 25 |
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
|
|
|
|
| 26 |
if device == 'cuda:0':
|
| 27 |
pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
|
| 28 |
else:
|
|
@@ -90,7 +93,8 @@ def predict(query,data):
|
|
| 90 |
hist = st + " " + st_hashed
|
| 91 |
now = datetime.now()
|
| 92 |
current_time = now.strftime("%H:%M:%S")
|
| 93 |
-
|
|
|
|
| 94 |
df = pd.read_csv("{}.csv".format(hash(st)))
|
| 95 |
list_outputs = []
|
| 96 |
for i in range(k):
|
|
@@ -105,7 +109,7 @@ def predict(query,data):
|
|
| 105 |
print(e)
|
| 106 |
print(st)
|
| 107 |
|
| 108 |
-
if name_to_save+".txt" in os.listdir():
|
| 109 |
doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
| 110 |
doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
| 111 |
file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
|
@@ -125,6 +129,8 @@ def predict(query,data):
|
|
| 125 |
doc_emb = doc_emb.reshape(-1, 768)
|
| 126 |
with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
|
| 127 |
f.write(text)
|
|
|
|
|
|
|
| 128 |
start = time.time()
|
| 129 |
query_emb = encode_query(query)
|
| 130 |
|
|
@@ -136,6 +142,8 @@ def predict(query,data):
|
|
| 136 |
probs = softmax(sorted(scores,reverse = True)[:k])
|
| 137 |
table = {"Passage":[],"Answer":[],"Probabilities":[]}
|
| 138 |
|
|
|
|
|
|
|
| 139 |
for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
|
| 140 |
passage = passage.replace("\n","")
|
| 141 |
#passage = passage.replace(" . "," ")
|
|
@@ -155,7 +163,7 @@ def predict(query,data):
|
|
| 155 |
table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
|
| 156 |
|
| 157 |
|
| 158 |
-
|
| 159 |
df = pd.DataFrame(table)
|
| 160 |
print(df)
|
| 161 |
print("time: "+ str(time.time()-start))
|
|
|
|
| 19 |
from scipy.special import softmax
|
| 20 |
import pandas as pd
|
| 21 |
from datetime import datetime
|
| 22 |
+
|
| 23 |
+
|
| 24 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 25 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
| 26 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
| 27 |
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
|
| 28 |
+
|
| 29 |
if device == 'cuda:0':
|
| 30 |
pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
|
| 31 |
else:
|
|
|
|
| 93 |
hist = st + " " + st_hashed
|
| 94 |
now = datetime.now()
|
| 95 |
current_time = now.strftime("%H:%M:%S")
|
| 96 |
+
|
| 97 |
+
try: #if the same question was already asked for this document, upload question and answer
|
| 98 |
df = pd.read_csv("{}.csv".format(hash(st)))
|
| 99 |
list_outputs = []
|
| 100 |
for i in range(k):
|
|
|
|
| 109 |
print(e)
|
| 110 |
print(st)
|
| 111 |
|
| 112 |
+
if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
|
| 113 |
doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
| 114 |
doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
| 115 |
file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
|
|
|
|
| 129 |
doc_emb = doc_emb.reshape(-1, 768)
|
| 130 |
with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
|
| 131 |
f.write(text)
|
| 132 |
+
|
| 133 |
+
#once embeddings are calculated, run MIPS
|
| 134 |
start = time.time()
|
| 135 |
query_emb = encode_query(query)
|
| 136 |
|
|
|
|
| 142 |
probs = softmax(sorted(scores,reverse = True)[:k])
|
| 143 |
table = {"Passage":[],"Answer":[],"Probabilities":[]}
|
| 144 |
|
| 145 |
+
|
| 146 |
+
#get answers for each pair of question (from user) and top best passages
|
| 147 |
for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
|
| 148 |
passage = passage.replace("\n","")
|
| 149 |
#passage = passage.replace(" . "," ")
|
|
|
|
| 163 |
table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
|
| 164 |
|
| 165 |
|
| 166 |
+
#format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
|
| 167 |
df = pd.DataFrame(table)
|
| 168 |
print(df)
|
| 169 |
print("time: "+ str(time.time()-start))
|