ekaterinatao commited on
Commit
c904f3b
·
verified ·
1 Parent(s): 6a8d825

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -90
app.py CHANGED
@@ -1,9 +1,5 @@
1
  import gradio as gr
2
- import torch
3
- import faiss
4
- import numpy as np
5
- import datasets
6
- from transformers import AutoTokenizer, AutoModel
7
 
8
  title = "HouseMD bot"
9
 
@@ -11,91 +7,6 @@ description = "Gradio Demo for telegram bot.\
11
  To use it, simply add your text message.\
12
  I've used the API on this Space to deploy the model on a Telegram bot."
13
 
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
-
17
- def embed_bert_cls(text, model, tokenizer):
18
- t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
19
- with torch.no_grad():
20
- model_output = model(**{k: v.to(model.device) for k, v in t.items()})
21
- embeds = model_output.last_hidden_state[:, 0, :]
22
- embeds = torch.nn.functional.normalize(embeds)
23
- return embeds[0].cpu().numpy()
24
-
25
-
26
- def get_ranked_docs(query, vec_query_base, data,
27
- bi_model, bi_tok, cross_model, cross_tok):
28
-
29
- vec_shape = vec_query_base.shape[1]
30
- index = faiss.IndexFlatL2(vec_shape)
31
- index.add(vec_query_base)
32
- xq = embed_bert_cls(query, bi_model, bi_tok)
33
- _, I = index.search(xq.reshape(1, vec_shape), 50)
34
- corpus = [data[int(i)]['answer'] for i in I[0]]
35
-
36
- queries = [query] * len(corpus)
37
- tokenized_texts = cross_tok(
38
- queries, corpus, max_length=128, padding=True, truncation=True, return_tensors="pt"
39
- ).to(device)
40
-
41
- with torch.no_grad():
42
- model_output = cross_model(
43
- **{k: v.to(cross_model.device) for k, v in tokenized_texts.items()}
44
- )
45
- ce_scores = model_output.last_hidden_state[:, 0, :]
46
- ce_scores = np.matmul(ce_scores, ce_scores.T)
47
- scores = ce_scores.cpu().numpy()
48
- scores_ix = np.argsort(scores)[::-1]
49
-
50
- return corpus[scores_ix[0][0]]
51
-
52
-
53
- def load_dataset(url='ekaterinatao/house_md_context3'):
54
-
55
- dataset = datasets.load_dataset(url, split='train')
56
- house_dataset = dataset.filter(lambda row: row['labels'] == 0)
57
-
58
- return house_dataset
59
-
60
-
61
- def load_cls_base(url='ekaterinatao/house_md_cls_embeds'):
62
-
63
- cls_dataset = datasets.load_dataset(url, split='train')
64
- cls_base = np.stack([embed['cls_embeds'] for embed in cls_dataset])
65
-
66
- return cls_base
67
-
68
-
69
- def load_bi_enc_model(checkpoint='ekaterinatao/house-md-bot-bert-bi-encoder'):
70
-
71
- bi_model = AutoModel.from_pretrained(checkpoint)
72
- bi_tok = AutoTokenizer.from_pretrained(checkpoint)
73
-
74
- return bi_model, bi_tok
75
-
76
-
77
- def load_cross_enc_model(checkpoint='ekaterinatao/house-md-bot-bert-cross-encoder'):
78
-
79
- cross_model = AutoModel.from_pretrained(checkpoint)
80
- cross_tok = AutoTokenizer.from_pretrained(checkpoint)
81
-
82
- return cross_model, cross_tok
83
-
84
-
85
- def get_answer(message):
86
-
87
- dataset = load_dataset()
88
- cls_base = load_cls_base()
89
- bi_enc_model = load_bi_enc_model()
90
- cross_enc_model = load_cross_enc_model()
91
-
92
- answer = get_ranked_docs(
93
- query=message, vec_query_base=cls_base, data=dataset,
94
- bi_model=bi_enc_model[0], bi_tok=bi_enc_model[1],
95
- cross_model=cross_enc_model[0], cross_tok=cross_enc_model[1]
96
- )
97
- return answer
98
-
99
 
100
  interface = gr.Interface(
101
  fn=get_answer,
 
1
  import gradio as gr
2
+ from utils.get_answer import get_answer
 
 
 
 
3
 
4
  title = "HouseMD bot"
5
 
 
7
  To use it, simply add your text message.\
8
  I've used the API on this Space to deploy the model on a Telegram bot."
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  interface = gr.Interface(
12
  fn=get_answer,