lorenpe2 commited on
Commit
c186b27
·
1 Parent(s): 6457b4b

FEAT: Code without models

Browse files
Files changed (4) hide show
  1. .gitignore +6 -0
  2. app.py +107 -2
  3. inference_tokenizer.py +34 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .idea
2
+ .pyc
3
+ __pycache__
4
+ local
5
+ *wandb*
6
+ *temp*
app.py CHANGED
@@ -1,4 +1,109 @@
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict, List, Tuple, Union
4
+
5
+ import torch
6
+ import pandas
7
  import streamlit as st
8
+ import matplotlib.pyplot as plt
9
+
10
+ from inference_tokenizer import NextSentencePredictionTokenizer
11
+
12
+
13
+ @st.cache_resource
14
+ def get_model(model_path):
15
+ from transformers import BertForNextSentencePrediction
16
+ _model = BertForNextSentencePrediction.from_pretrained(model_path)
17
+ _model.eval()
18
+ return _model
19
+
20
+
21
+ @st.cache_resource
22
+ def get_tokenizer(tokenizer_path):
23
+ from transformers import BertTokenizer
24
+ tokenizer = BertTokenizer.from_pretrained(os.path.join(tokenizer_path, "tokenizer"))
25
+ tokenizer_args = {
26
+ "padding": "max_length",
27
+ "max_length_ctx": 256,
28
+ "max_length_res": 64,
29
+ "truncation": "only_first",
30
+ "return_tensors": "np",
31
+ # will be transfer to tensor later during the training (because of some memory problem with tensors)
32
+ "is_split_into_words": True,
33
+ }
34
+ special_token = " "
35
+ # todo better than hardcoded
36
+ if tokenizer_path == "./model/e09d71f55f4b6fc20135f856bf029322a3265d8d":
37
+ special_token = "[unused1]"
38
+ tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
39
+ _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, special_token=special_token, **tokenizer_args)
40
+ return _inference_tokenizer
41
+
42
+
43
+ model_option = st.selectbox(
44
+ 'Which model do you want to use?',
45
+ ('./model/c3c3bdb7ad80396e69de171995e2038f900940c8', './model/e09d71f55f4b6fc20135f856bf029322a3265d8d'))
46
+
47
+ model = get_model(model_option)
48
+ inference_tokenizer = get_tokenizer(model_option)
49
+
50
+
51
+ def get_evaluation_data(_context: List, special_delimiter=" "):
52
+ output_data = []
53
+ for _dict in _context:
54
+ _dict: Dict
55
+ c = special_delimiter.join(_dict["context"])
56
+ for source in _dict["answers"].values():
57
+ for _t, sentences in source.items():
58
+ for sentence in sentences:
59
+ output_data.append([c, sentence, _t])
60
+ return output_data
61
+
62
+
63
+ option = st.selectbox("Choose type of evaluation:",
64
+ ["01 - Raw text (one line)", "02 - JSON (aggregated)"])
65
+
66
+ with st.form("input_text"):
67
+ if "01" in option:
68
+ context = st.text_area("Insert context here (sentences divided by ||):")
69
+ actual_text = st.text_input("Actual text")
70
+
71
+ input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
72
+ output_model = model(**input_tensor.data).logits
73
+
74
+ output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
75
+ prop_follow = output_model[0]
76
+ prop_not_follow = output_model[1]
77
+
78
+ # Every form must have a submit button.
79
+ submitted = st.form_submit_button("Submit")
80
+ if submitted:
81
+ fig, ax = plt.subplots()
82
+ ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
83
+ autopct='%1.1f%%')
84
+ st.pyplot(fig)
85
+ elif "02" in option:
86
+ context = st.text_area("Insert JSON here")
87
+ if "{" in context:
88
+ evaluation_data = get_evaluation_data(_context=json.loads(context))
89
+ results = []
90
+ accuracy = []
91
+ # Every form must have a submit button.
92
+ submitted = st.form_submit_button("Submit")
93
+ if submitted:
94
+ for datapoint in evaluation_data:
95
+ c, s, human_label = datapoint
96
+ input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
97
+ output_model = model(**input_tensor.data).logits
98
+ output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
99
+ prop_follow = output_model[0]
100
+ prop_not_follow = output_model[1]
101
 
102
+ results.append((c, s, human_label, prop_follow, prop_not_follow))
103
+ if human_label == "coherent":
104
+ accuracy.append(int(prop_follow > prop_not_follow))
105
+ else:
106
+ accuracy.append(int(prop_not_follow > prop_follow))
107
+ st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
108
+ df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)", "Probability (not-follow)"])
109
+ st.dataframe(df)
inference_tokenizer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Dict
3
+
4
+
5
+ class NextSentencePredictionTokenizer:
6
+
7
+ def __init__(self, _tokenizer, special_token, **_tokenizer_args):
8
+ self.tokenizer = _tokenizer
9
+ self.tokenizer_args = _tokenizer_args
10
+ self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
11
+ self.max_length_res = self.tokenizer_args.get("max_length_res")
12
+ del self.tokenizer_args["max_length_ctx"]
13
+ del self.tokenizer_args["max_length_res"]
14
+ self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
15
+ self.special_token = special_token
16
+
17
+ def get_item(self, context: str, actual_sentence: str):
18
+ actual_item = {"ctx": context.replace("||", self.special_token), "res": actual_sentence}
19
+ tokenized = self._tokenize_row(actual_item)
20
+
21
+ for key in tokenized.data.keys():
22
+ tokenized.data[key] = torch.reshape(torch.from_numpy(tokenized.data[key]), (1, -1))
23
+ return tokenized
24
+
25
+ def _tokenize_row(self, row: Dict):
26
+ ctx_tokens = row["ctx"].split(" ")
27
+ res_tokens = row["res"].split(" ")
28
+ # -5 for additional information like [SEP], [CLS]
29
+ ctx_tokens = ctx_tokens[-self.max_length_ctx:]
30
+ res_tokens = res_tokens[-self.max_length_res:]
31
+ _args = (ctx_tokens, res_tokens)
32
+ tokenized_row = self.tokenizer(*_args, **self.tokenizer_args)
33
+ return tokenized_row
34
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ streamlit
4
+ matplotlib
5
+ numpy
6
+ pandas