lorenpe2 commited on
Commit
2b6660e
Β·
1 Parent(s): 7bf60b0

FEAT: New model

Browse files
Files changed (23) hide show
  1. app.py +43 -31
  2. inference_tokenizer.py +11 -6
  3. model/3c090bb39725194fae09a603caac8c3d9014df49/config.json +26 -0
  4. model/3c090bb39725194fae09a603caac8c3d9014df49/info.json +4 -0
  5. model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json +1 -0
  6. model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin +3 -0
  7. model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json +1 -0
  8. model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json +1 -0
  9. model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin +3 -0
  10. model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt +0 -0
  11. model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json +4 -0
  12. model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json +1 -0
  13. model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json β†’ special_tokens_map.json} +0 -0
  14. model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json β†’ tokenizer.json} +0 -0
  15. model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json β†’ tokenizer_config.json} +0 -0
  16. model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt β†’ vocab.txt} +0 -0
  17. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json +4 -0
  18. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json +1 -0
  19. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json β†’ special_tokens_map.json} +0 -0
  20. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json β†’ tokenizer.json} +0 -0
  21. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json β†’ tokenizer_config.json} +0 -0
  22. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt β†’ vocab.txt} +0 -0
  23. requirements.txt +0 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import json
3
  from typing import Dict, List, Tuple, Union
4
 
@@ -21,52 +22,59 @@ def get_model(model_path):
21
  @st.cache_resource
22
  def get_tokenizer(tokenizer_path):
23
  from transformers import BertTokenizer
24
- tokenizer = BertTokenizer.from_pretrained(os.path.join(tokenizer_path, "tokenizer"))
25
- tokenizer_args = {
26
- "padding": "max_length",
27
- "max_length_ctx": 256,
28
- "max_length_res": 64,
29
- "truncation": "only_first",
30
- "return_tensors": "np",
31
- # will be transfer to tensor later during the training (because of some memory problem with tensors)
32
- "is_split_into_words": True,
33
- }
34
- special_token = " "
35
- # todo better than hardcoded
36
- if tokenizer_path == "./model/e09d71f55f4b6fc20135f856bf029322a3265d8d":
37
- special_token = "[unused1]"
38
  tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
39
- _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, special_token=special_token, **tokenizer_args)
 
 
40
  return _inference_tokenizer
41
 
42
 
43
- model_option = st.selectbox(
44
- 'Which model do you want to use?',
45
- ('./model/c3c3bdb7ad80396e69de171995e2038f900940c8', './model/e09d71f55f4b6fc20135f856bf029322a3265d8d'))
 
 
 
 
46
 
47
- model = get_model(model_option)
48
- inference_tokenizer = get_tokenizer(model_option)
49
 
 
50
 
51
- def get_evaluation_data(_context: List, special_delimiter=" "):
 
 
 
 
52
  output_data = []
53
  for _dict in _context:
54
  _dict: Dict
55
- c = special_delimiter.join(_dict["context"])
56
  for source in _dict["answers"].values():
57
  for _t, sentences in source.items():
58
  for sentence in sentences:
59
- output_data.append([c, sentence, _t])
60
  return output_data
61
 
62
 
63
- option = st.selectbox("Choose type of evaluation:",
64
- ["01 - Raw text (one line)", "02 - JSON (aggregated)"])
65
 
66
  with st.form("input_text"):
67
  if "01" in option:
68
- context = st.text_area("Insert context here (sentences divided by ||):")
69
- actual_text = st.text_input("Actual text")
 
70
 
71
  input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
72
  output_model = model(**input_tensor.data).logits
@@ -75,7 +83,6 @@ with st.form("input_text"):
75
  prop_follow = output_model[0]
76
  prop_not_follow = output_model[1]
77
 
78
- # Every form must have a submit button.
79
  submitted = st.form_submit_button("Submit")
80
  if submitted:
81
  fig, ax = plt.subplots()
@@ -83,12 +90,12 @@ with st.form("input_text"):
83
  autopct='%1.1f%%')
84
  st.pyplot(fig)
85
  elif "02" in option:
86
- context = st.text_area("Insert JSON here")
87
  if "{" in context:
88
  evaluation_data = get_evaluation_data(_context=json.loads(context))
89
  results = []
90
  accuracy = []
91
- # Every form must have a submit button.
92
  submitted = st.form_submit_button("Submit")
93
  if submitted:
94
  for datapoint in evaluation_data:
@@ -105,5 +112,10 @@ with st.form("input_text"):
105
  else:
106
  accuracy.append(int(prop_not_follow > prop_follow))
107
  st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
108
- df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)", "Probability (not-follow)"])
 
109
  st.dataframe(df)
 
 
 
 
 
1
  import os
2
+ import glob
3
  import json
4
  from typing import Dict, List, Tuple, Union
5
 
 
22
  @st.cache_resource
23
  def get_tokenizer(tokenizer_path):
24
  from transformers import BertTokenizer
25
+ tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
26
+ if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
27
+ with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
28
+ meta_info = json.load(f)
29
+ tokenizer_args = meta_info["tokenizer_args"]
30
+ special_token = meta_info["kwargs"]["special_token"]
31
+ else:
32
+ raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")
33
+
34
+ if special_token != " ":
 
 
 
 
35
  tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
36
+ print(special_token)
37
+ print(tokenizer_args)
38
+ _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
39
  return _inference_tokenizer
40
 
41
 
42
+ models_path = glob.glob("./model/*/info.json")
43
+ models = {}
44
+ for model_path in models_path:
45
+ with open(model_path, "r") as f:
46
+ model_data = json.load(f)
47
+ model_data["path"] = model_path.replace("info.json", "")
48
+ models[model_data["model"]] = model_data
49
 
50
+ model_name = st.selectbox('Which model do you want to use?',
51
+ (x for x in sorted(models.keys())))
52
 
53
+ model_path = models[model_name]["path"]
54
 
55
+ model = get_model(model_path)
56
+ inference_tokenizer = get_tokenizer(model_path)
57
+
58
+
59
+ def get_evaluation_data(_context: List) -> List[Tuple[List, str, str]]:
60
  output_data = []
61
  for _dict in _context:
62
  _dict: Dict
 
63
  for source in _dict["answers"].values():
64
  for _t, sentences in source.items():
65
  for sentence in sentences:
66
+ output_data.append((_dict["context"], sentence, _t))
67
  return output_data
68
 
69
 
70
+ option = st.selectbox("Choose type of input:",
71
+ ["01 - String (one turn per line)", "02 - JSON (aggregated)"])
72
 
73
  with st.form("input_text"):
74
  if "01" in option:
75
+ context = st.text_area("Insert context here (one turn per line):")
76
+ actual_text = st.text_input("Insert current turn:")
77
+ context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n")))
78
 
79
  input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
80
  output_model = model(**input_tensor.data).logits
 
83
  prop_follow = output_model[0]
84
  prop_not_follow = output_model[1]
85
 
 
86
  submitted = st.form_submit_button("Submit")
87
  if submitted:
88
  fig, ax = plt.subplots()
 
90
  autopct='%1.1f%%')
91
  st.pyplot(fig)
92
  elif "02" in option:
93
+ context = st.text_area("Insert JSON here:")
94
  if "{" in context:
95
  evaluation_data = get_evaluation_data(_context=json.loads(context))
96
  results = []
97
  accuracy = []
98
+
99
  submitted = st.form_submit_button("Submit")
100
  if submitted:
101
  for datapoint in evaluation_data:
 
112
  else:
113
  accuracy.append(int(prop_not_follow > prop_follow))
114
  st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
115
+ df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
116
+ "Probability (not-follow)"])
117
  st.dataframe(df)
118
+
119
+ st.markdown("## Description of models:")
120
+ for x in sorted(models.values(), key=lambda x: x["model"]):
121
+ st.write((str(x["model"] + " - " + x["description"])))
inference_tokenizer.py CHANGED
@@ -1,21 +1,26 @@
1
  import torch
2
- from typing import Dict
3
 
4
 
5
  class NextSentencePredictionTokenizer:
6
 
7
- def __init__(self, _tokenizer, special_token, **_tokenizer_args):
8
  self.tokenizer = _tokenizer
9
  self.tokenizer_args = _tokenizer_args
10
  self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
11
  self.max_length_res = self.tokenizer_args.get("max_length_res")
 
 
 
 
 
 
12
  del self.tokenizer_args["max_length_ctx"]
13
  del self.tokenizer_args["max_length_res"]
14
- self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
15
- self.special_token = special_token
16
 
17
- def get_item(self, context: str, actual_sentence: str):
18
- actual_item = {"ctx": context.replace("||", self.special_token), "res": actual_sentence}
 
19
  tokenized = self._tokenize_row(actual_item)
20
 
21
  for key in tokenized.data.keys():
 
1
  import torch
2
+ from typing import Dict, List
3
 
4
 
5
  class NextSentencePredictionTokenizer:
6
 
7
+ def __init__(self, _tokenizer, **_tokenizer_args):
8
  self.tokenizer = _tokenizer
9
  self.tokenizer_args = _tokenizer_args
10
  self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
11
  self.max_length_res = self.tokenizer_args.get("max_length_res")
12
+ self.special_token = self.tokenizer_args.get("special_token")
13
+ self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
14
+
15
+ # cleaning
16
+ del self.tokenizer_args["special_token"]
17
+ del self.tokenizer_args["naive_approach"]
18
  del self.tokenizer_args["max_length_ctx"]
19
  del self.tokenizer_args["max_length_res"]
 
 
20
 
21
+ def get_item(self, context: List[str], actual_sentence: str):
22
+ context_str = f" {self.special_token} ".join(context) if self.special_token != " " else " ".join(context)
23
+ actual_item = {"ctx": context_str, "res": actual_sentence}
24
  tokenized = self._tokenize_row(actual_item)
25
 
26
  for key in tokenized.data.keys():
model/3c090bb39725194fae09a603caac8c3d9014df49/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-cased",
3
+ "architectures": [
4
+ "BertForNextSentencePrediction"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.17.0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 28996
26
+ }
model/3c090bb39725194fae09a603caac8c3d9014df49/info.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model": "BERT-NSP-v3",
3
+ "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
4
+ }
model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-cased", "tokenizer": "bert-base-cased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}
model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627fe3220abd88a13cdd5e4befc4b7d8ec31412ed55d9e97c03c7aaf73b95b01
3
+ size 433334133
model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}
model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}
model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb6633e7e9dbd843917b80027f14e34338c15f651ec8cfe995646b0415e76a2
3
+ size 3195
model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model": "BERT-NSP-v1",
3
+ "description": "Model trained on DailyDialogue. Context is taken as is - no separation of individual turns. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/dm6ab7ma/logs?workspace=user-petr-lorenc"
4
+ }
model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": " ", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": " "}}
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json β†’ special_tokens_map.json} RENAMED
File without changes
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json β†’ tokenizer.json} RENAMED
File without changes
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json β†’ tokenizer_config.json} RENAMED
File without changes
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt β†’ vocab.txt} RENAMED
File without changes
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model": "BERT-NSP-v2",
3
+ "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
4
+ }
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json β†’ special_tokens_map.json} RENAMED
File without changes
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json β†’ tokenizer.json} RENAMED
File without changes
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json β†’ tokenizer_config.json} RENAMED
File without changes
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt β†’ vocab.txt} RENAMED
File without changes
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  torch
2
  transformers
3
- streamlit
4
  matplotlib
5
  numpy
6
  pandas
 
1
  torch
2
  transformers
 
3
  matplotlib
4
  numpy
5
  pandas