lorenpe2 commited on
Commit
76f757a
Β·
1 Parent(s): 65be65b

FEAT: New version of model trained as sequence classification

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. app.py +18 -14
  2. data/example_data.py +0 -0
  3. inference_tokenizer.py +0 -1
  4. model/3c090bb39725194fae09a603caac8c3d9014df49/awscliv2.zip +3 -0
  5. model/3c090bb39725194fae09a603caac8c3d9014df49/onnx/model.onnx +3 -0
  6. model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/config.json +0 -0
  7. model/4a70ad1033ceec48447d5319d0863d442e976823/info.json +4 -0
  8. model/4a70ad1033ceec48447d5319d0863d442e976823/meta-info.json +56 -0
  9. model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/pytorch_model.bin +1 -1
  10. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/special_tokens_map.json +3 -0
  11. model/4a70ad1033ceec48447d5319d0863d442e976823/tokenizer_config.json +15 -0
  12. model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/training_args.bin +1 -1
  13. model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/vocab.txt +0 -0
  14. model/6a62f122a90e090b285f0344a1d79e753f2000bb/info.json +0 -4
  15. model/6a62f122a90e090b285f0344a1d79e753f2000bb/meta-info.json +0 -1
  16. model/6a62f122a90e090b285f0344a1d79e753f2000bb/special_tokens_map.json +0 -1
  17. model/6a62f122a90e090b285f0344a1d79e753f2000bb/tokenizer_config.json +0 -1
  18. model/berttokenizer.zip +3 -0
  19. model/berttokenizer/special_tokens_map.json +8 -0
  20. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ berttokenizer}/tokenizer.json +10 -10
  21. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ berttokenizer}/tokenizer_config.json +1 -0
  22. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ berttokenizer}/vocab.txt +0 -0
  23. model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json +0 -4
  24. model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json +0 -1
  25. model/d1dd8365cbf16ff423f537e2291c61a91c717ed1/onnx/model.onnx +3 -0
  26. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json +0 -4
  27. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json +0 -1
  28. model/e09d71f55f4b6fc20135f856bf029322a3265d8d/optimizer.pt +0 -3
  29. model/f1f881389fb38108e623689999ceaaaf398c5e92/info.json +0 -4
  30. model/f1f881389fb38108e623689999ceaaaf398c5e92/meta-info.json +0 -1
  31. model/f1f881389fb38108e623689999ceaaaf398c5e92/special_tokens_map.json +0 -1
  32. model/f1f881389fb38108e623689999ceaaaf398c5e92/tokenizer_config.json +0 -1
  33. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/config.json +3 -2
  34. model/new_version_dummy/onnx/model.onnx +3 -0
  35. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/pytorch_model.bin +2 -2
  36. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_dummy}/special_tokens_map.json +3 -0
  37. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_dummy}/tokenizer.json +9 -0
  38. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/tokenizer_config.json +1 -1
  39. model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_dummy}/training_args.bin +2 -2
  40. model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/vocab.txt +0 -0
  41. model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_v1}/config.json +2 -2
  42. model/new_version_v1/onnx/model.onnx +3 -0
  43. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v1}/pytorch_model.bin +2 -2
  44. model/new_version_v1/special_tokens_map.json +10 -0
  45. model/new_version_v1/tokenizer_config.json +15 -0
  46. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v1}/training_args.bin +2 -2
  47. model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_v1}/vocab.txt +0 -0
  48. model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v2}/config.json +4 -3
  49. model/new_version_v2/info.json +4 -0
  50. model/new_version_v2/meta-info.json +28 -0
app.py CHANGED
@@ -22,7 +22,7 @@ def get_model(_model_path):
22
  _model_package = meta_info["kwargs"].get("model_package", "transformers")
23
  _model_class = meta_info["kwargs"].get("model_class", "BertForNextSentencePrediction")
24
  else:
25
- raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")
26
 
27
  model_class = get_class(_model_package, _model_class)
28
  _model = model_class.from_pretrained(_model_path)
@@ -32,8 +32,8 @@ def get_model(_model_path):
32
 
33
  def get_tokenizer(tokenizer_path):
34
  print(f"Getting tokenizer at {tokenizer_path}")
35
- from transformers import BertTokenizer
36
- tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
37
  if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
38
  with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
39
  meta_info = json.load(f)
@@ -44,8 +44,7 @@ def get_tokenizer(tokenizer_path):
44
 
45
  if special_token != " ":
46
  tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
47
- # print(special_token)
48
- # print(tokenizer_args)
49
  _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
50
  return _inference_tokenizer
51
 
@@ -108,7 +107,10 @@ option = st.selectbox("Choose type of input:",
108
  "02 - JSON (aggregated)",
109
  "03 - JSON (example CA-OOD)",
110
  "04 - JSON (example Elysai)",
111
- "05 - Diagnostic mode"])
 
 
 
112
 
113
  with st.form("input_text"):
114
  if "01" in option:
@@ -129,10 +131,9 @@ with st.form("input_text"):
129
  ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
130
  autopct='%1.1f%%')
131
  st.pyplot(fig)
132
- elif "02" in option or "03" in option or "04" in option:
133
- from data.example_data import ca_ood, elysai
134
 
135
- choices = [ca_ood, elysai]
136
  option: str
137
  # > Python 3.10
138
  # match option.split("-")[0].strip():
@@ -143,12 +144,14 @@ with st.form("input_text"):
143
  # case _:
144
  # text = ""
145
  option = option.split("-")[0].strip()
 
146
  if option == "03":
147
- text = json.dumps(choices[0])
148
  elif option == "04":
149
- text = json.dumps(choices[1])
150
- else:
151
- test = ""
 
152
  context = st.text_area("Insert JSON here:", value=str(text))
153
 
154
  if "{" in context:
@@ -158,7 +161,8 @@ with st.form("input_text"):
158
 
159
  submitted = st.form_submit_button("Submit")
160
  if submitted:
161
- for datapoint in data_for_evaluation:
 
162
  c, s, human_label = datapoint
163
  input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
164
  output_model = model(**input_tensor.data).logits
 
22
  _model_package = meta_info["kwargs"].get("model_package", "transformers")
23
  _model_class = meta_info["kwargs"].get("model_class", "BertForNextSentencePrediction")
24
  else:
25
+ raise FileNotFoundError("Model is provided without meta-info.json. Cannot interfere proper configuration!")
26
 
27
  model_class = get_class(_model_package, _model_class)
28
  _model = model_class.from_pretrained(_model_path)
 
32
 
33
  def get_tokenizer(tokenizer_path):
34
  print(f"Getting tokenizer at {tokenizer_path}")
35
+ from transformers import AutoTokenizer
36
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
37
  if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
38
  with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
39
  meta_info = json.load(f)
 
44
 
45
  if special_token != " ":
46
  tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
47
+
 
48
  _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
49
  return _inference_tokenizer
50
 
 
107
  "02 - JSON (aggregated)",
108
  "03 - JSON (example CA-OOD)",
109
  "04 - JSON (example Elysai)",
110
+ "05 - Diagnostic mode",
111
+ "06 - JSON (example Elysai - large)"])
112
+
113
+ progres_bar = st.progress(0.0, text="Inference")
114
 
115
  with st.form("input_text"):
116
  if "01" in option:
 
131
  ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
132
  autopct='%1.1f%%')
133
  st.pyplot(fig)
134
+ elif "02" in option or "03" in option or "04" in option or "06" in option:
135
+ from data.example_data import ca_ood, elysai, elysai_large
136
 
 
137
  option: str
138
  # > Python 3.10
139
  # match option.split("-")[0].strip():
 
144
  # case _:
145
  # text = ""
146
  option = option.split("-")[0].strip()
147
+ text = ""
148
  if option == "03":
149
+ text = json.dumps(ca_ood)
150
  elif option == "04":
151
+ text = json.dumps(elysai)
152
+ elif option == "06":
153
+ text = json.dumps(elysai_large)
154
+
155
  context = st.text_area("Insert JSON here:", value=str(text))
156
 
157
  if "{" in context:
 
161
 
162
  submitted = st.form_submit_button("Submit")
163
  if submitted:
164
+ for idx, datapoint in enumerate(data_for_evaluation):
165
+ progres_bar.progress(idx/len(data_for_evaluation), text="Inference")
166
  c, s, human_label = datapoint
167
  input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
168
  output_model = model(**input_tensor.data).logits
data/example_data.py CHANGED
The diff for this file is too large to render. See raw diff
 
inference_tokenizer.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
  from typing import Dict, List
3
 
4
-
5
  class NextSentencePredictionTokenizer:
6
 
7
  def __init__(self, _tokenizer, **_tokenizer_args):
 
1
  import torch
2
  from typing import Dict, List
3
 
 
4
  class NextSentencePredictionTokenizer:
5
 
6
  def __init__(self, _tokenizer, **_tokenizer_args):
model/3c090bb39725194fae09a603caac8c3d9014df49/awscliv2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4962990f24634814b6d4834a5c105a524c8895fca478a8fc17f7cc7e6191fa4
3
+ size 57717779
model/3c090bb39725194fae09a603caac8c3d9014df49/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1e7543fd13846a928c1309459cd3a068dc82221a4d22ca647e054d658ea1063
3
+ size 433513952
model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/config.json RENAMED
File without changes
model/4a70ad1033ceec48447d5319d0863d442e976823/info.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model": "BERT-NSP-v7",
3
+ "description": "Model trained on full version of DailyDialogue and CommonDialogues + down=sampled version of SODA and AirDialogue. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). Added method for **Data Augmentation**. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/b40zgvoo/overview?workspace=user-petr-lorenc"
4
+ }
model/4a70ad1033ceec48447d5319d0863d442e976823/meta-info.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": [],
3
+ "kwargs": {
4
+ "model_package": "models",
5
+ "model_class": "OwnBertForNextSentencePrediction",
6
+ "data_root": "/home/lorenpe2/project/data",
7
+ "data_sources": [
8
+ [
9
+ "COMMON_DIALOGUES",
10
+ "common_dialogues/train.json",
11
+ "common_dialogues/valid_frozen.json",
12
+ "common_dialogues/test_frozen.json"
13
+ ],
14
+ [
15
+ "DAILY_DIALOGUES",
16
+ "daily_dialogues/dialogues_text.train.txt",
17
+ "daily_dialogues/dev_frozen.json",
18
+ "daily_dialogues/test_frozen.json"
19
+ ],
20
+ [
21
+ "DAILY_DIALOGUES",
22
+ "air_dialogue/subsampled_train.txt",
23
+ "air_dialogue/subsampled_validation_frozen.json",
24
+ "air_dialogue/subsampled_test_frozen.json"
25
+ ],
26
+ [
27
+ "DAILY_DIALOGUES",
28
+ "soda/subsampled_train.txt",
29
+ "soda/subsampled_validation_frozen.json",
30
+ "soda/subsampled_test_frozen.json"
31
+ ]
32
+ ],
33
+ "pretrained_model": "bert-base-uncased",
34
+ "tokenizer": "bert-base-uncased",
35
+ "approach": "UNIQUE_RANDOM_CONTEXT",
36
+ "data_augmentation": [
37
+ "ADD_PARTLY_BROKEN_CONTEXT",
38
+ "ADD_SMALLER_CONTEXT"
39
+ ],
40
+ "special_token": "[unused1]",
41
+ "learning_rate": 5e-07,
42
+ "warmup_ratio": 0.1,
43
+ "freeze_prefinetuning": true,
44
+ "prefinenuting_epoch": 10,
45
+ "finetuning_epochs": 75
46
+ },
47
+ "tokenizer_args": {
48
+ "padding": "max_length",
49
+ "max_length_ctx": 32,
50
+ "max_length_res": 8,
51
+ "truncation": "only_first",
52
+ "return_tensors": "np",
53
+ "is_split_into_words": true,
54
+ "special_token": "[unused1]"
55
+ }
56
+ }
model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb065d01ae7e4e255d81ca3fdafadb21c42daea7cafdaaac3c21923f11827641
3
  size 438871109
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a0200d7532286b0e8aae550933d8a083274c7d8bcba41bd1ac989f9efb1bb1d
3
  size 438871109
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/special_tokens_map.json RENAMED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "cls_token": "[CLS]",
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[unused1]"
4
+ ],
5
  "cls_token": "[CLS]",
6
  "mask_token": "[MASK]",
7
  "pad_token": "[PAD]",
model/4a70ad1033ceec48447d5319d0863d442e976823/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": null,
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0032b55f2a68888f89b97e84498c69d4a7a65403f1a209c41da390a1762f76fc
3
  size 3195
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb391844e5d1b871e851254cc8388e4803682dd147af1fdb5067fa64fbe530aa
3
  size 3195
model/{6a62f122a90e090b285f0344a1d79e753f2000bb β†’ 4a70ad1033ceec48447d5319d0863d442e976823}/vocab.txt RENAMED
File without changes
model/6a62f122a90e090b285f0344a1d79e753f2000bb/info.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "model": "BERT-NSP-v6",
3
- "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). Added method for **Data Augmentation**. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/xvzhhw2r/overview?workspace=user-petr-lorenc"
4
- }
 
 
 
 
 
model/6a62f122a90e090b285f0344a1d79e753f2000bb/meta-info.json DELETED
@@ -1 +0,0 @@
1
- {"args": [], "kwargs": {"model_package": "models", "model_class": "OwnBertForNextSentencePrediction", "data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid_frozen.json", "common_dialogues/test_frozen.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dev_frozen.json", "daily_dialogues/test_frozen.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "approach": "IGNORE_DUPLICITIES", "data_augmentation": ["ADD_PARTLY_BROKEN_CONTEXT"], "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1, "freeze_prefinetuning": true, "prefinenuting_epoch": 10, "finetuning_epochs": 75}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]"}}
 
 
model/6a62f122a90e090b285f0344a1d79e753f2000bb/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}
 
 
model/6a62f122a90e090b285f0344a1d79e753f2000bb/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
 
 
model/berttokenizer.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d253965be810ac94eed5cf080d45c958e282f0450d64dff77803b272e145d8e
3
+ size 320047
model/berttokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]",
7
+ "additional_special_tokens": ["[unused1]", "[unused2]", "[unused3]", "[unused4]", "[unused5]", "[unused6]", "[unused7]", "[unused8]", "[unused9]", "[unused10]"]
8
+ }
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ berttokenizer}/tokenizer.json RENAMED
@@ -5,48 +5,48 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
8
  "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false,
13
- "special": true
14
  },
15
  {
16
  "id": 100,
 
17
  "content": "[UNK]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": false,
22
- "special": true
23
  },
24
  {
25
  "id": 101,
 
26
  "content": "[CLS]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
  },
33
  {
34
  "id": 102,
 
35
  "content": "[SEP]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
  },
42
  {
43
  "id": 103,
 
44
  "content": "[MASK]",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
  }
51
  ],
52
  "normalizer": {
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "special": true,
9
  "content": "[PAD]",
10
  "single_word": false,
11
  "lstrip": false,
12
  "rstrip": false,
13
+ "normalized": false
 
14
  },
15
  {
16
  "id": 100,
17
+ "special": true,
18
  "content": "[UNK]",
19
  "single_word": false,
20
  "lstrip": false,
21
  "rstrip": false,
22
+ "normalized": false
 
23
  },
24
  {
25
  "id": 101,
26
+ "special": true,
27
  "content": "[CLS]",
28
  "single_word": false,
29
  "lstrip": false,
30
  "rstrip": false,
31
+ "normalized": false
 
32
  },
33
  {
34
  "id": 102,
35
+ "special": true,
36
  "content": "[SEP]",
37
  "single_word": false,
38
  "lstrip": false,
39
  "rstrip": false,
40
+ "normalized": false
 
41
  },
42
  {
43
  "id": 103,
44
+ "special": true,
45
  "content": "[MASK]",
46
  "single_word": false,
47
  "lstrip": false,
48
  "rstrip": false,
49
+ "normalized": false
 
50
  }
51
  ],
52
  "normalizer": {
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ berttokenizer}/tokenizer_config.json RENAMED
@@ -3,6 +3,7 @@
3
  "do_lower_case": true,
4
  "mask_token": "[MASK]",
5
  "model_max_length": 512,
 
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
8
  "special_tokens_map_file": null,
 
3
  "do_lower_case": true,
4
  "mask_token": "[MASK]",
5
  "model_max_length": 512,
6
+ "name_or_path": "bert-base-uncased",
7
  "pad_token": "[PAD]",
8
  "sep_token": "[SEP]",
9
  "special_tokens_map_file": null,
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ berttokenizer}/vocab.txt RENAMED
File without changes
model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "model": "BERT-NSP-v1",
3
- "description": "Model trained on DailyDialogue. Context is taken as is - no separation of individual turns. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/dm6ab7ma/logs?workspace=user-petr-lorenc"
4
- }
 
 
 
 
 
model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json DELETED
@@ -1 +0,0 @@
1
- {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": " ", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": " "}}
 
 
model/d1dd8365cbf16ff423f537e2291c61a91c717ed1/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65ee19216a6cfe592c1b5d7b35dabe9182a60d9bf7dbe415a7ad491b6c64733
3
+ size 438044019
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "model": "BERT-NSP-v2",
3
- "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
4
- }
 
 
 
 
 
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json DELETED
@@ -1 +0,0 @@
1
- {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}
 
 
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab19babe7ae39d1b6746d1dec58ab7758b0bacc33346b66e9c7da94419bebf96
3
- size 124944384
 
 
 
 
model/f1f881389fb38108e623689999ceaaaf398c5e92/info.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "model": "BERT-NSP-v5",
3
- "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/vzpwetvm/overview?workspace=user-petr-lorenc"
4
- }
 
 
 
 
 
model/f1f881389fb38108e623689999ceaaaf398c5e92/meta-info.json DELETED
@@ -1 +0,0 @@
1
- {"args": [], "kwargs": {"model_package": "models", "model_class": "OwnBertForNextSentencePrediction", "data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid_frozen.json", "common_dialogues/test_frozen.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dev_frozen.json", "daily_dialogues/test_frozen.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1, "freeze_prefinetuning": true, "prefinenuting_epoch": 10, "finetuning_epochs": 75}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]"}}
 
 
model/f1f881389fb38108e623689999ceaaaf398c5e92/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}
 
 
model/f1f881389fb38108e623689999ceaaaf398c5e92/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
 
 
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/config.json RENAMED
@@ -1,7 +1,7 @@
1
  {
2
  "_name_or_path": "bert-base-uncased",
3
  "architectures": [
4
- "BertForNextSentencePrediction"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
@@ -18,8 +18,9 @@
18
  "num_hidden_layers": 12,
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
 
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.17.0",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
 
1
  {
2
  "_name_or_path": "bert-base-uncased",
3
  "architectures": [
4
+ "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
 
18
  "num_hidden_layers": 12,
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
  "torch_dtype": "float32",
23
+ "transformers_version": "4.30.0.dev0",
24
  "type_vocab_size": 2,
25
  "use_cache": true,
26
  "vocab_size": 30522
model/new_version_dummy/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf2c61611e66c3efcca5e6d866b354596fff14c0f5d49ef4ff73bcbc77a20bc
3
+ size 438201824
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f791a2952d2707e82b275b14738b0fcd52c56b9a6acd597f4480829737d4368
3
- size 438022005
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04a843b2644531a9f6ac15659f22d89bc288c17886e9c6d2561431a7a0add441
3
+ size 438007925
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_dummy}/special_tokens_map.json RENAMED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "cls_token": "[CLS]",
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[unused1]"
4
+ ],
5
  "cls_token": "[CLS]",
6
  "mask_token": "[MASK]",
7
  "pad_token": "[PAD]",
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_dummy}/tokenizer.json RENAMED
@@ -12,6 +12,15 @@
12
  "normalized": false,
13
  "special": true
14
  },
 
 
 
 
 
 
 
 
 
15
  {
16
  "id": 100,
17
  "content": "[UNK]",
 
12
  "normalized": false,
13
  "special": true
14
  },
15
+ {
16
+ "id": 2,
17
+ "content": "[unused1]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
  {
25
  "id": 100,
26
  "content": "[UNK]",
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/tokenizer_config.json RENAMED
@@ -1,11 +1,11 @@
1
  {
 
2
  "cls_token": "[CLS]",
3
  "do_lower_case": true,
4
  "mask_token": "[MASK]",
5
  "model_max_length": 512,
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
8
- "special_tokens_map_file": null,
9
  "strip_accents": null,
10
  "tokenize_chinese_chars": true,
11
  "tokenizer_class": "BertTokenizer",
 
1
  {
2
+ "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
  "model_max_length": 512,
7
  "pad_token": "[PAD]",
8
  "sep_token": "[SEP]",
 
9
  "strip_accents": null,
10
  "tokenize_chinese_chars": true,
11
  "tokenizer_class": "BertTokenizer",
model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_dummy}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49a0183a62c25be44cbf2f333cf9224204e7fbfe84cd9054d94775d61daa9774
3
- size 3195
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e18fad4f08f10e21116128e5ffdbc3b9da56804fae548529abfcafe8814d94
3
+ size 3899
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d β†’ new_version_dummy}/vocab.txt RENAMED
File without changes
model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_v1}/config.json RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "bert-base-uncased",
3
  "architectures": [
4
  "OwnBertForNextSentencePrediction"
5
  ],
@@ -19,7 +19,7 @@
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.17.0",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
 
1
  {
2
+ "_name_or_path": "/home/lorenpe2/project/hf_models/bert-base-uncased",
3
  "architectures": [
4
  "OwnBertForNextSentencePrediction"
5
  ],
 
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
22
+ "transformers_version": "4.30.0.dev0",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
model/new_version_v1/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a4908d96ecead256b5c4f921a6ae08fe156bdb3a11aceb5aa7b8677dac9322e
3
+ size 438174352
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v1}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aec095ee8fc2c88ca0460f59e19df6b38c5c91d38a3ab04928ce7eb996c0d62a
3
- size 438022005
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed3691d1c07d77a0a6796fb4a5f61a2e774849cb7489c97a183097cb4f693aa
3
+ size 438856837
model/new_version_v1/special_tokens_map.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[unused1]"
4
+ ],
5
+ "cls_token": "[CLS]",
6
+ "mask_token": "[MASK]",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "unk_token": "[UNK]"
10
+ }
model/new_version_v1/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": null,
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v1}/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9965b76852aa7c689ed048738e3db2f0b386154cddbfb42d9da7a064a9f9231
3
- size 3195
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bed99c1c5b736531dce1c0822e055bb8ad10a1634b9235d9a35e7d6aa4356d58
3
+ size 4091
model/{f1f881389fb38108e623689999ceaaaf398c5e92 β†’ new_version_v1}/vocab.txt RENAMED
File without changes
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 β†’ new_version_v2}/config.json RENAMED
@@ -1,7 +1,7 @@
1
  {
2
- "_name_or_path": "bert-base-uncased",
3
  "architectures": [
4
- "BertForNextSentencePrediction"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
@@ -18,8 +18,9 @@
18
  "num_hidden_layers": 12,
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
 
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.17.0",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
 
1
  {
2
+ "_name_or_path": "/home/lorenpe2/project/hf_models/bert-base-uncased",
3
  "architectures": [
4
+ "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
 
18
  "num_hidden_layers": 12,
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
  "torch_dtype": "float32",
23
+ "transformers_version": "4.30.0.dev0",
24
  "type_vocab_size": 2,
25
  "use_cache": true,
26
  "vocab_size": 30522
model/new_version_v2/info.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model": "BERT-SEQUENCE-CLASSIFICATION",
3
+ "description": "Model trained on subset of DailyDialogue, CommonDialogues, ChitChatDataset, AirDialogue and SODA. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/wfsx1sga/overview?workspace=user-petr-lorenc"
4
+ }
model/new_version_v2/meta-info.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": [],
3
+ "kwargs": {
4
+ "model_package": "transformers",
5
+ "model_class": "AutoModelForSequenceClassification",
6
+ "data_root": "/home/lorenpe2/project/data",
7
+ "data_sources": [],
8
+ "pretrained_model": "bert-base-uncased",
9
+ "tokenizer": "bert-base-uncased",
10
+ "approach": "IGNORE_DUPLICITIES",
11
+ "special_token": "[unused1]",
12
+ "learning_rate": 5e-07,
13
+ "warmup_ratio": 0.1,
14
+ "freeze_prefinetuning": true,
15
+ "prefinenuting_epoch": 10,
16
+ "finetuning_epochs": 75
17
+ },
18
+ "tokenizer_args": {
19
+ "padding": "max_length",
20
+ "max_length_ctx": 256,
21
+ "max_length_res": 40,
22
+ "truncation": "only_first",
23
+ "return_tensors": "np",
24
+ "is_split_into_words": true,
25
+ "approach": "IGNORE_DUPLICITIES",
26
+ "special_token": "[unused1]"
27
+ }
28
+ }