Spaces:

lorenpe2
/

next-sentence-probability

Runtime error

App Files Files Community

lorenpe2 commited on Jun 7, 2023

Commit

76f757a

1 Parent(s): 65be65b

FEAT: New version of model trained as sequence classification

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +18 -14
data/example_data.py +0 -0
inference_tokenizer.py +0 -1
model/3c090bb39725194fae09a603caac8c3d9014df49/awscliv2.zip +3 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/onnx/model.onnx +3 -0
model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/config.json +0 -0
model/4a70ad1033ceec48447d5319d0863d442e976823/info.json +4 -0
model/4a70ad1033ceec48447d5319d0863d442e976823/meta-info.json +56 -0
model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/pytorch_model.bin +1 -1
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → 4a70ad1033ceec48447d5319d0863d442e976823}/special_tokens_map.json +3 -0
model/4a70ad1033ceec48447d5319d0863d442e976823/tokenizer_config.json +15 -0
model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/training_args.bin +1 -1
model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/vocab.txt +0 -0
model/6a62f122a90e090b285f0344a1d79e753f2000bb/info.json +0 -4
model/6a62f122a90e090b285f0344a1d79e753f2000bb/meta-info.json +0 -1
model/6a62f122a90e090b285f0344a1d79e753f2000bb/special_tokens_map.json +0 -1
model/6a62f122a90e090b285f0344a1d79e753f2000bb/tokenizer_config.json +0 -1
model/berttokenizer.zip +3 -0
model/berttokenizer/special_tokens_map.json +8 -0
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → berttokenizer}/tokenizer.json +10 -10
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → berttokenizer}/tokenizer_config.json +1 -0
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → berttokenizer}/vocab.txt +0 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json +0 -4
model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json +0 -1
model/d1dd8365cbf16ff423f537e2291c61a91c717ed1/onnx/model.onnx +3 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json +0 -4
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json +0 -1
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/optimizer.pt +0 -3
model/f1f881389fb38108e623689999ceaaaf398c5e92/info.json +0 -4
model/f1f881389fb38108e623689999ceaaaf398c5e92/meta-info.json +0 -1
model/f1f881389fb38108e623689999ceaaaf398c5e92/special_tokens_map.json +0 -1
model/f1f881389fb38108e623689999ceaaaf398c5e92/tokenizer_config.json +0 -1
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/config.json +3 -2
model/new_version_dummy/onnx/model.onnx +3 -0
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/pytorch_model.bin +2 -2
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_dummy}/special_tokens_map.json +3 -0
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_dummy}/tokenizer.json +9 -0
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/tokenizer_config.json +1 -1
model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_dummy}/training_args.bin +2 -2
model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/vocab.txt +0 -0
model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_v1}/config.json +2 -2
model/new_version_v1/onnx/model.onnx +3 -0
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v1}/pytorch_model.bin +2 -2
model/new_version_v1/special_tokens_map.json +10 -0
model/new_version_v1/tokenizer_config.json +15 -0
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v1}/training_args.bin +2 -2
model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_v1}/vocab.txt +0 -0
model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v2}/config.json +4 -3
model/new_version_v2/info.json +4 -0
model/new_version_v2/meta-info.json +28 -0

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def get_model(_model_path):
             _model_package = meta_info["kwargs"].get("model_package", "transformers")
             _model_class = meta_info["kwargs"].get("model_class", "BertForNextSentencePrediction")
     else:
-        raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")
     model_class = get_class(_model_package, _model_class)
     _model = model_class.from_pretrained(_model_path)
@@ -32,8 +32,8 @@ def get_model(_model_path):
 def get_tokenizer(tokenizer_path):
     print(f"Getting tokenizer at {tokenizer_path}")
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
     if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
         with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
             meta_info = json.load(f)
@@ -44,8 +44,7 @@ def get_tokenizer(tokenizer_path):
     if special_token != " ":
         tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
-    # print(special_token)
-    # print(tokenizer_args)
     _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
     return _inference_tokenizer
@@ -108,7 +107,10 @@ option = st.selectbox("Choose type of input:",
                        "02 - JSON (aggregated)",
                        "03 - JSON (example CA-OOD)",
                        "04 - JSON (example Elysai)",
-                       "05 - Diagnostic mode"])
 with st.form("input_text"):
     if "01" in option:
@@ -129,10 +131,9 @@ with st.form("input_text"):
             ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
                    autopct='%1.1f%%')
             st.pyplot(fig)
-    elif "02" in option or "03" in option or "04" in option:
-        from data.example_data import ca_ood, elysai
-        choices = [ca_ood, elysai]
         option: str
         # > Python 3.10
         # match option.split("-")[0].strip():
@@ -143,12 +144,14 @@ with st.form("input_text"):
         #     case _:
         #         text = ""
         option = option.split("-")[0].strip()
         if option == "03":
-            text = json.dumps(choices[0])
         elif option == "04":
-            text = json.dumps(choices[1])
-        else:
-            test = ""
         context = st.text_area("Insert JSON here:", value=str(text))
         if "{" in context:
@@ -158,7 +161,8 @@ with st.form("input_text"):
         submitted = st.form_submit_button("Submit")
         if submitted:
-            for datapoint in data_for_evaluation:
                 c, s, human_label = datapoint
                 input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
                 output_model = model(**input_tensor.data).logits

             _model_package = meta_info["kwargs"].get("model_package", "transformers")
             _model_class = meta_info["kwargs"].get("model_class", "BertForNextSentencePrediction")
     else:
+        raise FileNotFoundError("Model is provided without meta-info.json. Cannot interfere proper configuration!")
     model_class = get_class(_model_package, _model_class)
     _model = model_class.from_pretrained(_model_path)
 def get_tokenizer(tokenizer_path):
     print(f"Getting tokenizer at {tokenizer_path}")
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
         with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
             meta_info = json.load(f)
     if special_token != " ":
         tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
     _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
     return _inference_tokenizer
                        "02 - JSON (aggregated)",
                        "03 - JSON (example CA-OOD)",
                        "04 - JSON (example Elysai)",
+                       "05 - Diagnostic mode",
+                       "06 - JSON (example Elysai - large)"])
+progres_bar = st.progress(0.0, text="Inference")
 with st.form("input_text"):
     if "01" in option:
             ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
                    autopct='%1.1f%%')
             st.pyplot(fig)
+    elif "02" in option or "03" in option or "04" in option or "06" in option:
+        from data.example_data import ca_ood, elysai, elysai_large
         option: str
         # > Python 3.10
         # match option.split("-")[0].strip():
         #     case _:
         #         text = ""
         option = option.split("-")[0].strip()
+        text = ""
         if option == "03":
+            text = json.dumps(ca_ood)
         elif option == "04":
+            text = json.dumps(elysai)
+        elif option == "06":
+            text = json.dumps(elysai_large)
         context = st.text_area("Insert JSON here:", value=str(text))
         if "{" in context:
         submitted = st.form_submit_button("Submit")
         if submitted:
+            for idx, datapoint in enumerate(data_for_evaluation):
+                progres_bar.progress(idx/len(data_for_evaluation), text="Inference")
                 c, s, human_label = datapoint
                 input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
                 output_model = model(**input_tensor.data).logits

data/example_data.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

inference_tokenizer.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import torch
 from typing import Dict, List
 class NextSentencePredictionTokenizer:
     def __init__(self, _tokenizer, **_tokenizer_args):

 import torch
 from typing import Dict, List
 class NextSentencePredictionTokenizer:
     def __init__(self, _tokenizer, **_tokenizer_args):

model/3c090bb39725194fae09a603caac8c3d9014df49/awscliv2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4962990f24634814b6d4834a5c105a524c8895fca478a8fc17f7cc7e6191fa4
+size 57717779

model/3c090bb39725194fae09a603caac8c3d9014df49/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e7543fd13846a928c1309459cd3a068dc82221a4d22ca647e054d658ea1063
+size 433513952

model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/config.json RENAMED Viewed

File without changes

model/4a70ad1033ceec48447d5319d0863d442e976823/info.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model": "BERT-NSP-v7",
+  "description": "Model trained on full version of DailyDialogue and CommonDialogues + down=sampled version of SODA and AirDialogue. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). Added method for **Data Augmentation**. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/b40zgvoo/overview?workspace=user-petr-lorenc"
+}

model/4a70ad1033ceec48447d5319d0863d442e976823/meta-info.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "args": [],
+  "kwargs": {
+    "model_package": "models",
+    "model_class": "OwnBertForNextSentencePrediction",
+    "data_root": "/home/lorenpe2/project/data",
+    "data_sources": [
+      [
+        "COMMON_DIALOGUES",
+        "common_dialogues/train.json",
+        "common_dialogues/valid_frozen.json",
+        "common_dialogues/test_frozen.json"
+      ],
+      [
+        "DAILY_DIALOGUES",
+        "daily_dialogues/dialogues_text.train.txt",
+        "daily_dialogues/dev_frozen.json",
+        "daily_dialogues/test_frozen.json"
+      ],
+      [
+        "DAILY_DIALOGUES",
+        "air_dialogue/subsampled_train.txt",
+        "air_dialogue/subsampled_validation_frozen.json",
+        "air_dialogue/subsampled_test_frozen.json"
+      ],
+      [
+        "DAILY_DIALOGUES",
+        "soda/subsampled_train.txt",
+        "soda/subsampled_validation_frozen.json",
+        "soda/subsampled_test_frozen.json"
+      ]
+    ],
+    "pretrained_model": "bert-base-uncased",
+    "tokenizer": "bert-base-uncased",
+    "approach": "UNIQUE_RANDOM_CONTEXT",
+    "data_augmentation": [
+      "ADD_PARTLY_BROKEN_CONTEXT",
+      "ADD_SMALLER_CONTEXT"
+    ],
+    "special_token": "[unused1]",
+    "learning_rate": 5e-07,
+    "warmup_ratio": 0.1,
+    "freeze_prefinetuning": true,
+    "prefinenuting_epoch": 10,
+    "finetuning_epochs": 75
+  },
+  "tokenizer_args": {
+    "padding": "max_length",
+    "max_length_ctx": 32,
+    "max_length_res": 8,
+    "truncation": "only_first",
+    "return_tensors": "np",
+    "is_split_into_words": true,
+    "special_token": "[unused1]"
+  }
+}

model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/pytorch_model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb065d01ae7e4e255d81ca3fdafadb21c42daea7cafdaaac3c21923f11827641
 size 438871109

 version https://git-lfs.github.com/spec/v1
+oid sha256:8a0200d7532286b0e8aae550933d8a083274c7d8bcba41bd1ac989f9efb1bb1d
 size 438871109

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → 4a70ad1033ceec48447d5319d0863d442e976823}/special_tokens_map.json RENAMED Viewed

@@ -1,4 +1,7 @@
 {
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",

 {
+  "additional_special_tokens": [
+    "[unused1]"
+  ],
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",

model/4a70ad1033ceec48447d5319d0863d442e976823/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0032b55f2a68888f89b97e84498c69d4a7a65403f1a209c41da390a1762f76fc
 size 3195

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb391844e5d1b871e851254cc8388e4803682dd147af1fdb5067fa64fbe530aa
 size 3195

model/{6a62f122a90e090b285f0344a1d79e753f2000bb → 4a70ad1033ceec48447d5319d0863d442e976823}/vocab.txt RENAMED Viewed

File without changes

model/6a62f122a90e090b285f0344a1d79e753f2000bb/info.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "model": "BERT-NSP-v6",
-  "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). Added method for **Data Augmentation**. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/xvzhhw2r/overview?workspace=user-petr-lorenc"
-}

model/6a62f122a90e090b285f0344a1d79e753f2000bb/meta-info.json DELETED Viewed

@@ -1 +0,0 @@

- {"args": [], "kwargs": {"model_package": "models", "model_class": "OwnBertForNextSentencePrediction", "data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid_frozen.json", "common_dialogues/test_frozen.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dev_frozen.json", "daily_dialogues/test_frozen.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "approach": "IGNORE_DUPLICITIES", "data_augmentation": ["ADD_PARTLY_BROKEN_CONTEXT"], "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1, "freeze_prefinetuning": true, "prefinenuting_epoch": 10, "finetuning_epochs": 75}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]"}}

model/6a62f122a90e090b285f0344a1d79e753f2000bb/special_tokens_map.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}

model/6a62f122a90e090b285f0344a1d79e753f2000bb/tokenizer_config.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}

model/berttokenizer.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d253965be810ac94eed5cf080d45c958e282f0450d64dff77803b272e145d8e
+size 320047

model/berttokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]",
+  "additional_special_tokens": ["[unused1]", "[unused2]", "[unused3]", "[unused4]", "[unused5]", "[unused6]", "[unused7]", "[unused8]", "[unused9]", "[unused10]"]
+}

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → berttokenizer}/tokenizer.json RENAMED Viewed

@@ -5,48 +5,48 @@
   "added_tokens": [
     {
       "id": 0,
       "content": "[PAD]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": false,
-      "special": true
     },
     {
       "id": 100,
       "content": "[UNK]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": false,
-      "special": true
     },
     {
       "id": 101,
       "content": "[CLS]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": false,
-      "special": true
     },
     {
       "id": 102,
       "content": "[SEP]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": false,
-      "special": true
     },
     {
       "id": 103,
       "content": "[MASK]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": {

   "added_tokens": [
     {
       "id": 0,
+      "special": true,
       "content": "[PAD]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false
     },
     {
       "id": 100,
+      "special": true,
       "content": "[UNK]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false
     },
     {
       "id": 101,
+      "special": true,
       "content": "[CLS]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false
     },
     {
       "id": 102,
+      "special": true,
       "content": "[SEP]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false
     },
     {
       "id": 103,
+      "special": true,
       "content": "[MASK]",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false
     }
   ],
   "normalizer": {

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → berttokenizer}/tokenizer_config.json RENAMED Viewed

@@ -3,6 +3,7 @@
   "do_lower_case": true,
   "mask_token": "[MASK]",
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "special_tokens_map_file": null,

   "do_lower_case": true,
   "mask_token": "[MASK]",
   "model_max_length": 512,
+  "name_or_path": "bert-base-uncased",
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "special_tokens_map_file": null,

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → berttokenizer}/vocab.txt RENAMED Viewed

File without changes

model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "model": "BERT-NSP-v1",
-  "description": "Model trained on DailyDialogue. Context is taken as is - no separation of individual turns. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/dm6ab7ma/logs?workspace=user-petr-lorenc"
-}

model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json DELETED Viewed

@@ -1 +0,0 @@

- {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": " ", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": " "}}

model/d1dd8365cbf16ff423f537e2291c61a91c717ed1/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a65ee19216a6cfe592c1b5d7b35dabe9182a60d9bf7dbe415a7ad491b6c64733
+size 438044019

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "model": "BERT-NSP-v2",
-  "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
-}

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json DELETED Viewed

@@ -1 +0,0 @@

- {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ab19babe7ae39d1b6746d1dec58ab7758b0bacc33346b66e9c7da94419bebf96
-size 124944384

model/f1f881389fb38108e623689999ceaaaf398c5e92/info.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "model": "BERT-NSP-v5",
-  "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). Using frozen test set to better compare models and therefore trained longer time (about 60 epochs). The model also have bigger classification head (from one layer liner as classical). More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/vzpwetvm/overview?workspace=user-petr-lorenc"
-}

model/f1f881389fb38108e623689999ceaaaf398c5e92/meta-info.json DELETED Viewed

@@ -1 +0,0 @@

- {"args": [], "kwargs": {"model_package": "models", "model_class": "OwnBertForNextSentencePrediction", "data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid_frozen.json", "common_dialogues/test_frozen.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dev_frozen.json", "daily_dialogues/test_frozen.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1, "freeze_prefinetuning": true, "prefinenuting_epoch": 10, "finetuning_epochs": 75}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "approach": "IGNORE_DUPLICITIES", "special_token": "[unused1]"}}

model/f1f881389fb38108e623689999ceaaaf398c5e92/special_tokens_map.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}

model/f1f881389fb38108e623689999ceaaaf398c5e92/tokenizer_config.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/config.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
   "_name_or_path": "bert-base-uncased",
   "architectures": [
-    "BertForNextSentencePrediction"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
@@ -18,8 +18,9 @@
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
-  "transformers_version": "4.17.0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

 {
   "_name_or_path": "bert-base-uncased",
   "architectures": [
+    "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
   "torch_dtype": "float32",
+  "transformers_version": "4.30.0.dev0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

model/new_version_dummy/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adf2c61611e66c3efcca5e6d866b354596fff14c0f5d49ef4ff73bcbc77a20bc
+size 438201824

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/pytorch_model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f791a2952d2707e82b275b14738b0fcd52c56b9a6acd597f4480829737d4368
-size 438022005

 version https://git-lfs.github.com/spec/v1
+oid sha256:04a843b2644531a9f6ac15659f22d89bc288c17886e9c6d2561431a7a0add441
+size 438007925

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_dummy}/special_tokens_map.json RENAMED Viewed

@@ -1,4 +1,7 @@
 {
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",

 {
+  "additional_special_tokens": [
+    "[unused1]"
+  ],
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
   "pad_token": "[PAD]",

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_dummy}/tokenizer.json RENAMED Viewed

@@ -12,6 +12,15 @@
       "normalized": false,
       "special": true
     },
     {
       "id": 100,
       "content": "[UNK]",

       "normalized": false,
       "special": true
     },
+    {
+      "id": 2,
+      "content": "[unused1]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
     {
       "id": 100,
       "content": "[UNK]",

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/tokenizer_config.json RENAMED Viewed

@@ -1,11 +1,11 @@
 {
   "cls_token": "[CLS]",
   "do_lower_case": true,
   "mask_token": "[MASK]",
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "special_tokens_map_file": null,
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "BertTokenizer",

 {
+  "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "do_lower_case": true,
   "mask_token": "[MASK]",
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "strip_accents": null,
   "tokenize_chinese_chars": true,
   "tokenizer_class": "BertTokenizer",

model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_dummy}/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49a0183a62c25be44cbf2f333cf9224204e7fbfe84cd9054d94775d61daa9774
-size 3195

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5e18fad4f08f10e21116128e5ffdbc3b9da56804fae548529abfcafe8814d94
+size 3899

model/{e09d71f55f4b6fc20135f856bf029322a3265d8d → new_version_dummy}/vocab.txt RENAMED Viewed

File without changes

model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_v1}/config.json RENAMED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "bert-base-uncased",
   "architectures": [
     "OwnBertForNextSentencePrediction"
   ],
@@ -19,7 +19,7 @@
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
-  "transformers_version": "4.17.0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

 {
+  "_name_or_path": "/home/lorenpe2/project/hf_models/bert-base-uncased",
   "architectures": [
     "OwnBertForNextSentencePrediction"
   ],
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
+  "transformers_version": "4.30.0.dev0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

model/new_version_v1/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a4908d96ecead256b5c4f921a6ae08fe156bdb3a11aceb5aa7b8677dac9322e
+size 438174352

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v1}/pytorch_model.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aec095ee8fc2c88ca0460f59e19df6b38c5c91d38a3ab04928ce7eb996c0d62a
-size 438022005

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ed3691d1c07d77a0a6796fb4a5f61a2e774849cb7489c97a183097cb4f693aa
+size 438856837

model/new_version_v1/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "additional_special_tokens": [
+    "[unused1]"
+  ],
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model/new_version_v1/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v1}/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9965b76852aa7c689ed048738e3db2f0b386154cddbfb42d9da7a064a9f9231
-size 3195

 version https://git-lfs.github.com/spec/v1
+oid sha256:bed99c1c5b736531dce1c0822e055bb8ad10a1634b9235d9a35e7d6aa4356d58
+size 4091

model/{f1f881389fb38108e623689999ceaaaf398c5e92 → new_version_v1}/vocab.txt RENAMED Viewed

File without changes

model/{c3c3bdb7ad80396e69de171995e2038f900940c8 → new_version_v2}/config.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
-  "_name_or_path": "bert-base-uncased",
   "architectures": [
-    "BertForNextSentencePrediction"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
@@ -18,8 +18,9 @@
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
-  "transformers_version": "4.17.0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

 {
+  "_name_or_path": "/home/lorenpe2/project/hf_models/bert-base-uncased",
   "architectures": [
+    "BertForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
   "torch_dtype": "float32",
+  "transformers_version": "4.30.0.dev0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522

model/new_version_v2/info.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model": "BERT-SEQUENCE-CLASSIFICATION",
+  "description": "Model trained on subset of DailyDialogue, CommonDialogues, ChitChatDataset, AirDialogue and SODA. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/wfsx1sga/overview?workspace=user-petr-lorenc"
+}

model/new_version_v2/meta-info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "args": [],
+  "kwargs": {
+    "model_package": "transformers",
+    "model_class": "AutoModelForSequenceClassification",
+    "data_root": "/home/lorenpe2/project/data",
+    "data_sources": [],
+    "pretrained_model": "bert-base-uncased",
+    "tokenizer": "bert-base-uncased",
+    "approach": "IGNORE_DUPLICITIES",
+    "special_token": "[unused1]",
+    "learning_rate": 5e-07,
+    "warmup_ratio": 0.1,
+    "freeze_prefinetuning": true,
+    "prefinenuting_epoch": 10,
+    "finetuning_epochs": 75
+  },
+  "tokenizer_args": {
+    "padding": "max_length",
+    "max_length_ctx": 256,
+    "max_length_res": 40,
+    "truncation": "only_first",
+    "return_tensors": "np",
+    "is_split_into_words": true,
+    "approach": "IGNORE_DUPLICITIES",
+    "special_token": "[unused1]"
+  }
+}