Spaces:

lorenpe2
/

next-sentence-probability

Runtime error

App Files Files Community

lorenpe2 commited on Apr 18, 2023

Commit

2b6660e

1 Parent(s): 7bf60b0

FEAT: New model

Browse files

Files changed (23) hide show

app.py +43 -31
inference_tokenizer.py +11 -6
model/3c090bb39725194fae09a603caac8c3d9014df49/config.json +26 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/info.json +4 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json +1 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin +3 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json +1 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json +1 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin +3 -0
model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt +0 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json +4 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json +1 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json → special_tokens_map.json} +0 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json → tokenizer.json} +0 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json → tokenizer_config.json} +0 -0
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt → vocab.txt} +0 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json +4 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json +1 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json → special_tokens_map.json} +0 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json → tokenizer.json} +0 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json → tokenizer_config.json} +0 -0
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt → vocab.txt} +0 -0
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import json
 from typing import Dict, List, Tuple, Union
@@ -21,52 +22,59 @@ def get_model(model_path):
 @st.cache_resource
 def get_tokenizer(tokenizer_path):
     from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained(os.path.join(tokenizer_path, "tokenizer"))
-    tokenizer_args = {
-        "padding": "max_length",
-        "max_length_ctx": 256,
-        "max_length_res": 64,
-        "truncation": "only_first",
-        "return_tensors": "np",
-        # will be transfer to tensor later during the training (because of some memory problem with tensors)
-        "is_split_into_words": True,
-    }
-    special_token = " "
-    # todo better than hardcoded
-    if tokenizer_path == "./model/e09d71f55f4b6fc20135f856bf029322a3265d8d":
-        special_token = "[unused1]"
         tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
-    _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, special_token=special_token, **tokenizer_args)
     return _inference_tokenizer
-model_option = st.selectbox(
-    'Which model do you want to use?',
-    ('./model/c3c3bdb7ad80396e69de171995e2038f900940c8', './model/e09d71f55f4b6fc20135f856bf029322a3265d8d'))
-model = get_model(model_option)
-inference_tokenizer = get_tokenizer(model_option)
-def get_evaluation_data(_context: List, special_delimiter=" "):
     output_data = []
     for _dict in _context:
         _dict: Dict
-        c = special_delimiter.join(_dict["context"])
         for source in _dict["answers"].values():
             for _t, sentences in source.items():
                 for sentence in sentences:
-                    output_data.append([c, sentence, _t])
     return output_data
-option = st.selectbox("Choose type of evaluation:",
-                      ["01 - Raw text (one line)", "02 - JSON (aggregated)"])
 with st.form("input_text"):
     if "01" in option:
-        context = st.text_area("Insert context here (sentences divided by ||):")
-        actual_text = st.text_input("Actual text")
         input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
         output_model = model(**input_tensor.data).logits
@@ -75,7 +83,6 @@ with st.form("input_text"):
         prop_follow = output_model[0]
         prop_not_follow = output_model[1]
-        # Every form must have a submit button.
         submitted = st.form_submit_button("Submit")
         if submitted:
             fig, ax = plt.subplots()
@@ -83,12 +90,12 @@ with st.form("input_text"):
                    autopct='%1.1f%%')
             st.pyplot(fig)
     elif "02" in option:
-        context = st.text_area("Insert JSON here")
         if "{" in context:
             evaluation_data = get_evaluation_data(_context=json.loads(context))
         results = []
         accuracy = []
-        # Every form must have a submit button.
         submitted = st.form_submit_button("Submit")
         if submitted:
             for datapoint in evaluation_data:
@@ -105,5 +112,10 @@ with st.form("input_text"):
                 else:
                     accuracy.append(int(prop_not_follow > prop_follow))
             st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
-            df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)", "Probability (not-follow)"])
             st.dataframe(df)

 import os
+import glob
 import json
 from typing import Dict, List, Tuple, Union
 @st.cache_resource
 def get_tokenizer(tokenizer_path):
     from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
+    if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
+        with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
+            meta_info = json.load(f)
+            tokenizer_args = meta_info["tokenizer_args"]
+            special_token = meta_info["kwargs"]["special_token"]
+    else:
+        raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")
+    if special_token != " ":
         tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
+    print(special_token)
+    print(tokenizer_args)
+    _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
     return _inference_tokenizer
+models_path = glob.glob("./model/*/info.json")
+models = {}
+for model_path in models_path:
+    with open(model_path, "r") as f:
+        model_data = json.load(f)
+        model_data["path"] = model_path.replace("info.json", "")
+        models[model_data["model"]] = model_data
+model_name = st.selectbox('Which model do you want to use?',
+                          (x for x in sorted(models.keys())))
+model_path = models[model_name]["path"]
+model = get_model(model_path)
+inference_tokenizer = get_tokenizer(model_path)
+def get_evaluation_data(_context: List) -> List[Tuple[List, str, str]]:
     output_data = []
     for _dict in _context:
         _dict: Dict
         for source in _dict["answers"].values():
             for _t, sentences in source.items():
                 for sentence in sentences:
+                    output_data.append((_dict["context"], sentence, _t))
     return output_data
+option = st.selectbox("Choose type of input:",
+                      ["01 - String (one turn per line)", "02 - JSON (aggregated)"])
 with st.form("input_text"):
     if "01" in option:
+        context = st.text_area("Insert context here (one turn per line):")
+        actual_text = st.text_input("Insert current turn:")
+        context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n")))
         input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
         output_model = model(**input_tensor.data).logits
         prop_follow = output_model[0]
         prop_not_follow = output_model[1]
         submitted = st.form_submit_button("Submit")
         if submitted:
             fig, ax = plt.subplots()
                    autopct='%1.1f%%')
             st.pyplot(fig)
     elif "02" in option:
+        context = st.text_area("Insert JSON here:")
         if "{" in context:
             evaluation_data = get_evaluation_data(_context=json.loads(context))
         results = []
         accuracy = []
         submitted = st.form_submit_button("Submit")
         if submitted:
             for datapoint in evaluation_data:
                 else:
                     accuracy.append(int(prop_not_follow > prop_follow))
             st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
+            df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
+                                                    "Probability (not-follow)"])
             st.dataframe(df)
+st.markdown("## Description of models:")
+for x in sorted(models.values(), key=lambda x: x["model"]):
+    st.write((str(x["model"] + " - " + x["description"])))

inference_tokenizer.py CHANGED Viewed

@@ -1,21 +1,26 @@
 import torch
-from typing import Dict
 class NextSentencePredictionTokenizer:
-    def __init__(self, _tokenizer, special_token, **_tokenizer_args):
         self.tokenizer = _tokenizer
         self.tokenizer_args = _tokenizer_args
         self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
         self.max_length_res = self.tokenizer_args.get("max_length_res")
         del self.tokenizer_args["max_length_ctx"]
         del self.tokenizer_args["max_length_res"]
-        self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
-        self.special_token = special_token
-    def get_item(self, context: str, actual_sentence: str):
-        actual_item = {"ctx": context.replace("||", self.special_token), "res": actual_sentence}
         tokenized = self._tokenize_row(actual_item)
         for key in tokenized.data.keys():

 import torch
+from typing import Dict, List
 class NextSentencePredictionTokenizer:
+    def __init__(self, _tokenizer, **_tokenizer_args):
         self.tokenizer = _tokenizer
         self.tokenizer_args = _tokenizer_args
         self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
         self.max_length_res = self.tokenizer_args.get("max_length_res")
+        self.special_token = self.tokenizer_args.get("special_token")
+        self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
+        # cleaning
+        del self.tokenizer_args["special_token"]
+        del self.tokenizer_args["naive_approach"]
         del self.tokenizer_args["max_length_ctx"]
         del self.tokenizer_args["max_length_res"]
+    def get_item(self, context: List[str], actual_sentence: str):
+        context_str = f" {self.special_token} ".join(context) if self.special_token != " " else " ".join(context)
+        actual_item = {"ctx": context_str, "res": actual_sentence}
         tokenized = self._tokenize_row(actual_item)
         for key in tokenized.data.keys():

model/3c090bb39725194fae09a603caac8c3d9014df49/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "bert-base-cased",
+  "architectures": [
+    "BertForNextSentencePrediction"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 28996
+}

model/3c090bb39725194fae09a603caac8c3d9014df49/info.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model": "BERT-NSP-v3",
+  "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
+}

model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-cased", "tokenizer": "bert-base-cased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}

model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:627fe3220abd88a13cdd5e4befc4b7d8ec31412ed55d9e97c03c7aaf73b95b01
+size 433334133

model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}

model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}

model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efb6633e7e9dbd843917b80027f14e34338c15f651ec8cfe995646b0415e76a2
+size 3195

model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model": "BERT-NSP-v1",
+  "description": "Model trained on DailyDialogue. Context is taken as is - no separation of individual turns. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/dm6ab7ma/logs?workspace=user-petr-lorenc"
+}

model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": " ", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": " "}}

model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json → special_tokens_map.json} RENAMED Viewed

File without changes

model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json → tokenizer.json} RENAMED Viewed

File without changes

model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json → tokenizer_config.json} RENAMED Viewed

File without changes

model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt → vocab.txt} RENAMED Viewed

File without changes

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model": "BERT-NSP-v2",
+  "description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
+}

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json → special_tokens_map.json} RENAMED Viewed

File without changes

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json → tokenizer.json} RENAMED Viewed

File without changes

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json → tokenizer_config.json} RENAMED Viewed

File without changes

model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt → vocab.txt} RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 torch
 transformers
-streamlit
 matplotlib
 numpy
 pandas

 torch
 transformers
 matplotlib
 numpy
 pandas