Spaces:
Runtime error
Runtime error
FEAT: New model
Browse files- app.py +43 -31
- inference_tokenizer.py +11 -6
- model/3c090bb39725194fae09a603caac8c3d9014df49/config.json +26 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/info.json +4 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json +1 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin +3 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json +1 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json +1 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin +3 -0
- model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt +0 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json +4 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json +1 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json β special_tokens_map.json} +0 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json β tokenizer.json} +0 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json β tokenizer_config.json} +0 -0
- model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt β vocab.txt} +0 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json +4 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json +1 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json β special_tokens_map.json} +0 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json β tokenizer.json} +0 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json β tokenizer_config.json} +0 -0
- model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt β vocab.txt} +0 -0
- requirements.txt +0 -1
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import json
|
3 |
from typing import Dict, List, Tuple, Union
|
4 |
|
@@ -21,52 +22,59 @@ def get_model(model_path):
|
|
21 |
@st.cache_resource
|
22 |
def get_tokenizer(tokenizer_path):
|
23 |
from transformers import BertTokenizer
|
24 |
-
tokenizer = BertTokenizer.from_pretrained(
|
25 |
-
|
26 |
-
"
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
special_token = " "
|
35 |
-
# todo better than hardcoded
|
36 |
-
if tokenizer_path == "./model/e09d71f55f4b6fc20135f856bf029322a3265d8d":
|
37 |
-
special_token = "[unused1]"
|
38 |
tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
|
39 |
-
|
|
|
|
|
40 |
return _inference_tokenizer
|
41 |
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
|
|
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
output_data = []
|
53 |
for _dict in _context:
|
54 |
_dict: Dict
|
55 |
-
c = special_delimiter.join(_dict["context"])
|
56 |
for source in _dict["answers"].values():
|
57 |
for _t, sentences in source.items():
|
58 |
for sentence in sentences:
|
59 |
-
output_data.append([
|
60 |
return output_data
|
61 |
|
62 |
|
63 |
-
option = st.selectbox("Choose type of
|
64 |
-
["01 -
|
65 |
|
66 |
with st.form("input_text"):
|
67 |
if "01" in option:
|
68 |
-
context = st.text_area("Insert context here (
|
69 |
-
actual_text = st.text_input("
|
|
|
70 |
|
71 |
input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
|
72 |
output_model = model(**input_tensor.data).logits
|
@@ -75,7 +83,6 @@ with st.form("input_text"):
|
|
75 |
prop_follow = output_model[0]
|
76 |
prop_not_follow = output_model[1]
|
77 |
|
78 |
-
# Every form must have a submit button.
|
79 |
submitted = st.form_submit_button("Submit")
|
80 |
if submitted:
|
81 |
fig, ax = plt.subplots()
|
@@ -83,12 +90,12 @@ with st.form("input_text"):
|
|
83 |
autopct='%1.1f%%')
|
84 |
st.pyplot(fig)
|
85 |
elif "02" in option:
|
86 |
-
context = st.text_area("Insert JSON here")
|
87 |
if "{" in context:
|
88 |
evaluation_data = get_evaluation_data(_context=json.loads(context))
|
89 |
results = []
|
90 |
accuracy = []
|
91 |
-
|
92 |
submitted = st.form_submit_button("Submit")
|
93 |
if submitted:
|
94 |
for datapoint in evaluation_data:
|
@@ -105,5 +112,10 @@ with st.form("input_text"):
|
|
105 |
else:
|
106 |
accuracy.append(int(prop_not_follow > prop_follow))
|
107 |
st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
|
108 |
-
df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
|
|
|
109 |
st.dataframe(df)
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import glob
|
3 |
import json
|
4 |
from typing import Dict, List, Tuple, Union
|
5 |
|
|
|
22 |
@st.cache_resource
|
23 |
def get_tokenizer(tokenizer_path):
|
24 |
from transformers import BertTokenizer
|
25 |
+
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
|
26 |
+
if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
|
27 |
+
with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
|
28 |
+
meta_info = json.load(f)
|
29 |
+
tokenizer_args = meta_info["tokenizer_args"]
|
30 |
+
special_token = meta_info["kwargs"]["special_token"]
|
31 |
+
else:
|
32 |
+
raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")
|
33 |
+
|
34 |
+
if special_token != " ":
|
|
|
|
|
|
|
|
|
35 |
tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
|
36 |
+
print(special_token)
|
37 |
+
print(tokenizer_args)
|
38 |
+
_inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
|
39 |
return _inference_tokenizer
|
40 |
|
41 |
|
42 |
+
models_path = glob.glob("./model/*/info.json")
|
43 |
+
models = {}
|
44 |
+
for model_path in models_path:
|
45 |
+
with open(model_path, "r") as f:
|
46 |
+
model_data = json.load(f)
|
47 |
+
model_data["path"] = model_path.replace("info.json", "")
|
48 |
+
models[model_data["model"]] = model_data
|
49 |
|
50 |
+
model_name = st.selectbox('Which model do you want to use?',
|
51 |
+
(x for x in sorted(models.keys())))
|
52 |
|
53 |
+
model_path = models[model_name]["path"]
|
54 |
|
55 |
+
model = get_model(model_path)
|
56 |
+
inference_tokenizer = get_tokenizer(model_path)
|
57 |
+
|
58 |
+
|
59 |
+
def get_evaluation_data(_context: List) -> List[Tuple[List, str, str]]:
|
60 |
output_data = []
|
61 |
for _dict in _context:
|
62 |
_dict: Dict
|
|
|
63 |
for source in _dict["answers"].values():
|
64 |
for _t, sentences in source.items():
|
65 |
for sentence in sentences:
|
66 |
+
output_data.append((_dict["context"], sentence, _t))
|
67 |
return output_data
|
68 |
|
69 |
|
70 |
+
option = st.selectbox("Choose type of input:",
|
71 |
+
["01 - String (one turn per line)", "02 - JSON (aggregated)"])
|
72 |
|
73 |
with st.form("input_text"):
|
74 |
if "01" in option:
|
75 |
+
context = st.text_area("Insert context here (one turn per line):")
|
76 |
+
actual_text = st.text_input("Insert current turn:")
|
77 |
+
context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n")))
|
78 |
|
79 |
input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
|
80 |
output_model = model(**input_tensor.data).logits
|
|
|
83 |
prop_follow = output_model[0]
|
84 |
prop_not_follow = output_model[1]
|
85 |
|
|
|
86 |
submitted = st.form_submit_button("Submit")
|
87 |
if submitted:
|
88 |
fig, ax = plt.subplots()
|
|
|
90 |
autopct='%1.1f%%')
|
91 |
st.pyplot(fig)
|
92 |
elif "02" in option:
|
93 |
+
context = st.text_area("Insert JSON here:")
|
94 |
if "{" in context:
|
95 |
evaluation_data = get_evaluation_data(_context=json.loads(context))
|
96 |
results = []
|
97 |
accuracy = []
|
98 |
+
|
99 |
submitted = st.form_submit_button("Submit")
|
100 |
if submitted:
|
101 |
for datapoint in evaluation_data:
|
|
|
112 |
else:
|
113 |
accuracy.append(int(prop_not_follow > prop_follow))
|
114 |
st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
|
115 |
+
df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
|
116 |
+
"Probability (not-follow)"])
|
117 |
st.dataframe(df)
|
118 |
+
|
119 |
+
st.markdown("## Description of models:")
|
120 |
+
for x in sorted(models.values(), key=lambda x: x["model"]):
|
121 |
+
st.write((str(x["model"] + " - " + x["description"])))
|
inference_tokenizer.py
CHANGED
@@ -1,21 +1,26 @@
|
|
1 |
import torch
|
2 |
-
from typing import Dict
|
3 |
|
4 |
|
5 |
class NextSentencePredictionTokenizer:
|
6 |
|
7 |
-
def __init__(self, _tokenizer,
|
8 |
self.tokenizer = _tokenizer
|
9 |
self.tokenizer_args = _tokenizer_args
|
10 |
self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
|
11 |
self.max_length_res = self.tokenizer_args.get("max_length_res")
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
del self.tokenizer_args["max_length_ctx"]
|
13 |
del self.tokenizer_args["max_length_res"]
|
14 |
-
self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
|
15 |
-
self.special_token = special_token
|
16 |
|
17 |
-
def get_item(self, context: str, actual_sentence: str):
|
18 |
-
|
|
|
19 |
tokenized = self._tokenize_row(actual_item)
|
20 |
|
21 |
for key in tokenized.data.keys():
|
|
|
1 |
import torch
|
2 |
+
from typing import Dict, List
|
3 |
|
4 |
|
5 |
class NextSentencePredictionTokenizer:
|
6 |
|
7 |
+
def __init__(self, _tokenizer, **_tokenizer_args):
|
8 |
self.tokenizer = _tokenizer
|
9 |
self.tokenizer_args = _tokenizer_args
|
10 |
self.max_length_ctx = self.tokenizer_args.get("max_length_ctx")
|
11 |
self.max_length_res = self.tokenizer_args.get("max_length_res")
|
12 |
+
self.special_token = self.tokenizer_args.get("special_token")
|
13 |
+
self.tokenizer_args["max_length"] = self.max_length_ctx + self.max_length_res
|
14 |
+
|
15 |
+
# cleaning
|
16 |
+
del self.tokenizer_args["special_token"]
|
17 |
+
del self.tokenizer_args["naive_approach"]
|
18 |
del self.tokenizer_args["max_length_ctx"]
|
19 |
del self.tokenizer_args["max_length_res"]
|
|
|
|
|
20 |
|
21 |
+
def get_item(self, context: List[str], actual_sentence: str):
|
22 |
+
context_str = f" {self.special_token} ".join(context) if self.special_token != " " else " ".join(context)
|
23 |
+
actual_item = {"ctx": context_str, "res": actual_sentence}
|
24 |
tokenized = self._tokenize_row(actual_item)
|
25 |
|
26 |
for key in tokenized.data.keys():
|
model/3c090bb39725194fae09a603caac8c3d9014df49/config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-cased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForNextSentencePrediction"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.17.0",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 28996
|
26 |
+
}
|
model/3c090bb39725194fae09a603caac8c3d9014df49/info.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "BERT-NSP-v3",
|
3 |
+
"description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. Improved training arguments (warmup, smaller learning rate). More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
|
4 |
+
}
|
model/3c090bb39725194fae09a603caac8c3d9014df49/meta-info.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-cased", "tokenizer": "bert-base-cased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-07, "warmup_ratio": 0.1}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 40, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}
|
model/3c090bb39725194fae09a603caac8c3d9014df49/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:627fe3220abd88a13cdd5e4befc4b7d8ec31412ed55d9e97c03c7aaf73b95b01
|
3 |
+
size 433334133
|
model/3c090bb39725194fae09a603caac8c3d9014df49/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "additional_special_tokens": ["[unused1]"]}
|
model/3c090bb39725194fae09a603caac8c3d9014df49/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}
|
model/3c090bb39725194fae09a603caac8c3d9014df49/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efb6633e7e9dbd843917b80027f14e34338c15f651ec8cfe995646b0415e76a2
|
3 |
+
size 3195
|
model/3c090bb39725194fae09a603caac8c3d9014df49/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/info.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "BERT-NSP-v1",
|
3 |
+
"description": "Model trained on DailyDialogue. Context is taken as is - no separation of individual turns. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/dm6ab7ma/logs?workspace=user-petr-lorenc"
|
4 |
+
}
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/meta-info.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": " ", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": " "}}
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/special_tokens_map.json β special_tokens_map.json}
RENAMED
File without changes
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer.json β tokenizer.json}
RENAMED
File without changes
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/tokenizer_config.json β tokenizer_config.json}
RENAMED
File without changes
|
model/c3c3bdb7ad80396e69de171995e2038f900940c8/{tokenizer/vocab.txt β vocab.txt}
RENAMED
File without changes
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/info.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "BERT-NSP-v2",
|
3 |
+
"description": "Model trained on DailyDialogue and CommonDialogues. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/ll69cliu/logs?workspace=user-petr-lorenc"
|
4 |
+
}
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/meta-info.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"args": [], "kwargs": {"data_root": "/home/lorenpe2/project/data", "data_sources": [["COMMON_DIALOGUES", "common_dialogues/train.json", "common_dialogues/valid.json", "common_dialogues/test.json"], ["DAILY_DIALOGUES", "daily_dialogues/dialogues_text.train.txt", "daily_dialogues/dialogues_text.dev.txt", "daily_dialogues/dialogues_text.test.txt"]], "pretrained_model": "bert-base-uncased", "tokenizer": "bert-base-uncased", "naive_approach": true, "special_token": "[unused1]", "learning_rate": 5e-05}, "tokenizer_args": {"padding": "max_length", "max_length_ctx": 256, "max_length_res": 64, "truncation": "only_first", "return_tensors": "np", "is_split_into_words": true, "naive_approach": true, "special_token": "[unused1]"}}
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/special_tokens_map.json β special_tokens_map.json}
RENAMED
File without changes
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer.json β tokenizer.json}
RENAMED
File without changes
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/tokenizer_config.json β tokenizer_config.json}
RENAMED
File without changes
|
model/e09d71f55f4b6fc20135f856bf029322a3265d8d/{tokenizer/vocab.txt β vocab.txt}
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
-
streamlit
|
4 |
matplotlib
|
5 |
numpy
|
6 |
pandas
|
|
|
1 |
torch
|
2 |
transformers
|
|
|
3 |
matplotlib
|
4 |
numpy
|
5 |
pandas
|