Spaces:
Runtime error
Runtime error
FEAT: new regression model
Browse files- app.py +39 -15
- model/new_version_v3/config.json +33 -0
- model/new_version_v3/info.json +4 -0
- model/new_version_v3/meta-info.json +28 -0
- model/new_version_v3/pytorch_model.bin +3 -0
- model/new_version_v3/special_tokens_map.json +10 -0
- model/new_version_v3/tokenizer.json +0 -0
- model/new_version_v3/tokenizer_config.json +13 -0
- model/new_version_v3/training_args.bin +3 -0
- model/new_version_v3/vocab.txt +0 -0
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
import numpy as np
|
|
|
4 |
import glob
|
5 |
import json
|
6 |
from typing import Dict, List, Tuple, Union
|
@@ -122,9 +123,14 @@ if "01" in option:
|
|
122 |
input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
|
123 |
output_model = model(**input_tensor.data).logits
|
124 |
|
125 |
-
output_model =
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
submitted = st.form_submit_button("Submit")
|
130 |
if submitted:
|
@@ -165,13 +171,18 @@ if "02" in option or "03" in option or "04" in option or "06" in option:
|
|
165 |
submitted = st.form_submit_button("Submit")
|
166 |
if submitted:
|
167 |
for idx, datapoint in enumerate(data_for_evaluation):
|
168 |
-
progres_bar.progress(idx/len(data_for_evaluation), text="Inference")
|
169 |
c, s, human_label = datapoint
|
170 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
|
171 |
output_model = model(**input_tensor.data).logits
|
172 |
-
output_model =
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
results.append((c, s, human_label, prop_follow, prop_not_follow))
|
177 |
if human_label == "coherent":
|
@@ -192,15 +203,21 @@ if "05" in option:
|
|
192 |
data_for_evaluation = get_evaluation_data_from_dialogue(_clean_conversational_line(context).split("\n"))
|
193 |
lines = []
|
194 |
scores = np.zeros(shape=(len(data_for_evaluation), context_size))
|
195 |
-
for datapoint in data_for_evaluation:
|
|
|
196 |
for actual_sentence, contexts in datapoint.items():
|
197 |
lines.append(actual_sentence)
|
198 |
for c in contexts:
|
199 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=actual_sentence)
|
200 |
output_model = model(**input_tensor.data).logits
|
201 |
-
output_model =
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
204 |
scores[len(lines) - 1][len(c) - 1] = prop_follow
|
205 |
|
206 |
aggregated_result = []
|
@@ -210,8 +227,10 @@ if "05" in option:
|
|
210 |
|
211 |
if "07" in option:
|
212 |
from data.example_data import dbc
|
|
|
213 |
select_conversation = st.selectbox("Which dialogue to evaluate", list(range(len(dbc))), index=0)
|
214 |
-
context = st.text_area("Insert dialogue here (one turn per line):",
|
|
|
215 |
st.markdown("# Formatted form")
|
216 |
context_json = json.loads(context)
|
217 |
output = ""
|
@@ -236,9 +255,14 @@ if "07" in option:
|
|
236 |
for c in contexts:
|
237 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=actual_sentence)
|
238 |
output_model = model(**input_tensor.data).logits
|
239 |
-
output_model =
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
242 |
scores[len(lines) - 1][len(c) - 1] = prop_follow
|
243 |
|
244 |
for idx, line in enumerate(lines):
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import numpy as np
|
4 |
+
import scipy as sp
|
5 |
import glob
|
6 |
import json
|
7 |
from typing import Dict, List, Tuple, Union
|
|
|
123 |
input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
|
124 |
output_model = model(**input_tensor.data).logits
|
125 |
|
126 |
+
output_model = output_model.detach().numpy()[0]
|
127 |
+
if len(output_model) == 2: # classification
|
128 |
+
output_model = sp.special.softmax(output_model, axis=-1)
|
129 |
+
prop_follow = output_model[0]
|
130 |
+
prop_not_follow = output_model[1]
|
131 |
+
elif len(output_model) == 1: # regression
|
132 |
+
prop_follow = 1 - output_model[0]
|
133 |
+
prop_not_follow = 1 - prop_follow
|
134 |
|
135 |
submitted = st.form_submit_button("Submit")
|
136 |
if submitted:
|
|
|
171 |
submitted = st.form_submit_button("Submit")
|
172 |
if submitted:
|
173 |
for idx, datapoint in enumerate(data_for_evaluation):
|
174 |
+
progres_bar.progress(idx / len(data_for_evaluation), text="Inference")
|
175 |
c, s, human_label = datapoint
|
176 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
|
177 |
output_model = model(**input_tensor.data).logits
|
178 |
+
output_model = output_model.detach().numpy()[0]
|
179 |
+
if len(output_model) == 2: # classification
|
180 |
+
output_model = sp.special.softmax(output_model, axis=-1)
|
181 |
+
prop_follow = output_model[0]
|
182 |
+
prop_not_follow = output_model[1]
|
183 |
+
elif len(output_model) == 1: # regression
|
184 |
+
prop_follow = 1 - output_model[0]
|
185 |
+
prop_not_follow = 1 - prop_follow
|
186 |
|
187 |
results.append((c, s, human_label, prop_follow, prop_not_follow))
|
188 |
if human_label == "coherent":
|
|
|
203 |
data_for_evaluation = get_evaluation_data_from_dialogue(_clean_conversational_line(context).split("\n"))
|
204 |
lines = []
|
205 |
scores = np.zeros(shape=(len(data_for_evaluation), context_size))
|
206 |
+
for idx, datapoint in enumerate(data_for_evaluation):
|
207 |
+
progres_bar.progress(idx / len(data_for_evaluation), text="Inference")
|
208 |
for actual_sentence, contexts in datapoint.items():
|
209 |
lines.append(actual_sentence)
|
210 |
for c in contexts:
|
211 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=actual_sentence)
|
212 |
output_model = model(**input_tensor.data).logits
|
213 |
+
output_model = output_model.detach().numpy()[0]
|
214 |
+
if len(output_model) == 2: # classification
|
215 |
+
output_model = sp.special.softmax(output_model, axis=-1)
|
216 |
+
prop_follow = output_model[0]
|
217 |
+
prop_not_follow = output_model[1]
|
218 |
+
elif len(output_model) == 1: # regression
|
219 |
+
prop_follow = 1 - output_model[0]
|
220 |
+
prop_not_follow = 1 - prop_follow
|
221 |
scores[len(lines) - 1][len(c) - 1] = prop_follow
|
222 |
|
223 |
aggregated_result = []
|
|
|
227 |
|
228 |
if "07" in option:
|
229 |
from data.example_data import dbc
|
230 |
+
|
231 |
select_conversation = st.selectbox("Which dialogue to evaluate", list(range(len(dbc))), index=0)
|
232 |
+
context = st.text_area("Insert dialogue here (one turn per line):",
|
233 |
+
value=json.dumps([dbc[int(select_conversation)]]))
|
234 |
st.markdown("# Formatted form")
|
235 |
context_json = json.loads(context)
|
236 |
output = ""
|
|
|
255 |
for c in contexts:
|
256 |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=actual_sentence)
|
257 |
output_model = model(**input_tensor.data).logits
|
258 |
+
output_model = output_model.detach().numpy()[0]
|
259 |
+
if len(output_model) == 2: # classification
|
260 |
+
output_model = sp.special.softmax(output_model, axis=-1)
|
261 |
+
prop_follow = output_model[0]
|
262 |
+
prop_not_follow = output_model[1]
|
263 |
+
elif len(output_model) == 1: # regression
|
264 |
+
prop_follow = 1 - output_model[0]
|
265 |
+
prop_not_follow = 1 - prop_follow
|
266 |
scores[len(lines) - 1][len(c) - 1] = prop_follow
|
267 |
|
268 |
for idx, line in enumerate(lines):
|
model/new_version_v3/config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/home/lorenpe2/project/hf_models/bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 3072,
|
17 |
+
"label2id": {
|
18 |
+
"LABEL_0": 0
|
19 |
+
},
|
20 |
+
"layer_norm_eps": 1e-12,
|
21 |
+
"max_position_embeddings": 512,
|
22 |
+
"model_type": "bert",
|
23 |
+
"num_attention_heads": 12,
|
24 |
+
"num_hidden_layers": 12,
|
25 |
+
"pad_token_id": 0,
|
26 |
+
"position_embedding_type": "absolute",
|
27 |
+
"problem_type": "regression",
|
28 |
+
"torch_dtype": "float32",
|
29 |
+
"transformers_version": "4.30.0.dev0",
|
30 |
+
"type_vocab_size": 2,
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 30522
|
33 |
+
}
|
model/new_version_v3/info.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "BERT-REGRESSION",
|
3 |
+
"description": "Model trained on subset of DailyDialogue, CommonDialogues, ChitChatDataset, AirDialogue and SODA. Using [unused1] token to divide sentences in context. More info can be found at https://wandb.ai/alquist/next-sentence-prediction/runs/66pz87ta/overview?workspace=user-petr-lorenc"
|
4 |
+
}
|
model/new_version_v3/meta-info.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"args": [],
|
3 |
+
"kwargs": {
|
4 |
+
"model_package": "transformers",
|
5 |
+
"model_class": "AutoModelForSequenceClassification",
|
6 |
+
"data_root": "/home/lorenpe2/project/data",
|
7 |
+
"data_sources": [],
|
8 |
+
"pretrained_model": "bert-base-uncased",
|
9 |
+
"tokenizer": "bert-base-uncased",
|
10 |
+
"approach": "IGNORE_DUPLICITIES",
|
11 |
+
"special_token": "[unused1]",
|
12 |
+
"learning_rate": 5e-07,
|
13 |
+
"warmup_ratio": 0.1,
|
14 |
+
"freeze_prefinetuning": true,
|
15 |
+
"prefinenuting_epoch": 10,
|
16 |
+
"finetuning_epochs": 75
|
17 |
+
},
|
18 |
+
"tokenizer_args": {
|
19 |
+
"padding": "max_length",
|
20 |
+
"max_length_ctx": 256,
|
21 |
+
"max_length_res": 40,
|
22 |
+
"truncation": "only_first",
|
23 |
+
"return_tensors": "np",
|
24 |
+
"is_split_into_words": true,
|
25 |
+
"approach": "IGNORE_DUPLICITIES",
|
26 |
+
"special_token": "[unused1]"
|
27 |
+
}
|
28 |
+
}
|
model/new_version_v3/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e3abc0ca002886035ddc3b08b66e8931313e3a0f3dbf9866d3b04c25432100e
|
3 |
+
size 438004853
|
model/new_version_v3/special_tokens_map.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"[unused1]"
|
4 |
+
],
|
5 |
+
"cls_token": "[CLS]",
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"unk_token": "[UNK]"
|
10 |
+
}
|
model/new_version_v3/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model/new_version_v3/tokenizer_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"strip_accents": null,
|
10 |
+
"tokenize_chinese_chars": true,
|
11 |
+
"tokenizer_class": "BertTokenizer",
|
12 |
+
"unk_token": "[UNK]"
|
13 |
+
}
|
model/new_version_v3/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6e790d899c6f03e1990e4d4ff2bd7e63eead63ed41f89122176b62ddefd330e
|
3 |
+
size 4091
|
model/new_version_v3/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|