File size: 4,726 Bytes
c186b27
2b6660e
c186b27
 
 
 
 
6457b4b
c186b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b6660e
 
 
 
 
 
 
 
 
 
c186b27
2b6660e
 
 
c186b27
 
 
2b6660e
 
 
 
 
 
 
c186b27
2b6660e
 
c186b27
2b6660e
c186b27
2b6660e
 
 
 
 
c186b27
 
 
 
 
 
2b6660e
c186b27
 
 
2b6660e
 
c186b27
 
 
2b6660e
 
 
c186b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b6660e
c186b27
 
 
 
2b6660e
c186b27
 
 
 
 
 
 
 
 
6457b4b
c186b27
 
 
 
 
 
2b6660e
 
c186b27
2b6660e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import glob
import json
from typing import Dict, List, Tuple, Union

import torch
import pandas
import streamlit as st
import matplotlib.pyplot as plt

from inference_tokenizer import NextSentencePredictionTokenizer


@st.cache_resource
def get_model(model_path):
    from transformers import BertForNextSentencePrediction
    _model = BertForNextSentencePrediction.from_pretrained(model_path)
    _model.eval()
    return _model


@st.cache_resource
def get_tokenizer(tokenizer_path):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
        with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
            meta_info = json.load(f)
            tokenizer_args = meta_info["tokenizer_args"]
            special_token = meta_info["kwargs"]["special_token"]
    else:
        raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")

    if special_token != " ":
        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
    print(special_token)
    print(tokenizer_args)
    _inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
    return _inference_tokenizer


models_path = glob.glob("./model/*/info.json")
models = {}
for model_path in models_path:
    with open(model_path, "r") as f:
        model_data = json.load(f)
        model_data["path"] = model_path.replace("info.json", "")
        models[model_data["model"]] = model_data

model_name = st.selectbox('Which model do you want to use?',
                          (x for x in sorted(models.keys())))

model_path = models[model_name]["path"]

model = get_model(model_path)
inference_tokenizer = get_tokenizer(model_path)


def get_evaluation_data(_context: List) -> List[Tuple[List, str, str]]:
    output_data = []
    for _dict in _context:
        _dict: Dict
        for source in _dict["answers"].values():
            for _t, sentences in source.items():
                for sentence in sentences:
                    output_data.append((_dict["context"], sentence, _t))
    return output_data


option = st.selectbox("Choose type of input:",
                      ["01 - String (one turn per line)", "02 - JSON (aggregated)"])

with st.form("input_text"):
    if "01" in option:
        context = st.text_area("Insert context here (one turn per line):")
        actual_text = st.text_input("Insert current turn:")
        context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n")))

        input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
        output_model = model(**input_tensor.data).logits

        output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
        prop_follow = output_model[0]
        prop_not_follow = output_model[1]

        submitted = st.form_submit_button("Submit")
        if submitted:
            fig, ax = plt.subplots()
            ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
                   autopct='%1.1f%%')
            st.pyplot(fig)
    elif "02" in option:
        context = st.text_area("Insert JSON here:")
        if "{" in context:
            evaluation_data = get_evaluation_data(_context=json.loads(context))
        results = []
        accuracy = []

        submitted = st.form_submit_button("Submit")
        if submitted:
            for datapoint in evaluation_data:
                c, s, human_label = datapoint
                input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
                output_model = model(**input_tensor.data).logits
                output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
                prop_follow = output_model[0]
                prop_not_follow = output_model[1]

                results.append((c, s, human_label, prop_follow, prop_not_follow))
                if human_label == "coherent":
                    accuracy.append(int(prop_follow > prop_not_follow))
                else:
                    accuracy.append(int(prop_not_follow > prop_follow))
            st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
            df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
                                                    "Probability (not-follow)"])
            st.dataframe(df)

st.markdown("## Description of models:")
for x in sorted(models.values(), key=lambda x: x["model"]):
    st.write((str(x["model"] + " - " + x["description"])))