Spaces:
Runtime error
Runtime error
import os | |
import glob | |
import json | |
from typing import Dict, List, Tuple, Union | |
import torch | |
import pandas | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
from inference_tokenizer import NextSentencePredictionTokenizer | |
def get_model(model_path): | |
from transformers import BertForNextSentencePrediction | |
_model = BertForNextSentencePrediction.from_pretrained(model_path) | |
_model.eval() | |
return _model | |
def get_tokenizer(tokenizer_path): | |
from transformers import BertTokenizer | |
tokenizer = BertTokenizer.from_pretrained(tokenizer_path) | |
if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")): | |
with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f: | |
meta_info = json.load(f) | |
tokenizer_args = meta_info["tokenizer_args"] | |
special_token = meta_info["kwargs"]["special_token"] | |
else: | |
raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!") | |
if special_token != " ": | |
tokenizer.add_special_tokens({"additional_special_tokens": [special_token]}) | |
print(special_token) | |
print(tokenizer_args) | |
_inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args) | |
return _inference_tokenizer | |
models_path = glob.glob("./model/*/info.json") | |
models = {} | |
for model_path in models_path: | |
with open(model_path, "r") as f: | |
model_data = json.load(f) | |
model_data["path"] = model_path.replace("info.json", "") | |
models[model_data["model"]] = model_data | |
model_name = st.selectbox('Which model do you want to use?', | |
(x for x in sorted(models.keys()))) | |
model_path = models[model_name]["path"] | |
model = get_model(model_path) | |
inference_tokenizer = get_tokenizer(model_path) | |
def get_evaluation_data_from_json(_context: List) -> List[Tuple[List, str, str]]: | |
output_data = [] | |
for _dict in _context: | |
_dict: Dict | |
for source in _dict["answers"].values(): | |
for _t, sentences in source.items(): | |
for sentence in sentences: | |
output_data.append((_dict["context"], sentence, _t)) | |
return output_data | |
def get_evaluation_data_from_dialogue(_context: List) -> List[Tuple[List, str, Union[str, None]]]: | |
output_data = [] | |
for idx, _line in enumerate(_context): | |
if idx == 0: | |
continue | |
actual_context = _context[max(0, idx - 5):idx] | |
actual_sentence = _line | |
for context_idx in range(len(actual_context)): | |
output_data.append((actual_context[-context_idx:], actual_sentence, None)) | |
return output_data | |
option = st.selectbox("Choose type of input:", | |
["01 - String (one turn per line)", | |
"02 - JSON (aggregated)", | |
"03 - JSON (example CA-OOD)", | |
"04 - JSON (example Elysai)", | |
"05 - Diagnostic mode"]) | |
with st.form("input_text"): | |
if "01" in option: | |
context = st.text_area("Insert context here (one turn per line):") | |
actual_text = st.text_input("Insert current turn:") | |
context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n"))) | |
input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text) | |
output_model = model(**input_tensor.data).logits | |
output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0] | |
prop_follow = output_model[0] | |
prop_not_follow = output_model[1] | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
fig, ax = plt.subplots() | |
ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"], | |
autopct='%1.1f%%') | |
st.pyplot(fig) | |
elif "02" in option or "03" in option or "04" in option: | |
from data.example_data import ca_ood, elysai | |
choices = [ca_ood, elysai] | |
option: str | |
# > Python 3.10 | |
# match option.split("-")[0].strip(): | |
# case "03": | |
# text = json.dumps(choices[0]) | |
# case "04": | |
# text = json.dumps(choices[1]) | |
# case _: | |
# text = "" | |
option = option.split("-")[0].strip() | |
if option == "03": | |
text = json.dumps(choices[0]) | |
elif option == "04": | |
text = json.dumps(choices[1]) | |
else: | |
test = "" | |
context = st.text_area("Insert JSON here:", value=str(text)) | |
if "{" in context: | |
data_for_evaluation = get_evaluation_data_from_json(_context=json.loads(context)) | |
results = [] | |
accuracy = [] | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
for datapoint in data_for_evaluation: | |
c, s, human_label = datapoint | |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s) | |
output_model = model(**input_tensor.data).logits | |
output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0] | |
prop_follow = output_model[0] | |
prop_not_follow = output_model[1] | |
results.append((c, s, human_label, prop_follow, prop_not_follow)) | |
if human_label == "coherent": | |
accuracy.append(int(prop_follow > prop_not_follow)) | |
else: | |
accuracy.append(int(prop_not_follow > prop_follow)) | |
st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %") | |
df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)", | |
"Probability (not-follow)"]) | |
st.dataframe(df) | |
elif "05" in option: | |
context = st.text_area("Insert dialogue here (one turn per line):") | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
aggregated_result = [] | |
data_for_evaluation = get_evaluation_data_from_dialogue(context.split("\n")) | |
for datapoint in data_for_evaluation: | |
c, s, _ = datapoint | |
input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s) | |
output_model = model(**input_tensor.data).logits | |
output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0] | |
prop_follow = output_model[0] | |
prop_not_follow = output_model[1] | |
aggregated_result.append((c, s, prop_follow)) | |
st.table(aggregated_result) | |
st.markdown("## Description of models:") | |
for x in sorted(models.values(), key=lambda x: x["model"]): | |
st.write((str(x["model"] + " - " + x["description"]))) | |