Spaces:

lorenpe2
/

next-sentence-probability

Runtime error

App Files Files Community

next-sentence-probability / app.py

lorenpe2

FEAT: new models, reload model each time when something change (not ideal but it is better than st.cache_resource)

822e1b3 about 2 years ago

raw

history blame

7.52 kB

	import os
	import glob
	import json
	from typing import Dict, List, Tuple, Union

	import torch
	import pandas
	import streamlit as st
	import matplotlib.pyplot as plt


	from inference_tokenizer import NextSentencePredictionTokenizer
	from models import get_class
	from models import OwnBertForNextSentencePrediction

	def get_model(_model_path):
	print(f"Getting model at {_model_path}")
	if os.path.isfile(os.path.join(_model_path, "meta-info.json")):
	with open(os.path.join(_model_path, "meta-info.json"), "r") as f:
	meta_info = json.load(f)
	_model_package = meta_info["kwargs"].get("model_package", "transformers")
	_model_class = meta_info["kwargs"].get("model_class", "BertForNextSentencePrediction")
	else:
	raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")

	model_class = get_class(_model_package, _model_class)
	_model = model_class.from_pretrained(_model_path)
	_model.eval()
	return _model


	def get_tokenizer(tokenizer_path):
	print(f"Getting tokenizer at {tokenizer_path}")
	from transformers import BertTokenizer
	tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
	if os.path.isfile(os.path.join(tokenizer_path, "meta-info.json")):
	with open(os.path.join(tokenizer_path, "meta-info.json"), "r") as f:
	meta_info = json.load(f)
	tokenizer_args = meta_info["tokenizer_args"]
	special_token = meta_info["kwargs"]["special_token"]
	else:
	raise FileNotFoundError("Tokenizer is provided without meta-info.json. Cannot interfere proper configuration!")

	if special_token != " ":
	tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
	# print(special_token)
	# print(tokenizer_args)
	_inference_tokenizer = NextSentencePredictionTokenizer(tokenizer, **tokenizer_args)
	return _inference_tokenizer


	models_path = glob.glob("./model/*/info.json")
	models = {}
	for model_path in models_path:
	with open(model_path, "r") as f:
	model_data = json.load(f)
	model_data["path"] = model_path.replace("info.json", "")
	models[model_data["model"]] = model_data


	model_name = st.selectbox('Which model do you want to use?',
	(x for x in sorted(models.keys())),
	index=0)

	model_path = models[model_name]["path"]
	model = get_model(model_path)
	inference_tokenizer = get_tokenizer(model_path)


	def get_evaluation_data_from_json(_context: List) -> List[Tuple[List, str, str]]:
	output_data = []
	for _dict in _context:
	_dict: Dict
	for source in _dict["answers"].values():
	for _t, sentences in source.items():
	for sentence in sentences:
	output_data.append((_dict["context"], sentence, _t))
	return output_data


	def get_evaluation_data_from_dialogue(_context: List) -> List[Tuple[List, str, Union[str, None]]]:
	output_data = []
	for idx, _line in enumerate(_context):
	if idx == 0:
	continue
	actual_context = _context[max(0, idx - 5):idx]
	actual_sentence = _line
	for context_idx in range(len(actual_context)):
	output_data.append((actual_context[-context_idx:], actual_sentence, None))
	return output_data


	option = st.selectbox("Choose type of input:",
	["01 - String (one turn per line)",
	"02 - JSON (aggregated)",
	"03 - JSON (example CA-OOD)",
	"04 - JSON (example Elysai)",
	"05 - Diagnostic mode"])


	with st.form("input_text"):
	if "01" in option:
	context = st.text_area("Insert context here (one turn per line):")
	actual_text = st.text_input("Insert current turn:")
	context = list(filter(lambda x: len(x.strip()) >= 1, context.split("\n")))

	input_tensor = inference_tokenizer.get_item(context=context, actual_sentence=actual_text)
	output_model = model(**input_tensor.data).logits

	output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
	prop_follow = output_model[0]
	prop_not_follow = output_model[1]

	submitted = st.form_submit_button("Submit")
	if submitted:
	fig, ax = plt.subplots()
	ax.pie([prop_follow, prop_not_follow], labels=["Probability - Follow", "Probability - Not Follow"],
	autopct='%1.1f%%')
	st.pyplot(fig)
	elif "02" in option or "03" in option or "04" in option:
	from data.example_data import ca_ood, elysai
	choices = [ca_ood, elysai]
	option: str
	# > Python 3.10
	# match option.split("-")[0].strip():
	# case "03":
	# text = json.dumps(choices[0])
	# case "04":
	# text = json.dumps(choices[1])
	# case _:
	# text = ""
	option = option.split("-")[0].strip()
	if option == "03":
	text = json.dumps(choices[0])
	elif option == "04":
	text = json.dumps(choices[1])
	else:
	test = ""
	context = st.text_area("Insert JSON here:", value=str(text))

	if "{" in context:
	data_for_evaluation = get_evaluation_data_from_json(_context=json.loads(context))
	results = []
	accuracy = []

	submitted = st.form_submit_button("Submit")
	if submitted:
	for datapoint in data_for_evaluation:
	c, s, human_label = datapoint
	input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
	output_model = model(**input_tensor.data).logits
	output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
	prop_follow = output_model[0]
	prop_not_follow = output_model[1]

	results.append((c, s, human_label, prop_follow, prop_not_follow))
	if human_label == "coherent":
	accuracy.append(int(prop_follow > prop_not_follow))
	else:
	accuracy.append(int(prop_not_follow > prop_follow))
	st.metric(label="Accuracy", value=f"{sum(accuracy) / len(accuracy)} %")
	df = pandas.DataFrame(results, columns=["Context", "Query", "Human Label", "Probability (follow)",
	"Probability (not-follow)"])
	st.dataframe(df)
	elif "05" in option:
	context = st.text_area("Insert dialogue here (one turn per line):")
	submitted = st.form_submit_button("Submit")
	if submitted:
	aggregated_result = []
	data_for_evaluation = get_evaluation_data_from_dialogue(context.split("\n"))
	for datapoint in data_for_evaluation:
	c, s, _ = datapoint
	input_tensor = inference_tokenizer.get_item(context=c, actual_sentence=s)
	output_model = model(**input_tensor.data).logits
	output_model = torch.softmax(output_model, dim=-1).detach().numpy()[0]
	prop_follow = output_model[0]
	prop_not_follow = output_model[1]

	aggregated_result.append((c, s, prop_follow))
	st.table(aggregated_result)

	st.markdown("## Description of models:")
	for x in sorted(models.values(), key=lambda x: x["model"]):
	st.write((str(x["model"] + " - " + x["description"])))