biomed-multi-alignment

Running

App Files Files Community

biomed-multi-alignment / new_app.py

matanninio

new_app now works for ppi

4c8737b 7 months ago

raw

history blame

9.77 kB

	import gradio as gr
	import torch
	from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
	from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
	from mammal.keys import *
	from mammal.model import Mammal
	from abc import ABC, abstractmethod
	class MammalObjectBroker():
	def __init__(self, model_path: str, name:str= None, task_list: list[str]=None) -> None:
	self.model_path = model_path
	if name is None:
	name = model_path
	self.name = name

	if task_list is not None:
	self.tasks=task_list
	else:
	self.task = []
	self._model = None
	self._tokenizer_op = None


	@property
	def model(self)-> Mammal:
	if self._model is None:
	self._model = Mammal.from_pretrained(self.model_path)
	self._model.eval()
	return self._model

	@property
	def tokenizer_op(self):
	if self._tokenizer_op is None:
	self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
	return self._tokenizer_op





	class MammalTask(ABC):
	def __init__(self, name:str) -> None:
	self.name = name
	self.description = None
	self._demo = None

	@abstractmethod
	def generate_prompt(self, **kwargs) -> str:
	"""Formatting prompt to match pre-training syntax

	Args:
	prot1 (_type_): _description_
	prot2 (_type_): _description_

	Raises:
	No: _description_
	"""
	raise NotImplementedError()

	@abstractmethod
	def crate_sample_dict(self, prompt: str, **kwargs) -> dict:
	"""Formatting prompt to match pre-training syntax

	Args:
	prompt (str): _description_

	Returns:
	dict: sample_dict for feeding into model
	"""
	raise NotImplementedError()

	# @abstractmethod
	def run_model(self, sample_dict, model:Mammal):
	raise NotImplementedError()

	@abstractmethod
	def create_demo(self, model_name_dropdown):
	"""create an gradio demo group

	Returns:
	_type_: _description_
	"""
	raise NotImplementedError()


	def demo(self,model_name_dropdown=None):
	if self._demo is None:
	self._demo = self.create_demo(model_name_dropdown=model_name_dropdown)
	return self._demo

	@abstractmethod
	def decode_output(self,batch_dict, model:Mammal):
	raise NotImplementedError()

	#self._setup()

	# def _setup(self):
	# pass



	all_tasks = dict()
	all_models= dict()

	class PpiTask(MammalTask):
	def __init__(self):
	super().__init__(name="PPI")
	self.description = "Protein-Protein Interaction (PPI)"
	self.examples = {
	"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
	"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
	}
	self.markup_text = """
	# Mammal based {self.description} demonstration

	Given two protein sequences, estimate if the proteins interact or not."""



	@staticmethod
	def positive_token_id(model_holder: MammalObjectBroker):
	"""token for positive binding

	Args:
	model (MammalTrainedModel): model holding tokenizer

	Returns:
	int: id of positive binding token
	"""
	return model_holder.tokenizer_op.get_token_id("<1>")

	def generate_prompt(self, prot1, prot2):
	"""Formatting prompt to match pre-training syntax

	Args:
	prot1 (str): sequance of protein number 1
	prot2 (str): sequance of protein number 2

	Returns:
	str: prompt
	"""
	prompt = "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
	"<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
	f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
	"<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
	f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
	return prompt


	def crate_sample_dict(self,prompt: str, model_holder:MammalObjectBroker):
	# Create and load sample
	sample_dict = dict()
	sample_dict[ENCODER_INPUTS_STR] = prompt

	# Tokenize
	sample_dict = model_holder.tokenizer_op(
	sample_dict=sample_dict,
	key_in=ENCODER_INPUTS_STR,
	key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
	key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
	)
	sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
	sample_dict[ENCODER_INPUTS_TOKENS]
	)
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
	)
	return sample_dict

	def run_model(self, sample_dict, model: Mammal):
	# Generate Prediction
	batch_dict = model.generate(
	[sample_dict],
	output_scores=True,
	return_dict_in_generate=True,
	max_new_tokens=5,
	)
	return batch_dict

	def decode_output(self,batch_dict, model_holder):

	# Get output
	generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
	score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()

	return generated_output, score


	def create_and_run_prompt(self,model_name,protein1, protein2):
	model_holder = all_models[model_name]
	prompt = self.generate_prompt(protein1, protein2)
	sample_dict = self.crate_sample_dict(prompt=prompt, model_holder=model_holder)
	batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
	res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
	return res


	def create_demo(self,model_name_dropdown):

	# """
	# ### Using the model from

	# ```{model} ```
	# """
	with gr.Group() as demo:
	gr.Markdown(self.markup_text)
	with gr.Row():
	prot1 = gr.Textbox(
	label="Protein 1 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calmodulin"],
	)
	prot2 = gr.Textbox(
	label="Protein 2 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calcineurin"],
	)
	with gr.Row():
	run_mammal = gr.Button(
	"Run Mammal prompt for Protein-Protein Interaction", variant="primary"
	)
	with gr.Row():
	prompt_box = gr.Textbox(label="Mammal prompt", lines=5)

	with gr.Row():
	decoded = gr.Textbox(label="Mammal output")
	run_mammal.click(
	fn=self.create_and_run_prompt,
	inputs=[model_name_dropdown, prot1, prot2],
	outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
	)
	with gr.Row():
	gr.Markdown(
	"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
	)
	demo.visible = True
	return demo

	ppi_task = PpiTask()
	all_tasks[ppi_task.name]=ppi_task

	ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=["PPI"])

	all_models[ppi_model.name]=ppi_model
	# tdi_model = MammalTrainedModel(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd") TODO: ## task list still empty
	# all_models.append(tdi_model)


	def create_application():
	def task_change(value):
	choices=[model_name for model_name, model in all_models.items() if value in model.tasks]
	if choices:
	return gr.update(choices=choices, value=choices[0])
	else:
	return
	# return model_name_dropdown


	with gr.Blocks() as demo:
	task_dropdown = gr.Dropdown(choices=["select demo"] + list(all_tasks.keys()))
	task_dropdown.interactive = True
	model_name_dropdown = gr.Dropdown(choices=[model_name for model_name, model in all_models.items() if task_dropdown.value in model.tasks], interactive=True)
	task_dropdown.change(task_change,inputs=[task_dropdown],outputs=[model_name_dropdown])





	ppi_demo = all_tasks["PPI"].demo(model_name_dropdown = model_name_dropdown)
	ppi_demo.visible = True
	# dtb_demo = create_tdb_demo()

	def set_ppi_vis(main_text):
	main_text=main_text
	print(f"main text is {main_text}")
	return gr.Group(visible=True)
	#return gr.Group(visible=(main_text == "PPI"))
	# , gr.Group( visible=(main_text == "DTI") )

	task_dropdown.change(
	set_ppi_vis, inputs=task_dropdown, outputs=[ppi_demo]
	)
	return demo

	full_demo=None
	def main():
	global full_demo
	full_demo = create_application()
	full_demo.launch(show_error=True, share=False)


	if __name__ == "__main__":
	main()