matanninio's picture
new_app now works for ppi
4c8737b
raw
history blame
9.77 kB
import gradio as gr
import torch
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
from mammal.keys import *
from mammal.model import Mammal
from abc import ABC, abstractmethod
class MammalObjectBroker():
def __init__(self, model_path: str, name:str= None, task_list: list[str]=None) -> None:
self.model_path = model_path
if name is None:
name = model_path
self.name = name
if task_list is not None:
self.tasks=task_list
else:
self.task = []
self._model = None
self._tokenizer_op = None
@property
def model(self)-> Mammal:
if self._model is None:
self._model = Mammal.from_pretrained(self.model_path)
self._model.eval()
return self._model
@property
def tokenizer_op(self):
if self._tokenizer_op is None:
self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
return self._tokenizer_op
class MammalTask(ABC):
def __init__(self, name:str) -> None:
self.name = name
self.description = None
self._demo = None
@abstractmethod
def generate_prompt(self, **kwargs) -> str:
"""Formatting prompt to match pre-training syntax
Args:
prot1 (_type_): _description_
prot2 (_type_): _description_
Raises:
No: _description_
"""
raise NotImplementedError()
@abstractmethod
def crate_sample_dict(self, prompt: str, **kwargs) -> dict:
"""Formatting prompt to match pre-training syntax
Args:
prompt (str): _description_
Returns:
dict: sample_dict for feeding into model
"""
raise NotImplementedError()
# @abstractmethod
def run_model(self, sample_dict, model:Mammal):
raise NotImplementedError()
@abstractmethod
def create_demo(self, model_name_dropdown):
"""create an gradio demo group
Returns:
_type_: _description_
"""
raise NotImplementedError()
def demo(self,model_name_dropdown=None):
if self._demo is None:
self._demo = self.create_demo(model_name_dropdown=model_name_dropdown)
return self._demo
@abstractmethod
def decode_output(self,batch_dict, model:Mammal):
raise NotImplementedError()
#self._setup()
# def _setup(self):
# pass
all_tasks = dict()
all_models= dict()
class PpiTask(MammalTask):
def __init__(self):
super().__init__(name="PPI")
self.description = "Protein-Protein Interaction (PPI)"
self.examples = {
"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
}
self.markup_text = """
# Mammal based {self.description} demonstration
Given two protein sequences, estimate if the proteins interact or not."""
@staticmethod
def positive_token_id(model_holder: MammalObjectBroker):
"""token for positive binding
Args:
model (MammalTrainedModel): model holding tokenizer
Returns:
int: id of positive binding token
"""
return model_holder.tokenizer_op.get_token_id("<1>")
def generate_prompt(self, prot1, prot2):
"""Formatting prompt to match pre-training syntax
Args:
prot1 (str): sequance of protein number 1
prot2 (str): sequance of protein number 2
Returns:
str: prompt
"""
prompt = "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
"<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
"<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
return prompt
def crate_sample_dict(self,prompt: str, model_holder:MammalObjectBroker):
# Create and load sample
sample_dict = dict()
sample_dict[ENCODER_INPUTS_STR] = prompt
# Tokenize
sample_dict = model_holder.tokenizer_op(
sample_dict=sample_dict,
key_in=ENCODER_INPUTS_STR,
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
)
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
sample_dict[ENCODER_INPUTS_TOKENS]
)
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
)
return sample_dict
def run_model(self, sample_dict, model: Mammal):
# Generate Prediction
batch_dict = model.generate(
[sample_dict],
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=5,
)
return batch_dict
def decode_output(self,batch_dict, model_holder):
# Get output
generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()
return generated_output, score
def create_and_run_prompt(self,model_name,protein1, protein2):
model_holder = all_models[model_name]
prompt = self.generate_prompt(protein1, protein2)
sample_dict = self.crate_sample_dict(prompt=prompt, model_holder=model_holder)
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
return res
def create_demo(self,model_name_dropdown):
# """
# ### Using the model from
# ```{model} ```
# """
with gr.Group() as demo:
gr.Markdown(self.markup_text)
with gr.Row():
prot1 = gr.Textbox(
label="Protein 1 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calmodulin"],
)
prot2 = gr.Textbox(
label="Protein 2 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calcineurin"],
)
with gr.Row():
run_mammal = gr.Button(
"Run Mammal prompt for Protein-Protein Interaction", variant="primary"
)
with gr.Row():
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
with gr.Row():
decoded = gr.Textbox(label="Mammal output")
run_mammal.click(
fn=self.create_and_run_prompt,
inputs=[model_name_dropdown, prot1, prot2],
outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
)
with gr.Row():
gr.Markdown(
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
)
demo.visible = True
return demo
ppi_task = PpiTask()
all_tasks[ppi_task.name]=ppi_task
ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=["PPI"])
all_models[ppi_model.name]=ppi_model
# tdi_model = MammalTrainedModel(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd") TODO: ## task list still empty
# all_models.append(tdi_model)
def create_application():
def task_change(value):
choices=[model_name for model_name, model in all_models.items() if value in model.tasks]
if choices:
return gr.update(choices=choices, value=choices[0])
else:
return
# return model_name_dropdown
with gr.Blocks() as demo:
task_dropdown = gr.Dropdown(choices=["select demo"] + list(all_tasks.keys()))
task_dropdown.interactive = True
model_name_dropdown = gr.Dropdown(choices=[model_name for model_name, model in all_models.items() if task_dropdown.value in model.tasks], interactive=True)
task_dropdown.change(task_change,inputs=[task_dropdown],outputs=[model_name_dropdown])
ppi_demo = all_tasks["PPI"].demo(model_name_dropdown = model_name_dropdown)
ppi_demo.visible = True
# dtb_demo = create_tdb_demo()
def set_ppi_vis(main_text):
main_text=main_text
print(f"main text is {main_text}")
return gr.Group(visible=True)
#return gr.Group(visible=(main_text == "PPI"))
# , gr.Group( visible=(main_text == "DTI") )
task_dropdown.change(
set_ppi_vis, inputs=task_dropdown, outputs=[ppi_demo]
)
return demo
full_demo=None
def main():
global full_demo
full_demo = create_application()
full_demo.launch(show_error=True, share=False)
if __name__ == "__main__":
main()