biomed-multi-alignment

Running

File size: 9,768 Bytes

4c8737b

import gradio as gr
import torch
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
from mammal.keys import *
from mammal.model import Mammal
from abc import ABC, abstractmethod
class MammalObjectBroker():
    def __init__(self, model_path: str, name:str= None, task_list: list[str]=None) -> None:
        self.model_path = model_path
        if name is None:
            name = model_path
        self.name = name        
        
        if task_list is not None:
            self.tasks=task_list
        else:
            self.task = []
        self._model = None
        self._tokenizer_op = None
        
        
    @property
    def model(self)-> Mammal:
        if self._model is None:
            self._model =  Mammal.from_pretrained(self.model_path)
            self._model.eval()
        return self._model
    
    @property
    def tokenizer_op(self):
        if self._tokenizer_op is None:
            self._tokenizer_op =  ModularTokenizerOp.from_pretrained(self.model_path)
        return self._tokenizer_op
    
    
    
    

class MammalTask(ABC):
    def __init__(self, name:str) -> None:
            self.name = name
            self.description = None
            self._demo = None

    @abstractmethod
    def generate_prompt(self, **kwargs) -> str:
        """Formatting prompt to match pre-training syntax

        Args:
            prot1 (_type_): _description_
            prot2 (_type_): _description_

        Raises:
            No: _description_
        """
        raise NotImplementedError()

    @abstractmethod
    def crate_sample_dict(self, prompt: str, **kwargs) -> dict:
        """Formatting prompt to match pre-training syntax

        Args:
            prompt (str): _description_

        Returns:
            dict: sample_dict for feeding into model
        """
        raise NotImplementedError()

    # @abstractmethod
    def run_model(self, sample_dict, model:Mammal):
        raise NotImplementedError()
    
    @abstractmethod
    def create_demo(self, model_name_dropdown):
        """create an gradio demo group

        Returns:
            _type_: _description_
        """
        raise NotImplementedError()

    
    def demo(self,model_name_dropdown=None):
        if self._demo is None:
            self._demo = self.create_demo(model_name_dropdown=model_name_dropdown)
        return self._demo

    @abstractmethod
    def decode_output(self,batch_dict, model:Mammal):
        raise NotImplementedError()

    #self._setup()
        
    # def _setup(self):
    #     pass
    


all_tasks = dict()
all_models= dict()

class PpiTask(MammalTask):
    def __init__(self):
        super().__init__(name="PPI")
        self.description = "Protein-Protein Interaction (PPI)"
        self.examples = {
            "protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
            "protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
        }
        self.markup_text = """
    # Mammal based {self.description} demonstration
    
    Given two protein sequences, estimate if the proteins interact or not."""
    
        
        
    @staticmethod
    def positive_token_id(model_holder: MammalObjectBroker):
        """token for positive binding

        Args:
            model (MammalTrainedModel): model holding tokenizer

        Returns:
            int: id of positive binding token
        """
        return model_holder.tokenizer_op.get_token_id("<1>")
    
    def generate_prompt(self, prot1, prot2):
        """Formatting prompt to match pre-training syntax

        Args:
            prot1 (str): sequance of protein number 1
            prot2 (str): sequance of protein number 2

        Returns:
            str: prompt
        """   
        prompt =  "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
            f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
            f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
        return prompt
    
    
    def crate_sample_dict(self,prompt: str, model_holder:MammalObjectBroker):
        # Create and load sample
        sample_dict = dict()
        sample_dict[ENCODER_INPUTS_STR] = prompt

        # Tokenize
        sample_dict = model_holder.tokenizer_op(
            sample_dict=sample_dict,
            key_in=ENCODER_INPUTS_STR,
            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
        )
        sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
            sample_dict[ENCODER_INPUTS_TOKENS]
        )
        sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
        )
        return sample_dict

    def run_model(self, sample_dict, model: Mammal):
        # Generate Prediction
        batch_dict = model.generate(
            [sample_dict],
            output_scores=True,
            return_dict_in_generate=True,
            max_new_tokens=5,
        )
        return batch_dict
        
    def decode_output(self,batch_dict, model_holder):

        # Get output
        generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
        score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()

        return generated_output, score


    def create_and_run_prompt(self,model_name,protein1, protein2):
        model_holder = all_models[model_name]
        prompt = self.generate_prompt(protein1, protein2)
        sample_dict = self.crate_sample_dict(prompt=prompt, model_holder=model_holder)
        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
        return res

    
    def create_demo(self,model_name_dropdown):
        
    # """
    # ### Using the model from

    # ```{model} ```
    # """
        with gr.Group() as demo:
            gr.Markdown(self.markup_text)
            with gr.Row():
                prot1 = gr.Textbox(
                    label="Protein 1 sequence",
                    # info="standard",
                    interactive=True,
                    lines=3,
                    value=self.examples["protein_calmodulin"],
                )
                prot2 = gr.Textbox(
                    label="Protein 2 sequence",
                    # info="standard",
                    interactive=True,
                    lines=3,
                    value=self.examples["protein_calcineurin"],
                )
            with gr.Row():
                run_mammal = gr.Button(
                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
                )
            with gr.Row():
                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)

            with gr.Row():
                decoded = gr.Textbox(label="Mammal output")
                run_mammal.click(
                    fn=self.create_and_run_prompt,
                    inputs=[model_name_dropdown, prot1, prot2],
                    outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
                )
            with gr.Row():
                gr.Markdown(
                    "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
                )
            demo.visible = True
            return demo

ppi_task = PpiTask()
all_tasks[ppi_task.name]=ppi_task

ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=["PPI"])

all_models[ppi_model.name]=ppi_model
# tdi_model = MammalTrainedModel(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd")  TODO: ## task list still empty
# all_models.append(tdi_model)


def create_application():
    def task_change(value):
        choices=[model_name for model_name, model in all_models.items() if value in model.tasks]
        if choices:
            return  gr.update(choices=choices, value=choices[0])
        else:
            return
        # return model_name_dropdown
        
       
    with gr.Blocks() as demo:
        task_dropdown = gr.Dropdown(choices=["select demo"] + list(all_tasks.keys()))
        task_dropdown.interactive = True
        model_name_dropdown = gr.Dropdown(choices=[model_name for model_name, model in all_models.items() if task_dropdown.value in model.tasks], interactive=True)
        task_dropdown.change(task_change,inputs=[task_dropdown],outputs=[model_name_dropdown])
        
            



        ppi_demo = all_tasks["PPI"].demo(model_name_dropdown = model_name_dropdown)
        ppi_demo.visible = True
        # dtb_demo = create_tdb_demo()

        def set_ppi_vis(main_text):
            main_text=main_text
            print(f"main text is {main_text}")
            return gr.Group(visible=True)
            #return gr.Group(visible=(main_text == "PPI"))
        # , gr.Group(                visible=(main_text == "DTI")            )

        task_dropdown.change(
            set_ppi_vis, inputs=task_dropdown, outputs=[ppi_demo]
        )
        return demo

full_demo=None
def main():
    global full_demo
    full_demo = create_application()
    full_demo.launch(show_error=True, share=False)


if __name__ == "__main__":
    main()