biomed-multi-alignment

Sleeping

App Files Files Community

matanninio commited on Dec 1, 2024

Commit

4c8737b

1 Parent(s): 83ccd79

new_app now works for ppi

Browse files

Files changed (1) hide show

new_app.py +297 -0

new_app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import gradio as gr
+import torch
+from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
+from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
+from mammal.keys import *
+from mammal.model import Mammal
+from abc import ABC, abstractmethod
+class MammalObjectBroker():
+    def __init__(self, model_path: str, name:str= None, task_list: list[str]=None) -> None:
+        self.model_path = model_path
+        if name is None:
+            name = model_path
+        self.name = name
+        if task_list is not None:
+            self.tasks=task_list
+        else:
+            self.task = []
+        self._model = None
+        self._tokenizer_op = None
+    @property
+    def model(self)-> Mammal:
+        if self._model is None:
+            self._model =  Mammal.from_pretrained(self.model_path)
+            self._model.eval()
+        return self._model
+    @property
+    def tokenizer_op(self):
+        if self._tokenizer_op is None:
+            self._tokenizer_op =  ModularTokenizerOp.from_pretrained(self.model_path)
+        return self._tokenizer_op
+class MammalTask(ABC):
+    def __init__(self, name:str) -> None:
+            self.name = name
+            self.description = None
+            self._demo = None
+    @abstractmethod
+    def generate_prompt(self, **kwargs) -> str:
+        """Formatting prompt to match pre-training syntax
+        Args:
+            prot1 (_type_): _description_
+            prot2 (_type_): _description_
+        Raises:
+            No: _description_
+        """
+        raise NotImplementedError()
+    @abstractmethod
+    def crate_sample_dict(self, prompt: str, **kwargs) -> dict:
+        """Formatting prompt to match pre-training syntax
+        Args:
+            prompt (str): _description_
+        Returns:
+            dict: sample_dict for feeding into model
+        """
+        raise NotImplementedError()
+    # @abstractmethod
+    def run_model(self, sample_dict, model:Mammal):
+        raise NotImplementedError()
+    @abstractmethod
+    def create_demo(self, model_name_dropdown):
+        """create an gradio demo group
+        Returns:
+            _type_: _description_
+        """
+        raise NotImplementedError()
+    def demo(self,model_name_dropdown=None):
+        if self._demo is None:
+            self._demo = self.create_demo(model_name_dropdown=model_name_dropdown)
+        return self._demo
+    @abstractmethod
+    def decode_output(self,batch_dict, model:Mammal):
+        raise NotImplementedError()
+    #self._setup()
+    # def _setup(self):
+    #     pass
+all_tasks = dict()
+all_models= dict()
+class PpiTask(MammalTask):
+    def __init__(self):
+        super().__init__(name="PPI")
+        self.description = "Protein-Protein Interaction (PPI)"
+        self.examples = {
+            "protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
+            "protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
+        }
+        self.markup_text = """
+    # Mammal based {self.description} demonstration
+    Given two protein sequences, estimate if the proteins interact or not."""
+    @staticmethod
+    def positive_token_id(model_holder: MammalObjectBroker):
+        """token for positive binding
+        Args:
+            model (MammalTrainedModel): model holding tokenizer
+        Returns:
+            int: id of positive binding token
+        """
+        return model_holder.tokenizer_op.get_token_id("<1>")
+    def generate_prompt(self, prot1, prot2):
+        """Formatting prompt to match pre-training syntax
+        Args:
+            prot1 (str): sequance of protein number 1
+            prot2 (str): sequance of protein number 2
+        Returns:
+            str: prompt
+        """
+        prompt =  "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
+            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
+            f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
+            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
+            f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
+        return prompt
+    def crate_sample_dict(self,prompt: str, model_holder:MammalObjectBroker):
+        # Create and load sample
+        sample_dict = dict()
+        sample_dict[ENCODER_INPUTS_STR] = prompt
+        # Tokenize
+        sample_dict = model_holder.tokenizer_op(
+            sample_dict=sample_dict,
+            key_in=ENCODER_INPUTS_STR,
+            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
+            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
+        )
+        sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_TOKENS]
+        )
+        sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
+        )
+        return sample_dict
+    def run_model(self, sample_dict, model: Mammal):
+        # Generate Prediction
+        batch_dict = model.generate(
+            [sample_dict],
+            output_scores=True,
+            return_dict_in_generate=True,
+            max_new_tokens=5,
+        )
+        return batch_dict
+    def decode_output(self,batch_dict, model_holder):
+        # Get output
+        generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
+        score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()
+        return generated_output, score
+    def create_and_run_prompt(self,model_name,protein1, protein2):
+        model_holder = all_models[model_name]
+        prompt = self.generate_prompt(protein1, protein2)
+        sample_dict = self.crate_sample_dict(prompt=prompt, model_holder=model_holder)
+        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
+        return res
+    def create_demo(self,model_name_dropdown):
+    # """
+    # ### Using the model from
+    # ```{model} ```
+    # """
+        with gr.Group() as demo:
+            gr.Markdown(self.markup_text)
+            with gr.Row():
+                prot1 = gr.Textbox(
+                    label="Protein 1 sequence",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["protein_calmodulin"],
+                )
+                prot2 = gr.Textbox(
+                    label="Protein 2 sequence",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["protein_calcineurin"],
+                )
+            with gr.Row():
+                run_mammal = gr.Button(
+                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
+                )
+            with gr.Row():
+                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
+            with gr.Row():
+                decoded = gr.Textbox(label="Mammal output")
+                run_mammal.click(
+                    fn=self.create_and_run_prompt,
+                    inputs=[model_name_dropdown, prot1, prot2],
+                    outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
+                )
+            with gr.Row():
+                gr.Markdown(
+                    "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
+                )
+            demo.visible = True
+            return demo
+ppi_task = PpiTask()
+all_tasks[ppi_task.name]=ppi_task
+ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=["PPI"])
+all_models[ppi_model.name]=ppi_model
+# tdi_model = MammalTrainedModel(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd")  TODO: ## task list still empty
+# all_models.append(tdi_model)
+def create_application():
+    def task_change(value):
+        choices=[model_name for model_name, model in all_models.items() if value in model.tasks]
+        if choices:
+            return  gr.update(choices=choices, value=choices[0])
+        else:
+            return
+        # return model_name_dropdown
+    with gr.Blocks() as demo:
+        task_dropdown = gr.Dropdown(choices=["select demo"] + list(all_tasks.keys()))
+        task_dropdown.interactive = True
+        model_name_dropdown = gr.Dropdown(choices=[model_name for model_name, model in all_models.items() if task_dropdown.value in model.tasks], interactive=True)
+        task_dropdown.change(task_change,inputs=[task_dropdown],outputs=[model_name_dropdown])
+        ppi_demo = all_tasks["PPI"].demo(model_name_dropdown = model_name_dropdown)
+        ppi_demo.visible = True
+        # dtb_demo = create_tdb_demo()
+        def set_ppi_vis(main_text):
+            main_text=main_text
+            print(f"main text is {main_text}")
+            return gr.Group(visible=True)
+            #return gr.Group(visible=(main_text == "PPI"))
+        # , gr.Group(                visible=(main_text == "DTI")            )
+        task_dropdown.change(
+            set_ppi_vis, inputs=task_dropdown, outputs=[ppi_demo]
+        )
+        return demo
+full_demo=None
+def main():
+    global full_demo
+    full_demo = create_application()
+    full_demo.launch(show_error=True, share=False)
+if __name__ == "__main__":
+    main()