Spaces:

samsl
/

D-SCRIPT

Running on T4

App Files Files Community

samsl commited on Sep 15, 2023

Commit

e09f17f

1 Parent(s): d43f920

add tt3d with prostt5 predictino of 3di sequences

Browse files

Files changed (5) hide show

.gitignore +6 -0
app.py +123 -21
dscript_architecture1.png +0 -0
predict_3di.py +354 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__/
+models
+cnn_chkpnt
+foldseek
+*.fasta
+*.tar.gz

app.py CHANGED Viewed

@@ -1,52 +1,151 @@
 import gradio as gr
 import pandas as pd
 from pathlib import Path
 from Bio import SeqIO
 from dscript.pretrained import get_pretrained
 from dscript.language_model import lm_embed
 from tqdm.auto import tqdm
 from uuid import uuid4
 model_map = {
     "D-SCRIPT": "human_v1",
-    "Topsy-Turvy": "human_v2"
 }
-def predict(model, sequence_file, pairs_file):
     run_id = uuid4()
-    gr.Info("Loading model...")
-    _ = lm_embed("M")
-    model = get_pretrained(model_map[model])
-    gr.Info("Loading files...")
     try:
         seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
-    except ValueError as e:
-        gr.Error("Invalid FASTA file - duplicate entry")
     if Path(pairs_file.name).suffix == ".csv":
         pairs = pd.read_csv(pairs_file.name)
     elif Path(pairs_file.name).suffix == ".tsv":
         pairs = pd.read_csv(pairs_file.name, sep="\t")
-    pairs.columns = ["protein1", "protein2"]
-    gr.Info("Predicting...")
     results = []
-    progress = gr.Progress(track_tqdm=True)
-    for i, r in tqdm(pairs.iterrows(), total=len(pairs)):
-        gr.Info(f"[{i+1}/{len(pairs)}]")
         prot1 = r["protein1"]
         prot2 = r["protein2"]
         seq1 = str(seqs[prot1].seq)
         seq2 = str(seqs[prot2].seq)
         lm1 = lm_embed(seq1)
         lm2 = lm_embed(seq2)
-        interaction = model.predict(lm1, lm2).item()
         results.append([prot1, prot2, interaction])
-        progress((i, len(pairs)))
     results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
@@ -59,16 +158,19 @@ def predict(model, sequence_file, pairs_file):
 demo = gr.Interface(
     fn=predict,
     inputs = [
-        gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy"], value = "Topsy-Turvy"),
         gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
-        gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"])
     ],
     outputs = [
         gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']),
         gr.File(label="Download results", type="file")
-    ]
 )
 if __name__ == "__main__":
-    demo.queue(max_size=20)
-    demo.launch()

+import time
 import gradio as gr
 import pandas as pd
+import torch
 from pathlib import Path
 from Bio import SeqIO
 from dscript.pretrained import get_pretrained
 from dscript.language_model import lm_embed
 from tqdm.auto import tqdm
 from uuid import uuid4
+from predict_3di import get_3di_sequences, predictions_to_dict, one_hot_3di_sequence
 model_map = {
     "D-SCRIPT": "human_v1",
+    "Topsy-Turvy": "human_v2",
+    "TT3D": "human_tt3d",
 }
+theme = "Default"
+title = "D-SCRIPT: Predicting Protein-Protein Interactions"
+description = """
+"""
+article = """
+<hr>
+<img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/>
+<hr>
+D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences.
+It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact,
+a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage
+in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and,
+since structure is more conserved evolutionarily than sequence, improves generalizability across species.
+<hr>
+Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the
+individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate
+top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based,
+multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by
+incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the
+ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data.
+"""
+fold_vocab = {
+    "D": 0,
+    "P": 1,
+    "V": 2,
+    "Q": 3,
+    "A": 4,
+    "W": 5,
+    "K": 6,
+    "E": 7,
+    "I": 8,
+    "T": 9,
+    "L": 10,
+    "F": 11,
+    "G": 12,
+    "S": 13,
+    "M": 14,
+    "H": 15,
+    "C": 16,
+    "R": 17,
+    "Y": 18,
+    "N": 19,
+    "X": 20,
+}
+def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
     run_id = uuid4()
+    device = torch.cuda("0") if torch.cuda.is_available() else torch.device("cpu")
+    # gr.Info("Loading model...")
+    _ = lm_embed("M", use_cuda = (device.type == "cuda"))
+    model = get_pretrained(model_map[model_name]).to(device)
+    # gr.Info("Loading files...")
     try:
         seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
+    except ValueError as _:
+        raise gr.Error("Invalid FASTA file - duplicate entry")
     if Path(pairs_file.name).suffix == ".csv":
         pairs = pd.read_csv(pairs_file.name)
     elif Path(pairs_file.name).suffix == ".tsv":
         pairs = pd.read_csv(pairs_file.name, sep="\t")
+    try:
+        pairs.columns = ["protein1", "protein2"]
+    except ValueError as _:
+        raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
+    do_foldseek = False
+    if model_name == "TT3D":
+        do_foldseek = True
+        need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
+        seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
+        half_precision = False
+        assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
+        gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
+        predictions = get_3di_sequences(
+            seqs_to_translate,
+            model_dir = "Rostlab/ProstT5",
+            report_fn = gr.Info,
+            error_fn = gr.Error,
+            device=device,
+            )
+        foldseek_sequences = predictions_to_dict(predictions)
+        foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
+        # for k in seqs_to_translate.keys():
+        #     print(seqs_to_translate[k])
+        #     print(len(seqs_to_translate[k]))
+        #     print(foldseek_embeddings[k])
+        #     print(foldseek_embeddings[k].shape)
+    progress(0, desc="Starting...")
     results = []
+    for i in progress.tqdm(range(len(pairs))):
+        r = pairs.iloc[i]
         prot1 = r["protein1"]
         prot2 = r["protein2"]
         seq1 = str(seqs[prot1].seq)
         seq2 = str(seqs[prot2].seq)
+        fold1 = foldseek_embeddings[prot1] if do_foldseek else None
+        fold2 = foldseek_embeddings[prot2] if do_foldseek else None
         lm1 = lm_embed(seq1)
         lm2 = lm_embed(seq2)
+        print(lm1.shape, lm2.shape, fold1.shape, fold2.shape)
+        interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
         results.append([prot1, prot2, interaction])
     results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
 demo = gr.Interface(
     fn=predict,
     inputs = [
+        gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy", "TT3D"], value = "Topsy-Turvy"),
+        gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"]),
         gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
     ],
     outputs = [
         gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']),
         gr.File(label="Download results", type="file")
+    ],
+    title = title,
+    description = description,
+    article = article,
+    theme = theme,
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

dscript_architecture1.png ADDED Viewed

predict_3di.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 16 14:27:44 2023
+@author: mheinzinger
+"""
+import argparse
+import time
+from pathlib import Path
+from urllib import request
+import shutil
+import numpy as np
+import torch
+from torch import nn
+from transformers import T5EncoderModel, T5Tokenizer
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+print("Using device: {}".format(device))
+# Convolutional neural network (two convolutional layers)
+class CNN(nn.Module):
+    def __init__( self ):
+        super(CNN, self).__init__()
+        self.classifier = nn.Sequential(
+            nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)),  # 7x32
+            nn.ReLU(),
+            nn.Dropout(0.0),
+            nn.Conv2d(32, 20, kernel_size=(7, 1), padding=(3, 0))
+        )
+    def forward(self, x):
+        """
+            L = protein length
+            B = batch-size
+            F = number of features (1024 for embeddings)
+            N = number of classes (20 for 3Di)
+        """
+        x = x.permute(0, 2, 1).unsqueeze(dim=-1)  # IN: X = (B x L x F); OUT: (B x F x L, 1)
+        Yhat = self.classifier(x)  # OUT: Yhat_consurf = (B x N x L x 1)
+        Yhat = Yhat.squeeze(dim=-1)  # IN: (B x N x L x 1); OUT: ( B x L x N )
+        return Yhat
+def one_hot_3di_sequence(sequence, vocab):
+    foldseek_enc = torch.zeros(
+        len(sequence), len(vocab), dtype=torch.float32
+    )
+    for i, a in enumerate(sequence):
+        assert a in vocab
+        foldseek_enc[i, vocab[a]] = 1
+    return foldseek_enc.unsqueeze(0)
+def get_T5_model(model_dir):
+    print("Loading T5 from: {}".format(model_dir))
+    model = T5EncoderModel.from_pretrained(model_dir).to(device)
+    model = model.eval()
+    vocab = T5Tokenizer.from_pretrained(model_dir, do_lower_case=False )
+    return model, vocab
+def read_fasta( fasta_path, split_char, id_field ):
+    '''
+        Reads in fasta file containing multiple sequences.
+        Returns dictionary of holding multiple sequences or only single
+        sequence, depending on input file.
+    '''
+    sequences = dict()
+    with open( fasta_path, 'r' ) as fasta_f:
+        for line in fasta_f:
+            # get uniprot ID from header and create new entry
+            if line.startswith('>'):
+                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
+                # replace tokens that are mis-interpreted when loading h5
+                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
+                sequences[ uniprot_id ] = ''
+            else:
+                s = ''.join( line.split() ).replace("-","")
+                if s.islower(): # sanity check to avoid mix-up of 3Di and AA input
+                    print("The input file was in lower-case which indicates 3Di-input." +
+                          "This predictor only operates on amino-acid-input (upper-case)." +
+                          "Exiting now ..."
+                          )
+                    return None
+                else:
+                    sequences[ uniprot_id ] += s
+    return sequences
+def write_predictions(predictions, out_path):
+    ss_mapping = {
+        0: "A",
+        1: "C",
+        2: "D",
+        3: "E",
+        4: "F",
+        5: "G",
+        6: "H",
+        7: "I",
+        8: "K",
+        9: "L",
+        10: "M",
+        11: "N",
+        12: "P",
+        13: "Q",
+        14: "R",
+        15: "S",
+        16: "T",
+        17: "V",
+        18: "W",
+        19: "Y"
+    }
+    with open(out_path, 'w+') as out_f:
+        out_f.write( '\n'.join(
+          [ ">{}\n{}".format(
+              seq_id, "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))) )
+          for seq_id, yhats in predictions.items()
+          ]
+            ) )
+    print(f"Finished writing results to {out_path}")
+    return None
+def predictions_to_dict(predictions):
+    ss_mapping = {
+        0: "A",
+        1: "C",
+        2: "D",
+        3: "E",
+        4: "F",
+        5: "G",
+        6: "H",
+        7: "I",
+        8: "K",
+        9: "L",
+        10: "M",
+        11: "N",
+        12: "P",
+        13: "Q",
+        14: "R",
+        15: "S",
+        16: "T",
+        17: "V",
+        18: "W",
+        19: "Y"
+    }
+    results = {seq_id: "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))) for seq_id, yhats in predictions.items()}
+    return results
+def toCPU(tensor):
+    if len(tensor.shape) > 1:
+        return tensor.detach().cpu().squeeze(dim=-1).numpy()
+    else:
+        return tensor.detach().cpu().numpy()
+def download_file(url,local_path):
+    if not local_path.parent.is_dir():
+        local_path.parent.mkdir()
+    print("Downloading: {}".format(url))
+    req = request.Request(url, headers={
+          'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
+      })
+    with request.urlopen(req) as response, open(local_path, 'wb') as outfile:
+          shutil.copyfileobj(response, outfile)
+    return None
+def load_predictor( weights_link="https://rostlab.org/~deepppi/prostt5/cnn_chkpnt/model.pt" , device=torch.device("cpu")):
+    model = CNN()
+    checkpoint_p = Path.cwd() / "cnn_chkpnt" / "model.pt"
+    # if no pre-trained model is available, yet --> download it
+    if not checkpoint_p.exists():
+        download_file(weights_link, checkpoint_p)
+    state = torch.load(checkpoint_p, map_location=device)
+    model.load_state_dict(state["state_dict"])
+    model = model.eval()
+    model = model.to(device)
+    return model
+def get_3di_sequences( seq_dict, model_dir, device,
+                   max_residues=4000, max_seq_len=1000, max_batch=100,report_fn=print,error_fn=print,half_precision=False):
+    predictions = dict()
+    prefix = "<AA2fold>"
+    model, vocab = get_T5_model(model_dir)
+    predictor = load_predictor(device=device)
+    if half_precision:
+        model = model.half()
+        predictor = predictor.half()
+    report_fn('Total number of sequences: {}'.format(len(seq_dict)))
+    avg_length = sum([ len(seq) for _, seq in seq_dict.items()]) / len(seq_dict)
+    n_long     = sum([ 1 for _, seq in seq_dict.items() if len(seq)>max_seq_len])
+    # sort sequences by length to trigger OOM at the beginning
+    seq_dict   = sorted( seq_dict.items(), key=lambda kv: len( seq_dict[kv[0]] ), reverse=True )
+    report_fn("Average sequence length: {}".format(avg_length))
+    report_fn("Number of sequences >{}: {}".format(max_seq_len, n_long))
+    start = time.time()
+    batch = list()
+    for seq_idx, (pdb_id, seq) in enumerate(seq_dict,1):
+        # replace non-standard AAs
+        seq = seq.replace('U','X').replace('Z','X').replace('O','X')
+        seq_len = len(seq)
+        seq = prefix + ' ' + ' '.join(list(seq))
+        batch.append((pdb_id,seq,seq_len))
+        # count residues in current batch and add the last sequence length to
+        # avoid that batches with (n_res_batch > max_residues) get processed
+        n_res_batch = sum([ s_len for  _, _, s_len in batch ]) + seq_len
+        if len(batch) >= max_batch or n_res_batch>=max_residues or seq_idx==len(seq_dict) or seq_len>max_seq_len:
+            pdb_ids, seqs, seq_lens = zip(*batch)
+            batch = list()
+            token_encoding = vocab.batch_encode_plus(seqs,
+                                                     add_special_tokens=True,
+                                                     padding="longest",
+                                                     return_tensors='pt'
+                                                     ).to(device)
+            try:
+                with torch.no_grad():
+                    embedding_repr = model(token_encoding.input_ids,
+                                           attention_mask=token_encoding.attention_mask
+                                           )
+            except RuntimeError:
+                error_fn("RuntimeError during embedding for {} (L={})".format(
+                    pdb_id, seq_len)
+                    )
+                continue
+            # ProtT5 appends a special tokens at the end of each sequence
+            # Mask this also out during inference while taking into account the prefix
+            for idx, s_len in enumerate(seq_lens):
+                token_encoding.attention_mask[idx,s_len+1] = 0
+            # extract last hidden states (=embeddings)
+            residue_embedding = embedding_repr.last_hidden_state.detach()
+            # mask out padded elements in the attention output (can be non-zero) for further processing/prediction
+            residue_embedding = residue_embedding*token_encoding.attention_mask.unsqueeze(dim=-1)
+            # slice off embedding of special token prepended before to each sequence
+            residue_embedding = residue_embedding[:,1:]
+            prediction = predictor(residue_embedding)
+            prediction = toCPU(torch.max( prediction, dim=1, keepdim=True )[1] ).astype(np.byte)
+            # batch-size x seq_len x embedding_dim
+            # extra token is added at the end of the seq
+            for batch_idx, identifier in enumerate(pdb_ids):
+                s_len = seq_lens[batch_idx]
+                # slice off padding and special token appended to the end of the sequence
+                predictions[identifier] = prediction[batch_idx,:, 0:s_len].squeeze()
+                assert s_len == len(predictions[identifier]), error_fn(f"Length mismatch for {identifier}: is:{len(predictions[identifier])} vs should:{s_len}")
+    end = time.time()
+    report_fn('Total number of predictions: {}'.format(len(predictions)))
+    report_fn('Total time: {:.2f}[s]; time/prot: {:.4f}[s]; avg. len= {:.2f}'.format(
+            end-start, (end-start)/len(predictions), avg_length))
+    return predictions
+def create_arg_parser():
+    """"Creates and returns the ArgumentParser object."""
+    # Instantiate the parser
+    parser = argparse.ArgumentParser(description=(
+            'embed.py creates ProstT5-Encoder embeddings for a given text '+
+            ' file containing sequence(s) in FASTA-format.' +
+            'Example: python predict_3Di.py --input /path/to/some_AA_sequences.fasta --output /path/to/some_3Di_sequences.fasta --half 1' ) )
+    # Required positional argument
+    parser.add_argument( '-i', '--input', required=True, type=str,
+                    help='A path to a fasta-formatted text file containing protein sequence(s).')
+    # Optional positional argument
+    parser.add_argument( '-o', '--output', required=True, type=str,
+                    help='A path for saving the created embeddings as NumPy npz file.')
+    # Required positional argument
+    parser.add_argument('--model', required=False, type=str,
+                    default="Rostlab/ProstT5",
+                    help='Either a path to a directory holding the checkpoint for a pre-trained model or a huggingface repository link.' )
+    # Optional argument
+    parser.add_argument('--split_char', type=str,
+                    default='!',
+                    help='The character for splitting the FASTA header in order to retrieve ' +
+                        "the protein identifier. Should be used in conjunction with --id." +
+                        "Default: '!' ")
+    # Optional argument
+    parser.add_argument('--id', type=int,
+                    default=0,
+                    help='The index for the uniprot identifier field after splitting the ' +
+                        "FASTA header after each symbole in ['|', '#', ':', ' ']." +
+                        'Default: 0')
+    parser.add_argument('--half', type=int,
+                    default=1,
+                    help="Whether to use half_precision or not. Default: 1 (half-precision)")
+    return parser
+def main():
+    parser     = create_arg_parser()
+    args       = parser.parse_args()
+    seq_path   = Path( args.input ) # path to input FASTAS
+    out_path   = Path( args.output) # path where predictions should be written to
+    model_dir  = args.model # path/repo_link to checkpoint
+    if out_path.is_file():
+        print("Output file is already existing and will be overwritten ...")
+    split_char = args.split_char
+    id_field   = args.id
+    half_precision = False if int(args.half) == 0 else True
+    assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
+    seq_dict = read_fasta( seq_path, split_char, id_field )
+    predictions = get_3di_sequences(
+        seq_dict,
+        model_dir,
+        )
+    print("Writing results now to disk ...")
+    write_predictions(predictions,out_path)
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

@@ -2,3 +2,5 @@ dscript
 biopython
 pandas
 tqdm

 biopython
 pandas
 tqdm
+transformers
+sentencepiece