Dinesh1102 commited on
Commit
6159956
·
verified ·
1 Parent(s): b7e6390

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +122 -0
  2. model5layer.weights.h5 +3 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tqdm
2
+ from Bio import SeqIO
3
+ import numpy as np
4
+ import pandas as pd
5
+ import tensorflow as tf
6
+ import os
7
+ import json
8
+ from typing import Dict
9
+ from collections import Counter
10
+ import random
11
+ import obonet
12
+ from transformers import T5Tokenizer, T5EncoderModel
13
+ import torch
14
+ import re
15
+ import gradio as gr
16
+
17
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
18
+
19
+ # Load the tokenizer
20
+ tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False) #.to(device)
21
+
22
+ # Load the model
23
+ model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
24
+
25
+ def get_embeddings(seq):
26
+ sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
27
+
28
+ ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
29
+
30
+ input_ids = torch.tensor(ids['input_ids']).to(device)
31
+ attention_mask = torch.tensor(ids['attention_mask']).to(device)
32
+
33
+ # generate embeddings
34
+ with torch.no_grad():
35
+ embedding_repr = model(input_ids=input_ids,
36
+ attention_mask=attention_mask)
37
+
38
+ # extract residue embeddings for the first ([0,:]) sequence in the batch and remove padded & special tokens ([0,:7])
39
+ emb_0 = embedding_repr.last_hidden_state[0]
40
+ emb_0_per_protein = emb_0.mean(dim=0)
41
+
42
+ return emb_0_per_protein
43
+
44
+ def predict(filepath):
45
+ sequences = SeqIO.parse(filepath, "fasta")
46
+
47
+ ids = []
48
+ num_sequences=sum(1 for seq in sequences)
49
+ embeds = np.zeros((num_sequences, 1024))
50
+ i = 0
51
+ with open(filepath, "r") as fasta_file:
52
+ # Iterate over each sequence in the file
53
+ for sequence in SeqIO.parse(fasta_file, "fasta"):
54
+ # Access the sequence ID and sequence data
55
+ seq_id = sequence.id
56
+ seq_data = str(sequence.seq)
57
+ embeds[i] = get_embeddings(seq_data).detach().cpu().numpy()
58
+ print(embeds[i])
59
+ ids.append(seq_id)
60
+ i += 1
61
+
62
+ INPUT_SHAPE=[1024]
63
+ num_of_labels=1500
64
+
65
+ model = tf.keras.Sequential([
66
+ tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
67
+ tf.keras.layers.Dense(units=512, activation='relu'),
68
+ tf.keras.layers.Dropout(0.2),
69
+ tf.keras.layers.Dense(units=512, activation='relu'),
70
+ tf.keras.layers.Dropout(0.2),
71
+ tf.keras.layers.Dense(units=512, activation='relu'),
72
+ tf.keras.layers.Dropout(0.2),
73
+ tf.keras.layers.Dense(units=512, activation='relu'),
74
+ tf.keras.layers.Dropout(0.2),
75
+ tf.keras.layers.Dense(units=512, activation='relu'),
76
+ tf.keras.layers.Dropout(0.2),
77
+ tf.keras.layers.Dense(units=num_of_labels, activation='sigmoid')
78
+ ])
79
+
80
+ model.compile(
81
+ optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
82
+ loss='binary_crossentropy',
83
+ metrics=['binary_accuracy', tf.keras.metrics.AUC()]
84
+ )
85
+
86
+ model.load_weights('./model5layer.weights.h5') #load model here
87
+ labels_df=pd.read_csv('./labels.csv')
88
+ labels_df=labels_df.drop(columns='Unnamed: 0')
89
+
90
+ predictions = model.predict(embeds)
91
+ predictions_list1=[]
92
+ predictions_list2=[]
93
+
94
+ # 'predictions' will contain the model's output for the custom input tensor
95
+ # print(predictions)
96
+ for prediction in predictions:
97
+ tmp=[]
98
+ t2=[]
99
+ for i in prediction:
100
+ x=0 if i<0.4 else 1
101
+ tmp.append(x)
102
+ t2.append(i)
103
+ predictions_list1.append(tmp.copy())
104
+ predictions_list2.append(t2.copy())
105
+
106
+ label_columns = labels_df.columns
107
+
108
+ # Convert the predictions into a DataFrame
109
+ predictions_df = pd.DataFrame(predictions_list1, columns=label_columns)
110
+ p21=pd.DataFrame(predictions_list2, columns=label_columns)
111
+
112
+ # Save the DataFrame to a CSV file
113
+ predictions_df.to_csv("predictions.csv", index=False) #output csv
114
+ p21.to_csv("decimal.csv",index=False)
115
+ return "predictions.csv"
116
+
117
+ gr.Interface(
118
+ predict,
119
+ title = 'Protein Function Prediction using fasta file,upload a fasta file',
120
+ inputs="file",
121
+ outputs="file"
122
+ ).launch()
model5layer.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e428c434fd3149bdc45e027ec969625f624fecc7c80268046ae7c5af768fd497
3
+ size 28216412