InstaDeepAI
/

segment_nt_multi_species

@@ -42,29 +42,59 @@ A small snippet of code is given here in order to retrieve both logits and embed
 from transformers import AutoTokenizer, AutoModel
 import torch
-tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", use_auth_token=hf_token, trust_remote_code=True)
-model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", use_auth_token=hf_token, trust_remote_code=True)
 # Choose the length to which the input sequences are padded. By default, the
 # model max length is chosen, but feel free to decrease it as the time taken to
 # obtain the embeddings increases significantly with it.
-max_length = tokenizer.model_max_length
 # Create a dummy dna sequence and tokenize it
 sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
-tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
-# Compute the embeddings
-attention_mask = torch_tokens != tokenizer.pad_token_id
 outs = model(
-    torch_tokens,
     attention_mask=attention_mask,
     output_hidden_states=True
 )
-logits = outs.logits.detach().numpy()
 probabilities = torch.nn.functional.softmax(logits, dim=-1)
 ```

 from transformers import AutoTokenizer, AutoModel
 import torch
+features = [
+    "protein_coding_gene",
+    "lncRNA",
+    "exon",
+    "intron",
+    "splice_donor",
+    "splice_acceptor",
+    "5UTR",
+    "3UTR",
+    "CTCF-bound",
+    "polyA_signal",
+    "enhancer_Tissue_specific",
+    "enhancer_Tissue_invariant",
+    "promoter_Tissue_specific",
+    "promoter_Tissue_invariant",
+  ]
+tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", trust_remote_code=True)
+model = AutoModel.from_pretrained("InstaDeepAI/segment_nt_30kb_multi_species", trust_remote_code=True)
 # Choose the length to which the input sequences are padded. By default, the
 # model max length is chosen, but feel free to decrease it as the time taken to
 # obtain the embeddings increases significantly with it.
+# The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by
+# 2 to the power of the number of downsampling block, i.e 4.
+max_length = 12 + 1
+assert (max_length - 1) % 4 == 0, (
+    "The number of DNA tokens (excluding the CLS token prepended) needs to be dividible by"
+     "2 to the power of the number of downsampling block, i.e 4.")
 # Create a dummy dna sequence and tokenize it
 sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
+tokens = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = max_length)["input_ids"]
+# Infer
+attention_mask = tokens != tokenizer.pad_token_id
 outs = model(
+    tokens,
     attention_mask=attention_mask,
     output_hidden_states=True
 )
+# Obtain the logits over the genomic features
+logits = outs.logits.detach()
+# Transform them in probabilities
 probabilities = torch.nn.functional.softmax(logits, dim=-1)
+print(f"Probabilities shape: {probabilities.shape}")
+# Get probabilities associated with intron
+idx_intron = features.index("intron")
+probabilities_intron = probabilities[:,:,idx_intron]
+print(f"Intron probabilities shape: {probabilities_intron.shape}")
 ```