Spaces:

westlake-repl
/

Demo_ProTrek_650M

Running

App Files Files Community

LTEnjoy commited on 4 days ago

Commit

9606143

verified ·

1 Parent(s): ee76aa3

Delete model

Browse files

Files changed (6) hide show

model/ProTrek/protein_encoder.py +0 -95
model/ProTrek/protrek_trimodal_model.py +0 -874
model/ProTrek/structure_encoder.py +0 -86
model/ProTrek/text_encoder.py +0 -81
model/abstract_model.py +0 -401
model/model_interface.py +0 -104

model/ProTrek/protein_encoder.py DELETED Viewed

@@ -1,95 +0,0 @@
-import torch
-from tqdm import tqdm
-from torch.nn.functional import normalize
-from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
-class ProteinEncoder(torch.nn.Module):
-    def __init__(self,
-                 config_path: str,
-                 out_dim: int,
-                 load_pretrained: bool = True,
-                 gradient_checkpointing: bool = False):
-        """
-        Args:
-            config_path: Path to the config file
-            out_dim    : Output dimension of the protein representation
-            load_pretrained: Whether to load pretrained weights
-            gradient_checkpointing: Whether to use gradient checkpointing
-        """
-        super().__init__()
-        config = EsmConfig.from_pretrained(config_path)
-        if load_pretrained:
-            self.model = EsmForMaskedLM.from_pretrained(config_path)
-        else:
-            self.model = EsmForMaskedLM(config)
-        self.out = torch.nn.Linear(config.hidden_size, out_dim)
-        # Set gradient checkpointing
-        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
-        # Remove contact head
-        self.model.esm.contact_head = None
-        # Remove position embedding if the embedding type is ``rotary``
-        if config.position_embedding_type == "rotary":
-            self.model.esm.embeddings.position_embeddings = None
-        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
-    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        """
-        Compute protein representation for the given proteins
-        Args:
-            protein: A list of protein sequences
-            batch_size: Batch size for inference
-            verbose: Whether to print progress
-        """
-        device = next(self.parameters()).device
-        protein_repr = []
-        if verbose:
-            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
-        else:
-            iterator = range(0, len(proteins), batch_size)
-        for i in iterator:
-            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
-                                                              return_tensors="pt",
-                                                              padding=True)
-            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
-            output, _ = self.forward(protein_inputs)
-            protein_repr.append(output)
-        protein_repr = torch.cat(protein_repr, dim=0)
-        return normalize(protein_repr, dim=-1)
-    def forward(self, inputs: dict, get_mask_logits: bool = False):
-        """
-        Encode protein sequence into protein representation
-        Args:
-            inputs: A dictionary containing the following keys:
-                - input_ids: [batch, seq_len]
-                - attention_mask: [batch, seq_len]
-            get_mask_logits: Whether to return the logits for masked tokens
-        Returns:
-            protein_repr: [batch, protein_repr_dim]
-            mask_logits : [batch, seq_len, vocab_size]
-        """
-        last_hidden_state = self.model.esm(**inputs).last_hidden_state
-        reprs = last_hidden_state[:, 0, :]
-        reprs = self.out(reprs)
-        # Get logits for masked tokens
-        if get_mask_logits:
-            mask_logits = self.model.lm_head(last_hidden_state)
-        else:
-            mask_logits = None
-        return reprs, mask_logits

model/ProTrek/protrek_trimodal_model.py DELETED Viewed

@@ -1,874 +0,0 @@
-import torch
-import torch.distributed as dist
-import torchmetrics
-import json
-import math
-import numpy as np
-import os
-import copy
-import faiss
-import time
-import pandas as pd
-import random
-from tqdm import tqdm
-from .protein_encoder import ProteinEncoder
-from .structure_encoder import StructureEncoder
-from .text_encoder import TextEncoder
-from ..abstract_model import AbstractModel
-from ..model_interface import register_model
-from utils.mpr import MultipleProcessRunnerSimplifier
-from torch.nn.functional import normalize, cross_entropy
-from utils.constants import residue_level, sequence_level
-from sklearn.metrics import roc_auc_score
-def multilabel_cross_entropy(logits, labels):
-    """
-    Compute cross entropy loss for multilabel classification。 See "https://arxiv.org/pdf/2208.02955.pdf"
-    Args:
-        logits: [num_samples, num_classes]
-        labels: [num_samples, num_classes]
-    """
-    loss = 0
-    for pred, label in zip(logits, labels):
-        pos_logits = pred[label == 1]
-        neg_logits = pred[label == 0]
-        diff = neg_logits.unsqueeze(-1) - pos_logits
-        loss += torch.log(1 + torch.exp(diff).sum())
-    return loss / len(logits)
-    # pred = (1 - 2 * labels) * logits
-    # pred_neg = pred - labels * 1e12
-    # pred_pos = pred - (1 - labels) * 1e12
-    #
-    # zeros = torch.zeros_like(logits[..., :1], dtype=logits.dtype)
-    # pred_neg = torch.cat([pred_neg, zeros], dim=-1)
-    # pred_pos = torch.cat([pred_pos, zeros], dim=-1)
-    #
-    # neg_loss = torch.logsumexp(pred_neg, dim=-1)
-    # pos_loss = torch.logsumexp(pred_pos, dim=-1)
-    #
-    # return (neg_loss + pos_loss).mean()
-@register_model
-class ProTrekTrimodalModel(AbstractModel):
-    def __init__(self,
-                 protein_config: str,
-                 text_config: str,
-                 structure_config: str = None,
-                 repr_dim: int = 1024,
-                 temperature: float = 0.07,
-                 load_protein_pretrained: bool = True,
-                 load_text_pretrained: bool = True,
-                 use_mlm_loss: bool = False,
-                 use_zlpr_loss: bool = False,
-                 use_saprot: bool = False,
-                 gradient_checkpointing: bool = False,
-                 **kwargs):
-        """
-        Args:
-            protein_config: Path to the config file for protein sequence encoder
-            text_config: Path to the config file for text encoder
-            structure_config: Path to the config file for structure encoder
-            repr_dim: Output dimension of the protein and text representation
-            temperature: Temperature for softmax
-            load_protein_pretrained: Whether to load pretrained weights for protein encoder
-            load_text_pretrained: Whether to load pretrained weights for text encoder
-            use_mlm_loss: Whether to use masked language modeling loss
-            use_zlpr_loss: Whether to use zlpr loss. See "https://arxiv.org/pdf/2208.02955.pdf"
-            use_saprot: Whether to use SaProt as protein encoder
-            gradient_checkpointing: Whether to use gradient checkpointing for protein encoder
-        """
-        self.protein_config = protein_config
-        self.structure_config = structure_config
-        self.text_config = text_config
-        self.repr_dim = repr_dim
-        self.temperature = temperature
-        self.load_protein_pretrained = load_protein_pretrained
-        self.load_text_pretrained = load_text_pretrained
-        self.use_mlm_loss = use_mlm_loss
-        self.use_zlpr_loss = use_zlpr_loss
-        self.use_saprot = use_saprot
-        self.gradient_checkpointing = gradient_checkpointing
-        super().__init__(**kwargs)
-    def initialize_metrics(self, stage: str) -> dict:
-        return_dict = {
-            f"{stage}_protein_text_acc": torchmetrics.Accuracy(),
-            f"{stage}_text_protein_acc": torchmetrics.Accuracy(),
-        }
-        if self.use_mlm_loss:
-            return_dict[f"{stage}_protein_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
-            if self.structure_config is not None:
-                return_dict[f"{stage}_structure_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
-        if self.structure_config is not None:
-            return_dict[f"{stage}_structure_protein_acc"] = torchmetrics.Accuracy()
-            return_dict[f"{stage}_structure_text_acc"] = torchmetrics.Accuracy()
-            return_dict[f"{stage}_text_structure_acc"] = torchmetrics.Accuracy()
-            return_dict[f"{stage}_protein_structure_acc"] = torchmetrics.Accuracy()
-        return return_dict
-    def initialize_model(self):
-        # Initialize encoders
-        self.protein_encoder = ProteinEncoder(self.protein_config,
-                                              self.repr_dim,
-                                              self.load_protein_pretrained,
-                                              self.gradient_checkpointing)
-        self.text_encoder = TextEncoder(self.text_config,
-                                        self.repr_dim,
-                                        self.load_text_pretrained,
-                                        self.gradient_checkpointing)
-        # Learnable temperature
-        self.temperature = torch.nn.Parameter(torch.tensor(self.temperature))
-        # self.model is used for saving and loading
-        self.model = torch.nn.ParameterList([self.temperature,
-                                             self.protein_encoder,
-                                             self.text_encoder])
-        # If the structure encoder is specified
-        if self.structure_config is not None:
-            self.structure_encoder = StructureEncoder(self.structure_config, self.repr_dim)
-            self.model.append(self.structure_encoder)
-    def get_text_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        return self.text_encoder.get_repr(texts, batch_size, verbose)
-    def get_structure_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        return self.structure_encoder.get_repr(proteins, batch_size, verbose)
-    def get_protein_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        return self.protein_encoder.get_repr(proteins, batch_size, verbose)
-    def forward(self, protein_inputs: dict, text_inputs: dict, structure_inputs: dict = None):
-        """
-        Args:
-            protein_inputs: A dictionary for protein encoder
-            structure_inputs: A dictionary for structure encoder
-            text_inputs   : A dictionary for text encoder
-        """
-        protein_repr, protein_mask_logits = self.protein_encoder(protein_inputs, self.use_mlm_loss)
-        text_repr = self.text_encoder(text_inputs)
-        outputs = [text_repr, protein_repr, protein_mask_logits]
-        if self.structure_config is not None:
-            structure_repr, structure_mask_logits = self.structure_encoder(structure_inputs, self.use_mlm_loss)
-            outputs += [structure_repr, structure_mask_logits]
-        return outputs
-    def loss_func(self, stage: str, outputs, labels):
-        if self.structure_config is not None:
-            text_repr, protein_repr, protein_mask_logits, structure_repr, structure_mask_logits = outputs
-        else:
-            text_repr, protein_repr, protein_mask_logits = outputs
-        device = text_repr.device
-        text_repr = normalize(text_repr, dim=-1)
-        protein_repr = normalize(protein_repr, dim=-1)
-        # Gather representations from all GPUs
-        all_protein_repr = self.all_gather(protein_repr).view(-1, protein_repr.shape[-1]).detach()
-        all_text_repr = self.all_gather(text_repr).view(-1, text_repr.shape[-1]).detach()
-        if self.structure_config is not None:
-            structure_repr = normalize(structure_repr, dim=-1)
-            all_structure_repr = self.all_gather(structure_repr).view(-1, structure_repr.shape[-1]).detach()
-        # text_idx = labels["text_idx"]
-        # text_candidates = labels["text_candidates"]
-        #
-        # # Gather all text ids
-        # text_inds = self.all_gather(text_idx).flatten()
-        # # Create text classification labels
-        # text_labels = torch.zeros(len(text_candidates), len(text_inds), dtype=int).to(device)
-        # for i, candidate in enumerate(text_candidates):
-        #     for j, idx in enumerate(text_inds):
-        #         if idx.item() in candidate:
-        #             text_labels[i, j] = 1
-        #
-        # # Gather text labels from all GPUs
-        # text_labels = self.all_gather(text_labels).view(-1, text_labels.shape[-1])
-        #
-        # # Protein classification labels are the transpose of text labels
-        # protein_labels = text_labels.T
-        # Batch size
-        rank = dist.get_rank()
-        bs = text_repr.shape[0]
-        # Get current labels
-        # protein_labels = protein_labels[rank * bs: rank * bs + bs]
-        # text_labels = text_labels[rank * bs: rank * bs + bs]
-        # Create classification labels between structure and sequence
-        bs_labels = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(device)
-        if self.structure_config is not None:
-            pairs = {
-                "protein": ["structure", "text"],
-                "structure": ["protein", "text"],
-                "text": ["protein", "structure"]
-            }
-        else:
-            pairs = {
-                "protein": ["text"],
-                "text": ["protein"]
-            }
-        loss_list = []
-        for k, values in pairs.items():
-            for v in values:
-                # Only calculate the similarity for the current batch
-                sim = torch.matmul(eval(f"{k}_repr"), eval(f"all_{v}_repr").T).div(self.temperature)
-                # if k == "text":
-                #     if self.use_zlpr_loss:
-                #         loss = multilabel_cross_entropy(sim, protein_labels)
-                #     else:
-                #         loss = cross_entropy(sim, bs_labels)
-                #
-                #     pred = []
-                #     for s, l in zip(sim, protein_labels):
-                #         n_label = l.sum()
-                #         topk = torch.topk(s, k=n_label).indices
-                #         if l[topk].sum() == n_label:
-                #             pred.append(1)
-                #         else:
-                #             pred.append(0)
-                #
-                #     pred = torch.tensor(pred).to(device)
-                #     label = torch.ones_like(pred)
-                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
-                #     # if v == "protein":
-                #     #     acc = self.metrics[stage][f"{stage}_{k}_{v}_acc"].compute()
-                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
-                #
-                # elif v == "text":
-                #     if self.use_zlpr_loss:
-                #         loss = multilabel_cross_entropy(sim, text_labels)
-                #     else:
-                #         loss = cross_entropy(sim, bs_labels)
-                #
-                #     pred = []
-                #     for s, l in zip(sim, text_labels):
-                #         n_label = l.sum()
-                #         topk = torch.topk(s, k=n_label).indices
-                #         if l[topk].sum() == n_label:
-                #             pred.append(1)
-                #         else:
-                #             pred.append(0)
-                #
-                #     pred = torch.tensor(pred).to(device)
-                #     label = torch.ones_like(pred)
-                #     # if k == "protein":
-                #     #     acc = pred.sum() / len(pred)
-                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
-                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
-                #
-                # else:
-                #     loss = cross_entropy(sim, bs_labels)
-                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
-                loss = cross_entropy(sim, bs_labels)
-                self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
-                loss_list.append(loss)
-        # Masked language modeling loss
-        if self.use_mlm_loss:
-            k_label = [("protein", labels["seq_labels"])]
-            if self.structure_config is not None:
-                k_label.append(("structure", labels["struc_labels"]))
-            for k, label in k_label:
-                logits = eval(f"{k}_mask_logits")
-                # merge the first and second dimension of logits
-                logits = logits.view(-1, logits.shape[-1])
-                label = label.flatten().to(device)
-                mlm_loss = cross_entropy(logits, label, ignore_index=-1)
-                loss_list.append(mlm_loss)
-                self.metrics[stage][f"{stage}_{k}_mask_acc"].update(logits.detach(), label)
-        loss = sum(loss_list) / len(loss_list)
-        if stage == "train":
-            log_dict = self.get_log_dict("train")
-            log_dict["train_loss"] = loss
-            self.log_info(log_dict)
-            # Reset train metrics
-            self.reset_metrics("train")
-        return loss
-    def padded_gather(self, tensor: torch.Tensor):
-        """
-        Gather tensors from all GPUs, allowing different shapes at the batch dimension.
-        """
-        # Get the size of the tensor
-        size = tensor.shape[0]
-        all_sizes = self.all_gather(torch.tensor(size, device=tensor.device))
-        max_size = max(all_sizes)
-        # Pad the tensor
-        if size != max_size:
-            tmp = torch.zeros(max_size, tensor.shape[-1], dtype=tensor.dtype, device=tensor.device)
-            tmp[:size] = tensor
-            tensor = tmp
-        padded_tensor = self.all_gather(tensor).view(-1, tensor.shape[-1])
-        tensor = padded_tensor[:sum(all_sizes)]
-        return tensor
-    def _get_protein_indices(self):
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        if self.use_saprot:
-            proteins = []
-            for sub_dict in self.uniprot2label.values():
-                aa_seq = sub_dict["seq"]
-                foldseek_seq = sub_dict["foldseek"]
-                assert len(aa_seq) == len(foldseek_seq)
-                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
-                proteins.append(seq)
-        else:
-            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
-        span = math.ceil(len(proteins) / world_size)
-        sub_proteins = proteins[rank * span: (rank + 1) * span]
-        # Display the progress bar on the rank 0 process
-        verbose = self.trainer.local_rank == 0
-        # Get protein representations
-        sub_protein_repr = self.protein_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
-        protein_repr = self.padded_gather(sub_protein_repr)
-        # Construct faiss index
-        d = protein_repr.shape[-1]
-        protein_indices = faiss.IndexFlatIP(d)
-        protein_indices.add(protein_repr.cpu().numpy())
-        return protein_indices
-    def _get_structure_indices(self):
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        proteins = [sub_dict["foldseek"] for sub_dict in self.uniprot2label.values()]
-        span = math.ceil(len(proteins) / world_size)
-        sub_proteins = proteins[rank * span: (rank + 1) * span]
-        # Display the progress bar on the rank 0 process
-        verbose = self.trainer.local_rank == 0
-        # Get protein representations
-        sub_protein_repr = self.structure_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
-        protein_repr = self.padded_gather(sub_protein_repr)
-        # Construct faiss index
-        d = protein_repr.shape[-1]
-        structure_indices = faiss.IndexFlatIP(d)
-        structure_indices.add(protein_repr.cpu().numpy())
-        return structure_indices
-    def _get_text_indices(self):
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        # Display the progress bar on the rank 0 process
-        verbose = self.trainer.local_rank == 0
-        if verbose:
-            iterator = tqdm(self.label2text.keys(), desc="Get text representations")
-        else:
-            iterator = self.label2text.keys()
-        text_embeddings = {}
-        for subsection in iterator:
-            if subsection == "Total":
-                continue
-            texts = []
-            for text_list in self.label2text[subsection].values():
-                # Only use the first text for efficiency
-                texts.append(text_list[0:1])
-            span = math.ceil(len(texts) / world_size)
-            texts = texts[rank * span: (rank + 1) * span]
-            embeddings = []
-            for text_list in texts:
-                text_repr = self.text_encoder.get_repr(text_list)
-                mean_repr = text_repr.mean(dim=0, keepdim=True)
-                norm_repr = torch.nn.functional.normalize(mean_repr, dim=-1)
-                embeddings.append(norm_repr)
-            if len(embeddings) > 0:
-                embeddings = torch.cat(embeddings, dim=0)
-            else:
-                embeddings = torch.zeros(0, self.repr_dim, dtype=self.dtype, device=self.device)
-            text_repr = self.padded_gather(embeddings)
-            text_embeddings[subsection] = text_repr
-        # Aggregate text embeddings for global retrieval
-        total_embeddings = []
-        for idx in self.label2text["Total"].values():
-            subsection, i = idx.split("|")
-            total_embeddings.append(text_embeddings[subsection][int(i)])
-        text_embeddings["Total"] = torch.stack(total_embeddings)
-        # Construct faiss index
-        text_indices = {}
-        for subsection, text_repr in text_embeddings.items():
-            d = text_repr.shape[-1]
-            text_indices[subsection] = faiss.IndexFlatIP(d)
-            text_indices[subsection].add(text_repr.cpu().numpy())
-        return text_indices
-    def _protein2text(self, modality: str, protein_indices, text_indices: dict):
-        def do(process_id, idx, row, writer):
-            subsection, uniprot_id, prob_idx, label = row
-            # Retrieve ranking results
-            p_embedding = protein_indices.reconstruct(prob_idx).reshape(1, -1)
-            text_inds = text_indices[subsection]
-            sim_scores, rank_inds = text_inds.search(p_embedding, text_inds.ntotal)
-            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
-            # Calculate Average Precision(AP)
-            ranks = []
-            label = set(label)
-            for i, rk in enumerate(rank_inds):
-                # Find the rank of this label in all labels
-                if rk in label:
-                    ranks.append(i + 1)
-            ranks = np.array(ranks)
-            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
-            # Calculate Mean Reciprocal Rank(MRR)
-            best_rank = ranks[0]
-            mrr = 1 / best_rank
-            # Calculate the AUC
-            true_labels = np.zeros_like(sim_scores)
-            true_labels[ranks - 1] = 1
-            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
-                auc = 0
-            else:
-                auc = roc_auc_score(true_labels, sim_scores)
-            output = json.dumps([ap, mrr, auc])
-            writer.write(output + "\n")
-        inputs = []
-        swissprot_subsections = set()
-        for subsection in text_indices.keys():
-            for i, (uniprot_id, labels) in enumerate(self.uniprot2label.items()):
-                if uniprot_id in self.swissprot_ids:
-                    if subsection in labels:
-                        swissprot_subsections.add(subsection)
-                        label = labels[subsection]
-                        inputs.append((subsection, uniprot_id, i, label))
-        # Randomly shuffle the inputs
-        random.seed(20000812)
-        random.shuffle(inputs)
-        # Split inputs into chunks for parallel processing
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        span = math.ceil(len(inputs) / world_size)
-        sub_inputs = inputs[rank * span: (rank + 1) * span]
-        # Display the progress bar on the rank 0 process
-        verbose = self.trainer.local_rank == 0
-        if verbose:
-            print("Evaluating on each subsection...")
-        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
-        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
-                                              return_results=True)
-        outputs = mpr.run()
-        os.remove(tmp_path)
-        # Aggregate results
-        tensor_outputs = []
-        for output in outputs:
-            ap, mrr, auc = json.loads(output)
-            tensor_outputs.append([float(ap), float(mrr), float(auc)])
-        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
-        tensor_outputs = self.padded_gather(tensor_outputs)
-        # Record results
-        avg_results = {}
-        for subsection in swissprot_subsections:
-            avg_results[subsection] = {"map": [],
-                                       "mrr": [],
-                                       "auc": []}
-        for input, output in zip(inputs, tensor_outputs):
-            ap, mrr, auc = output
-            subsection, _, _, _ = input
-            avg_results[subsection]["map"].append(ap.cpu().item())
-            avg_results[subsection]["mrr"].append(mrr.cpu().item())
-            avg_results[subsection]["auc"].append(auc.cpu().item())
-        results = {
-            f"{modality}2Text_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
-            f"{modality}2Text_Total_map": np.mean(avg_results["Total"]["map"]),
-            f"{modality}2Text_Total_auc": np.mean(avg_results["Total"]["auc"]),
-        }
-        # Average the precision and recall for each level
-        for level, labels in [("residue-level", residue_level),
-                              ("sequence-level", sequence_level),
-                              ("all", residue_level | sequence_level)]:
-            mrrs = []
-            maps = []
-            aucs = []
-            for subsection in labels:
-                if subsection in avg_results:
-                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
-                    maps.append(np.mean(avg_results[subsection]["map"]))
-                    aucs.append(np.mean(avg_results[subsection]["auc"]))
-            results[f"{modality}2Text_{level}_mrr"] = np.mean(mrrs)
-            results[f"{modality}2Text_{level}_map"] = np.mean(maps)
-            results[f"{modality}2Text_{level}_auc"] = np.mean(aucs)
-        return results
-    def _text2protein(self, modality: str, protein_indices, text_indices: dict):
-        def do(process_id, idx, row, writer):
-            subsection, text_id, label = row
-            # Retrieve ranking results
-            t_embedding = text_indices[subsection].reconstruct(text_id).reshape(1, -1)
-            sim_scores, rank_inds = protein_indices.search(t_embedding, protein_indices.ntotal)
-            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
-            # Calculate Average Precision(AP)
-            ranks = []
-            label = set(label)
-            for i, rk in enumerate(rank_inds):
-                # Find the rank of this label in all labels
-                if rk in label:
-                    ranks.append(i + 1)
-            ranks = np.array(ranks)
-            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
-            # Calculate Mean Reciprocal Rank(MRR)
-            best_rank = ranks[0]
-            mrr = 1 / best_rank
-            # Calculate the AUC
-            true_labels = np.zeros_like(sim_scores)
-            true_labels[ranks - 1] = 1
-            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
-                auc = 0
-            else:
-                auc = roc_auc_score(true_labels, sim_scores)
-            output = json.dumps([ap, mrr, auc])
-            writer.write(output + "\n")
-        text2label = {}
-        swissprot_subsections = set()
-        for i, (uniprot_id, subsections) in enumerate(self.uniprot2label.items()):
-            # Only evaluate the texts in Swiss-Prot
-            if uniprot_id not in self.swissprot_ids:
-                continue
-            for subsection, text_ids in subsections.items():
-                if subsection == "seq" or subsection == "foldseek":
-                    continue
-                swissprot_subsections.add(subsection)
-                if subsection not in text2label:
-                    text2label[subsection] = {}
-                for text_id in text_ids:
-                    text2label[subsection][text_id] = text2label[subsection].get(text_id, []) + [i]
-        inputs = []
-        for subsection in swissprot_subsections:
-            for i, (text_id, label) in enumerate(text2label[subsection].items()):
-                inputs.append((subsection, text_id, label))
-        # Randomly shuffle the inputs
-        random.seed(20000812)
-        random.shuffle(inputs)
-        # Split inputs into chunks for parallel processing
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        span = math.ceil(len(inputs) / world_size)
-        sub_inputs = inputs[rank * span: (rank + 1) * span]
-        # Display the progress bar on the rank 0 process
-        verbose = self.trainer.local_rank == 0
-        if verbose:
-            print("Evaluating on each text...")
-        # Add time stamp to the temporary file name to avoid conflicts
-        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
-        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
-                                              return_results=True)
-        outputs = mpr.run()
-        os.remove(tmp_path)
-        # Aggregate results
-        tensor_outputs = []
-        for output in outputs:
-            ap, mrr, auc = json.loads(output)
-            tensor_outputs.append([float(ap), float(mrr), float(auc)])
-        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
-        tensor_outputs = self.padded_gather(tensor_outputs)
-        # Record results
-        avg_results = {}
-        for subsection in swissprot_subsections:
-            avg_results[subsection] = {"map": [],
-                                       "mrr": [],
-                                       "auc": []}
-        for input, output in zip(inputs, tensor_outputs):
-            ap, mrr, auc = output
-            subsection, _, _ = input
-            avg_results[subsection]["map"].append(ap.cpu().item())
-            avg_results[subsection]["mrr"].append(mrr.cpu().item())
-            avg_results[subsection]["auc"].append(auc.cpu().item())
-        results = {
-            f"Text2{modality}_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
-            f"Text2{modality}_Total_map": np.mean(avg_results["Total"]["map"]),
-            f"Text2{modality}_Total_auc": np.mean(avg_results["Total"]["auc"]),
-        }
-        # Average the precision and recall for each level
-        for level, labels in [("residue-level", residue_level),
-                              ("sequence-level", sequence_level),
-                              ("all", residue_level | sequence_level)]:
-            mrrs = []
-            maps = []
-            aucs = []
-            for subsection in labels:
-                if subsection in avg_results:
-                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
-                    maps.append(np.mean(avg_results[subsection]["map"]))
-                    aucs.append(np.mean(avg_results[subsection]["auc"]))
-            results[f"Text2{modality}_{level}_mrr"] = np.mean(mrrs)
-            results[f"Text2{modality}_{level}_map"] = np.mean(maps)
-            results[f"Text2{modality}_{level}_auc"] = np.mean(aucs)
-        return results
-    def retrieval_eval(self) -> dict:
-        # Get protein representations
-        protein_indices = self._get_protein_indices()
-        # Get structure representations
-        # if self.structure_config is not None:
-        #     structure_embeddings = self._get_structure_embeddings()
-        # Get text representations
-        text_indices = self._get_text_indices()
-        # Retrieve texts for each protein
-        results = {}
-        results.update(self._protein2text("Sequence", protein_indices, text_indices))
-        # if self.structure_config is not None:
-        #     results.update(self._protein2text("Structure", structure_embeddings, text_embeddings))
-        #     results.update(self._text2protein("Structure", structure_embeddings, text_embeddings))
-        # Retrieve proteins for each text
-        results.update(self._text2protein("Sequence", protein_indices, text_indices))
-        return results
-    def _apply_bert_mask(self, tokens, tokenizer, mask_ratio):
-        while True:
-            masked_tokens = copy.copy(tokens)
-            labels = torch.full((len(tokens) + 2,), -1, dtype=torch.long)
-            vocab = [k for k in tokenizer.get_vocab().keys()]
-            for i in range(len(tokens)):
-                token = tokens[i]
-                prob = random.random()
-                if prob < mask_ratio:
-                    prob /= mask_ratio
-                    labels[i + 1] = tokenizer.convert_tokens_to_ids(token)
-                    if prob < 0.8:
-                        # 80% random change to mask token
-                        if self.use_saprot:
-                            token = "#" + token[-1]
-                        else:
-                            token = tokenizer.mask_token
-                    elif prob < 0.9:
-                        # 10% chance to change to random token
-                        token = random.choice(vocab)
-                    else:
-                        # 10% chance to keep current token
-                        pass
-                    masked_tokens[i] = token
-            # Check if there is at least one masked token
-            if (labels != -1).any():
-                return masked_tokens, labels
-    def mlm_eval(self) -> float:
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-        if self.use_saprot:
-            proteins = []
-            for sub_dict in self.uniprot2label.values():
-                aa_seq = sub_dict["seq"]
-                foldseek_seq = sub_dict["foldseek"]
-                assert len(aa_seq) == len(foldseek_seq)
-                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
-                proteins.append(seq)
-        else:
-            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
-        span = math.ceil(len(proteins) / world_size)
-        sub_proteins = proteins[rank * span: (rank + 1) * span]
-        # Display the progress bar on the rank 0 process
-        if self.trainer.local_rank == 0:
-            iterator = tqdm(sub_proteins, desc="Computing mlm...")
-        else:
-            iterator = sub_proteins
-        total = torch.tensor([0], dtype=torch.long, device=self.device)
-        correct = torch.tensor([0], dtype=torch.long, device=self.device)
-        for seq in iterator:
-            tokens = self.protein_encoder.tokenizer.tokenize(seq)
-            masked_tokens, labels = self._apply_bert_mask(tokens, self.protein_encoder.tokenizer, 0.15)
-            seq = " ".join(masked_tokens)
-            inputs = self.protein_encoder.tokenizer(seq, return_tensors="pt")
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            _, logits = self.protein_encoder(inputs, get_mask_logits=True)
-            logits = logits.squeeze(0)
-            labels = labels.to(self.device)
-            selecor = labels != -1
-            preds = logits.argmax(dim=-1)[selecor]
-            labels = labels[selecor]
-            total += len(preds)
-            correct += (preds == labels).sum()
-        # Gather all results
-        total = self.padded_gather(total).sum()
-        correct = self.padded_gather(correct).sum()
-        acc = correct / total
-        return acc.cpu().item()
-    def _load_eval_data(self, stage):
-        # Load the data
-        lmdb_dir = eval(f"self.trainer.datamodule.{stage}_lmdb")
-        uniprot2label_path = os.path.join(lmdb_dir, "uniprot2label.json")
-        label2text_path = os.path.join(lmdb_dir, "label2text.json")
-        swissprot_id_path = os.path.join(lmdb_dir, "swissprot_ids.tsv")
-        self.uniprot2label = json.load(open(uniprot2label_path, "r"))
-        self.label2text = json.load(open(label2text_path, "r"))
-        self.swissprot_ids = set(pd.read_csv(swissprot_id_path, sep="\t", header=None).values.flatten().tolist())
-        self.k = 3
-    def on_test_start(self):
-        self._load_eval_data("test")
-        log_dict = self.retrieval_eval()
-        log_dict = {"test_" + k: v for k, v in log_dict.items()}
-        if self.use_mlm_loss:
-            log_dict["test_mask_acc"] = self.mlm_eval()
-        self.log_info(log_dict)
-        print(log_dict)
-    def on_validation_start(self):
-        # Clear the cache
-        torch.cuda.empty_cache()
-        self._load_eval_data("valid")
-        log_dict = self.retrieval_eval()
-        log_dict = {"valid_" + k: v for k, v in log_dict.items()}
-        if self.use_mlm_loss:
-            log_dict["valid_mask_acc"] = self.mlm_eval()
-        self.log_info(log_dict)
-        self.check_save_condition(self.step, mode="max")
-    def test_step(self, batch, batch_idx):
-        return
-    def validation_step(self, batch, batch_idx):
-        return
-    def on_train_epoch_end(self):
-        super().on_train_epoch_end()
-        # Re-sample the subset of the training data
-        if self.trainer.datamodule.train_dataset.fixed_dataset_num is not None:
-            self.trainer.datamodule.train_dataset.sample_subset()
-    # def test_epoch_end(self, outputs):
-    #     log_dict = self.get_log_dict("test")
-    #     log_dict["test_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
-    #
-    #     print(log_dict)
-    #     self.log_info(log_dict)
-    #
-    #     self.reset_metrics("test")
-    #
-    # def validation_epoch_end(self, outputs):
-    #     log_dict = self.get_log_dict("valid")
-    #     log_dict["valid_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
-    #
-    #     self.log_info(log_dict)
-    #     self.reset_metrics("valid")
-    #     self.check_save_condition(log_dict["valid_loss"], mode="min")

model/ProTrek/structure_encoder.py DELETED Viewed

@@ -1,86 +0,0 @@
-import torch
-from tqdm import tqdm
-from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
-from torch.nn.functional import normalize
-class StructureEncoder(torch.nn.Module):
-    def __init__(self, config_path: str, out_dim: int, gradient_checkpointing: bool = False):
-        """
-        Args:
-            config_path: Path to the config file
-            out_dim: Output dimension of the structure representation
-            gradient_checkpointing: Whether to use gradient checkpointing
-        """
-        super().__init__()
-        config = EsmConfig.from_pretrained(config_path)
-        self.model = EsmForMaskedLM(config)
-        self.out = torch.nn.Linear(config.hidden_size, out_dim)
-        # Set gradient checkpointing
-        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
-        # Remove contact head
-        self.model.esm.contact_head = None
-        # Remove position embedding if the embedding type is ``rotary``
-        if config.position_embedding_type == "rotary":
-            self.model.esm.embeddings.position_embeddings = None
-        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
-    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        """
-        Compute protein structure representation for the given proteins
-        Args:
-            protein: A list of protein structural sequences
-            batch_size: Batch size for inference
-            verbose: Whether to print progress
-        """
-        device = next(self.parameters()).device
-        protein_repr = []
-        if verbose:
-            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
-        else:
-            iterator = range(0, len(proteins), batch_size)
-        for i in iterator:
-            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
-                                                              return_tensors="pt",
-                                                              padding=True)
-            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
-            output, _ = self.forward(protein_inputs)
-            protein_repr.append(output)
-        protein_repr = torch.cat(protein_repr, dim=0)
-        return normalize(protein_repr, dim=-1)
-    def forward(self, inputs: dict, get_mask_logits: bool = False):
-        """
-        Encode protein structure into protein representation
-        Args:
-            inputs: A dictionary containing the following keys:
-                - input_ids: [batch, seq_len]
-                - attention_mask: [batch, seq_len]
-            get_mask_logits: Whether to return the logits for masked tokens
-        Returns:
-            protein_repr: [batch, protein_repr_dim]
-            mask_logits : [batch, seq_len, vocab_size]
-        """
-        last_hidden_state = self.model.esm(**inputs).last_hidden_state
-        reprs = last_hidden_state[:, 0, :]
-        reprs = self.out(reprs)
-        # Get logits for masked tokens
-        if get_mask_logits:
-            mask_logits = self.model.lm_head(last_hidden_state)
-        else:
-            mask_logits = None
-        return reprs, mask_logits

model/ProTrek/text_encoder.py DELETED Viewed

@@ -1,81 +0,0 @@
-import torch
-from tqdm import tqdm
-from torch.nn.functional import normalize
-from transformers import BertConfig, BertModel, BertTokenizer
-class TextEncoder(torch.nn.Module):
-    def __init__(self,
-                 config_path: str,
-                 out_dim: int,
-                 load_pretrained: bool = True,
-                 gradient_checkpointing: bool = False):
-        """
-        Args:
-            config_path: Path to the config file
-            out_dim: Output dimension of the text representation
-            load_pretrained: Whether to load pretrained weights
-            gradient_checkpointing: Whether to enable gradient checkpointing
-        """
-        super().__init__()
-        config = BertConfig.from_pretrained(config_path)
-        if load_pretrained:
-            self.model = BertModel.from_pretrained(config_path, add_pooling_layer=False)
-        else:
-            self.model = BertModel(config, add_pooling_layer=False)
-        self.out = torch.nn.Linear(config.hidden_size, out_dim)
-        # Set gradient checkpointing
-        self.model.encoder.gradient_checkpointing = gradient_checkpointing
-        self.tokenizer = BertTokenizer.from_pretrained(config_path)
-    def get_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
-        """
-        Compute text representation for the given texts
-        Args:
-            texts: A list of strings
-            batch_size: Batch size for inference
-            verbose: Whether to print progress
-        """
-        device = next(self.parameters()).device
-        text_repr = []
-        if verbose:
-            iterator = tqdm(range(0, len(texts), batch_size), desc="Computing text embeddings")
-        else:
-            iterator = range(0, len(texts), batch_size)
-        for i in iterator:
-            text_inputs = self.tokenizer.batch_encode_plus(texts[i: i+batch_size],
-                                                           return_tensors="pt",
-                                                           truncation=True,
-                                                           max_length=512,
-                                                           padding=True)
-            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-            output = self(text_inputs)
-            text_repr.append(output)
-        text_repr = torch.cat(text_repr, dim=0)
-        return normalize(text_repr, dim=-1)
-    def forward(self, inputs: dict):
-        """
-        Encode text into text representation
-        Args:
-            inputs: A dictionary containing the following keys:
-                - input_ids: [batch, seq_len]
-                - attention_mask: [batch, seq_len]
-                - token_type_ids: [batch, seq_len]
-        Returns:
-            text_repr: [batch, text_repr_dim]
-        """
-        reprs = self.model(**inputs).last_hidden_state[:, 0, :]
-        reprs = self.out(reprs)
-        return reprs

model/abstract_model.py DELETED Viewed

@@ -1,401 +0,0 @@
-import torch
-import abc
-import os
-import copy
-import pytorch_lightning as pl
-from utils.lr_scheduler import *
-from torch import distributed as dist
-class AbstractModel(pl.LightningModule):
-    def __init__(self,
-                 lr_scheduler_kwargs: dict = None,
-                 optimizer_kwargs: dict = None,
-                 save_path: str = None,
-                 from_checkpoint: str = None,
-                 load_prev_scheduler: bool = False,
-                 save_weights_only: bool = True,):
-        """
-        Args:
-            lr_scheduler: Kwargs for lr_scheduler
-            optimizer_kwargs: Kwargs for optimizer_kwargs
-            save_path: Save trained model
-            from_checkpoint: Load model from checkpoint
-            load_prev_scheduler: Whether load previous scheduler from checkpoint
-            load_strict: Whether load model strictly
-            save_weights_only: Whether save only weights or also optimizer and lr_scheduler
-        """
-        super().__init__()
-        self.initialize_model()
-        self.metrics = {}
-        for stage in ["train", "valid", "test"]:
-            stage_metrics = self.initialize_metrics(stage)
-            # Rigister metrics as attributes
-            for metric_name, metric in stage_metrics.items():
-                setattr(self, metric_name, metric)
-            self.metrics[stage] = stage_metrics
-        if lr_scheduler_kwargs is None:
-            # Default lr_scheduler
-            self.lr_scheduler_kwargs = {
-                "class": "ConstantLRScheduler",
-                "init_lr": 0,
-            }
-            print("No lr_scheduler_kwargs provided. The default learning rate is 0.")
-        else:
-            self.lr_scheduler_kwargs = lr_scheduler_kwargs
-        if optimizer_kwargs is None:
-            # Default optimizer
-            self.optimizer_kwargs = {
-                "class": "AdamW",
-                "betas": (0.9, 0.98),
-                "weight_decay": 0.01,
-            }
-            print("No optimizer_kwargs provided. The default optimizer is AdamW.")
-        else:
-            self.optimizer_kwargs = optimizer_kwargs
-        self.init_optimizers()
-        self.save_path = save_path
-        self.save_weights_only = save_weights_only
-        # temp_step is used for accumulating gradients
-        self.temp_step = 0
-        self.step = 0
-        self.epoch = 0
-        self.load_prev_scheduler = load_prev_scheduler
-        self.from_checkpoint = from_checkpoint
-        if from_checkpoint:
-            self.load_checkpoint(from_checkpoint)
-    @abc.abstractmethod
-    def initialize_model(self) -> None:
-        """
-        All model initialization should be done here
-        Note that the whole model must be named as "self.model" for model saving and loading
-        """
-        raise NotImplementedError
-    @abc.abstractmethod
-    def forward(self, *args, **kwargs):
-        """
-        Forward propagation
-        """
-        raise NotImplementedError
-    @abc.abstractmethod
-    def initialize_metrics(self, stage: str) -> dict:
-        """
-        Initialize metrics for each stage
-        Args:
-            stage: "train", "valid" or "test"
-        Returns:
-            A dictionary of metrics for the stage. Keys are metric names and values are metric objects
-        """
-        raise NotImplementedError
-    @abc.abstractmethod
-    def loss_func(self, stage: str, outputs, labels) -> torch.Tensor:
-        """
-        Args:
-            stage: "train", "valid" or "test"
-            outputs: model outputs for calculating loss
-            labels: labels for calculating loss
-        Returns:
-            loss
-        """
-        raise NotImplementedError
-    @staticmethod
-    def load_weights(model, weights):
-        model_dict = model.state_dict()
-        unused_params = []
-        missed_params = list(model_dict.keys())
-        for k, v in weights.items():
-            if k in model_dict.keys():
-                model_dict[k] = v
-                missed_params.remove(k)
-            else:
-                unused_params.append(k)
-        if len(missed_params) > 0:
-            print(f"\033[31mSome weights of {type(model).__name__} were not "
-                  f"initialized from the model checkpoint: {missed_params}\033[0m")
-        if len(unused_params) > 0:
-            print(f"\033[31mSome weights of the model checkpoint were not used: {unused_params}\033[0m")
-        model.load_state_dict(model_dict)
-    def optimizer_step(
-        self,
-        epoch: int,
-        batch_idx: int,
-        optimizer,
-        optimizer_closure=None,
-    ) -> None:
-        super().optimizer_step(epoch, batch_idx, optimizer, optimizer_closure)
-        self.temp_step += 1
-        if self.temp_step == self.trainer.accumulate_grad_batches:
-            self.step += 1
-            self.temp_step = 0
-    # For pytorch-lightning 1.9.5
-    # def optimizer_step(
-    #     self,
-    #     epoch: int,
-    #     batch_idx: int,
-    #     optimizer,
-    #     optimizer_idx: int = 0,
-    #     optimizer_closure=None,
-    #     on_tpu: bool = False,
-    #     using_native_amp: bool = False,
-    #     using_lbfgs: bool = False,
-    # ) -> None:
-    #     super().optimizer_step(
-    #         epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
-    #     )
-    #     self.temp_step += 1
-    #     if self.temp_step == self.trainer.accumulate_grad_batches:
-    #         self.step += 1
-    #         self.temp_step = 0
-    def on_train_epoch_end(self):
-        self.epoch += 1
-    def training_step(self, batch, batch_idx):
-        inputs, labels = batch
-        # optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.98))
-        # for _ in range(1000):
-        #     outputs = self(**inputs)
-        #     loss = self.loss_func('train', outputs, labels)
-        #     loss.backward()
-        #     optimizer.step()
-        #     optimizer.zero_grad()
-        #
-        # raise
-        outputs = self(**inputs)
-        loss = self.loss_func('train', outputs, labels)
-        self.log("loss", loss, prog_bar=True)
-        return loss
-    def validation_step(self, batch, batch_idx):
-        inputs, labels = batch
-        outputs = self(**inputs)
-        loss = self.loss_func('valid', outputs, labels)
-        self.valid_outputs.append(loss)
-        return loss
-    def test_step(self, batch, batch_idx):
-        inputs, labels = batch
-        outputs = self(**inputs)
-        loss = self.loss_func('test', outputs, labels)
-        self.test_outputs.append(loss)
-        return loss
-    def on_train_start(self) -> None:
-        # Load previous scheduler
-        if getattr(self, "prev_schechuler", None) is not None:
-            try:
-                self.step = self.prev_schechuler["global_step"]
-                self.epoch = self.prev_schechuler["epoch"]
-                self.best_value = self.prev_schechuler["best_value"]
-                self.lr_scheduler.load_state_dict(self.prev_schechuler["lr_scheduler"])
-                print(f"Previous training global step: {self.step}")
-                print(f"Previous training epoch: {self.epoch}")
-                print(f"Previous best value: {self.best_value}")
-                print(f"Previous lr_scheduler: {self.prev_schechuler['lr_scheduler']}")
-                # Load optimizer state
-                if hasattr(self.trainer.strategy, "deepspeed_engine"):
-                    # For DeepSpeed strategy
-                    try:
-                        self.trainer.strategy.deepspeed_engine.load_checkpoint(self.from_checkpoint)
-                    except Exception as e:
-                        print(e)
-                else:
-                    # For DDP strategy
-                    self.optimizer.load_state_dict(self.prev_schechuler["optimizer"])
-            except Exception as e:
-                print(e)
-                raise Exception("Error in loading previous scheduler. Please set load_prev_scheduler=False")
-    def on_validation_epoch_start(self) -> None:
-        setattr(self, "valid_outputs", [])
-    def on_test_epoch_start(self) -> None:
-        setattr(self, "test_outputs", [])
-    def load_checkpoint(self, from_checkpoint: str) -> None:
-        """
-        Args:
-            from_checkpoint:  Path to checkpoint.
-        """
-        # If ``from_checkpoint`` is a directory, load the checkpoint in it
-        if os.path.isdir(from_checkpoint):
-            basename = os.path.basename(from_checkpoint)
-            from_checkpoint = os.path.join(from_checkpoint, f"{basename}.pt")
-        state_dict = torch.load(from_checkpoint, map_location=self.device)
-        self.load_weights(self.model, state_dict["model"])
-        if self.load_prev_scheduler:
-            state_dict.pop("model")
-            self.prev_schechuler = state_dict
-    def save_checkpoint(self, save_path: str, save_info: dict = None, save_weights_only: bool = True) -> None:
-        """
-        Save model to save_path
-        Args:
-            save_path: Path to save model
-            save_info: Other info to save
-            save_weights_only: Whether only save model weights
-        """
-        dir = os.path.dirname(save_path)
-        os.makedirs(dir, exist_ok=True)
-        state_dict = {} if save_info is None else save_info
-        state_dict["model"] = self.model.state_dict()
-        # Convert model weights to fp32
-        for k, v in state_dict["model"].items():
-            state_dict["model"][k] = v.float()
-        if not save_weights_only:
-            state_dict["global_step"] = self.step
-            state_dict["epoch"] = self.epoch
-            state_dict["best_value"] = getattr(self, f"best_value", None)
-            state_dict["lr_scheduler"] = self.lr_schedulers().state_dict()
-            # If not using DeepSpeed, save optimizer state
-            if not hasattr(self.trainer.strategy, "deepspeed_engine"):
-                state_dict["optimizer"] = self.optimizers().optimizer.state_dict()
-        torch.save(state_dict, save_path)
-    def check_save_condition(self, now_value: float, mode: str, save_info: dict = None) -> None:
-        """
-        Check whether to save model. If save_path is not None and now_value is the best, save model.
-        Args:
-            now_value: Current metric value
-            mode: "min" or "max", meaning whether the lower the better or the higher the better
-            save_info: Other info to save
-        """
-        assert mode in ["min", "max"], "mode should be 'min' or 'max'"
-        if self.save_path is not None:
-            # In case there are variables to be included in the save path
-            save_path = eval(f"f'{self.save_path}'")
-            dir = os.path.dirname(save_path)
-            os.makedirs(dir, exist_ok=True)
-            # Check whether to save model
-            best_value = getattr(self, f"best_value", None)
-            if best_value is not None:
-                if mode == "min" and now_value >= best_value or mode == "max" and now_value <= best_value:
-                    return
-            setattr(self, "best_value", now_value)
-            # For DeepSpeed strategy
-            if hasattr(self.trainer.strategy, "deepspeed_engine"):
-                if not self.save_weights_only:
-                    self.trainer.strategy.deepspeed_engine.save_checkpoint(save_path, tag="deepspeed_ckpt")
-                # Save a complete checkpoint
-                if dist.get_rank() == 0:
-                    basename = os.path.basename(save_path)
-                    ckpt_path = os.path.join(save_path, f"{basename}.pt")
-                    self.save_checkpoint(ckpt_path, save_info, self.save_weights_only)
-            # For normal situation
-            else:
-                if dist.get_rank() == 0:
-                    self.save_checkpoint(save_path, save_info, self.save_weights_only)
-    def reset_metrics(self, stage) -> None:
-        """
-        Reset metrics for given stage
-        Args:
-            stage: "train", "valid" or "test"
-        """
-        for metric in self.metrics[stage].values():
-            metric.reset()
-    def get_log_dict(self, stage: str) -> dict:
-        """
-        Get log dict for the stage
-        Args:
-            stage: "train", "valid" or "test"
-        Returns:
-            A dictionary of metrics for the stage. Keys are metric names and values are metric values
-        """
-        return {name: metric.compute() for name, metric in self.metrics[stage].items()}
-    def log_info(self, info: dict) -> None:
-        """
-        Record metrics during training and testing
-        Args:
-            info: dict of metrics
-        """
-        if getattr(self, "logger", None) is not None and dist.get_rank() == 0:
-            info["learning_rate"] = self.lr_scheduler.get_last_lr()[0]
-            info["epoch"] = self.epoch
-            self.logger.log_metrics(info, step=self.step)
-    def init_optimizers(self):
-        copy_optimizer_kwargs = copy.deepcopy(self.optimizer_kwargs)
-        # No decay for layer norm and bias
-        no_decay = ['LayerNorm.weight', 'bias']
-        weight_decay = copy_optimizer_kwargs.pop("weight_decay")
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
-             'weight_decay': weight_decay},
-            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
-             'weight_decay': 0.0}
-        ]
-        optimizer_cls = eval(f"torch.optim.{copy_optimizer_kwargs.pop('class')}")
-        self.optimizer = optimizer_cls(optimizer_grouped_parameters,
-                                       lr=self.lr_scheduler_kwargs['init_lr'],
-                                       **copy_optimizer_kwargs)
-        tmp_kwargs = copy.deepcopy(self.lr_scheduler_kwargs)
-        lr_scheduler = tmp_kwargs.pop("class")
-        self.lr_scheduler = eval(lr_scheduler)(self.optimizer, **tmp_kwargs)
-    def configure_optimizers(self):
-        return {"optimizer": self.optimizer,
-                "lr_scheduler": {"scheduler": self.lr_scheduler,
-                                 "interval": "step",
-                                 "frequency": 1}
-                }

model/model_interface.py DELETED Viewed

@@ -1,104 +0,0 @@
-import os
-import yaml
-import glob
-# register all available models through *_model.py files
-# def construct_model():
-#     model_dir = os.path.dirname(__file__)
-#
-#     # lists all model files
-#     model_list = []
-#     for root, _, names in os.walk(model_dir):
-#         for name in names:
-#             if name.endswith('_model.py'):
-#                 sub_dirs = root.replace(model_dir, '').split(os.sep)
-#                 model_list.append((sub_dirs, name[:-3]))
-#
-#     # load model_config.yaml, controlling which models to be loaded
-#     model_config = yaml.safe_load(open(f"{model_dir}/model_config.yaml", "r"))
-#
-#     if model_config["verbose"]:
-#         print("*" * 30 + f" Loading model " + "*" * 30)
-#
-#     # register models
-#     for sub_dirs, name in model_list:
-#         if name in model_config["models"]:
-#             if len(sub_dirs) > 1:
-#                 cmd = f"from {'.'.join(sub_dirs)} import {name}"
-#             else:
-#                 cmd = f"from . import {name}"
-#
-#             exec(cmd)
-#
-#             if model_config["verbose"]:
-#                 info = f"Loaded model: {name}"
-#                 print(f"\033[32m{info}\033[0m")
-#         else:
-#             if model_config["verbose"]:
-#                 info = f"Skipped model: {name}"
-#                 print(f"\033[31m{info}\033[0m")
-#
-#     if model_config["verbose"]:
-#         print("*" * 75)
-#
-#
-# # register function as a wrapper for all models
-# def register_model(cls):
-#     model_dict[cls.__name__] = cls
-#     return cls
-#
-#
-# model_dict = {}
-# construct_model()
-#
-#
-# class ModelInterface:
-#     @classmethod
-#     def get_available_models(cls):
-#         return model_dict.keys()
-#
-#     @classmethod
-#     def init_model(cls, model: str, **kwargs):
-#         """
-#
-#         Args:
-#            model   : Class name of model you want to use. Must be in model_dict.keys()
-#            **kwargs: Kwargs for model initialization
-#
-#         Returns: Corresponding model
-#
-#         """
-#         assert model in model_dict.keys(), f"class {model} doesn't exist!"
-#         return model_dict[model](**kwargs)
-########################################################################
-#                             Version 2                                #
-########################################################################
-# register function as a wrapper for all models
-def register_model(cls):
-    global now_cls
-    now_cls = cls
-    return cls
-now_cls = None
-class ModelInterface:
-    @classmethod
-    def init_model(cls, model_py_path: str, **kwargs):
-        """
-        Args:
-            model_py_path: Py file Path of model you want to use.
-           **kwargs: Kwargs for model initialization
-        Returns: Corresponding model
-        """
-        sub_dirs = model_py_path.split(os.sep)
-        cmd = f"from {'.' + '.'.join(sub_dirs[:-1])} import {sub_dirs[-1]}"
-        exec(cmd)
-        return now_cls(**kwargs)