text-ypesk

Sleeping

App Files Files Community

ypesk commited on Jan 29

Commit

435ab1a

verified ·

1 Parent(s): 80cfa93

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +184 -22

tasks/text.py CHANGED Viewed

@@ -30,7 +30,13 @@ else:
     device = torch.device("cpu")
-MODEL = "ct" #mlp, ct, modern
 class ConspiracyClassification(
     nn.Module,
@@ -65,26 +71,90 @@ class ConspiracyClassification(
         return outputs
-class CovidTwitterBertClassifier(
     nn.Module,
     PyTorchModelHubMixin,
     # optionally, you can add metadata which gets pushed to the model card
-):
     def __init__(self, num_classes):
         super().__init__()
-        self.n_classes = num_classes
         self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
         self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
-        self.sigmoid = nn.Sigmoid()
-    def forward(self, input_ids, token_type_ids, input_mask):
         outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
         logits = outputs[1]
-        return logits
 @router.post(ROUTE, tags=["Text Task"],
              description=DESCRIPTION)
@@ -120,28 +190,20 @@ async def evaluate_text(request: TextEvaluationRequest):
     # Split dataset
     train_test = dataset["train"]
     test_dataset = dataset["test"]
-    # Start tracking emissions
-    tracker.start()
-    tracker.start_task("inference")
-    #--------------------------------------------------------------------------------------------
-    # YOUR MODEL INFERENCE CODE HERE
-    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    #--------------------------------------------------------------------------------------------
     if MODEL =="mlp":
         model = ConspiracyClassification.from_pretrained("ypesk/frugal-ai-mlp-baseline")
         model = model.to(device)
         emb_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
         batch_size = 6
         test_texts = torch.Tensor(emb_model.encode([t['quote'] for t in test_dataset]))
         test_data = TensorDataset(test_texts)
         test_sampler = SequentialSampler(test_data)
         test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
     elif MODEL == "ct":
-        model = CovidTwitterBertClassifier.from_pretrained("ypesk/ct-baseline")
         model = model.to(device)
         tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
@@ -161,18 +223,118 @@ async def evaluate_text(request: TextEvaluationRequest):
         test_sampler = SequentialSampler(test_data)
         test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
     model.eval()
-    predictions = []
     for batch in tqdm(test_dataloader):
         batch = tuple(t.to(device) for t in batch)
         with torch.no_grad():
             if MODEL =="mlp":
                 b_texts = batch[0]
                 logits = model(b_texts)
-            elif MODEL == "ct":
                 b_input_ids, b_input_mask, b_token_type_ids = batch
-                logits = model(b_input_ids, b_token_type_ids, b_input_mask)
         logits = logits.detach().cpu().numpy()
         predictions.extend(logits.argmax(1))

     device = torch.device("cpu")
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+MODEL = "modern-large" #mlp, ct, modern-base, modern-large, gte-base, gte-large
 class ConspiracyClassification(
     nn.Module,
         return outputs
+class CTBERT(
     nn.Module,
     PyTorchModelHubMixin,
     # optionally, you can add metadata which gets pushed to the model card
+):
     def __init__(self, num_classes):
         super().__init__()
         self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
         self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
+    def forward(self, input_ids, input_mask, token_type_ids):
         outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
         logits = outputs[1]
+        return logits
+class conspiracyModelBase(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # optionally, you can add metadata which gets pushed to the model card
+):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.n_classes = num_classes
+        self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-base', num_labels=num_classes)
+    def forward(self, input_ids, input_mask):
+        outputs = self.bert(input_ids = input_ids, attention_mask = input_mask)
+        return outputs.logits
+class conspiracyModelLarge(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # optionally, you can add metadata which gets pushed to the model card
+):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.n_classes = num_classes
+        self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-large', num_labels=num_classes)
+    def forward(self, input_ids, input_mask):
+        outputs = self.bert(input_ids = input_ids, attention_mask = input_mask)
+        return outputs.logits
+class gteModelLarge(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # optionally, you can add metadata which gets pushed to the model card
+):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.n_classes = num_classes
+        #self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-large', num_labels=num_classes)
+        self.gte = AutoModel.from_pretrained('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
+        #self.cls = nn.Linear(768, num_classes)
+        self.cls = nn.Linear(1024, num_classes)
+    def forward(self, input_ids, input_mask, input_type_ids):
+        outputs = self.gte(input_ids = input_ids, attention_mask = input_mask, token_type_ids = input_type_ids)
+        embeddings = outputs.last_hidden_state[:, 0]
+        logits = self.cls(embeddings)
+        return logits
+class gteModel(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # optionally, you can add metadata which gets pushed to the model card
+):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.n_classes = num_classes
+        #self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-large', num_labels=num_classes)
+        self.gte = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
+        self.cls = nn.Linear(768, num_classes)
+        #self.cls = nn.Linear(1024, num_classes)
+    def forward(self, input_ids, input_mask, input_type_ids):
+        outputs = self.gte(input_ids = input_ids, attention_mask = input_mask, token_type_ids = input_type_ids)
+        embeddings = outputs.last_hidden_state[:, 0]
+        logits = self.cls(embeddings)
+        return logits
 @router.post(ROUTE, tags=["Text Task"],
              description=DESCRIPTION)
     # Split dataset
     train_test = dataset["train"]
     test_dataset = dataset["test"]
     if MODEL =="mlp":
         model = ConspiracyClassification.from_pretrained("ypesk/frugal-ai-mlp-baseline")
         model = model.to(device)
         emb_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
         batch_size = 6
         test_texts = torch.Tensor(emb_model.encode([t['quote'] for t in test_dataset]))
         test_data = TensorDataset(test_texts)
         test_sampler = SequentialSampler(test_data)
         test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
     elif MODEL == "ct":
+        model = CTBERT.from_pretrained("ypesk/frugal-ai-ct-bert-baseline")
         model = model.to(device)
         tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
         test_sampler = SequentialSampler(test_data)
         test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
+    elif MODEL == "modern-base":
+        model = conspiracyModelBase.from_pretrained("ypesk/frugal-ai-modern-base-baseline")
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+        test_texts = [t['quote'] for t in test_dataset]
+        MAX_LEN = 256 #1024 # < m some tweets will be truncated
+        tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
+        test_input_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['attention_mask']
+        test_input_ids = torch.tensor(test_input_ids)
+        test_attention_mask = torch.tensor(test_attention_mask)
+        batch_size = 12 #
+        test_data = TensorDataset(test_input_ids, test_attention_mask)
+        test_sampler = SequentialSampler(test_data)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
+    elif MODEL == "modern-large":
+        model = conspiracyModelLarge.from_pretrained("ypesk/frugal-ai-modern-large-baseline")
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-large")
+        test_texts = [t['quote'] for t in test_dataset]
+        MAX_LEN = 256 #1024 # < m some tweets will be truncated
+        tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
+        test_input_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['attention_mask']
+        test_input_ids = torch.tensor(test_input_ids)
+        test_attention_mask = torch.tensor(test_attention_mask)
+        batch_size = 12 #
+        test_data = TensorDataset(test_input_ids, test_attention_mask)
+        test_sampler = SequentialSampler(test_data)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
+    elif MODEL == "gte-base":
+        model = gteModel.from_pretrained("ypesk/frugal-ai-gte-base-baseline")
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5')
+        test_texts = [t['quote'] for t in test_dataset]
+        MAX_LEN = 256 #1024 # < m some tweets will be truncated
+        tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
+        test_input_ids, test_attention_mask, test_token_type_ids = tokenized_test['input_ids'], tokenized_test['attention_mask'], tokenized_test['token_type_ids']
+        test_input_ids = torch.tensor(test_input_ids)
+        test_attention_mask = torch.tensor(test_attention_mask)
+        test_token_type_ids = torch.tensor(test_token_type_ids)
+        batch_size = 12 #
+        test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
+        test_sampler = SequentialSampler(test_data)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
+    elif MODEL == "gte-large":
+        model = gteModel.from_pretrained("ypesk/frugal-ai-gte-large-baseline")
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-large-en-v1.5')
+        test_texts = [t['quote'] for t in test_dataset]
+        MAX_LEN = 256 #1024 # < m some tweets will be truncated
+        tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
+        test_input_ids, test_attention_mask, test_token_type_ids = tokenized_test['input_ids'], tokenized_test['attention_mask'], tokenized_test['token_type_ids']
+        test_input_ids = torch.tensor(test_input_ids)
+        test_attention_mask = torch.tensor(test_attention_mask)
+        test_token_type_ids = torch.tensor(test_token_type_ids)
+        batch_size = 12 #
+        test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
+        test_sampler = SequentialSampler(test_data)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
+    # Start tracking emissions
+    tracker.start()
+    tracker.start_task("inference")
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE CODE HERE
+    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    #--------------------------------------------------------------------------------------------
     model.eval()
     for batch in tqdm(test_dataloader):
         batch = tuple(t.to(device) for t in batch)
         with torch.no_grad():
             if MODEL =="mlp":
                 b_texts = batch[0]
                 logits = model(b_texts)
+            elif MODEL == "modern-base" or MODEL=="modern-large":
+                b_input_ids, b_input_mask = batch
+                logits = model(b_input_ids, b_input_mask)
+            elif MODEL == "gte-base" or MODEL=="gte-large" or MODEL=="ct":
                 b_input_ids, b_input_mask, b_token_type_ids = batch
+                logits = model(b_input_ids, b_input_mask, b_token_type_ids)
         logits = logits.detach().cpu().numpy()
         predictions.extend(logits.argmax(1))