text-ypesk

Sleeping

App Files Files Community

ypesk commited on Jan 28

Commit

4d8b8b9

verified ·

1 Parent(s): a71804f

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +19 -12

tasks/text.py CHANGED Viewed

@@ -3,6 +3,14 @@ from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import random
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
@@ -55,13 +63,16 @@ async def evaluate_text(request: TextEvaluationRequest):
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
     #--------------------------------------------------------------------------------------------
-    class CovidTwitterBertClassifier(nn.Module):
-        def __init__(self, n_classes):
             super().__init__()
-            self.n_classes = n_classes
             self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
-            self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)
             self.sigmoid = nn.Sigmoid()
@@ -71,11 +82,7 @@ async def evaluate_text(request: TextEvaluationRequest):
             logits = outputs[1]
             return logits
-    model = CovidTwitterBertClassifier(8)
-    model.to(device)
-    model.load_state_dict(torch.load('ypesk/ct_baseline/CTBert_128_e15_0.692.pth'))
     model.eval()
@@ -83,7 +90,7 @@ async def evaluate_text(request: TextEvaluationRequest):
     test_texts = [t['quote'] for t in data_test]
-    MAX_LEN = 128 #1024 # < m some tweets will be truncated
     tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
     test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']
@@ -92,7 +99,7 @@ async def evaluate_text(request: TextEvaluationRequest):
     test_input_ids = torch.tensor(test_input_ids)
     test_attention_mask = torch.tensor(test_attention_mask)
-    batch_size = 8 #
     test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
     test_sampler = SequentialSampler(test_data)

 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import random
+import numpy as np
+from huggingface_hub import PyTorchModelHubMixin
+import torch
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertForPreTraining, BertModel, AutoTokenizer, BertForSequenceClassification, RobertaForSequenceClassification
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
     # YOUR MODEL INFERENCE CODE HERE
     # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
     #--------------------------------------------------------------------------------------------
+    class CovidTwitterBertClassifier(
+        nn.Module,
+        PyTorchModelHubMixin,
+        # optionally, you can add metadata which gets pushed to the model card
+    ):
+        def __init__(self, num_classes):
             super().__init__()
+            self.n_classes = num_classes
             self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
+            self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
             self.sigmoid = nn.Sigmoid()
             logits = outputs[1]
             return logits
+    model = CovidTwitterBertClassifier.from_pretrained("ypesk/ct-baseline")
     model.eval()
     test_texts = [t['quote'] for t in data_test]
+    MAX_LEN = 256 #1024 # < m some tweets will be truncated
     tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
     test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']
     test_input_ids = torch.tensor(test_input_ids)
     test_attention_mask = torch.tensor(test_attention_mask)
+    batch_size = 12 #
     test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids)
     test_sampler = SequentialSampler(test_data)