evgeniiarazum commited on
Commit
58e2728
·
verified ·
1 Parent(s): d753f94

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +61 -62
tasks/text.py CHANGED
@@ -13,20 +13,28 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
13
 
14
  router = APIRouter()
15
 
16
- MODEL_TYPE = "baseline"
17
- DESCRIPTIONS = {
18
- "distilbert_frugalai": "distilbert tuned on frugal ai data",
19
- "modernbert_frugalai": "distilbert tuned on frugal ai data",
20
- "mpnet_frugalai": "mpnet tuned on frugal ai data",
21
-
22
- }
23
  ROUTE = "/text"
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  class TextDataset(Dataset):
27
- def __init__(self, texts, tokenizer, max_length=256):
28
  self.texts = texts
29
- self.encodings = tokenizer(
30
  texts,
31
  truncation=True,
32
  padding=True,
@@ -35,43 +43,38 @@ class TextDataset(Dataset):
35
  )
36
 
37
  def __getitem__(self, idx):
38
- item = {key: val[idx] for key, val in self.encodings.items()}
39
  return item
40
 
41
  def __len__(self) -> int:
42
  return len(self.texts)
 
43
 
44
 
45
- def baseline_model(dataset_length: int):
46
- # Make random predictions (placeholder for actual model inference)
47
- # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
48
-
49
- # My favorite baseline is the most common class.
50
- predictions = [0] * dataset_length
51
-
52
- return predictions
53
-
54
-
55
- def bert_model(test_dataset: dict, model_type: str):
56
  texts = test_dataset["quote"]
57
 
58
- model_repo = f"evgeniiarazum/{MODEL_TYPE}"
59
- print(f"Loading from model_repo: {model_repo}")
60
- config = AutoConfig.from_pretrained(model_repo)
61
- model = AutoModelForSequenceClassification.from_pretrained(model_repo)
62
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
63
 
64
- if torch.cuda.is_available():
65
- device = torch.device("cuda")
66
  else:
67
- device = torch.device("cpu")
68
- print("Using device:", device)
 
 
 
69
  model = model.to(device)
 
 
70
  dataset = TextDataset(texts, tokenizer=tokenizer)
71
- dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
 
72
  model.eval()
73
  with torch.no_grad():
74
- print("Starting model run.")
75
  predictions = np.array([])
76
  for batch in dataloader:
77
  test_input_ids = batch["input_ids"].to(device)
@@ -79,21 +82,18 @@ def bert_model(test_dataset: dict, model_type: str):
79
  outputs = model(test_input_ids, test_attention_mask)
80
  p = torch.argmax(outputs.logits, dim=1)
81
  predictions = np.append(predictions, p.cpu().numpy())
82
- print("End of model run.")
83
-
 
84
  return predictions
85
 
86
 
87
  @router.post(ROUTE, tags=["Text Task"])
88
- async def evaluate_text(
89
- request: TextEvaluationRequest,
90
- model_type: str = MODEL_TYPE,
91
- # This should be an API query parameter, but it looks like the submission repo
92
- # https://huggingface.co/spaces/frugal-ai-challenge/submission-portal
93
- # is built in a way to not accept any other endpoints or parameters.
94
- ):
95
  """
96
  Evaluate text classification for climate disinformation detection.
 
97
  Current Model: Random Baseline
98
  - Makes random predictions from the label space (0-7)
99
  - Used as a baseline for comparison
@@ -110,7 +110,7 @@ async def evaluate_text(
110
  "4_solutions_harmful_unnecessary": 4,
111
  "5_science_unreliable": 5,
112
  "6_proponents_biased": 6,
113
- "7_fossil_fuels_needed": 7,
114
  }
115
 
116
  # Load and prepare the dataset
@@ -120,44 +120,43 @@ async def evaluate_text(
120
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
121
 
122
  # Split dataset
123
- train_test = dataset["train"].train_test_split(
124
- test_size=request.test_size, seed=request.test_seed
125
- )
126
  test_dataset = train_test["test"]
127
-
128
  # Start tracking emissions
129
  tracker.start()
130
  tracker.start_task("inference")
131
 
132
- # --------------------------------------------------------------------------------------------
133
  # YOUR MODEL INFERENCE CODE HERE
134
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
135
- # --------------------------------------------------------------------------------------------
136
-
 
137
  true_labels = test_dataset["label"]
138
- if model_type == "baseline":
 
139
  predictions = baseline_model(len(true_labels))
140
- elif model_type in ["distilbert_frugalai", "modernbert_frugalai", "mpnet_frugalai"]:
141
- predictions = bert_model(test_dataset, model_type)
142
- else:
143
- raise ValueError(model_type)
144
 
145
- # --------------------------------------------------------------------------------------------
146
  # YOUR MODEL INFERENCE STOPS HERE
147
- # --------------------------------------------------------------------------------------------
148
 
 
149
  # Stop tracking emissions
150
  emissions_data = tracker.stop_task()
151
-
152
  # Calculate accuracy
153
  accuracy = accuracy_score(true_labels, predictions)
154
-
155
  # Prepare results dictionary
156
  results = {
157
  "username": username,
158
  "space_url": space_url,
159
  "submission_timestamp": datetime.now().isoformat(),
160
- "model_description": DESCRIPTIONS[model_type],
161
  "accuracy": float(accuracy),
162
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
163
  "emissions_gco2eq": emissions_data.emissions * 1000,
@@ -166,8 +165,8 @@ async def evaluate_text(
166
  "dataset_config": {
167
  "dataset_name": request.dataset_name,
168
  "test_size": request.test_size,
169
- "test_seed": request.test_seed,
170
- },
171
  }
172
-
173
- return results
 
13
 
14
  router = APIRouter()
15
 
16
+ DESCRIPTION = "Random Baseline"
 
 
 
 
 
 
17
  ROUTE = "/text"
18
 
19
+ models_descriptions = {
20
+ "baseline": "random baseline", # Baseline
21
+ "tfidf_xgb": "TF-IDF vectorizer and XGBoost classifier", # Submitted
22
+ "bert_base_pruned": "Pruned BERT base model", # Submitted
23
+ 'climate_bert_pruned': "Fine-tuned and pruned DistilRoBERTa pre-trained on climate texts", # Not working
24
+ "sbert_distilroberta": "Fine-tuned sentence transformer DistilRoBERTa"
25
+ }
26
+
27
+
28
+ def baseline_model(dataset_length: int):
29
+ # Make random predictions (placeholder for actual model inference)
30
+ predictions = [random.randint(0, 7) for _ in range(dataset_length)]
31
+
32
+ return predictions
33
 
34
  class TextDataset(Dataset):
35
+ def __init__(self, texts, tokenizer, max_length=512):
36
  self.texts = texts
37
+ self.tokenized_texts = tokenizer(
38
  texts,
39
  truncation=True,
40
  padding=True,
 
43
  )
44
 
45
  def __getitem__(self, idx):
46
+ item = {key: val[idx] for key, val in self.tokenized_texts.items()}
47
  return item
48
 
49
  def __len__(self) -> int:
50
  return len(self.texts)
51
+
52
 
53
 
54
+ def bert_classifier(test_dataset: dict, model: str):
55
+ print("Starting BERT model run")
 
 
 
 
 
 
 
 
 
56
  texts = test_dataset["quote"]
57
 
58
+ model_repo = f"evgeniiarazum/{model}"
59
+
 
 
60
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
61
 
62
+ if model in ["distilbert_frugalai", "deberta_frugalai", "modernbert_frugalai", "distilroberta_frugalai"]:
63
+ model = AutoModelForSequenceClassification.from_pretrained(model_repo)
64
  else:
65
+ raise(ValueError)
66
+
67
+ # Use CUDA if available
68
+ device, _, _ = get_backend()
69
+
70
  model = model.to(device)
71
+
72
+ # Prepare dataset
73
  dataset = TextDataset(texts, tokenizer=tokenizer)
74
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
75
+
76
  model.eval()
77
  with torch.no_grad():
 
78
  predictions = np.array([])
79
  for batch in dataloader:
80
  test_input_ids = batch["input_ids"].to(device)
 
82
  outputs = model(test_input_ids, test_attention_mask)
83
  p = torch.argmax(outputs.logits, dim=1)
84
  predictions = np.append(predictions, p.cpu().numpy())
85
+
86
+ print("Finished BERT model run")
87
+
88
  return predictions
89
 
90
 
91
  @router.post(ROUTE, tags=["Text Task"])
92
+ async def evaluate_text(request: TextEvaluationRequest,
93
+ model: str = "distilbert_frugalai"):
 
 
 
 
 
94
  """
95
  Evaluate text classification for climate disinformation detection.
96
+
97
  Current Model: Random Baseline
98
  - Makes random predictions from the label space (0-7)
99
  - Used as a baseline for comparison
 
110
  "4_solutions_harmful_unnecessary": 4,
111
  "5_science_unreliable": 5,
112
  "6_proponents_biased": 6,
113
+ "7_fossil_fuels_needed": 7
114
  }
115
 
116
  # Load and prepare the dataset
 
120
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
121
 
122
  # Split dataset
123
+ train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
 
 
124
  test_dataset = train_test["test"]
125
+
126
  # Start tracking emissions
127
  tracker.start()
128
  tracker.start_task("inference")
129
 
130
+ #--------------------------------------------------------------------------------------------
131
  # YOUR MODEL INFERENCE CODE HERE
132
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
133
+ #--------------------------------------------------------------------------------------------
134
+
135
+ # Make random predictions (placeholder for actual model inference)
136
  true_labels = test_dataset["label"]
137
+
138
+ if model == "baseline":
139
  predictions = baseline_model(len(true_labels))
140
+ elif 'bert' in model:
141
+ predictions = bert_classifier(test_dataset, model)
 
 
142
 
143
+ #--------------------------------------------------------------------------------------------
144
  # YOUR MODEL INFERENCE STOPS HERE
145
+ #--------------------------------------------------------------------------------------------
146
 
147
+
148
  # Stop tracking emissions
149
  emissions_data = tracker.stop_task()
150
+
151
  # Calculate accuracy
152
  accuracy = accuracy_score(true_labels, predictions)
153
+
154
  # Prepare results dictionary
155
  results = {
156
  "username": username,
157
  "space_url": space_url,
158
  "submission_timestamp": datetime.now().isoformat(),
159
+ "model_description": models_descriptions[model],
160
  "accuracy": float(accuracy),
161
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
162
  "emissions_gco2eq": emissions_data.emissions * 1000,
 
165
  "dataset_config": {
166
  "dataset_name": request.dataset_name,
167
  "test_size": request.test_size,
168
+ "test_seed": request.test_seed
169
+ }
170
  }
171
+
172
+ return results