File size: 5,776 Bytes
4d6e8c2 ca9b1e7 4d6e8c2 70467a6 1c33274 70f5f26 1c33274 70f5f26 4d6e8c2 70f5f26 4d6e8c2 8eb4396 4d6e8c2 70f5f26 4d6e8c2 ca9b1e7 e62e3eb 84167fd e62e3eb e39064f e62e3eb e39064f e62e3eb 1ab9627 e39064f e62e3eb e39064f e62e3eb e39064f be34aad bd6fbf6 e62e3eb e39064f e62e3eb 22b1a98 e62e3eb 1f45c21 e62e3eb 22b1a98 e62e3eb e39064f e62e3eb 22b1a98 e62e3eb e39064f e62e3eb e39064f e62e3eb 70f5f26 4d6e8c2 70f5f26 4d6e8c2 1c33274 4d6e8c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import random
from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
from transformers import AutoTokenizer,BertForSequenceClassification,AutoModelForSequenceClassification,Trainer, TrainingArguments,DataCollatorWithPadding
from datasets import Dataset
import torch
import numpy as np
router = APIRouter()
DESCRIPTION = "modernBERT_finetuned"
ROUTE = "/text"
@router.post(ROUTE, tags=["Text Task"],
description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
"""
Evaluate text classification for climate disinformation detection.
Current Model: Random Baseline
- Makes random predictions from the label space (0-7)
- Used as a baseline for comparison
"""
# Get space info
username, space_url = get_space_info()
# Define the label mapping
LABEL_MAPPING = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Load and prepare the dataset
dataset = load_dataset(request.dataset_name)
# Convert string labels to integers
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
# Split dataset
train_test = dataset["train"]
test_dataset = dataset["test"]
# Start tracking emissions
tracker.start()
tracker.start_task("inference")
#--------------------------------------------------------------------------------------------
# YOUR MODEL INFERENCE CODE HERE
# Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
#--------------------------------------------------------------------------------------------
# Make random predictions (placeholder for actual model inference)
true_labels = test_dataset["label"]
predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
path_model = 'MatthiasPicard/checkpoint4200_batch16_modern_bert_valloss_0.79_0.74acc'
path_tokenizer = "answerdotai/ModernBERT-base"
model = AutoModelForSequenceClassification.from_pretrained(path_model)
tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
def preprocess_function(df):
return tokenizer(df["quote"], truncation=True,padding='longest')
tokenized_test = test_dataset.map(preprocess_function, batched=True)
# training_args = torch.load("training_args.bin")
# training_args.eval_strategy='no'
trainer = Trainer(
model=model,
# args=training_args,
tokenizer=tokenizer
)
trainer.args.per_device_eval_batch_size = 4
preds = trainer.predict(tokenized_test)
# path_model = 'MatthiasPi/modernbert_finetunedV1'
# path_tokenizer = "answerdotai/ModernBERT-base"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained(path_model).to(device).eval()
# tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
# model.half()
# # Use optimized tokenization
# def preprocess_function(df):
# return tokenizer(df["quote"], truncation=True, padding="max_length")
# tokenized_test = test_dataset.map(preprocess_function, batched=True)
# # Convert dataset to PyTorch tensors for efficient inference
# def collate_fn(batch):
# input_ids = torch.tensor([example["input_ids"] for example in batch]).to(device)
# attention_mask = torch.tensor([example["attention_mask"] for example in batch]).to(device)
# return {"input_ids": input_ids, "attention_mask": attention_mask}
# Optimized inference function
# def predict(dataset, batch_size=16):
# all_preds = []
# with torch.no_grad(): # No gradient computation (saves energy)
# for batch in torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn):
# outputs = model(**batch)
# preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
# all_preds.extend(preds)
# return np.array(all_preds)
# Run inference
# predictions = predict(tokenized_test)
# print(predictions)
predictions = np.array([np.argmax(x) for x in preds[0]])
#--------------------------------------------------------------------------------------------
# YOUR MODEL INFERENCE STOPS HERE
#--------------------------------------------------------------------------------------------
# Stop tracking emissions
emissions_data = tracker.stop_task()
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
# Prepare results dictionary
results = {
"username": username,
"space_url": space_url,
"submission_timestamp": datetime.now().isoformat(),
"model_description": DESCRIPTION,
"accuracy": float(accuracy),
"energy_consumed_wh": emissions_data.energy_consumed * 1000,
"emissions_gco2eq": emissions_data.emissions * 1000,
"emissions_data": clean_emissions_data(emissions_data),
"api_route": ROUTE,
"dataset_config": {
"dataset_name": request.dataset_name,
"test_size": request.test_size,
"test_seed": request.test_seed
}
}
return results |