MatthiasPi commited on
Commit
e39064f
·
verified ·
1 Parent(s): 8eb4396

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +47 -14
tasks/text.py CHANGED
@@ -65,27 +65,60 @@ async def evaluate_text(request: TextEvaluationRequest):
65
  # Make random predictions (placeholder for actual model inference)
66
  true_labels = test_dataset["label"]
67
  # predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  path_model = 'MatthiasPi/modernbert_finetunedV1'
69
  path_tokenizer = "answerdotai/ModernBERT-base"
70
 
71
- model = AutoModelForSequenceClassification.from_pretrained(path_model)
 
72
  tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
73
 
 
74
  def preprocess_function(df):
75
- return tokenizer(df["quote"], truncation=True)
 
76
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
77
-
78
- # training_args = torch.load("training_args.bin")
79
- # training_args.eval_strategy='no'
80
-
81
- trainer = Trainer(
82
- model=model,
83
- # args=training_args,
84
- tokenizer=tokenizer
85
- )
86
-
87
- preds = trainer.predict(tokenized_test)
88
- predictions = np.array([np.argmax(x) for x in preds[0]])
 
 
 
 
 
 
 
 
 
89
 
90
  #--------------------------------------------------------------------------------------------
91
  # YOUR MODEL INFERENCE STOPS HERE
 
65
  # Make random predictions (placeholder for actual model inference)
66
  true_labels = test_dataset["label"]
67
  # predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
68
+ # path_model = 'MatthiasPi/modernbert_finetunedV1'
69
+ # path_tokenizer = "answerdotai/ModernBERT-base"
70
+
71
+ # model = AutoModelForSequenceClassification.from_pretrained(path_model)
72
+ # tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
73
+
74
+ # def preprocess_function(df):
75
+ # return tokenizer(df["quote"], truncation=True)
76
+ # tokenized_test = test_dataset.map(preprocess_function, batched=True)
77
+
78
+ # # training_args = torch.load("training_args.bin")
79
+ # # training_args.eval_strategy='no'
80
+
81
+ # trainer = Trainer(
82
+ # model=model,
83
+ # # args=training_args,
84
+ # tokenizer=tokenizer
85
+ # )
86
+
87
+ # preds = trainer.predict(tokenized_test)
88
+
89
  path_model = 'MatthiasPi/modernbert_finetunedV1'
90
  path_tokenizer = "answerdotai/ModernBERT-base"
91
 
92
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93
+ model = AutoModelForSequenceClassification.from_pretrained(path_model).to(device).eval()
94
  tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
95
 
96
+ # Use optimized tokenization
97
  def preprocess_function(df):
98
+ return tokenizer(df["quote"], truncation=True, padding="max_length")
99
+
100
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
101
+
102
+ # Convert dataset to PyTorch tensors for efficient inference
103
+ def collate_fn(batch):
104
+ input_ids = torch.tensor([example["input_ids"] for example in batch]).to(device)
105
+ attention_mask = torch.tensor([example["attention_mask"] for example in batch]).to(device)
106
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
107
+
108
+ # Optimized inference function
109
+ def predict(dataset):
110
+ all_preds = []
111
+ with torch.no_grad(): # No gradient computation (saves energy)
112
+ for batch in torch.utils.data.DataLoader(dataset, batch_size=len(dataset), collate_fn=collate_fn):
113
+ outputs = model(**batch)
114
+ preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
115
+ all_preds.extend(preds)
116
+ return np.array(all_preds)
117
+
118
+ # Run inference
119
+ predictions = predict(tokenized_test)
120
+
121
+ # predictions = np.array([np.argmax(x) for x in preds[0]])
122
 
123
  #--------------------------------------------------------------------------------------------
124
  # YOUR MODEL INFERENCE STOPS HERE