MatthiasPi commited on
Commit
e62e3eb
·
verified ·
1 Parent(s): 1f45c21

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +42 -42
tasks/text.py CHANGED
@@ -64,63 +64,63 @@ async def evaluate_text(request: TextEvaluationRequest):
64
 
65
  # Make random predictions (placeholder for actual model inference)
66
  true_labels = test_dataset["label"]
67
- # predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
68
- # path_model = 'MatthiasPi/modernbert_finetunedV1'
69
- # path_tokenizer = "answerdotai/ModernBERT-base"
70
 
71
- # model = AutoModelForSequenceClassification.from_pretrained(path_model)
72
- # tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
73
 
74
- # def preprocess_function(df):
75
- # return tokenizer(df["quote"], truncation=True)
76
- # tokenized_test = test_dataset.map(preprocess_function, batched=True)
77
 
78
- # # training_args = torch.load("training_args.bin")
79
- # # training_args.eval_strategy='no'
80
 
81
- # trainer = Trainer(
82
- # model=model,
83
- # # args=training_args,
84
- # tokenizer=tokenizer
85
- # )
86
 
87
- # preds = trainer.predict(tokenized_test)
88
 
89
- path_model = 'MatthiasPi/modernbert_finetunedV1'
90
- path_tokenizer = "answerdotai/ModernBERT-base"
91
 
92
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93
- model = AutoModelForSequenceClassification.from_pretrained(path_model).to(device).eval()
94
- tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
95
 
96
- model.half()
97
 
98
- # Use optimized tokenization
99
- def preprocess_function(df):
100
- return tokenizer(df["quote"], truncation=True, padding="max_length")
101
 
102
- tokenized_test = test_dataset.map(preprocess_function, batched=True)
103
 
104
- # Convert dataset to PyTorch tensors for efficient inference
105
- def collate_fn(batch):
106
- input_ids = torch.tensor([example["input_ids"] for example in batch]).to(device)
107
- attention_mask = torch.tensor([example["attention_mask"] for example in batch]).to(device)
108
- return {"input_ids": input_ids, "attention_mask": attention_mask}
109
 
110
  # Optimized inference function
111
- def predict(dataset, batch_size=16):
112
- all_preds = []
113
- with torch.no_grad(): # No gradient computation (saves energy)
114
- for batch in torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn):
115
- outputs = model(**batch)
116
- preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
117
- all_preds.extend(preds)
118
- return np.array(all_preds)
119
 
120
  # Run inference
121
- predictions = predict(tokenized_test)
122
- print(predictions)
123
- # predictions = np.array([np.argmax(x) for x in preds[0]])
124
 
125
  #--------------------------------------------------------------------------------------------
126
  # YOUR MODEL INFERENCE STOPS HERE
 
64
 
65
  # Make random predictions (placeholder for actual model inference)
66
  true_labels = test_dataset["label"]
67
+ predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
68
+ path_model = 'MatthiasPi/modernbert_finetunedV1'
69
+ path_tokenizer = "answerdotai/ModernBERT-base"
70
 
71
+ model = AutoModelForSequenceClassification.from_pretrained(path_model)
72
+ tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
73
 
74
+ def preprocess_function(df):
75
+ return tokenizer(df["quote"], truncation=True)
76
+ tokenized_test = test_dataset.map(preprocess_function, batched=True)
77
 
78
+ # training_args = torch.load("training_args.bin")
79
+ # training_args.eval_strategy='no'
80
 
81
+ trainer = Trainer(
82
+ model=model,
83
+ # args=training_args,
84
+ tokenizer=tokenizer
85
+ )
86
 
87
+ preds = trainer.predict(tokenized_test)
88
 
89
+ # path_model = 'MatthiasPi/modernbert_finetunedV1'
90
+ # path_tokenizer = "answerdotai/ModernBERT-base"
91
 
92
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
93
+ # model = AutoModelForSequenceClassification.from_pretrained(path_model).to(device).eval()
94
+ # tokenizer = AutoTokenizer.from_pretrained(path_tokenizer)
95
 
96
+ # model.half()
97
 
98
+ # # Use optimized tokenization
99
+ # def preprocess_function(df):
100
+ # return tokenizer(df["quote"], truncation=True, padding="max_length")
101
 
102
+ # tokenized_test = test_dataset.map(preprocess_function, batched=True)
103
 
104
+ # # Convert dataset to PyTorch tensors for efficient inference
105
+ # def collate_fn(batch):
106
+ # input_ids = torch.tensor([example["input_ids"] for example in batch]).to(device)
107
+ # attention_mask = torch.tensor([example["attention_mask"] for example in batch]).to(device)
108
+ # return {"input_ids": input_ids, "attention_mask": attention_mask}
109
 
110
  # Optimized inference function
111
+ # def predict(dataset, batch_size=16):
112
+ # all_preds = []
113
+ # with torch.no_grad(): # No gradient computation (saves energy)
114
+ # for batch in torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn):
115
+ # outputs = model(**batch)
116
+ # preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
117
+ # all_preds.extend(preds)
118
+ # return np.array(all_preds)
119
 
120
  # Run inference
121
+ # predictions = predict(tokenized_test)
122
+ # print(predictions)
123
+ predictions = np.array([np.argmax(x) for x in preds[0]])
124
 
125
  #--------------------------------------------------------------------------------------------
126
  # YOUR MODEL INFERENCE STOPS HERE