Rcarvalo commited on
Commit
f19a99a
·
verified ·
1 Parent(s): c6e64ec

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +43 -20
tasks/text.py CHANGED
@@ -3,15 +3,20 @@ from datetime import datetime
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import random
6
- import torch
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
- from transformers import AutoTokenizer,BertForSequenceClassification,AutoModelForSequenceClassification,Trainer, TrainingArguments,DataCollatorWithPadding
 
 
 
 
10
  import numpy as np
11
 
 
12
  router = APIRouter()
13
 
14
- DESCRIPTION = "ModernBert Baseline"
15
  ROUTE = "/text"
16
 
17
  @router.post(ROUTE, tags=["Text Task"],
@@ -46,8 +51,8 @@ async def evaluate_text(request: TextEvaluationRequest):
46
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
47
 
48
  # Split dataset
49
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
50
- test_dataset = train_test["test"]
51
 
52
  # Start tracking emissions
53
  tracker.start()
@@ -61,32 +66,50 @@ async def evaluate_text(request: TextEvaluationRequest):
61
  # Make random predictions (placeholder for actual model inference)
62
  true_labels = test_dataset["label"]
63
  predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
 
 
64
 
65
- #--------------------------------------------------------------------------------------------
66
- # YOUR MODEL INFERENCE STOPS HERE
67
- #--------------------------------------------------------------------------------------------
68
- ## Model loading
69
- model = AutoModelForSequenceClassification.from_pretrained("Rcarvalo/test_modernbert_finetuned_v2")
70
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
 
 
 
 
 
71
 
72
- ## Data prep
 
 
 
73
  def preprocess_function(df):
74
  return tokenizer(df["quote"], truncation=True)
75
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
76
-
77
- ## Modify inference model
78
- training_args = torch.load("./tasks/utils/training_args.bin")
79
- training_args.eval_strategy='no'
80
-
81
  trainer = Trainer(
82
  model=model,
83
- args=training_args,
84
  tokenizer=tokenizer
85
  )
86
-
87
- ## prediction
88
  preds = trainer.predict(tokenized_test)
 
 
 
 
 
 
89
  predictions = np.array([np.argmax(x) for x in preds[0]])
 
 
 
 
 
90
 
91
  # Stop tracking emissions
92
  emissions_data = tracker.stop_task()
 
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
  import random
6
+
7
  from .utils.evaluation import TextEvaluationRequest
8
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
9
+
10
+ from peft import PeftModel
11
+ from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer, TrainingArguments,DataCollatorWithPadding, BitsAndBytesConfig
12
+ from datasets import Dataset
13
+ import torch
14
  import numpy as np
15
 
16
+
17
  router = APIRouter()
18
 
19
+ DESCRIPTION = "qwen_finetuned"
20
  ROUTE = "/text"
21
 
22
  @router.post(ROUTE, tags=["Text Task"],
 
51
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
52
 
53
  # Split dataset
54
+ train_test = dataset["train"]
55
+ test_dataset = dataset["test"]
56
 
57
  # Start tracking emissions
58
  tracker.start()
 
66
  # Make random predictions (placeholder for actual model inference)
67
  true_labels = test_dataset["label"]
68
  predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
69
+ path_adapter = 'MatthiasPicard/Qwen3B_model_test'
70
+ path_model = "Qwen/Qwen2.5-3B-Instruct"
71
 
72
+ bnb_config = BitsAndBytesConfig(
73
+ load_in_8bit=True
74
+ )
75
+
76
+ base_model = AutoModelForSequenceClassification.from_pretrained(
77
+ path_model,
78
+ num_labels=len(LABEL_MAPPING),
79
+ device_map="auto",
80
+ torch_dtype=torch.bfloat16,
81
+ quantization_config=bnb_config
82
+ )
83
 
84
+ model = PeftModel.from_pretrained(base_model, path_adapter)
85
+ model.eval()
86
+ tokenizer = AutoTokenizer.from_pretrained(path_model)
87
+
88
  def preprocess_function(df):
89
  return tokenizer(df["quote"], truncation=True)
90
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
91
+
92
+ # training_args = torch.load("training_args.bin")
93
+ # training_args.eval_strategy='no'
94
+
 
95
  trainer = Trainer(
96
  model=model,
 
97
  tokenizer=tokenizer
98
  )
99
+
 
100
  preds = trainer.predict(tokenized_test)
101
+
102
+
103
+
104
+ # Run inference
105
+ # predictions = predict(tokenized_test)
106
+ # print(predictions)
107
  predictions = np.array([np.argmax(x) for x in preds[0]])
108
+
109
+ #--------------------------------------------------------------------------------------------
110
+ # YOUR MODEL INFERENCE STOPS HERE
111
+ #--------------------------------------------------------------------------------------------
112
+
113
 
114
  # Stop tracking emissions
115
  emissions_data = tracker.stop_task()