hbanduk commited on
Commit
6e3a1c9
·
verified ·
1 Parent(s): 6842466

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +12 -58
tasks/text.py CHANGED
@@ -71,69 +71,23 @@ async def evaluate_text(request: TextEvaluationRequest):
71
  MAX_LENGTH = 365
72
 
73
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)
74
- #model.eval() # Set to evaluation mode
75
 
76
- class QuotesDataset(Dataset):
77
- def __init__(self, encodings, labels):
78
- self.encodings = encodings
79
- self.labels = labels
80
-
81
- def __getitem__(self, idx):
82
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
83
- item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
84
- return item
85
-
86
- def __len__(self):
87
- return len(self.labels)
88
-
89
- def encode_data(tokenizer, texts, labels, max_length):
90
- try:
91
- if isinstance(texts, pd.Series):
92
- texts = texts.tolist()
93
- if isinstance(labels, pd.Series):
94
- labels = labels.tolist()
95
-
96
- encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
97
- return QuotesDataset(encodings, labels)
98
-
99
- except Exception as e:
100
- print(f"Error during tokenization: {e}")
101
- return None
102
-
103
- val_dataset = encode_data(tokenizer, test_dataset['quote'], test_dataset['label'], MAX_LENGTH)
104
- val_loader = DataLoader(val_dataset, batch_size= 16, shuffle=False)
105
-
106
 
107
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
108
- print(f"Using device: {device}")
109
-
110
- def validate_model(model, val_loader, device):
111
- model.eval()
112
- predictions = []
113
- with torch.no_grad():
114
- for batch in val_loader:
115
- batch = {k: v.to(device) for k, v in batch.items()}
116
- outputs = model(**batch)
117
- preds = torch.argmax(outputs.logits, dim=-1)
118
- predictions.extend(preds.cpu().numpy())
119
- return predictions
120
-
121
-
122
  # tokenize texts
123
- #test_encodings = tokenizer(test_dataset["quote"], padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
124
- #test_labels = torch.tensor(test_dataset["label"])
125
 
126
- #test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)
127
- #test_loader = DataLoader(test_dataset, batch_size=16)
128
 
129
- #predictions = []
130
- #with torch.no_grad():
131
- #for batch in test_loader:
132
- # input_ids, attention_mask, labels = [x.to(device) for x in batch]
133
- # outputs = model(input_ids, attention_mask=attention_mask)
134
- # predictions = torch.argmax(outputs.logits, dim=1)
135
-
136
- predictions = validate_model(model, val_loader, device)
137
  true_labels = test_dataset["label"]
138
 
139
  #--------------------------------------------------------------------------------------------
 
71
  MAX_LENGTH = 365
72
 
73
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)
74
+ model.eval() # Set to evaluation mode
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # tokenize texts
78
+ test_encodings = tokenizer(test_dataset["quote"], padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
 
79
 
80
+ test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)
81
+ test_loader = DataLoader(test_dataset, batch_size=16)
82
 
83
+ predictions = []
84
+ with torch.no_grad():
85
+ for batch in test_loader:
86
+ input_ids, attention_mask, labels = [x.to(device) for x in batch]
87
+ outputs = model(input_ids, attention_mask=attention_mask)
88
+ preds = torch.argmax(outputs.logits, dim=1)
89
+ predictions.extend(preds.cpu().numpy())
90
+
91
  true_labels = test_dataset["label"]
92
 
93
  #--------------------------------------------------------------------------------------------