hbanduk commited on
Commit
4a96b36
·
verified ·
1 Parent(s): 843b402

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +60 -12
tasks/text.py CHANGED
@@ -62,32 +62,80 @@ async def evaluate_text(request: TextEvaluationRequest):
62
 
63
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
64
  import torch
 
65
 
66
  # Load model and tokenizer from Hugging Face Hub
67
  MODEL_REPO = "ClimateDebunk/FineTunedDistilBert4SeqClass"
68
 
69
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
 
 
70
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)
 
71
 
72
- model.eval() # Set to evaluation mode
 
 
 
 
 
 
 
 
73
 
74
- def preprocess(texts):
75
- """ Tokenize text inputs for DistilBERT """
76
- return tokenizer(texts, padding='max_length', truncation=True, max_length=365, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- def predict(texts):
79
- """ Run inference using the fine-tuned DistilBERT model """
80
- inputs = preprocess(texts)
81
  with torch.no_grad():
82
- outputs = model(**inputs)
83
- predictions = torch.argmax(outputs.logits, dim=1).tolist()
 
 
 
84
  return predictions
 
85
 
86
- # Run inference
87
- texts = test_dataset["quote"]
88
- predictions = predict(texts)
 
 
 
89
 
 
 
 
 
 
 
 
 
90
  true_labels = test_dataset["label"]
 
91
  #--------------------------------------------------------------------------------------------
92
  # YOUR MODEL INFERENCE STOPS HERE
93
  #--------------------------------------------------------------------------------------------
 
62
 
63
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
64
  import torch
65
+ from torch.utils.data import DataLoader
66
 
67
  # Load model and tokenizer from Hugging Face Hub
68
  MODEL_REPO = "ClimateDebunk/FineTunedDistilBert4SeqClass"
69
 
70
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
71
+ MAX_LENGTH = 365
72
+
73
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)
74
+ #model.eval() # Set to evaluation mode
75
 
76
+ class QuotesDataset(Dataset):
77
+ def __init__(self, encodings, labels):
78
+ self.encodings = encodings
79
+ self.labels = labels
80
+
81
+ def __getitem__(self, idx):
82
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
83
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
84
+ return item
85
 
86
+ def __len__(self):
87
+ return len(self.labels)
88
+
89
+ def encode_data(tokenizer, texts, labels, max_length):
90
+ try:
91
+ if isinstance(texts, pd.Series):
92
+ texts = texts.tolist()
93
+ if isinstance(labels, pd.Series):
94
+ labels = labels.tolist()
95
+
96
+ encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
97
+ return QuotesDataset(encodings, labels)
98
+
99
+ except Exception as e:
100
+ print(f"Error during tokenization: {e}")
101
+ return None
102
+
103
+ val_dataset = encode_data(tokenizer, test_dataset['quote'], test_dataset['label'], MAX_LENGTH)
104
+ val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle=False)
105
+
106
+
107
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
108
+ print(f"Using device: {device}")
109
 
110
+ def validate_model(model, val_loader, device):
111
+ model.eval()
112
+ predictions = []
113
  with torch.no_grad():
114
+ for batch in val_loader:
115
+ batch = {k: v.to(device) for k, v in batch.items()}
116
+ outputs = model(**batch)
117
+ preds = torch.argmax(outputs.logits, dim=-1)
118
+ predictions.extend(preds.cpu().numpy())
119
  return predictions
120
+
121
 
122
+ # tokenize texts
123
+ #test_encodings = tokenizer(test_dataset["quote"], padding='max_length', truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
124
+ #test_labels = torch.tensor(test_dataset["label"])
125
+
126
+ #test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)
127
+ #test_loader = DataLoader(test_dataset, batch_size=16)
128
 
129
+ #predictions = []
130
+ #with torch.no_grad():
131
+ #for batch in test_loader:
132
+ # input_ids, attention_mask, labels = [x.to(device) for x in batch]
133
+ # outputs = model(input_ids, attention_mask=attention_mask)
134
+ # predictions = torch.argmax(outputs.logits, dim=1)
135
+
136
+ predictions = validate_model(model, val_loader, device)
137
  true_labels = test_dataset["label"]
138
+
139
  #--------------------------------------------------------------------------------------------
140
  # YOUR MODEL INFERENCE STOPS HERE
141
  #--------------------------------------------------------------------------------------------