frugal-ai-submission-template

Sleeping

App Files Files Community

Tonic commited on Feb 10

Commit

1f08781

unverified ·

1 Parent(s): 6af9c73

fix model loading

Browse files

Files changed (1) hide show

tasks/text.py +47 -88

tasks/text.py CHANGED Viewed

@@ -8,6 +8,8 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from huggingface_hub import login
 from dotenv import load_dotenv
@@ -18,45 +20,37 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
 load_dotenv()
 # Authenticate with Hugging Face
-HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
-# Disable torch compile
-os.environ["TORCH_COMPILE_DISABLE"] = "1"
 router = APIRouter()
-DESCRIPTION = "Climate Guard Toxic Agent is a ModernBERT fine-tuned for climate disinformation detection"
 ROUTE = "/text"
 MODEL_NAME = "Tonic/climate-guard-toxic-agent"
 class TextClassifier:
     def __init__(self):
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         max_retries = 3
         for attempt in range(max_retries):
             try:
                 # Initialize tokenizer
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    MODEL_NAME,
-                    model_max_length=512,
-                    padding_side='right',
-                    truncation_side='right'
-                )
-                # Initialize model with basic configuration
                 self.model = AutoModelForSequenceClassification.from_pretrained(
                     MODEL_NAME,
                     num_labels=8,
-                    problem_type="single_label_classification",
-                    ignore_mismatched_sizes=True,
                     trust_remote_code=True
-                )
-                # Move model to device
-                self.model = self.model.to(self.device)
                 print("Model initialized successfully")
                 break
@@ -67,34 +61,32 @@ class TextClassifier:
                 print(f"Attempt {attempt + 1} failed, retrying... Error: {str(e)}")
                 time.sleep(1)
-    def process_batch(self, batch: List[str], batch_idx: int) -> Tuple[List[int], int]:
         """Process a batch of texts and return their predictions"""
         try:
-            print(f"Processing batch {batch_idx} with {len(batch)} items")
-            # Tokenize texts
             inputs = self.tokenizer(
-                batch,
                 padding=True,
                 truncation=True,
-                max_length=512,
                 return_tensors="pt"
-            ).to(self.device)
             # Get predictions
             with torch.no_grad():
                 outputs = self.model(**inputs)
-                predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
-            print(f"Completed batch {batch_idx} with {len(predictions)} predictions")
-            return predictions.tolist(), batch_idx
         except Exception as e:
-            print(f"Error in batch {batch_idx}: {str(e)}")
-            return [0] * len(batch), batch_idx
     def __del__(self):
-        # Clean up CUDA memory
         if hasattr(self, 'model'):
             del self.model
         if torch.cuda.is_available():
@@ -104,10 +96,8 @@ class TextClassifier:
 async def evaluate_text(request: TextEvaluationRequest):
     """Evaluate text classification for climate disinformation detection."""
-    # Get space info
     username, space_url = get_space_info()
-    # Define the label mapping
     LABEL_MAPPING = {
         "0_not_relevant": 0,
         "1_not_happening": 1,
@@ -120,76 +110,46 @@ async def evaluate_text(request: TextEvaluationRequest):
     }
     try:
-        # Load and prepare the dataset
-        dataset = load_dataset("QuotaClimat/frugalaichallenge-text-train", token=HF_TOKEN)
-        # Convert string labels to integers
-        def convert_label(example):
-            try:
-                return {"label": LABEL_MAPPING[example["label"]]}
-            except KeyError:
-                print(f"Warning: Unknown label {example['label']}")
-                return {"label": 0}
-        dataset = dataset.map(convert_label)
-        # Get test dataset
         test_dataset = dataset["test"]
         # Start tracking emissions
         tracker.start()
         tracker.start_task("inference")
         true_labels = test_dataset["label"]
-        # Initialize the model once
         classifier = TextClassifier()
-        # Prepare batches
-        batch_size = 16  # Reduced batch size for better stability
-        quotes = test_dataset["quote"]
-        num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
-        batches = [
-            quotes[i * batch_size:(i + 1) * batch_size]
-            for i in range(num_batches)
-        ]
-        # Initialize batch_results
-        batch_results = [[] for _ in range(num_batches)]
-        # Process batches in parallel
-        max_workers = min(os.cpu_count(), 4)
-        print(f"Processing with {max_workers} workers")
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            future_to_batch = {
-                executor.submit(classifier.process_batch, batch, idx): idx
-                for idx, batch in enumerate(batches)
-            }
-            for future in future_to_batch:
-                batch_idx = future_to_batch[future]
-                try:
-                    predictions, idx = future.result()
-                    if predictions:
-                        batch_results[idx] = predictions
-                        print(f"Stored results for batch {idx} ({len(predictions)} predictions)")
-                except Exception as e:
-                    print(f"Failed to get results for batch {batch_idx}: {e}")
-                    batch_results[batch_idx] = [0] * len(batches[batch_idx])
-        # Flatten predictions
-        predictions = []
-        for batch_preds in batch_results:
-            if batch_preds is not None:
-                predictions.extend(batch_preds)
         # Stop tracking emissions
         emissions_data = tracker.stop_task()
         # Calculate accuracy
-        accuracy = accuracy_score(true_labels, predictions)
-        print("accuracy:", accuracy)
         # Prepare results
         results = {
@@ -209,7 +169,6 @@ async def evaluate_text(request: TextEvaluationRequest):
             }
         }
-        print("results:", results)
         return results
     except Exception as e:

 from typing import List, Dict, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from torch.utils.data import DataLoader
+from transformers import DataCollatorWithPadding
 from huggingface_hub import login
 from dotenv import load_dotenv
 load_dotenv()
 # Authenticate with Hugging Face
+HF_TOKEN = os.getenv('HF_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
 router = APIRouter()
+DESCRIPTION = "Climate Guard Toxic Agent is a ModernBERT for Climate Disinformation Detection"
 ROUTE = "/text"
 MODEL_NAME = "Tonic/climate-guard-toxic-agent"
+TOKENIZER_NAME = "answerdotai/ModernBERT-base"
 class TextClassifier:
     def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         max_retries = 3
         for attempt in range(max_retries):
             try:
                 # Initialize tokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+                # Initialize model
                 self.model = AutoModelForSequenceClassification.from_pretrained(
                     MODEL_NAME,
                     num_labels=8,
                     trust_remote_code=True
+                ).to(self.device)
+                # Convert to half precision
+                self.model = self.model.half()
+                self.model.eval()
                 print("Model initialized successfully")
                 break
                 print(f"Attempt {attempt + 1} failed, retrying... Error: {str(e)}")
                 time.sleep(1)
+    def process_batch(self, texts: List[str]) -> List[int]:
         """Process a batch of texts and return their predictions"""
         try:
+            # Tokenize
             inputs = self.tokenizer(
+                texts,
                 padding=True,
                 truncation=True,
                 return_tensors="pt"
+            )
+            # Move inputs to device
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
             # Get predictions
             with torch.no_grad():
                 outputs = self.model(**inputs)
+                predictions = torch.argmax(outputs.logits, dim=-1)
+            return predictions.cpu().numpy().tolist()
         except Exception as e:
+            print(f"Error in batch processing: {str(e)}")
+            return [0] * len(texts)
     def __del__(self):
         if hasattr(self, 'model'):
             del self.model
         if torch.cuda.is_available():
 async def evaluate_text(request: TextEvaluationRequest):
     """Evaluate text classification for climate disinformation detection."""
     username, space_url = get_space_info()
     LABEL_MAPPING = {
         "0_not_relevant": 0,
         "1_not_happening": 1,
     }
     try:
+        # Load dataset
+        dataset = load_dataset(request.dataset_name)
+        # Convert labels
+        dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
         test_dataset = dataset["test"]
         # Start tracking emissions
         tracker.start()
         tracker.start_task("inference")
+        # Get true labels
         true_labels = test_dataset["label"]
+        # Initialize model
         classifier = TextClassifier()
+        # Process in batches
+        batch_size = 16
+        data_collator = DataCollatorWithPadding(tokenizer=classifier.tokenizer)
+        # Create DataLoader
+        test_loader = DataLoader(
+            test_dataset,
+            batch_size=batch_size,
+            collate_fn=data_collator
+        )
+        # Get predictions
+        all_predictions = []
+        for batch in test_loader:
+            batch_texts = batch["quote"]
+            batch_preds = classifier.process_batch(batch_texts)
+            all_predictions.extend(batch_preds)
         # Stop tracking emissions
         emissions_data = tracker.stop_task()
         # Calculate accuracy
+        accuracy = accuracy_score(true_labels, all_predictions)
         # Prepare results
         results = {
             }
         }
         return results
     except Exception as e: