frugal-ai-submission-template

Sleeping

App Files Files Community

Tonic commited on Feb 10

Commit

45a2367

unverified ·

1 Parent(s): a036e74

improve text classifier

Browse files

Files changed (1) hide show

tasks/text.py +46 -23

tasks/text.py CHANGED Viewed

@@ -7,8 +7,7 @@ import os
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
-import torch.nn as nn
-from transformers import AutoTokenizer, pipeline
 from huggingface_hub import login
 from dotenv import load_dotenv
@@ -28,34 +27,44 @@ os.environ["TORCH_COMPILE_DISABLE"] = "1"
 router = APIRouter()
-DESCRIPTION = "Climate Guard Toxic Agent model for climate disinformation detection"
 ROUTE = "/text"
 class TextClassifier:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         max_retries = 3
-        model_name = "Tonic/climate-guard-toxic-agent"
         for attempt in range(max_retries):
             try:
-                # Initialize tokenizer first
                 self.tokenizer = AutoTokenizer.from_pretrained(
-                    model_name,
-                    model_max_length=512,  # Reduced from 8192
                     padding_side='right',
                     truncation_side='right'
                 )
-                # Use pipeline for simpler initialization
                 self.classifier = pipeline(
                     "text-classification",
-                    model=model_name,
                     tokenizer=self.tokenizer,
                     device=self.device,
                     max_length=512,
                     truncation=True,
-                    batch_size=32
                 )
                 print("Model initialized successfully")
@@ -69,22 +78,36 @@ class TextClassifier:
     def process_batch(self, batch: List[str], batch_idx: int) -> Tuple[List[int], int]:
         """Process a batch of texts and return their predictions"""
-        try:
-            print(f"Processing batch {batch_idx} with {len(batch)} items")
-            # Use pipeline for prediction
-            outputs = self.classifier(batch)
-            predictions = [int(output['label'].split('_')[0]) for output in outputs]
-            print(f"Completed batch {batch_idx} with {len(predictions)} predictions")
-            return predictions, batch_idx
-        except Exception as e:
-            print(f"Error in batch {batch_idx}: {str(e)}")
-            return [0] * len(batch), batch_idx
     def __del__(self):
         # Clean up CUDA memory
         if hasattr(self, 'classifier'):
             del self.classifier
         if torch.cuda.is_available():

 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 from huggingface_hub import login
 from dotenv import load_dotenv
 router = APIRouter()
+DESCRIPTION = "ModernBERT fine-tuned for climate disinformation detection"
 ROUTE = "/text"
+MODEL_NAME = "answerdotai/ModernBERT-base"
 class TextClassifier:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         max_retries = 3
         for attempt in range(max_retries):
             try:
+                # Initialize tokenizer
                 self.tokenizer = AutoTokenizer.from_pretrained(
+                    MODEL_NAME,
+                    model_max_length=512,
                     padding_side='right',
                     truncation_side='right'
                 )
+                # Initialize model with specific configuration
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    MODEL_NAME,
+                    num_labels=8,
+                    problem_type="single_label_classification"
+                )
+                # Move model to appropriate device
+                self.model = self.model.to(self.device)
+                # Initialize pipeline with the model and tokenizer
                 self.classifier = pipeline(
                     "text-classification",
+                    model=self.model,
                     tokenizer=self.tokenizer,
                     device=self.device,
                     max_length=512,
                     truncation=True,
+                    batch_size=16
                 )
                 print("Model initialized successfully")
     def process_batch(self, batch: List[str], batch_idx: int) -> Tuple[List[int], int]:
         """Process a batch of texts and return their predictions"""
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                print(f"Processing batch {batch_idx} with {len(batch)} items")
+                # Process texts with error handling
+                predictions = []
+                for text in batch:
+                    try:
+                        result = self.classifier(text)
+                        pred_label = int(result[0]['label'].split('_')[0])
+                        predictions.append(pred_label)
+                    except Exception as e:
+                        print(f"Error processing text in batch {batch_idx}: {str(e)}")
+                        predictions.append(0)  # Default prediction
+                print(f"Completed batch {batch_idx} with {len(predictions)} predictions")
+                return predictions, batch_idx
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    print(f"Final error in batch {batch_idx}: {str(e)}")
+                    return [0] * len(batch), batch_idx
+                print(f"Error in batch {batch_idx} (attempt {attempt + 1}): {str(e)}")
+                time.sleep(1)
     def __del__(self):
         # Clean up CUDA memory
+        if hasattr(self, 'model'):
+            del self.model
         if hasattr(self, 'classifier'):
             del self.classifier
         if torch.cuda.is_available():