frugal-ai-submission-template

Sleeping

App Files Files Community

Tonic commited on Feb 10

Commit

bc4f464

verified ·

1 Parent(s): 21262c6

use pipeline

Browse files

Files changed (1) hide show

tasks/text.py +14 -36

tasks/text.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from huggingface_hub import login
 from dotenv import load_dotenv
@@ -18,7 +18,7 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info, star
 load_dotenv()
 # Authenticate with Hugging Face
-HF_TOKEN = os.getenv('HF_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
@@ -38,26 +38,13 @@ class TextClassifier:
         for attempt in range(max_retries):
             try:
-                # Load config first
-                config = AutoConfig.from_pretrained(model_name)
-                # Initialize tokenizer with specific model type
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    model_name,
-                    model_max_length=512,
-                    padding_side='right',
-                    truncation_side='right'
-                )
-                # Initialize model with config
-                self.model = AutoModelForSequenceClassification.from_pretrained(
-                    model_name,
-                    config=config,
-                    torch_dtype=torch.float32
                 )
-                self.model.to(self.device)
-                self.model.eval()
                 print("Model initialized successfully")
                 break
@@ -72,18 +59,9 @@ class TextClassifier:
         try:
             print(f"Processing batch {batch_idx} with {len(batch)} items")
-            # Process entire batch at once
-            inputs = self.tokenizer(
-                batch,
-                return_tensors="pt",
-                truncation=True,
-                max_length=512,
-                padding='max_length'
-            ).to(self.device)
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                predictions = torch.argmax(outputs.logits, dim=-1).tolist()
             print(f"Completed batch {batch_idx} with {len(predictions)} predictions")
             return predictions, batch_idx
@@ -112,13 +90,13 @@ async def evaluate_text(request: TextEvaluationRequest):
     }
     try:
-        # Load and prepare the dataset using the correct dataset name
-        dataset = load_dataset("QuotaClimat/frugalaichallenge-text-train", use_auth_token=HF_TOKEN)
         # Convert string labels to integers
         dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
-        # Split dataset according to request parameters
         test_dataset = dataset["test"]
         # Start tracking emissions

 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
+from transformers import pipeline
 from huggingface_hub import login
 from dotenv import load_dotenv
 load_dotenv()
 # Authenticate with Hugging Face
+HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
         for attempt in range(max_retries):
             try:
+                # Initialize pipeline
+                self.classifier = pipeline(
+                    "text-classification",
+                    model=model_name,
+                    device=self.device,
+                    batch_size=32
                 )
                 print("Model initialized successfully")
                 break
         try:
             print(f"Processing batch {batch_idx} with {len(batch)} items")
+            # Use pipeline for prediction
+            results = self.classifier(batch)
+            predictions = [int(result['label'].split('_')[0]) for result in results]
             print(f"Completed batch {batch_idx} with {len(predictions)} predictions")
             return predictions, batch_idx
     }
     try:
+        # Load and prepare the dataset
+        dataset = load_dataset("QuotaClimat/frugalaichallenge-text-train", token=HF_TOKEN)
         # Convert string labels to integers
         dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
+        # Split dataset
         test_dataset = dataset["test"]
         # Start tracking emissions