frugal-ai-submission-template

Sleeping

App Files Files Community

Tonic commited on Feb 10

Commit

6f0e9af

verified ·

1 Parent(s): 7abed63

fix dataset loading

Browse files

Files changed (1) hide show

tasks/text.py +84 -53

tasks/text.py CHANGED Viewed

@@ -123,59 +123,90 @@ async def evaluate_text(request: TextEvaluationRequest):
         "7_fossil_fuels_needed": 7
     }
-    # Load and prepare the dataset
-    dataset = load_dataset(request.dataset_name)
-    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
-    test_dataset = dataset["test"]
-    # Start tracking emissions
-    start_tracking()
-    true_labels = test_dataset["label"]
-    # Initialize the model once
-    classifier = TextClassifier()
-    # Prepare batches
-    batch_size = 32  # Increased batch size for efficiency
-    quotes = test_dataset["quote"]
-    num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
-    batches = [
-        quotes[i * batch_size:(i + 1) * batch_size]
-        for i in range(num_batches)
-    ]
-    # Process batches sequentially to avoid memory issues
-    predictions = []
-    for idx, batch in enumerate(batches):
-        batch_preds, _ = classifier.process_batch(batch, idx)
-        predictions.extend(batch_preds)
-        print(f"Processed batch {idx + 1}/{num_batches}")
-    # Stop tracking emissions
-    emissions_data = stop_tracking()
-    # Calculate accuracy
-    accuracy = accuracy_score(true_labels, predictions)
-    print("accuracy:", accuracy)
-    # Prepare results
-    results = {
-        "username": username,
-        "space_url": space_url,
-        "submission_timestamp": datetime.now().isoformat(),
-        "model_description": DESCRIPTION,
-        "accuracy": float(accuracy),
-        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
-        "emissions_gco2eq": emissions_data.emissions * 1000,
-        "emissions_data": clean_emissions_data(emissions_data),
-        "api_route": ROUTE,
-        "dataset_config": {
-            "dataset_name": request.dataset_name,
-            "test_size": request.test_size,
-            "test_seed": request.test_seed
         }
-    }
-    print("results:", results)
-    return results

         "7_fossil_fuels_needed": 7
     }
+    try:
+        # Load and prepare the dataset
+        dataset = load_dataset("QuotaClimat/frugal-ai-challenge")
+        # Convert string labels to integers
+        dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
+        test_dataset = dataset["test"]
+        # Start tracking emissions
+        tracker.start()
+        tracker.start_task("inference")
+        true_labels = test_dataset["label"]
+        # Initialize the model once
+        classifier = TextClassifier()
+        # Prepare batches
+        batch_size = 32
+        quotes = test_dataset["quote"]
+        num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
+        batches = [
+            quotes[i * batch_size:(i + 1) * batch_size]
+            for i in range(num_batches)
+        ]
+        # Initialize batch_results
+        batch_results = [[] for _ in range(num_batches)]
+        # Process batches in parallel
+        max_workers = min(os.cpu_count(), 4)
+        print(f"Processing with {max_workers} workers")
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_batch = {
+                executor.submit(classifier.process_batch, batch, idx): idx
+                for idx, batch in enumerate(batches)
+            }
+            for future in future_to_batch:
+                batch_idx = future_to_batch[future]
+                try:
+                    predictions, idx = future.result()
+                    if predictions:
+                        batch_results[idx] = predictions
+                        print(f"Stored results for batch {idx} ({len(predictions)} predictions)")
+                except Exception as e:
+                    print(f"Failed to get results for batch {batch_idx}: {e}")
+                    batch_results[batch_idx] = [0] * len(batches[batch_idx])
+        # Flatten predictions
+        predictions = []
+        for batch_preds in batch_results:
+            if batch_preds is not None:
+                predictions.extend(batch_preds)
+        # Stop tracking emissions
+        emissions_data = tracker.stop_task()
+        # Calculate accuracy
+        accuracy = accuracy_score(true_labels, predictions)
+        print("accuracy:", accuracy)
+        # Prepare results
+        results = {
+            "username": username,
+            "space_url": space_url,
+            "submission_timestamp": datetime.now().isoformat(),
+            "model_description": DESCRIPTION,
+            "accuracy": float(accuracy),
+            "energy_consumed_wh": emissions_data.energy_consumed * 1000,
+            "emissions_gco2eq": emissions_data.emissions * 1000,
+            "emissions_data": clean_emissions_data(emissions_data),
+            "api_route": ROUTE,
+            "dataset_config": {
+                "dataset_name": request.dataset_name,
+                "test_size": request.test_size,
+                "test_seed": request.test_seed
+            }
         }
+        print("results:", results)
+        return results
+    except Exception as e:
+        print(f"Error in evaluate_text: {str(e)}")
+        raise Exception(f"Failed to process request: {str(e)}")