submission-frugal-ai-challenge

Sleeping

App Files Files Community

pedro-thenewsroom commited on Jan 31

Commit

90194b0

verified ·

1 Parent(s): 40ac593

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +7 -13

tasks/text.py CHANGED Viewed

@@ -2,8 +2,7 @@ from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
-from sentence_transformers import SentenceTransformer, util
-import faiss
 import numpy as np
 from .utils.emissions import clean_emissions_data, get_space_info, tracker
@@ -11,7 +10,7 @@ from .utils.evaluation import TextEvaluationRequest
 router = APIRouter()
-DESCRIPTION = "Embedding-based classification with similarity threshold"
 ROUTE = "/text"
 # Load custom embedding model
@@ -41,16 +40,11 @@ class_descriptions = {
     "7_fossil_fuels_needed": "Fossil fuels have powered centuries of progress, lifted billions out of poverty, and remain the backbone of global energy, while alternatives, though promising, cannot yet match their scale, reliability, or affordability.",
 }
-# Precompute class embeddings
 class_labels = list(class_descriptions.keys())
 class_sentences = list(class_descriptions.values())
 class_embeddings = embedding_model.encode(class_sentences, batch_size=8, convert_to_numpy=True, normalize_embeddings=True)
-# Build FAISS index for efficient similarity search
-dimension = class_embeddings.shape[1]
-faiss_index = faiss.IndexFlatIP(dimension)  # Inner product = cosine similarity for normalized vectors
-faiss_index.add(class_embeddings)
 @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
@@ -81,10 +75,10 @@ async def evaluate_text(request: TextEvaluationRequest):
     # Batch embed all test dataset quotes
     test_embeddings = embedding_model.encode(test_dataset["quote"], batch_size=32, convert_to_numpy=True, normalize_embeddings=True)
-    # Use FAISS to find the nearest class for each embedding
-    similarities, indices = faiss_index.search(test_embeddings, 1)  # Top-1 match for each input
-    best_similarities = similarities.flatten()
-    best_indices = indices.flatten()
     # Apply threshold (0.9) for classification
     predictions = [

 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
+from sentence_transformers import SentenceTransformer
 import numpy as np
 from .utils.emissions import clean_emissions_data, get_space_info, tracker
 router = APIRouter()
+DESCRIPTION = "Efficient embedding-based classification with similarity threshold"
 ROUTE = "/text"
 # Load custom embedding model
     "7_fossil_fuels_needed": "Fossil fuels have powered centuries of progress, lifted billions out of poverty, and remain the backbone of global energy, while alternatives, though promising, cannot yet match their scale, reliability, or affordability.",
 }
+# Precompute class embeddings (normalized for cosine similarity)
 class_labels = list(class_descriptions.keys())
 class_sentences = list(class_descriptions.values())
 class_embeddings = embedding_model.encode(class_sentences, batch_size=8, convert_to_numpy=True, normalize_embeddings=True)
 @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
     # Batch embed all test dataset quotes
     test_embeddings = embedding_model.encode(test_dataset["quote"], batch_size=32, convert_to_numpy=True, normalize_embeddings=True)
+    # Compute cosine similarity in a single operation
+    similarity_matrix = np.dot(test_embeddings, class_embeddings.T)  # Efficient matrix multiplication
+    best_indices = similarity_matrix.argmax(axis=1)  # Get index of highest similarity for each test sample
+    best_similarities = similarity_matrix.max(axis=1)  # Get max similarity values
     # Apply threshold (0.9) for classification
     predictions = [