Spaces:
Sleeping
Sleeping
File size: 3,140 Bytes
4d6e8c2 f6107f3 e2f75a8 4d6e8c2 e2f75a8 70f5f26 1c33274 70f5f26 e2f75a8 4d6e8c2 70f5f26 e2f75a8 f6107f3 e2f75a8 4d6e8c2 5d2f9b2 4d6e8c2 5d2f9b2 f6107f3 e2f75a8 f6107f3 5d2f9b2 4d6e8c2 70f5f26 f6107f3 70f5f26 4d6e8c2 5d2f9b2 4d6e8c2 f6107f3 5d2f9b2 4d6e8c2 70f5f26 4d6e8c2 1c33274 4d6e8c2 5d2f9b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
router = APIRouter()
DESCRIPTION = "SVM Text Classifier with TF-IDF"
ROUTE = "/text_svm"
@router.post(ROUTE, tags=["Text Task"],
description=DESCRIPTION)
async def evaluate_text_svm(request: TextEvaluationRequest):
"""
Evaluate text classification for climate disinformation detection.
Current Model: SVM Classifier
- Uses TF-IDF for text vectorization
- Trains and evaluates a Support Vector Machine (SVM) model
"""
# Get space info
username, space_url = get_space_info()
# Define the label mapping
LABEL_MAPPING = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Load and prepare the dataset
dataset = load_dataset(request.dataset_name)
# Convert string labels to integers
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
# Split dataset
train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
train_dataset = train_test["train"]
test_dataset = train_test["test"]
# Extract text and labels
train_texts = [x["text"] for x in train_dataset]
train_labels = [x["label"] for x in train_dataset]
test_texts = [x["text"] for x in test_dataset]
test_labels = [x["label"] for x in test_dataset]
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)
# Train SVM Classifier
model = SVC(kernel="linear", probability=True)
model.fit(train_vectors, train_labels)
# Start tracking emissions
tracker.start()
tracker.start_task("inference")
# Inference
predictions = model.predict(test_vectors)
# Stop tracking emissions
emissions_data = tracker.stop_task()
# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
# Prepare results dictionary
results = {
"username": username,
"space_url": space_url,
"submission_timestamp": datetime.now().isoformat(),
"model_description": DESCRIPTION,
"accuracy": float(accuracy),
"energy_consumed_wh": emissions_data.energy_consumed * 1000,
"emissions_gco2eq": emissions_data.emissions * 1000,
"emissions_data": clean_emissions_data(emissions_data),
"api_route": ROUTE,
"dataset_config": {
"dataset_name": request.dataset_name,
"test_size": request.test_size,
"test_seed": request.test_seed
}
}
return results
|