Spaces:
Sleeping
Sleeping
File size: 2,072 Bytes
73a393b 50944f0 924d4e1 50944f0 924d4e1 b89ac72 924d4e1 b89ac72 924d4e1 50944f0 b89ac72 50944f0 924d4e1 50944f0 924d4e1 b89ac72 fe81869 50944f0 b89ac72 50944f0 b89ac72 924d4e1 b89ac72 924d4e1 b89ac72 924d4e1 50944f0 924d4e1 b89ac72 924d4e1 b89ac72 73a393b 924d4e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import gradio as gr
import re
from datasets import load_dataset
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
# 1. Load dataset
dataset = load_dataset("sms_spam", split="train")
texts = dataset["sms"]
labels = [1 if label == "spam" else 0 for label in dataset["label"]]
print("Label distribution:", Counter(labels)) # Debug check
# 2. Clean text
def clean_text(text):
text = text.lower()
text = re.sub(r"\W+", " ", text)
return text.strip()
texts_cleaned = [clean_text(t) for t in texts]
# 3. Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
texts_cleaned, labels, test_size=0.2, random_state=42, stratify=labels
)
print("Train labels:", Counter(y_train)) # Debug check
print("Test labels:", Counter(y_test)) # Debug check
# 4. Build model pipeline
model = make_pipeline(
TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_df=0.9),
LogisticRegression(max_iter=1000, class_weight="balanced")
)
# 5. Train model
model.fit(X_train, y_train)
# 6. Evaluate
y_pred = model.predict(X_test)
print("Validation Accuracy:", accuracy_score(y_test, y_pred))
# 7. Predict function
def predict_spam(message):
cleaned = clean_text(message)
pred = model.predict([cleaned])[0]
prob = model.predict_proba([cleaned])[0][pred]
label = "π« Spam" if pred == 1 else "π© Not Spam (Ham)"
return f"{label} (Confidence: {prob:.2%})"
# 8. Gradio app
iface = gr.Interface(
fn=predict_spam,
inputs=gr.Textbox(lines=4, label="Enter your SMS message"),
outputs=gr.Text(label="Prediction"),
title="π¬ SMS Spam Detector (Improved)",
description="Detect spam in SMS messages using Logistic Regression with TF-IDF bi-grams. Trained on a balanced dataset from Hugging Face."
)
if __name__ == "__main__":
iface.launch(share=False)
|