File size: 2,072 Bytes
73a393b
50944f0
924d4e1
 
50944f0
 
924d4e1
 
b89ac72
924d4e1
 
b89ac72
924d4e1
50944f0
 
b89ac72
 
50944f0
 
 
 
 
924d4e1
50944f0
924d4e1
b89ac72
fe81869
 
 
50944f0
b89ac72
 
 
 
50944f0
 
 
 
b89ac72
 
924d4e1
 
b89ac72
924d4e1
 
 
b89ac72
924d4e1
50944f0
 
 
 
 
924d4e1
b89ac72
924d4e1
 
 
 
b89ac72
 
73a393b
 
 
924d4e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
import re
from datasets import load_dataset
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

# 1. Load dataset
dataset = load_dataset("sms_spam", split="train")
texts = dataset["sms"]
labels = [1 if label == "spam" else 0 for label in dataset["label"]]

print("Label distribution:", Counter(labels))  # Debug check

# 2. Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\W+", " ", text)
    return text.strip()

texts_cleaned = [clean_text(t) for t in texts]

# 3. Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    texts_cleaned, labels, test_size=0.2, random_state=42, stratify=labels
)

print("Train labels:", Counter(y_train))  # Debug check
print("Test labels:", Counter(y_test))    # Debug check

# 4. Build model pipeline
model = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_df=0.9),
    LogisticRegression(max_iter=1000, class_weight="balanced")
)

# 5. Train model
model.fit(X_train, y_train)

# 6. Evaluate
y_pred = model.predict(X_test)
print("Validation Accuracy:", accuracy_score(y_test, y_pred))

# 7. Predict function
def predict_spam(message):
    cleaned = clean_text(message)
    pred = model.predict([cleaned])[0]
    prob = model.predict_proba([cleaned])[0][pred]
    label = "🚫 Spam" if pred == 1 else "πŸ“© Not Spam (Ham)"
    return f"{label} (Confidence: {prob:.2%})"

# 8. Gradio app
iface = gr.Interface(
    fn=predict_spam,
    inputs=gr.Textbox(lines=4, label="Enter your SMS message"),
    outputs=gr.Text(label="Prediction"),
    title="πŸ“¬ SMS Spam Detector (Improved)",
    description="Detect spam in SMS messages using Logistic Regression with TF-IDF bi-grams. Trained on a balanced dataset from Hugging Face."
)

if __name__ == "__main__":
    iface.launch(share=False)