resolverkatla commited on
Commit
b89ac72
Β·
verified Β·
1 Parent(s): 0f060c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -6,12 +6,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.linear_model import LogisticRegression
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.metrics import accuracy_score
 
9
 
10
  # 1. Load dataset
11
- dataset = load_dataset("ucirvine/sms_spam", split="train")
12
  texts = dataset["sms"]
13
  labels = [1 if label == "spam" else 0 for label in dataset["label"]]
14
 
 
 
15
  # 2. Clean text
16
  def clean_text(text):
17
  text = text.lower()
@@ -20,23 +23,28 @@ def clean_text(text):
20
 
21
  texts_cleaned = [clean_text(t) for t in texts]
22
 
23
- # 3. Train/test split (use stratified sampling!)
24
  X_train, X_test, y_train, y_test = train_test_split(
25
  texts_cleaned, labels, test_size=0.2, random_state=42, stratify=labels
26
  )
27
 
28
- # 4. Build model: TF-IDF + Logistic Regression
 
 
 
29
  model = make_pipeline(
30
  TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_df=0.9),
31
  LogisticRegression(max_iter=1000, class_weight="balanced")
32
  )
 
 
33
  model.fit(X_train, y_train)
34
 
35
- # 5. Show validation accuracy
36
  y_pred = model.predict(X_test)
37
  print("Validation Accuracy:", accuracy_score(y_test, y_pred))
38
 
39
- # 6. Prediction function
40
  def predict_spam(message):
41
  cleaned = clean_text(message)
42
  pred = model.predict([cleaned])[0]
@@ -44,13 +52,13 @@ def predict_spam(message):
44
  label = "🚫 Spam" if pred == 1 else "πŸ“© Not Spam (Ham)"
45
  return f"{label} (Confidence: {prob:.2%})"
46
 
47
- # 7. Gradio UI
48
  iface = gr.Interface(
49
  fn=predict_spam,
50
  inputs=gr.Textbox(lines=4, label="Enter your SMS message"),
51
  outputs=gr.Text(label="Prediction"),
52
- title="πŸ“¬ Improved SMS Spam Detector",
53
- description="Detects spam in SMS messages using Logistic Regression with TF-IDF bi-grams. Now with higher accuracy!"
54
  )
55
 
56
  if __name__ == "__main__":
 
6
  from sklearn.linear_model import LogisticRegression
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.metrics import accuracy_score
9
+ from collections import Counter
10
 
11
  # 1. Load dataset
12
+ dataset = load_dataset("sms_spam", split="train")
13
  texts = dataset["sms"]
14
  labels = [1 if label == "spam" else 0 for label in dataset["label"]]
15
 
16
+ print("Label distribution:", Counter(labels)) # Debug check
17
+
18
  # 2. Clean text
19
  def clean_text(text):
20
  text = text.lower()
 
23
 
24
  texts_cleaned = [clean_text(t) for t in texts]
25
 
26
+ # 3. Train/test split with stratification
27
  X_train, X_test, y_train, y_test = train_test_split(
28
  texts_cleaned, labels, test_size=0.2, random_state=42, stratify=labels
29
  )
30
 
31
+ print("Train labels:", Counter(y_train)) # Debug check
32
+ print("Test labels:", Counter(y_test)) # Debug check
33
+
34
+ # 4. Build model pipeline
35
  model = make_pipeline(
36
  TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_df=0.9),
37
  LogisticRegression(max_iter=1000, class_weight="balanced")
38
  )
39
+
40
+ # 5. Train model
41
  model.fit(X_train, y_train)
42
 
43
+ # 6. Evaluate
44
  y_pred = model.predict(X_test)
45
  print("Validation Accuracy:", accuracy_score(y_test, y_pred))
46
 
47
+ # 7. Predict function
48
  def predict_spam(message):
49
  cleaned = clean_text(message)
50
  pred = model.predict([cleaned])[0]
 
52
  label = "🚫 Spam" if pred == 1 else "πŸ“© Not Spam (Ham)"
53
  return f"{label} (Confidence: {prob:.2%})"
54
 
55
+ # 8. Gradio app
56
  iface = gr.Interface(
57
  fn=predict_spam,
58
  inputs=gr.Textbox(lines=4, label="Enter your SMS message"),
59
  outputs=gr.Text(label="Prediction"),
60
+ title="πŸ“¬ SMS Spam Detector (Improved)",
61
+ description="Detect spam in SMS messages using Logistic Regression with TF-IDF bi-grams. Trained on a balanced dataset from Hugging Face."
62
  )
63
 
64
  if __name__ == "__main__":