Spaces:

resolverkatla
/

Titanic_Survival

Sleeping

App Files Files Community

resolverkatla commited on May 26

Commit

64f61d8

verified ·

1 Parent(s): 755c66d

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -93

app.py CHANGED Viewed

@@ -1,116 +1,93 @@
 import gradio as gr
 import pandas as pd
 from sklearn.model_selection import train_test_split
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score
 from datasets import load_dataset # To load the dataset from Hugging Face
 # --- Data Loading and Preprocessing ---
-# Load the Titanic dataset from Hugging Face
-# This dataset is commonly available on HF and mirrors the Kaggle structure.
 try:
-    # We load the 'train' split of the dataset
-    dataset = load_dataset("AbubakarJ/titanic", split="train")
-    df = pd.DataFrame(dataset) # Convert to pandas DataFrame
 except Exception as e:
-    # If the dataset cannot be loaded (e.g., internet issue on Space startup, or dataset changed)
-    # This will raise a RuntimeError which typically stops the Gradio app from launching
-    raise RuntimeError(f"Could not load Titanic dataset from Hugging Face: {e}. "
-                       "Please check the dataset name or your connection.")
-# Drop irrelevant columns and 'PassengerId' which is not a feature
-# These columns are typically present in the full Kaggle Titanic dataset.
-df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
-# Handle missing 'Age' with median imputation
-df['Age'].fillna(df['Age'].median(), inplace=True)
-# Handle missing 'Fare' with median imputation (Fare can also have missing values sometimes)
-df['Fare'].fillna(df['Fare'].median(), inplace=True)
-# Handle missing 'Embarked' with mode imputation
-df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
-# Convert categorical features to numerical using one-hot encoding
-# We drop 'Embarked_C' to avoid multicollinearity (as per common practice)
-df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
-# Define features (X) and target (y)
-X = df.drop('Survived', axis=1)
-y = df['Survived']
-# Split data into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-# Train a RandomForestClassifier model
-model = RandomForestClassifier(n_estimators=100, random_state=42)
-model.fit(X_train, y_train)
-# Evaluate the model (for display purposes)
-y_pred = model.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred)
 accuracy_message = f"Model Accuracy on Test Set: {accuracy:.2f}"
 # --- Prediction Function for Gradio ---
-def predict_survival(pclass, sex, age, sibsp, parch, fare, embarked):
-    # Create a dictionary for the input values
-    input_dict = {
-        'Pclass': pclass,
-        'Age': age,
-        'SibSp': sibsp,
-        'Parch': parch,
-        'Fare': fare,
-        # These match the one-hot encoded columns created during training
-        'Sex_male': 1 if sex == 'male' else 0,
-        'Embarked_Q': 1 if embarked == 'Q' else 0, # Assuming 'Q' is 'Embarked_Q'
-        'Embarked_S': 1 if embarked == 'S' else 0  # Assuming 'S' is 'Embarked_S'
-    }
-    # Create a DataFrame from the input values
-    input_data = pd.DataFrame([input_dict])
-    # Ensure all columns expected by the model are present in the input_data, even if 0
-    # This handles cases where a category might not be present in a single input but was in training
-    for col in X.columns:
-        if col not in input_data.columns:
-            input_data[col] = 0
-    # Reorder columns to match the training data's column order
-    input_data = input_data[X.columns]
     # Make prediction
-    prediction = model.predict(input_data)[0]
-    prediction_proba = model.predict_proba(input_data)[0]
-    if prediction == 1:
-        return f"Prediction: Survived ({prediction_proba[1]:.2%} confidence)", "green"
-    else:
-        return f"Prediction: Did Not Survive ({prediction_proba[0]:.2%} confidence)", "red"
 # --- Gradio Interface ---
-# CSS to style the output textbox background
 with gr.Blocks(css=".green {background-color: #e6ffe6 !important;}.red {background-color: #ffe6e6 !important;}") as demo:
     gr.Markdown(
         """
-        # Titanic Survival Predictor
-        Enter passenger details to predict their survival on the Titanic.
         """
     )
-    gr.Markdown(f"### Model Performance: {accuracy_message}")
-    with gr.Row():
-        pclass_input = gr.Radio(choices=[1, 2, 3], label="Pclass", value=3)
-        sex_input = gr.Radio(choices=['male', 'female'], label="Sex", value='male')
-        age_input = gr.Slider(minimum=0.5, maximum=80, value=30, label="Age", step=0.5)
-    with gr.Row():
-        sibsp_input = gr.Number(label="SibSp (Siblings/Spouses Aboard)", value=0)
-        parch_input = gr.Number(label="Parch (Parents/Children Aboard)", value=0)
-        fare_input = gr.Number(label="Fare", value=30.0)
-    with gr.Row():
-        embarked_input = gr.Radio(choices=['C', 'Q', 'S'], label="Embarked (Port of Embarkation)", value='S')
     predict_btn = gr.Button("Predict Survival")
     output_text = gr.Textbox(label="Survival Prediction", interactive=False)
     # This label is used internally to get the color, its content is not directly shown
@@ -126,8 +103,8 @@ with gr.Blocks(css=".green {background-color: #e6ffe6 !important;}.red {backgrou
             return gr.Textbox(value=text, label="Survival Prediction", interactive=False)
     predict_btn.click(
-        fn=predict_survival,
-        inputs=[pclass_input, sex_input, age_input, sibsp_input, parch_input, fare_input, embarked_input],
         outputs=[output_text, output_color_indicator]
     ).then(
         fn=update_output_style,
@@ -135,19 +112,25 @@ with gr.Blocks(css=".green {background-color: #e6ffe6 !important;}.red {backgrou
         outputs=output_text
     )
     gr.Markdown(
         """
         ---
-        **Feature Definitions:**
-        * **Pclass:** Passenger Class (1 = 1st, 2 = 2nd, 3 = 3rd)
-        * **Sex:** Sex (male/female)
-        * **Age:** Age in years
-        * **SibSp:** Number of siblings/spouses aboard the Titanic
-        * **Parch:** Number of parents/children aboard the Titanic
-        * **Fare:** Passenger fare
-        * **Embarked:** Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
-        *Note: The dataset is loaded directly from Hugging Face's `datasets` library ([AbubakarJ/titanic](https://huggingface.co/datasets/AbubakarJ/titanic)). Missing 'Age', 'Fare', and 'Embarked' values are imputed. Categorical features are one-hot encoded.*
         """
     )

 import gradio as gr
 import pandas as pd
 from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score, classification_report
 from datasets import load_dataset # To load the dataset from Hugging Face
 # --- Data Loading and Preprocessing ---
+# Load the julien-c/titanic-survival dataset from Hugging Face
 try:
+    # This dataset typically contains 'text' (description) and 'label' (0=died, 1=survived)
+    dataset = load_dataset("julien-c/titanic-survival", split="train")
+    df = pd.DataFrame(dataset) # Convert the dataset to a pandas DataFrame
 except Exception as e:
+    gr.Warning(f"Failed to load dataset: {e}. Please check your internet connection or dataset name.")
+    # Provide a minimal fallback for local testing if HF dataset loading fails
+    df = pd.DataFrame({
+        'text': [
+            "A young boy, probably a steerage passenger. He doesn't look like he survived.",
+            "A first-class lady with a child. She likely survived due to priority.",
+            "Male, 30s, middle class. Probably didn't make it.",
+            "Female, 20s, dressed finely. Looks like she got on a lifeboat.",
+            "An elderly man, alone, traveling steerage."
+        ],
+        'label': ["0", "1", "0", "1", "0"] # 0 for died, 1 for survived
+    })
+# Ensure 'label' column is numeric
+df['label'] = pd.to_numeric(df['label'])
+# Define features (X) and target (y)
+X = df['text']
+y = df['label']
+# Split data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Initialize CountVectorizer
+# This converts text documents to a matrix of token counts
+vectorizer = CountVectorizer(stop_words='english', lowercase=True)
+# Fit the vectorizer on the training data and transform both training and test data
+X_train_vectorized = vectorizer.fit_transform(X_train)
+X_test_vectorized = vectorizer.transform(X_test)
+# --- Model Training ---
+# Initialize and train the Multinomial Naive Bayes model
+model = MultinomialNB()
+model.fit(X_train_vectorized, y_train)
+# --- Model Evaluation (for display in app) ---
+y_pred = model.predict(X_test_vectorized)
 accuracy = accuracy_score(y_test, y_pred)
 accuracy_message = f"Model Accuracy on Test Set: {accuracy:.2f}"
 # --- Prediction Function for Gradio ---
+def predict_survival_from_text(passenger_description):
+    # Transform the input description using the *trained* vectorizer
+    message_vectorized = vectorizer.transform([passenger_description])
     # Make prediction
+    prediction = model.predict(message_vectorized)[0]
+    prediction_proba = model.predict_proba(message_vectorized)[0]
+    if prediction == 1: # 1 corresponds to 'survived'
+        return f"Prediction: SURVIVED ({prediction_proba[1]:.2%} confidence)", "green"
+    else: # 0 corresponds to 'died'
+        return f"Prediction: DID NOT SURVIVE ({prediction_proba[0]:.2%} confidence)", "red"
 # --- Gradio Interface ---
 with gr.Blocks(css=".green {background-color: #e6ffe6 !important;}.red {background-color: #ffe6e6 !important;}") as demo:
     gr.Markdown(
         """
+        # Titanic Survival Predictor (Text-based)
+        Enter a textual description of a passenger to predict their survival on the Titanic.
+        This model uses text classification techniques.
         """
     )
+    gr.Markdown(f"### {accuracy_message}")
+    description_input = gr.Textbox(
+        label="Enter Passenger Description",
+        lines=5,
+        placeholder="e.g., 'A young woman from first class, traveling alone.'"
+    )
     predict_btn = gr.Button("Predict Survival")
     output_text = gr.Textbox(label="Survival Prediction", interactive=False)
     # This label is used internally to get the color, its content is not directly shown
             return gr.Textbox(value=text, label="Survival Prediction", interactive=False)
     predict_btn.click(
+        fn=predict_survival_from_text,
+        inputs=description_input,
         outputs=[output_text, output_color_indicator]
     ).then(
         fn=update_output_style,
         outputs=output_text
     )
+    gr.Examples(
+        examples=[
+            "A wealthy first-class woman with her child. She was probably on a lifeboat.",
+            "An old man, alone, traveling in third class. He likely did not survive.",
+            "A young male crew member.",
+            "A small child from steerage."
+        ],
+        inputs=description_input,
+        outputs=[output_text, output_color_indicator],
+        fn=predict_survival_from_text,
+        cache_examples=True # Caches the output for examples for faster loading
+    )
     gr.Markdown(
         """
         ---
+        *This model uses a Multinomial Naive Bayes classifier. It is trained on the 'text' descriptions from the
+        [julien-c/titanic-survival](https://huggingface.co/datasets/julien-c/titanic-survival) dataset
+        to predict survival ('0' for died, '1' for survived). Text is preprocessed using CountVectorizer.*
         """
     )