AnguloM
/

LoanSmart_Predict_Loan_Approval_with_Confidence

Tabular Classification

Scikit-learn

English

finance

Model card Files Files and versions Community

AnguloM commited on Nov 7, 2024

Commit

3dfb3f9

verified ·

1 Parent(s): 8ed568b

Upload loan.py

Browse files

Files changed (1) hide show

loan.py +212 -0

loan.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# -*- coding: utf-8 -*-
+"""loan.py"""
+# Import necessary libraries
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import warnings
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from imblearn.over_sampling import SMOTE
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
+import gradio as gr
+from imblearn.pipeline import Pipeline as ImbPipeline
+import joblib
+from datasets import load_dataset  # Import the Hugging Face dataset library
+# Suppress specific FutureWarnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+# Load dataset directly from Hugging Face
+dataset = load_dataset("AnguloM/loan_data")
+# Access the train and test data
+df_train = dataset['train']
+# Convert dataset to pandas DataFrame
+df_train = pd.DataFrame(df_train)
+from sklearn.model_selection import train_test_split
+df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)
+# Create a summary DataFrame with data types and non-null counts
+info_df = pd.DataFrame({
+    "Column": df_train.columns,
+    "Data Type": df_train.dtypes,
+    "Non-Null Count": df_train.notnull().sum(),
+    "Total Count": len(df_train)
+})
+# Calculate the percentage of non-null values in each column
+info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%'
+# Style the table
+info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
+    [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
+)
+# Apply background gradient only to numerical columns
+info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges")
+# Create a widget to display the styled table
+table_widget = widgets.Output()
+with table_widget:
+    display(info_df_styled)
+# Widget for the missing values message
+message_widget = widgets.Output()
+with message_widget:
+    print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}")
+# Display both widgets (table and missing values message) side by side
+widgets.HBox([table_widget, message_widget])
+# Convert relevant columns to categorical if necessary
+df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category')
+# Select only numeric columns for correlation matrix calculation
+df_numeric = df_train.select_dtypes(include=[float, int])
+# Create a 1x2 grid for the plots
+plt.figure(figsize=(12, 6))
+# Create subplots for the correlation matrix and target distribution
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+# Plot Correlation Matrix
+sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f')
+axes[0].set_title('Correlation Matrix')
+# Plot Distribution of Loan Repayment Status (Target Variable)
+sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1])
+axes[1].set_title('Distribution of Loan Repayment Status')
+# Show the plots
+plt.tight_layout()  # Adjusts the layout to avoid overlapping
+plt.show()
+# OneHotEncoding for categorical columns and scaling for numeric columns
+# Prepare data for training
+data = df_train.copy()
+# Separate features (X) and target (y)
+X = data.drop('credit.policy', axis=1)  # Drop the target column
+y = data['credit.policy']  # Target variable
+# Split the data into training (80%) and testing (20%) sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Preprocessing pipeline (scaling numeric features and encoding categorical features)
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
+                                   'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths',
+                                   'delinq.2yrs', 'pub.rec']),
+        ('cat', OneHotEncoder(), ['purpose'])  # Ensure 'purpose' is included in categorical transformations
+    ]
+)
+# Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression
+imb_model_pipeline = ImbPipeline(steps=[
+    ('preprocessor', preprocessor),  # First, preprocess the data (scale numeric, encode categorical)
+    ('smote', SMOTE(random_state=42, sampling_strategy=0.5)),  # Apply SMOTE to balance the dataset
+    ('classifier', LogisticRegression(max_iter=1000000))  # Logistic Regression classifier
+])
+# Train the model with the full pipeline (preprocessing + SMOTE + model training)
+imb_model_pipeline.fit(X_train, y_train)
+# Make predictions on the test data
+y_pred = imb_model_pipeline.predict(X_test)
+y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
+# Adjust the decision threshold to improve recall of the positive class
+threshold = 0.3
+y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
+# Evaluate the model using classification report
+classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True)
+# Convert the classification report to a DataFrame for display as a table with styles
+classification_df = pd.DataFrame(classification_rep).transpose()
+classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles(
+    [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
+)
+# Display the classification report as a styled table in a widget
+table_widget = widgets.Output()
+with table_widget:
+    display(classification_df_styled)
+# Calculate the AUC-ROC score
+auc_roc = roc_auc_score(y_test, y_pred_proba)
+# Widget for the AUC-ROC
+auc_widget = widgets.Output()
+with auc_widget:
+    print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}")
+# Display both widgets (table and AUC-ROC message) side by side
+display(widgets.VBox([table_widget, auc_widget]))
+# Display the confusion matrix
+cm = confusion_matrix(y_test, y_pred_adjusted)
+sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+plt.title("Confusion Matrix")
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.show()
+from huggingface_hub import hf_hub_download
+import joblib
+model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl")
+pipeline = joblib.load(model_path)
+# Prediction function
+def predict_approval(int_rate, installment, log_annual_inc, dti, fico,
+                     days_with_cr_line, revol_bal, revol_util, inq_last_6mths,
+                     delinq_2yrs, pub_rec, purpose):
+    # Prepare the input as a DataFrame
+    input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico,
+                               days_with_cr_line, revol_bal, revol_util,
+                               inq_last_6mths, delinq_2yrs, pub_rec, purpose]],
+                             columns=['int.rate', 'installment', 'log.annual.inc',
+                                      'dti', 'fico', 'days.with.cr.line', 'revol.bal',
+                                      'revol.util', 'inq.last.6mths', 'delinq.2yrs',
+                                      'pub.rec', 'purpose'])
+    # Make loan approval prediction
+    result = pipeline.predict(input_data)
+    return result[0]
+# Create input components for the Gradio interface
+inputs = [
+    gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"),
+    gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"),
+    gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"),
+    gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"),
+    gr.Slider(300, 850, step=1, label="FICO Credit Score"),
+    gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"),
+    gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"),
+    gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"),
+    gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"),
+    gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"),
+    gr.Slider(0, 5, step=1, label="Public Records"),
+    gr.Dropdown(["credit_card", "debt_consolidation", "educational",
+                 "home_improvement", "major_purchase", "small_business",
+                 "other"], label="Loan Purpose")
+]
+# Create the Gradio interface for loan approval prediction
+gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)