AnguloM commited on
Commit
3dfb3f9
·
verified ·
1 Parent(s): 8ed568b

Upload loan.py

Browse files
Files changed (1) hide show
  1. loan.py +212 -0
loan.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """loan.py"""
3
+
4
+ # Import necessary libraries
5
+ import numpy as np
6
+ import pandas as pd
7
+ import seaborn as sns
8
+ import matplotlib.pyplot as plt
9
+ import warnings
10
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
11
+ from sklearn.compose import ColumnTransformer
12
+ from sklearn.pipeline import Pipeline
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.linear_model import LogisticRegression
15
+ from imblearn.over_sampling import SMOTE
16
+ from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
17
+ import gradio as gr
18
+ from imblearn.pipeline import Pipeline as ImbPipeline
19
+ import joblib
20
+ from datasets import load_dataset # Import the Hugging Face dataset library
21
+
22
+ # Suppress specific FutureWarnings
23
+ warnings.simplefilter(action='ignore', category=FutureWarning)
24
+
25
+ # Load dataset directly from Hugging Face
26
+ dataset = load_dataset("AnguloM/loan_data")
27
+
28
+ # Access the train and test data
29
+ df_train = dataset['train']
30
+
31
+
32
+ # Convert dataset to pandas DataFrame
33
+ df_train = pd.DataFrame(df_train)
34
+
35
+ from sklearn.model_selection import train_test_split
36
+
37
+ df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)
38
+
39
+
40
+ # Create a summary DataFrame with data types and non-null counts
41
+ info_df = pd.DataFrame({
42
+ "Column": df_train.columns,
43
+ "Data Type": df_train.dtypes,
44
+ "Non-Null Count": df_train.notnull().sum(),
45
+ "Total Count": len(df_train)
46
+ })
47
+
48
+ # Calculate the percentage of non-null values in each column
49
+ info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%'
50
+
51
+ # Style the table
52
+ info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
53
+ [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
54
+ )
55
+
56
+ # Apply background gradient only to numerical columns
57
+ info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges")
58
+
59
+ # Create a widget to display the styled table
60
+ table_widget = widgets.Output()
61
+ with table_widget:
62
+ display(info_df_styled)
63
+
64
+ # Widget for the missing values message
65
+ message_widget = widgets.Output()
66
+ with message_widget:
67
+ print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}")
68
+
69
+ # Display both widgets (table and missing values message) side by side
70
+ widgets.HBox([table_widget, message_widget])
71
+
72
+ # Convert relevant columns to categorical if necessary
73
+ df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category')
74
+
75
+ # Select only numeric columns for correlation matrix calculation
76
+ df_numeric = df_train.select_dtypes(include=[float, int])
77
+
78
+ # Create a 1x2 grid for the plots
79
+ plt.figure(figsize=(12, 6))
80
+
81
+ # Create subplots for the correlation matrix and target distribution
82
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
83
+
84
+ # Plot Correlation Matrix
85
+ sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f')
86
+ axes[0].set_title('Correlation Matrix')
87
+
88
+ # Plot Distribution of Loan Repayment Status (Target Variable)
89
+ sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1])
90
+ axes[1].set_title('Distribution of Loan Repayment Status')
91
+
92
+ # Show the plots
93
+ plt.tight_layout() # Adjusts the layout to avoid overlapping
94
+ plt.show()
95
+
96
+ # OneHotEncoding for categorical columns and scaling for numeric columns
97
+ # Prepare data for training
98
+ data = df_train.copy()
99
+
100
+ # Separate features (X) and target (y)
101
+ X = data.drop('credit.policy', axis=1) # Drop the target column
102
+ y = data['credit.policy'] # Target variable
103
+
104
+ # Split the data into training (80%) and testing (20%) sets
105
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
106
+
107
+ # Preprocessing pipeline (scaling numeric features and encoding categorical features)
108
+ preprocessor = ColumnTransformer(
109
+ transformers=[
110
+ ('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
111
+ 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths',
112
+ 'delinq.2yrs', 'pub.rec']),
113
+ ('cat', OneHotEncoder(), ['purpose']) # Ensure 'purpose' is included in categorical transformations
114
+ ]
115
+ )
116
+
117
+ # Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression
118
+ imb_model_pipeline = ImbPipeline(steps=[
119
+ ('preprocessor', preprocessor), # First, preprocess the data (scale numeric, encode categorical)
120
+ ('smote', SMOTE(random_state=42, sampling_strategy=0.5)), # Apply SMOTE to balance the dataset
121
+ ('classifier', LogisticRegression(max_iter=1000000)) # Logistic Regression classifier
122
+ ])
123
+
124
+ # Train the model with the full pipeline (preprocessing + SMOTE + model training)
125
+ imb_model_pipeline.fit(X_train, y_train)
126
+
127
+ # Make predictions on the test data
128
+ y_pred = imb_model_pipeline.predict(X_test)
129
+ y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class
130
+
131
+ # Adjust the decision threshold to improve recall of the positive class
132
+ threshold = 0.3
133
+ y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
134
+
135
+ # Evaluate the model using classification report
136
+ classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True)
137
+
138
+ # Convert the classification report to a DataFrame for display as a table with styles
139
+ classification_df = pd.DataFrame(classification_rep).transpose()
140
+ classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles(
141
+ [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
142
+ )
143
+
144
+ # Display the classification report as a styled table in a widget
145
+ table_widget = widgets.Output()
146
+ with table_widget:
147
+ display(classification_df_styled)
148
+
149
+ # Calculate the AUC-ROC score
150
+ auc_roc = roc_auc_score(y_test, y_pred_proba)
151
+
152
+ # Widget for the AUC-ROC
153
+ auc_widget = widgets.Output()
154
+ with auc_widget:
155
+ print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}")
156
+
157
+ # Display both widgets (table and AUC-ROC message) side by side
158
+ display(widgets.VBox([table_widget, auc_widget]))
159
+
160
+ # Display the confusion matrix
161
+ cm = confusion_matrix(y_test, y_pred_adjusted)
162
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
163
+ plt.title("Confusion Matrix")
164
+ plt.xlabel("Predicted")
165
+ plt.ylabel("Actual")
166
+ plt.show()
167
+
168
+ from huggingface_hub import hf_hub_download
169
+ import joblib
170
+
171
+
172
+ model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl")
173
+
174
+
175
+ pipeline = joblib.load(model_path)
176
+ # Prediction function
177
+ def predict_approval(int_rate, installment, log_annual_inc, dti, fico,
178
+ days_with_cr_line, revol_bal, revol_util, inq_last_6mths,
179
+ delinq_2yrs, pub_rec, purpose):
180
+ # Prepare the input as a DataFrame
181
+ input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico,
182
+ days_with_cr_line, revol_bal, revol_util,
183
+ inq_last_6mths, delinq_2yrs, pub_rec, purpose]],
184
+ columns=['int.rate', 'installment', 'log.annual.inc',
185
+ 'dti', 'fico', 'days.with.cr.line', 'revol.bal',
186
+ 'revol.util', 'inq.last.6mths', 'delinq.2yrs',
187
+ 'pub.rec', 'purpose'])
188
+ # Make loan approval prediction
189
+ result = pipeline.predict(input_data)
190
+ return result[0]
191
+
192
+
193
+ # Create input components for the Gradio interface
194
+ inputs = [
195
+ gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"),
196
+ gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"),
197
+ gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"),
198
+ gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"),
199
+ gr.Slider(300, 850, step=1, label="FICO Credit Score"),
200
+ gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"),
201
+ gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"),
202
+ gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"),
203
+ gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"),
204
+ gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"),
205
+ gr.Slider(0, 5, step=1, label="Public Records"),
206
+ gr.Dropdown(["credit_card", "debt_consolidation", "educational",
207
+ "home_improvement", "major_purchase", "small_business",
208
+ "other"], label="Loan Purpose")
209
+ ]
210
+
211
+ # Create the Gradio interface for loan approval prediction
212
+ gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)