carlosgonzalezmartinez commited on
Commit
5896b2f
·
verified ·
1 Parent(s): 39722e7

Upload 3 files

Browse files
Files changed (3) hide show
  1. model (3).joblib +3 -0
  2. requirements (1).txt +1 -0
  3. train.py +98 -0
model (3).joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd88a95d898fdfeaa52814163167680b1069d479b399777f32c33fb800a9e6c2
3
+ size 4550
requirements (1).txt ADDED
@@ -0,0 +1 @@
 
 
1
+ scikit-learn==1.2.2
train.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import joblib
3
+
4
+ from sklearn.datasets import fetch_openml
5
+
6
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
7
+ from sklearn.compose import make_column_transformer
8
+
9
+ from sklearn.pipeline import make_pipeline
10
+
11
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
12
+
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ import matplotlib.pyplot as plt
19
+ import seaborn as sns
20
+ from warnings import filterwarnings
21
+ filterwarnings('ignore')
22
+
23
+ df = pd.read_csv("/content/forest_health_data_with_target.csv")
24
+
25
+
26
+
27
+
28
+ target = 'Health_Status'
29
+ numeric_features = [
30
+ 'Latitude',
31
+ 'Longitude',
32
+ 'DBH',
33
+ 'Tree_Height',
34
+ 'Crown_Width_North_South',
35
+ 'Crown_Width_East_West',
36
+ 'Slope',
37
+ 'Elevation',
38
+ 'Temperature',
39
+ 'Humidity',
40
+ 'Soil_TN',
41
+ 'Soil_TP',
42
+ 'Soil_AP',
43
+ 'Soil_AN',
44
+ 'Menhinick_Index',
45
+ 'Gleason_Index',
46
+ 'Fire_Risk_Index'
47
+
48
+ ]
49
+
50
+
51
+
52
+
53
+ print("Creating data subsets")
54
+
55
+ X = df[numeric_features]
56
+ y = df[target]
57
+
58
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
59
+ X, y,
60
+ test_size=0.2,
61
+ random_state=42
62
+ )
63
+
64
+ preprocessor = make_column_transformer(
65
+ (StandardScaler(), numeric_features),
66
+ )
67
+
68
+ model_logistic_regression = LogisticRegression(n_jobs=-1)
69
+
70
+ print("Estimating Best Model Pipeline")
71
+
72
+ model_pipeline = make_pipeline(
73
+ preprocessor,
74
+ model_logistic_regression
75
+ )
76
+
77
+ param_distribution = {
78
+ "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1]
79
+ }
80
+
81
+ rand_search_cv = RandomizedSearchCV(
82
+ model_pipeline,
83
+ param_distribution,
84
+ n_iter=3,
85
+ cv=3,
86
+ random_state=42
87
+ )
88
+
89
+ rand_search_cv.fit(Xtrain, ytrain)
90
+
91
+ print("Logging Metrics")
92
+ print(f"Accuracy: {rand_search_cv.best_score_}")
93
+
94
+ print("Serializing Model")
95
+
96
+ saved_model_path = "model.joblib"
97
+
98
+ joblib.dump(rand_search_cv.best_estimator_, saved_model_path)