import joblib from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from warnings import filterwarnings filterwarnings('ignore') df = pd.read_csv("/content/forest_health_data_with_target.csv") target = 'Health_Status' numeric_features = [ 'Latitude', 'Longitude', 'DBH', 'Tree_Height', 'Crown_Width_North_South', 'Crown_Width_East_West', 'Slope', 'Elevation', 'Temperature', 'Humidity', 'Soil_TN', 'Soil_TP', 'Soil_AP', 'Soil_AN', 'Menhinick_Index', 'Gleason_Index', 'Fire_Risk_Index' ] print("Creating data subsets") X = df[numeric_features] y = df[target] Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=0.2, random_state=42 ) preprocessor = make_column_transformer( (StandardScaler(), numeric_features), ) model_logistic_regression = LogisticRegression(n_jobs=-1) print("Estimating Best Model Pipeline") model_pipeline = make_pipeline( preprocessor, model_logistic_regression ) param_distribution = { "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1] } rand_search_cv = RandomizedSearchCV( model_pipeline, param_distribution, n_iter=3, cv=3, random_state=42 ) rand_search_cv.fit(Xtrain, ytrain) print("Logging Metrics") print(f"Accuracy: {rand_search_cv.best_score_}") print("Serializing Model") saved_model_path = "model.joblib" joblib.dump(rand_search_cv.best_estimator_, saved_model_path)