rahul2001 commited on
Commit
0aa9c34
·
1 Parent(s): e22877e

Data transformation

Browse files
EDA.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
artifact/Preprocessor.pkl ADDED
Binary file (3.48 kB). View file
 
model_training.ipynb CHANGED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "1.1 Import Data and Required Packages\n",
15
+ "Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.\n",
16
+ "# Basic Import"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 16,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "# Basic Import\n",
26
+ "import numpy as np\n",
27
+ "import pandas as pd\n",
28
+ "import matplotlib.pyplot as plt \n",
29
+ "import seaborn as sns\n",
30
+ "# Modelling\n",
31
+ "from sklearn.metrics import mean_squared_error, r2_score\n",
32
+ "from sklearn.neighbors import KNeighborsRegressor\n",
33
+ "from sklearn.tree import DecisionTreeRegressor\n",
34
+ "from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor\n",
35
+ "from sklearn.svm import SVR\n",
36
+ "from sklearn.linear_model import LinearRegression, Ridge,Lasso\n",
37
+ "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
38
+ "from sklearn.model_selection import RandomizedSearchCV\n",
39
+ "from catboost import CatBoostRegressor\n",
40
+ "from xgboost import XGBRegressor\n",
41
+ "import warnings"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 3,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "df = pd.read_csv(\"artifact/raw.csv\")"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 4,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "name": "stdout",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "gender => ['female' 'male']\n",
63
+ "\n",
64
+ "race_ethnicity => ['group B' 'group C' 'group A' 'group D' 'group E']\n",
65
+ "\n",
66
+ "parental_level_of_education => [\"bachelor's degree\" 'some college' \"master's degree\" \"associate's degree\"\n",
67
+ " 'high school' 'some high school']\n",
68
+ "\n",
69
+ "lunch => ['standard' 'free/reduced']\n",
70
+ "\n",
71
+ "test_preparation_course => ['none' 'completed']\n",
72
+ "\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "for i in df.columns:\n",
78
+ " if df[i].dtype == \"object\":\n",
79
+ " print(\"{} =>\".format(i),df[i].unique())\n",
80
+ " print(\"\")"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 7,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "X = df.drop(columns=['math_score'],axis=1)\n",
90
+ "y = df[\"math_score\"]"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 8,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "# Create Column Transformer with 3 types of transformers\n",
100
+ "num_features = X.select_dtypes(exclude=\"object\").columns\n",
101
+ "cat_features = X.select_dtypes(include=\"object\").columns\n",
102
+ "\n",
103
+ "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
104
+ "from sklearn.compose import ColumnTransformer\n",
105
+ "\n",
106
+ "numeric_transformer = StandardScaler()\n",
107
+ "oh_transformer = OneHotEncoder()\n",
108
+ "\n",
109
+ "preprocessor = ColumnTransformer(\n",
110
+ " [\n",
111
+ " (\"OneHotEncoder\", oh_transformer, cat_features),\n",
112
+ " (\"StandardScaler\", numeric_transformer, num_features), \n",
113
+ " ]\n",
114
+ ")"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 9,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "X = preprocessor.fit_transform(X)"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 12,
129
+ "metadata": {},
130
+ "outputs": [
131
+ {
132
+ "data": {
133
+ "text/plain": [
134
+ "(1000, 19)"
135
+ ]
136
+ },
137
+ "execution_count": 12,
138
+ "metadata": {},
139
+ "output_type": "execute_result"
140
+ }
141
+ ],
142
+ "source": [
143
+ "X.shape"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 13,
149
+ "metadata": {},
150
+ "outputs": [
151
+ {
152
+ "data": {
153
+ "text/plain": [
154
+ "((800, 19), (200, 19))"
155
+ ]
156
+ },
157
+ "execution_count": 13,
158
+ "metadata": {},
159
+ "output_type": "execute_result"
160
+ }
161
+ ],
162
+ "source": [
163
+ "# separate dataset into train and test\n",
164
+ "from sklearn.model_selection import train_test_split\n",
165
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n",
166
+ "X_train.shape, X_test.shape"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "metadata": {},
172
+ "source": [
173
+ "***Create an Evaluate Function to give all metrics after model Training***"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 14,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "def evaluate_model(true, predicted):\n",
183
+ " mae = mean_absolute_error(true, predicted)\n",
184
+ " mse = mean_squared_error(true, predicted)\n",
185
+ " rmse = np.sqrt(mean_squared_error(true, predicted))\n",
186
+ " r2_square = r2_score(true, predicted)\n",
187
+ " return mae, rmse, r2_square"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 17,
193
+ "metadata": {},
194
+ "outputs": [
195
+ {
196
+ "name": "stdout",
197
+ "output_type": "stream",
198
+ "text": [
199
+ "Linear Regression\n",
200
+ "Model performance for Training set\n",
201
+ "- Root Mean Squared Error: 5.3243\n",
202
+ "- Mean Absolute Error: 4.2671\n",
203
+ "- R2 Score: 0.8743\n",
204
+ "----------------------------------\n",
205
+ "Model performance for Test set\n",
206
+ "- Root Mean Squared Error: 5.3960\n",
207
+ "- Mean Absolute Error: 4.2158\n",
208
+ "- R2 Score: 0.8803\n",
209
+ "===================================\n",
210
+ "\n",
211
+ "\n",
212
+ "Lasso\n",
213
+ "Model performance for Training set\n",
214
+ "- Root Mean Squared Error: 6.5938\n",
215
+ "- Mean Absolute Error: 5.2063\n",
216
+ "- R2 Score: 0.8071\n",
217
+ "----------------------------------\n",
218
+ "Model performance for Test set\n",
219
+ "- Root Mean Squared Error: 6.5197\n",
220
+ "- Mean Absolute Error: 5.1579\n",
221
+ "- R2 Score: 0.8253\n",
222
+ "===================================\n",
223
+ "\n",
224
+ "\n",
225
+ "Ridge\n",
226
+ "Model performance for Training set\n",
227
+ "- Root Mean Squared Error: 5.3233\n",
228
+ "- Mean Absolute Error: 4.2650\n",
229
+ "- R2 Score: 0.8743\n",
230
+ "----------------------------------\n",
231
+ "Model performance for Test set\n",
232
+ "- Root Mean Squared Error: 5.3904\n",
233
+ "- Mean Absolute Error: 4.2111\n",
234
+ "- R2 Score: 0.8806\n",
235
+ "===================================\n",
236
+ "\n",
237
+ "\n",
238
+ "K-Neighbors Regressor\n",
239
+ "Model performance for Training set\n",
240
+ "- Root Mean Squared Error: 5.7077\n",
241
+ "- Mean Absolute Error: 4.5167\n",
242
+ "- R2 Score: 0.8555\n",
243
+ "----------------------------------\n",
244
+ "Model performance for Test set\n",
245
+ "- Root Mean Squared Error: 7.2530\n",
246
+ "- Mean Absolute Error: 5.6210\n",
247
+ "- R2 Score: 0.7838\n",
248
+ "===================================\n",
249
+ "\n",
250
+ "\n",
251
+ "Decision Tree\n",
252
+ "Model performance for Training set\n",
253
+ "- Root Mean Squared Error: 0.2795\n",
254
+ "- Mean Absolute Error: 0.0187\n",
255
+ "- R2 Score: 0.9997\n",
256
+ "----------------------------------\n",
257
+ "Model performance for Test set\n",
258
+ "- Root Mean Squared Error: 7.7785\n",
259
+ "- Mean Absolute Error: 6.2350\n",
260
+ "- R2 Score: 0.7514\n",
261
+ "===================================\n",
262
+ "\n",
263
+ "\n",
264
+ "Random Forest Regressor\n",
265
+ "Model performance for Training set\n",
266
+ "- Root Mean Squared Error: 2.2860\n",
267
+ "- Mean Absolute Error: 1.8215\n",
268
+ "- R2 Score: 0.9768\n",
269
+ "----------------------------------\n",
270
+ "Model performance for Test set\n",
271
+ "- Root Mean Squared Error: 5.9993\n",
272
+ "- Mean Absolute Error: 4.6304\n",
273
+ "- R2 Score: 0.8521\n",
274
+ "===================================\n",
275
+ "\n",
276
+ "\n",
277
+ "XGBRegressor\n",
278
+ "Model performance for Training set\n",
279
+ "- Root Mean Squared Error: 1.0073\n",
280
+ "- Mean Absolute Error: 0.6875\n",
281
+ "- R2 Score: 0.9955\n",
282
+ "----------------------------------\n",
283
+ "Model performance for Test set\n",
284
+ "- Root Mean Squared Error: 6.4733\n",
285
+ "- Mean Absolute Error: 5.0577\n",
286
+ "- R2 Score: 0.8278\n",
287
+ "===================================\n",
288
+ "\n",
289
+ "\n",
290
+ "CatBoosting Regressor\n",
291
+ "Model performance for Training set\n",
292
+ "- Root Mean Squared Error: 3.0427\n",
293
+ "- Mean Absolute Error: 2.4054\n",
294
+ "- R2 Score: 0.9589\n",
295
+ "----------------------------------\n",
296
+ "Model performance for Test set\n",
297
+ "- Root Mean Squared Error: 6.0086\n",
298
+ "- Mean Absolute Error: 4.6125\n",
299
+ "- R2 Score: 0.8516\n",
300
+ "===================================\n",
301
+ "\n",
302
+ "\n",
303
+ "AdaBoost Regressor\n",
304
+ "Model performance for Training set\n",
305
+ "- Root Mean Squared Error: 5.7923\n",
306
+ "- Mean Absolute Error: 4.7185\n",
307
+ "- R2 Score: 0.8512\n",
308
+ "----------------------------------\n",
309
+ "Model performance for Test set\n",
310
+ "- Root Mean Squared Error: 5.9460\n",
311
+ "- Mean Absolute Error: 4.6538\n",
312
+ "- R2 Score: 0.8547\n",
313
+ "===================================\n",
314
+ "\n",
315
+ "\n"
316
+ ]
317
+ }
318
+ ],
319
+ "source": [
320
+ "models = {\n",
321
+ " \"Linear Regression\": LinearRegression(),\n",
322
+ " \"Lasso\": Lasso(),\n",
323
+ " \"Ridge\": Ridge(),\n",
324
+ " \"K-Neighbors Regressor\": KNeighborsRegressor(),\n",
325
+ " \"Decision Tree\": DecisionTreeRegressor(),\n",
326
+ " \"Random Forest Regressor\": RandomForestRegressor(),\n",
327
+ " \"XGBRegressor\": XGBRegressor(), \n",
328
+ " \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n",
329
+ " \"AdaBoost Regressor\": AdaBoostRegressor()\n",
330
+ "}\n",
331
+ "model_list = []\n",
332
+ "r2_list =[]\n",
333
+ "\n",
334
+ "for i in range(len(list(models))):\n",
335
+ " model = list(models.values())[i]\n",
336
+ " model.fit(X_train, y_train) # Train model\n",
337
+ "\n",
338
+ " # Make predictions\n",
339
+ " y_train_pred = model.predict(X_train)\n",
340
+ " y_test_pred = model.predict(X_test)\n",
341
+ " \n",
342
+ " # Evaluate Train and Test dataset\n",
343
+ " model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)\n",
344
+ "\n",
345
+ " model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n",
346
+ "\n",
347
+ " \n",
348
+ " print(list(models.keys())[i])\n",
349
+ " model_list.append(list(models.keys())[i])\n",
350
+ " \n",
351
+ " print('Model performance for Training set')\n",
352
+ " print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n",
353
+ " print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n",
354
+ " print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n",
355
+ "\n",
356
+ " print('----------------------------------')\n",
357
+ " \n",
358
+ " print('Model performance for Test set')\n",
359
+ " print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n",
360
+ " print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n",
361
+ " print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n",
362
+ " r2_list.append(model_test_r2)\n",
363
+ " \n",
364
+ " print('='*35)\n",
365
+ " print('\\n')"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "metadata": {},
371
+ "source": [
372
+ "***Results***"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 18,
378
+ "metadata": {},
379
+ "outputs": [
380
+ {
381
+ "data": {
382
+ "text/html": [
383
+ "<div>\n",
384
+ "<style scoped>\n",
385
+ " .dataframe tbody tr th:only-of-type {\n",
386
+ " vertical-align: middle;\n",
387
+ " }\n",
388
+ "\n",
389
+ " .dataframe tbody tr th {\n",
390
+ " vertical-align: top;\n",
391
+ " }\n",
392
+ "\n",
393
+ " .dataframe thead th {\n",
394
+ " text-align: right;\n",
395
+ " }\n",
396
+ "</style>\n",
397
+ "<table border=\"1\" class=\"dataframe\">\n",
398
+ " <thead>\n",
399
+ " <tr style=\"text-align: right;\">\n",
400
+ " <th></th>\n",
401
+ " <th>Model Name</th>\n",
402
+ " <th>R2_Score</th>\n",
403
+ " </tr>\n",
404
+ " </thead>\n",
405
+ " <tbody>\n",
406
+ " <tr>\n",
407
+ " <th>2</th>\n",
408
+ " <td>Ridge</td>\n",
409
+ " <td>0.880593</td>\n",
410
+ " </tr>\n",
411
+ " <tr>\n",
412
+ " <th>0</th>\n",
413
+ " <td>Linear Regression</td>\n",
414
+ " <td>0.880345</td>\n",
415
+ " </tr>\n",
416
+ " <tr>\n",
417
+ " <th>8</th>\n",
418
+ " <td>AdaBoost Regressor</td>\n",
419
+ " <td>0.854710</td>\n",
420
+ " </tr>\n",
421
+ " <tr>\n",
422
+ " <th>5</th>\n",
423
+ " <td>Random Forest Regressor</td>\n",
424
+ " <td>0.852094</td>\n",
425
+ " </tr>\n",
426
+ " <tr>\n",
427
+ " <th>7</th>\n",
428
+ " <td>CatBoosting Regressor</td>\n",
429
+ " <td>0.851632</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>6</th>\n",
433
+ " <td>XGBRegressor</td>\n",
434
+ " <td>0.827797</td>\n",
435
+ " </tr>\n",
436
+ " <tr>\n",
437
+ " <th>1</th>\n",
438
+ " <td>Lasso</td>\n",
439
+ " <td>0.825320</td>\n",
440
+ " </tr>\n",
441
+ " <tr>\n",
442
+ " <th>3</th>\n",
443
+ " <td>K-Neighbors Regressor</td>\n",
444
+ " <td>0.783813</td>\n",
445
+ " </tr>\n",
446
+ " <tr>\n",
447
+ " <th>4</th>\n",
448
+ " <td>Decision Tree</td>\n",
449
+ " <td>0.751354</td>\n",
450
+ " </tr>\n",
451
+ " </tbody>\n",
452
+ "</table>\n",
453
+ "</div>"
454
+ ],
455
+ "text/plain": [
456
+ " Model Name R2_Score\n",
457
+ "2 Ridge 0.880593\n",
458
+ "0 Linear Regression 0.880345\n",
459
+ "8 AdaBoost Regressor 0.854710\n",
460
+ "5 Random Forest Regressor 0.852094\n",
461
+ "7 CatBoosting Regressor 0.851632\n",
462
+ "6 XGBRegressor 0.827797\n",
463
+ "1 Lasso 0.825320\n",
464
+ "3 K-Neighbors Regressor 0.783813\n",
465
+ "4 Decision Tree 0.751354"
466
+ ]
467
+ },
468
+ "execution_count": 18,
469
+ "metadata": {},
470
+ "output_type": "execute_result"
471
+ }
472
+ ],
473
+ "source": [
474
+ "pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=[\"R2_Score\"],ascending=False)\n"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": null,
480
+ "metadata": {},
481
+ "outputs": [],
482
+ "source": []
483
+ },
484
+ {
485
+ "cell_type": "code",
486
+ "execution_count": null,
487
+ "metadata": {},
488
+ "outputs": [],
489
+ "source": []
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": null,
494
+ "metadata": {},
495
+ "outputs": [],
496
+ "source": []
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": null,
501
+ "metadata": {},
502
+ "outputs": [],
503
+ "source": []
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": null,
508
+ "metadata": {},
509
+ "outputs": [],
510
+ "source": []
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": null,
515
+ "metadata": {},
516
+ "outputs": [],
517
+ "source": []
518
+ }
519
+ ],
520
+ "metadata": {
521
+ "kernelspec": {
522
+ "display_name": "ml-project",
523
+ "language": "python",
524
+ "name": "python3"
525
+ },
526
+ "language_info": {
527
+ "codemirror_mode": {
528
+ "name": "ipython",
529
+ "version": 3
530
+ },
531
+ "file_extension": ".py",
532
+ "mimetype": "text/x-python",
533
+ "name": "python",
534
+ "nbconvert_exporter": "python",
535
+ "pygments_lexer": "ipython3",
536
+ "version": "3.11.4"
537
+ },
538
+ "orig_nbformat": 4
539
+ },
540
+ "nbformat": 4,
541
+ "nbformat_minor": 2
542
+ }
requirements.txt CHANGED
@@ -5,4 +5,5 @@ matplotlib
5
  scikit-learn
6
  catboost
7
  xgboost
 
8
  -e .
 
5
  scikit-learn
6
  catboost
7
  xgboost
8
+ dill
9
  -e .
src/Components/Data_ingestation.py CHANGED
@@ -6,8 +6,7 @@ from src.logger import logging
6
  import pandas as pd
7
  from sklearn.model_selection import train_test_split
8
  from dataclasses import dataclass
9
-
10
-
11
  @dataclass
12
  class Data_ingestion_config:
13
  train_data_path: str = os.path.join("artifact","train.csv")
@@ -46,8 +45,10 @@ class Data_ingestion:
46
 
47
  if __name__ == "__main__":
48
  obj = Data_ingestion()
49
- obj.intiate_data_ingestion()
50
 
 
 
51
 
52
 
53
 
 
6
  import pandas as pd
7
  from sklearn.model_selection import train_test_split
8
  from dataclasses import dataclass
9
+ from data_transformation import Data_transformation
 
10
  @dataclass
11
  class Data_ingestion_config:
12
  train_data_path: str = os.path.join("artifact","train.csv")
 
45
 
46
  if __name__ == "__main__":
47
  obj = Data_ingestion()
48
+ train_data,test_data = obj.intiate_data_ingestion()
49
 
50
+ data_trans = Data_transformation()
51
+ data_trans.initiate_data_transformation(train_data,test_data)
52
 
53
 
54
 
src/Components/data_transformation.py CHANGED
@@ -11,21 +11,97 @@ from sklearn.preprocessing import OneHotEncoder,StandardScaler
11
  from src.exception import CustomException
12
  from src.logger import logging
13
 
14
- from Data_ingestation import Data_ingestion
 
15
 
16
  @dataclass
17
 
18
  class Data_transformation_config:
19
- Preprpcessor_obj_file = os.path.join("artifact","Preprocessor.pkl")
20
 
21
  class Data_transformation:
22
  def __init__(self) -> None:
23
  self.data_transformation_config = Data_transformation_config()
24
  def get_data_transformer_object(self):
25
  try:
26
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  except Exception as e:
28
  raise CustomException(e,sys)
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
11
  from src.exception import CustomException
12
  from src.logger import logging
13
 
14
+
15
+ from src.utils import save_object
16
 
17
  @dataclass
18
 
19
  class Data_transformation_config:
20
+ Preprocessor_obj_file = os.path.join("artifact","Preprocessor.pkl")
21
 
22
  class Data_transformation:
23
  def __init__(self) -> None:
24
  self.data_transformation_config = Data_transformation_config()
25
  def get_data_transformer_object(self):
26
  try:
27
+ numerical_columns = ["writing_score","reading_score"]
28
+ categorical_columns = [
29
+ "gender",
30
+ "race_ethnicity",
31
+ "parental_level_of_education",
32
+ "lunch",
33
+ "test_preparation_course",
34
+ ]
35
+
36
+ num_pipeline = Pipeline(
37
+ steps = [
38
+ ("imputer",SimpleImputer(strategy="median")),
39
+ ("scaler",StandardScaler())
40
+ ]
41
+ )
42
+ cat_pipeline = Pipeline(
43
+ steps = [
44
+ ("imputer",SimpleImputer(strategy= "most_frequent")),
45
+ ("one_hot_encoder",OneHotEncoder()),
46
+ ("scaler",StandardScaler(with_mean = False))
47
+
48
+ ]
49
+ )
50
+ logging.info(f"Categorical Columns:{categorical_columns}")
51
+ logging.info(f"Numerical Columns:{numerical_columns}")
52
+
53
+ preprocessor = ColumnTransformer(
54
+ [
55
+ ("num_pipeline",num_pipeline,numerical_columns),
56
+ ("cat_pipeline",cat_pipeline,categorical_columns)
57
+ ]
58
+ )
59
+ return preprocessor
60
  except Exception as e:
61
  raise CustomException(e,sys)
62
+
63
+
64
+ def initiate_data_transformation(self,train_path,test_path):
65
+
66
+ try:
67
+ train_df = pd.read_csv(train_path)
68
+ test_df = pd.read_csv(test_path)
69
+
70
+ logging.info("Read train and test data completed")
71
+ logging.info("Obtaining preprocessing object")
72
+
73
+ preprocessor_obj = self.get_data_transformer_object()
74
 
75
+ target_column_name = "math_score"
76
+ numerical_columns = ["writing_score","reading_score"]
77
 
78
+ input_feature_train_df = train_df.drop(columns = [target_column_name],axis = 1)
79
+ target_feature_train_df = train_df[target_column_name]
80
+
81
+ input_feature_test_df = test_df.drop(columns = [target_column_name],axis = 1)
82
+ target_feature_test_df = test_df[target_column_name]
83
+
84
+ logging.info(
85
+ f"Applying preprocessing object on training dataframe and testing dataframe.")
86
+
87
+ input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
88
+ input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df)
89
+
90
+ train_arr = np.c_[input_feature_train_arr,np.array(target_feature_train_df)]
91
+ test_arr = np.c_[input_feature_test_arr,np.array(target_feature_test_df)]
92
+
93
+ logging.info(f"Saved preprocessing object.")
94
+
95
+ save_object(
96
+ file_path = self.data_transformation_config.Preprocessor_obj_file,
97
+ obj = preprocessor_obj
98
+ )
99
+
100
+ return (
101
+ train_arr,
102
+ test_arr,
103
+ self.data_transformation_config.Preprocessor_obj_file
104
+ )
105
+ except Exception as e:
106
+ raise CustomException(e,sys)
107
 
src/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import dill
7
+ import pickle
8
+ from sklearn.metrics import r2_score
9
+ from sklearn.model_selection import GridSearchCV
10
+
11
+ from src.exception import CustomException
12
+
13
+ def save_object(file_path , obj):
14
+ try:
15
+ dir_path = os.path.dirname(file_path)
16
+
17
+ os.makedirs(dir_path,exist_ok= True)
18
+
19
+ with open(file_path,"wb") as file_obj:
20
+ pickle.dump(obj,file_obj)
21
+ except Exception as e:
22
+ raise CustomException(e,sys)
23
+