antitheft159 commited on
Commit
bf55579
·
verified ·
1 Parent(s): f8c1fbb

Upload 1040_249_949.py

Browse files
Files changed (1) hide show
  1. 1040_249_949.py +76 -0
1040_249_949.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """1040_249_949
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1T8VCDZs5tRg-mTI4qNqCct_92fcd_7Rl
8
+ """
9
+
10
+ import pandas as pd
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ import numpy as np
14
+ import warnings as w
15
+ w.filterwarnings('ignore')
16
+
17
+ df=pd.read_csv('//content/1000_ml_jobs_us.csv')
18
+
19
+ df.head()
20
+
21
+ df.isnull().sum()
22
+
23
+ df.drop(columns=['company_website', 'company_description', 'job_description_text', 'Unnamed: 0'], inplace=True)
24
+
25
+ df['company_address_locality'] = df['company_address_locality'].fillna(df['company_address_locality'].mode()[0])
26
+ df['company_address_region'] = df['company_address_region'].fillna(df['company_address_region'].mode()[0])
27
+ df['seniority_level'] = (df['seniority_level'].fillna(df['seniority_level']).mode()[0])
28
+
29
+ df.info()
30
+
31
+ df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
32
+
33
+ df['company_address_locality'].value_counts().head(10).plot(kind='bar', title='Top 10 Localities')
34
+
35
+ df['company_address_region'].value_counts().head(10).plot(kind='bar', title='Top 10 Regions')
36
+
37
+ df['company_name'].value_counts().head(10).plot(kind='barh', title='Top 10 Hiring Companies')
38
+
39
+ df['seniority_level'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Seniority Level Distribution')
40
+
41
+ df['job_title'].value_counts().head(15).plot(kind='bar', title='Top 15 Job Titles')
42
+
43
+ import pandas as pd
44
+ from sklearn.preprocessing import LabelEncoder
45
+ from sklearn.model_selection import train_test_split
46
+ from sklearn.ensemble import RandomForestClassifier
47
+ from sklearn.metrics import classification_report, accuracy_score
48
+ import warnings as w
49
+ w.filterwarnings('ignore')
50
+
51
+ # Load data (assuming the previous steps for loading and cleaning the data were successful)
52
+ # df=pd.read_csv('//content/1000_ml_jobs_us.csv')
53
+ # ... (previous data cleaning and preparation steps) ...
54
+
55
+ le = LabelEncoder()
56
+
57
+ # Apply LabelEncoder to all relevant categorical columns outside the training loop
58
+ for col in ['job_posted_date', 'company_address_locality', 'company_address_region', 'company_name', 'job_title']:
59
+ df[col] = le.fit_transform(df[col].astype(str))
60
+
61
+ # Define features (X) and target (y) after encoding
62
+ X = df.drop('seniority_level', axis=1)
63
+ y = le.fit_transform(df['seniority_level']) # Encode the target variable as well
64
+
65
+ # Perform the train-test split
66
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
67
+
68
+ # Initialize and train the model
69
+ model = RandomForestClassifier(random_state=42)
70
+ model.fit(X_train, y_train)
71
+
72
+ # Make predictions and evaluate the model
73
+ y_pred = model.predict(X_test)
74
+
75
+ print("Accuracy:", accuracy_score(y_test, y_pred))
76
+ print(classification_report(y_test, y_pred))