Upload 1040_249_949.py
Browse files- 1040_249_949.py +76 -0
1040_249_949.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""1040_249_949
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1T8VCDZs5tRg-mTI4qNqCct_92fcd_7Rl
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import seaborn as sns
|
13 |
+
import numpy as np
|
14 |
+
import warnings as w
|
15 |
+
w.filterwarnings('ignore')
|
16 |
+
|
17 |
+
df=pd.read_csv('//content/1000_ml_jobs_us.csv')
|
18 |
+
|
19 |
+
df.head()
|
20 |
+
|
21 |
+
df.isnull().sum()
|
22 |
+
|
23 |
+
df.drop(columns=['company_website', 'company_description', 'job_description_text', 'Unnamed: 0'], inplace=True)
|
24 |
+
|
25 |
+
df['company_address_locality'] = df['company_address_locality'].fillna(df['company_address_locality'].mode()[0])
|
26 |
+
df['company_address_region'] = df['company_address_region'].fillna(df['company_address_region'].mode()[0])
|
27 |
+
df['seniority_level'] = (df['seniority_level'].fillna(df['seniority_level']).mode()[0])
|
28 |
+
|
29 |
+
df.info()
|
30 |
+
|
31 |
+
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
|
32 |
+
|
33 |
+
df['company_address_locality'].value_counts().head(10).plot(kind='bar', title='Top 10 Localities')
|
34 |
+
|
35 |
+
df['company_address_region'].value_counts().head(10).plot(kind='bar', title='Top 10 Regions')
|
36 |
+
|
37 |
+
df['company_name'].value_counts().head(10).plot(kind='barh', title='Top 10 Hiring Companies')
|
38 |
+
|
39 |
+
df['seniority_level'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Seniority Level Distribution')
|
40 |
+
|
41 |
+
df['job_title'].value_counts().head(15).plot(kind='bar', title='Top 15 Job Titles')
|
42 |
+
|
43 |
+
import pandas as pd
|
44 |
+
from sklearn.preprocessing import LabelEncoder
|
45 |
+
from sklearn.model_selection import train_test_split
|
46 |
+
from sklearn.ensemble import RandomForestClassifier
|
47 |
+
from sklearn.metrics import classification_report, accuracy_score
|
48 |
+
import warnings as w
|
49 |
+
w.filterwarnings('ignore')
|
50 |
+
|
51 |
+
# Load data (assuming the previous steps for loading and cleaning the data were successful)
|
52 |
+
# df=pd.read_csv('//content/1000_ml_jobs_us.csv')
|
53 |
+
# ... (previous data cleaning and preparation steps) ...
|
54 |
+
|
55 |
+
le = LabelEncoder()
|
56 |
+
|
57 |
+
# Apply LabelEncoder to all relevant categorical columns outside the training loop
|
58 |
+
for col in ['job_posted_date', 'company_address_locality', 'company_address_region', 'company_name', 'job_title']:
|
59 |
+
df[col] = le.fit_transform(df[col].astype(str))
|
60 |
+
|
61 |
+
# Define features (X) and target (y) after encoding
|
62 |
+
X = df.drop('seniority_level', axis=1)
|
63 |
+
y = le.fit_transform(df['seniority_level']) # Encode the target variable as well
|
64 |
+
|
65 |
+
# Perform the train-test split
|
66 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
67 |
+
|
68 |
+
# Initialize and train the model
|
69 |
+
model = RandomForestClassifier(random_state=42)
|
70 |
+
model.fit(X_train, y_train)
|
71 |
+
|
72 |
+
# Make predictions and evaluate the model
|
73 |
+
y_pred = model.predict(X_test)
|
74 |
+
|
75 |
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
76 |
+
print(classification_report(y_test, y_pred))
|