Spaces:
Sleeping
Sleeping
File size: 5,723 Bytes
bcb3e72 c1f7985 bcb3e72 cacd96a bcb3e72 1f044af bcb3e72 1f044af bcb3e72 1f044af bcb3e72 1f044af d2ecae9 6a482cd a4b0912 bcb3e72 a4b0912 d2ecae9 bcb3e72 a4b0912 bcb3e72 a4b0912 d2ecae9 bcb3e72 d2ecae9 bcb3e72 d2ecae9 bcb3e72 a0c13e7 d2ecae9 a4b0912 d2ecae9 a4b0912 d2ecae9 bcb3e72 2d02f7a bcb3e72 6a482cd bcb3e72 6a482cd bcb3e72 6a482cd d3a67dc 3010a2d f46476b 3010a2d bc23151 bcb3e72 d3a67dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import pandas as pd
import numpy as np
import streamlit as st
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import logging
import warnings
import time
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore", category=UserWarning, module="urllib3")
geolocator = Nominatim(user_agent="job_recommendation_system")
def get_coordinates(location):
geolocator = Nominatim(user_agent="job_recommendation_system")
location = location.replace("-", ",")
if "Metropolitan Area" in location or "Region" in location:
city_name = location.split(" ")[0] # Ambil nama kota utama
location = city_name
location_obj = geolocator.geocode(location)
if location_obj:
return location_obj.latitude, location_obj.longitude
print(f"Koordinat untuk {location} tidak ditemukan. Mencoba alternatif.")
city_or_area = location.split(',')[0].strip() # Ambil nama kota pertama
location_obj = geolocator.geocode(city_or_area)
if not location_obj:
country = location.split(',')[-1].strip() # Ambil nama negara terakhir
location_obj = geolocator.geocode(country)
retry_count = 0
while not location_obj and retry_count < 5:
print(f"Mencoba ulang untuk lokasi {location}...")
time.sleep(2) # Delay
location_obj = geolocator.geocode(location)
retry_count += 1
if location_obj:
return location_obj.latitude, location_obj.longitude
else:
print(f"Koordinat untuk {location} atau alternatif tidak dapat ditemukan.")
return None, None
sample_data = pd.read_csv('job_data_with_coordinates.csv')
# 1. Vektorisasi skill menggunakan CountVectorizer
def vectorize_skills(skills, all_skills):
vectorizer = CountVectorizer()
vectorizer.fit(all_skills)
skills_vector = vectorizer.transform(skills)
return skills_vector
# 2. Menghitung Cosine Similarity
def calculate_cosine_similarity(user_skills_tfidf, job_skills_tfidf):
return cosine_similarity(user_skills_tfidf, job_skills_tfidf)
# 3. Menghitung jarak lokasi
def calculate_distance(job_coords, user_coords):
try:
return geodesic(job_coords, user_coords).km
except ValueError: # Menangani kasus koordinat yang tidak valid
return float('inf')
# Fungsi normalisasi jarak menggunakan log
def normalize_distance(distances):
normalized = 1 / (1 + np.log1p(distances)) # log1p untuk menangani log(1 + distance)
return normalized
# Menambahkan radius sebagai parameter
def prepare_and_recommend(df, user_skills, user_location):
# 1. Memastikan dataset memiliki koordinat
if 'latitude' not in df or 'longitude' not in df:
raise ValueError("Dataset harus memiliki kolom latitude dan longitude")
# 2. Vektorisasi skill
all_skills = df['skills'].tolist()
user_skills_vtr = vectorize_skills([user_skills], all_skills)
job_skills_vtr = vectorize_skills(df['skills'], all_skills)
# 3. Menghitung Cosine Similarity antara user dan pekerjaan
cosine_similarities = calculate_cosine_similarity(user_skills_vtr, job_skills_vtr)
df['cosine_similarity'] = cosine_similarities[0]
# 4. Menghitung jarak antara lokasi pekerjaan dan lokasi user
user_coords = get_coordinates(user_location)
distances = []
for _, row in df.iterrows():
if pd.notna(row['latitude']) and pd.notna(row['longitude']):
job_coords = (row['latitude'], row['longitude'])
distance = calculate_distance(job_coords, user_coords)
distances.append(distance)
else:
distances.append(float('inf'))
df['distance (km)'] = distances
# # 5. Filter pekerjaan berdasarkan radius
# df = df[df['distance (km)']]
# 6. Normalisasi jarak
if not df.empty:
df['normalized_distance'] = normalize_distance(df['distance (km)'])
# 7. Menghitung skor akhir
df['final score'] = (1.5 * df['cosine_similarity']) * (1.0 * df['normalized_distance'])
df['final score'] = df['final score'].round(2)
# 8. Mengurutkan pekerjaan dan memilih 5 teratas berdasarkan skor akhir
top_jobs = df.sort_values(by='final score', ascending=False).head(5)
else:
top_jobs = pd.DataFrame() # Kembalikan DataFrame kosong jika tidak ada pekerjaan dalam radius
return top_jobs[['job_link', 'title', 'company', 'location', 'distance (km)', 'final score']]
# Streamlit app
st.title('Job Findr')
st.write('Find your job with ease.')
user_skills = st.text_input('Enter your skills (comma-separated):')
user_location = st.text_input('Job location:')
if st.button('Get Recommendations'):
if user_skills and user_location:
recommended_jobs = prepare_and_recommend(sample_data, user_skills, user_location)
if recommended_jobs.empty:
st.warning('Tidak ditemukan pekerjaan yang sesuai dengan keterampilan dan lokasi Anda.')
elif recommended_jobs['final score'].max() < 0.02:
st.warning('Maaf, hasil rekomendasi mungkin kurang relevan dengan keterampilan dan lokasi Anda. Silakan coba mengubah keterampilan atau lokasi Anda.')
else:
recommended_jobs['Rank'] = range(1, len(recommended_jobs) + 1)
columns = ['Rank', 'title', 'job_link', 'company', 'location', 'distance (km)']
recommended_jobs = recommended_jobs[columns]
recommended_jobs.reset_index(drop=True, inplace=True)
st.dataframe(recommended_jobs.reset_index(drop=True))
else:
st.warning('Please enter your skills and location.')
|