dielz's picture
fix typo
2d02f7a verified
import pandas as pd
import numpy as np
import streamlit as st
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import logging
import warnings
import time
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore", category=UserWarning, module="urllib3")
geolocator = Nominatim(user_agent="job_recommendation_system")
def get_coordinates(location):
geolocator = Nominatim(user_agent="job_recommendation_system")
location = location.replace("-", ",")
if "Metropolitan Area" in location or "Region" in location:
city_name = location.split(" ")[0] # Ambil nama kota utama
location = city_name
location_obj = geolocator.geocode(location)
if location_obj:
return location_obj.latitude, location_obj.longitude
print(f"Koordinat untuk {location} tidak ditemukan. Mencoba alternatif.")
city_or_area = location.split(',')[0].strip() # Ambil nama kota pertama
location_obj = geolocator.geocode(city_or_area)
if not location_obj:
country = location.split(',')[-1].strip() # Ambil nama negara terakhir
location_obj = geolocator.geocode(country)
retry_count = 0
while not location_obj and retry_count < 5:
print(f"Mencoba ulang untuk lokasi {location}...")
time.sleep(2) # Delay
location_obj = geolocator.geocode(location)
retry_count += 1
if location_obj:
return location_obj.latitude, location_obj.longitude
else:
print(f"Koordinat untuk {location} atau alternatif tidak dapat ditemukan.")
return None, None
sample_data = pd.read_csv('job_data_with_coordinates.csv')
# 1. Vektorisasi skill menggunakan CountVectorizer
def vectorize_skills(skills, all_skills):
vectorizer = CountVectorizer()
vectorizer.fit(all_skills)
skills_vector = vectorizer.transform(skills)
return skills_vector
# 2. Menghitung Cosine Similarity
def calculate_cosine_similarity(user_skills_tfidf, job_skills_tfidf):
return cosine_similarity(user_skills_tfidf, job_skills_tfidf)
# 3. Menghitung jarak lokasi
def calculate_distance(job_coords, user_coords):
try:
return geodesic(job_coords, user_coords).km
except ValueError: # Menangani kasus koordinat yang tidak valid
return float('inf')
# Fungsi normalisasi jarak menggunakan log
def normalize_distance(distances):
normalized = 1 / (1 + np.log1p(distances)) # log1p untuk menangani log(1 + distance)
return normalized
# Menambahkan radius sebagai parameter
def prepare_and_recommend(df, user_skills, user_location):
# 1. Memastikan dataset memiliki koordinat
if 'latitude' not in df or 'longitude' not in df:
raise ValueError("Dataset harus memiliki kolom latitude dan longitude")
# 2. Vektorisasi skill
all_skills = df['skills'].tolist()
user_skills_vtr = vectorize_skills([user_skills], all_skills)
job_skills_vtr = vectorize_skills(df['skills'], all_skills)
# 3. Menghitung Cosine Similarity antara user dan pekerjaan
cosine_similarities = calculate_cosine_similarity(user_skills_vtr, job_skills_vtr)
df['cosine_similarity'] = cosine_similarities[0]
# 4. Menghitung jarak antara lokasi pekerjaan dan lokasi user
user_coords = get_coordinates(user_location)
distances = []
for _, row in df.iterrows():
if pd.notna(row['latitude']) and pd.notna(row['longitude']):
job_coords = (row['latitude'], row['longitude'])
distance = calculate_distance(job_coords, user_coords)
distances.append(distance)
else:
distances.append(float('inf'))
df['distance (km)'] = distances
# # 5. Filter pekerjaan berdasarkan radius
# df = df[df['distance (km)']]
# 6. Normalisasi jarak
if not df.empty:
df['normalized_distance'] = normalize_distance(df['distance (km)'])
# 7. Menghitung skor akhir
df['final score'] = (1.5 * df['cosine_similarity']) * (1.0 * df['normalized_distance'])
df['final score'] = df['final score'].round(2)
# 8. Mengurutkan pekerjaan dan memilih 5 teratas berdasarkan skor akhir
top_jobs = df.sort_values(by='final score', ascending=False).head(5)
else:
top_jobs = pd.DataFrame() # Kembalikan DataFrame kosong jika tidak ada pekerjaan dalam radius
return top_jobs[['job_link', 'title', 'company', 'location', 'distance (km)', 'final score']]
# Streamlit app
st.title('Job Findr')
st.write('Find your job with ease.')
user_skills = st.text_input('Enter your skills (comma-separated):')
user_location = st.text_input('Job location:')
if st.button('Get Recommendations'):
if user_skills and user_location:
recommended_jobs = prepare_and_recommend(sample_data, user_skills, user_location)
if recommended_jobs.empty:
st.warning('Tidak ditemukan pekerjaan yang sesuai dengan keterampilan dan lokasi Anda.')
elif recommended_jobs['final score'].max() < 0.02:
st.warning('Maaf, hasil rekomendasi mungkin kurang relevan dengan keterampilan dan lokasi Anda. Silakan coba mengubah keterampilan atau lokasi Anda.')
else:
recommended_jobs['Rank'] = range(1, len(recommended_jobs) + 1)
columns = ['Rank', 'title', 'job_link', 'company', 'location', 'distance (km)']
recommended_jobs = recommended_jobs[columns]
recommended_jobs.reset_index(drop=True, inplace=True)
st.dataframe(recommended_jobs.reset_index(drop=True))
else:
st.warning('Please enter your skills and location.')