Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import streamlit as st | |
from geopy.geocoders import Nominatim | |
from geopy.distance import geodesic | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import logging | |
import warnings | |
import time | |
logging.getLogger("urllib3").setLevel(logging.CRITICAL) | |
warnings.filterwarnings("ignore", category=UserWarning, module="urllib3") | |
geolocator = Nominatim(user_agent="job_recommendation_system") | |
def get_coordinates(location): | |
geolocator = Nominatim(user_agent="job_recommendation_system") | |
location = location.replace("-", ",") | |
if "Metropolitan Area" in location or "Region" in location: | |
city_name = location.split(" ")[0] # Ambil nama kota utama | |
location = city_name | |
location_obj = geolocator.geocode(location) | |
if location_obj: | |
return location_obj.latitude, location_obj.longitude | |
print(f"Koordinat untuk {location} tidak ditemukan. Mencoba alternatif.") | |
city_or_area = location.split(',')[0].strip() # Ambil nama kota pertama | |
location_obj = geolocator.geocode(city_or_area) | |
if not location_obj: | |
country = location.split(',')[-1].strip() # Ambil nama negara terakhir | |
location_obj = geolocator.geocode(country) | |
retry_count = 0 | |
while not location_obj and retry_count < 5: | |
print(f"Mencoba ulang untuk lokasi {location}...") | |
time.sleep(2) # Delay | |
location_obj = geolocator.geocode(location) | |
retry_count += 1 | |
if location_obj: | |
return location_obj.latitude, location_obj.longitude | |
else: | |
print(f"Koordinat untuk {location} atau alternatif tidak dapat ditemukan.") | |
return None, None | |
sample_data = pd.read_csv('job_data_with_coordinates.csv') | |
# 1. Vektorisasi skill menggunakan CountVectorizer | |
def vectorize_skills(skills, all_skills): | |
vectorizer = CountVectorizer() | |
vectorizer.fit(all_skills) | |
skills_vector = vectorizer.transform(skills) | |
return skills_vector | |
# 2. Menghitung Cosine Similarity | |
def calculate_cosine_similarity(user_skills_tfidf, job_skills_tfidf): | |
return cosine_similarity(user_skills_tfidf, job_skills_tfidf) | |
# 3. Menghitung jarak lokasi | |
def calculate_distance(job_coords, user_coords): | |
try: | |
return geodesic(job_coords, user_coords).km | |
except ValueError: # Menangani kasus koordinat yang tidak valid | |
return float('inf') | |
# Fungsi normalisasi jarak menggunakan log | |
def normalize_distance(distances): | |
normalized = 1 / (1 + np.log1p(distances)) # log1p untuk menangani log(1 + distance) | |
return normalized | |
# Menambahkan radius sebagai parameter | |
def prepare_and_recommend(df, user_skills, user_location): | |
# 1. Memastikan dataset memiliki koordinat | |
if 'latitude' not in df or 'longitude' not in df: | |
raise ValueError("Dataset harus memiliki kolom latitude dan longitude") | |
# 2. Vektorisasi skill | |
all_skills = df['skills'].tolist() | |
user_skills_vtr = vectorize_skills([user_skills], all_skills) | |
job_skills_vtr = vectorize_skills(df['skills'], all_skills) | |
# 3. Menghitung Cosine Similarity antara user dan pekerjaan | |
cosine_similarities = calculate_cosine_similarity(user_skills_vtr, job_skills_vtr) | |
df['cosine_similarity'] = cosine_similarities[0] | |
# 4. Menghitung jarak antara lokasi pekerjaan dan lokasi user | |
user_coords = get_coordinates(user_location) | |
distances = [] | |
for _, row in df.iterrows(): | |
if pd.notna(row['latitude']) and pd.notna(row['longitude']): | |
job_coords = (row['latitude'], row['longitude']) | |
distance = calculate_distance(job_coords, user_coords) | |
distances.append(distance) | |
else: | |
distances.append(float('inf')) | |
df['distance (km)'] = distances | |
# # 5. Filter pekerjaan berdasarkan radius | |
# df = df[df['distance (km)']] | |
# 6. Normalisasi jarak | |
if not df.empty: | |
df['normalized_distance'] = normalize_distance(df['distance (km)']) | |
# 7. Menghitung skor akhir | |
df['final score'] = (1.5 * df['cosine_similarity']) * (1.0 * df['normalized_distance']) | |
df['final score'] = df['final score'].round(2) | |
# 8. Mengurutkan pekerjaan dan memilih 5 teratas berdasarkan skor akhir | |
top_jobs = df.sort_values(by='final score', ascending=False).head(5) | |
else: | |
top_jobs = pd.DataFrame() # Kembalikan DataFrame kosong jika tidak ada pekerjaan dalam radius | |
return top_jobs[['job_link', 'title', 'company', 'location', 'distance (km)', 'final score']] | |
# Streamlit app | |
st.title('Job Findr') | |
st.write('Find your job with ease.') | |
user_skills = st.text_input('Enter your skills (comma-separated):') | |
user_location = st.text_input('Job location:') | |
if st.button('Get Recommendations'): | |
if user_skills and user_location: | |
recommended_jobs = prepare_and_recommend(sample_data, user_skills, user_location) | |
if recommended_jobs.empty: | |
st.warning('Tidak ditemukan pekerjaan yang sesuai dengan keterampilan dan lokasi Anda.') | |
elif recommended_jobs['final score'].max() < 0.02: | |
st.warning('Maaf, hasil rekomendasi mungkin kurang relevan dengan keterampilan dan lokasi Anda. Silakan coba mengubah keterampilan atau lokasi Anda.') | |
else: | |
recommended_jobs['Rank'] = range(1, len(recommended_jobs) + 1) | |
columns = ['Rank', 'title', 'job_link', 'company', 'location', 'distance (km)'] | |
recommended_jobs = recommended_jobs[columns] | |
recommended_jobs.reset_index(drop=True, inplace=True) | |
st.dataframe(recommended_jobs.reset_index(drop=True)) | |
else: | |
st.warning('Please enter your skills and location.') | |