Vincent Claes
jobfixers add matching score + css template)
237ec40
import json
import os
import time
from urllib.parse import urlparse
import openai
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
)
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
# Set up OpenAI API
openai.api_key = os.getenv("OPENAI")
# Initialize followup number
followup_number = 0
# Start with page 1
page_number = 4
# Get the current working directory
cwd = os.getcwd()
# Path to the CSV file
csv_file = os.path.join(cwd, "vacancies.csv")
while True:
try:
# Setup webdriver
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
# The URL of the page with the buttons
url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"
# Navigate to the page
driver.get(url)
# Wait for the page to load
time.sleep(5)
# Find the buttons
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
for i in range(len(buttons)):
# Find the buttons again to avoid StaleElementReferenceException
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
# Click the button
buttons[i].click()
# Wait for the new page to load
time.sleep(5)
# Get the page source
html = driver.page_source
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Extract relevant items related to the vacancy
vacancy_detail = {}
# Extracting the job position
vacancy_detail["Position"] = soup.select_one(
".vacancy-detail__content__header__position"
).text.strip()
# Extracting the location
vacancy_detail["Location"] = soup.select_one(
".vacancy-detail__content__header__location a"
).text.strip()
# Extracting the description
description = soup.select_one(
".vacancy-detail__content__body__description__details"
).get_text(separator=" ")
vacancy_detail["Description"] = description.strip()
# Extracting the profile details
profile_details = soup.select(
".vacancy-detail__content__body__profile__details li"
)
vacancy_detail["Profile"] = [
detail.text.strip() for detail in profile_details
]
# Extracting the list of competences
competences = soup.select(
".vacancy-detail__content__body__competences__details li"
)
vacancy_detail["Competences"] = [
competence.text.strip() for competence in competences
]
# Extracting the offer details
offer_details = soup.select(
".vacancy-detail__content__body__offer__details li"
)
vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]
# Add the webpage and followup number
vacancy_detail["Webpage"] = driver.current_url
vacancy_detail["Followup_Number"] = followup_number
# Get the final part of the URL
parsed_url = urlparse(driver.current_url)
vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)
# Add the full URL of the webpage
vacancy_detail["Full_URL"] = driver.current_url
# Concatenate all the vacancy details into a single string
vacancy_detail[
"Vacancy"
] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"
# Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)
# Increment the followup number
followup_number += 1
# Print the vacancy detail, page number and follow-up number
print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
print(vacancy_detail)
# Append the vacancy detail to the CSV file
df = pd.DataFrame([vacancy_detail])
df.to_csv(
csv_file, mode="a", header=not os.path.exists(csv_file), index=False
)
# Go back to the list page
driver.back()
time.sleep(5)
# Go to the next page
page_number += 1
except WebDriverException as e:
print(
f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
)
driver.quit()
time.sleep(60)
page_number += 1
except Exception as e:
print(
f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
)
time.sleep(60)
page_number += 1
# Close the driver
driver.quit()