Spaces:

drift-ai
/

recruiter-assistant-jbfxrs

Sleeping

recruiter-assistant-jbfxrs / scripts /scrape_website.py

Vincent Claes

jobfixers add matching score + css template)

237ec40 almost 2 years ago

6.04 kB

	import json
	import os
	import time
	from urllib.parse import urlparse

	import openai
	import pandas as pd
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.common.exceptions import (
	WebDriverException,
	)
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.ui import WebDriverWait
	from webdriver_manager.chrome import ChromeDriverManager

	# Set up OpenAI API
	openai.api_key = os.getenv("OPENAI")

	# Initialize followup number
	followup_number = 0

	# Start with page 1
	page_number = 4

	# Get the current working directory
	cwd = os.getcwd()

	# Path to the CSV file
	csv_file = os.path.join(cwd, "vacancies.csv")

	while True:
	try:
	# Setup webdriver
	s = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=s)

	# The URL of the page with the buttons
	url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"

	# Navigate to the page
	driver.get(url)

	# Wait for the page to load
	time.sleep(5)

	# Find the buttons
	buttons = WebDriverWait(driver, 10).until(
	EC.presence_of_all_elements_located(
	(
	By.XPATH,
	'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
	)
	)
	)

	for i in range(len(buttons)):
	# Find the buttons again to avoid StaleElementReferenceException
	buttons = WebDriverWait(driver, 10).until(
	EC.presence_of_all_elements_located(
	(
	By.XPATH,
	'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
	)
	)
	)

	# Click the button
	buttons[i].click()

	# Wait for the new page to load
	time.sleep(5)

	# Get the page source
	html = driver.page_source

	# Parse the HTML with BeautifulSoup
	soup = BeautifulSoup(html, "html.parser")

	# Extract relevant items related to the vacancy
	vacancy_detail = {}

	# Extracting the job position
	vacancy_detail["Position"] = soup.select_one(
	".vacancy-detail__content__header__position"
	).text.strip()

	# Extracting the location
	vacancy_detail["Location"] = soup.select_one(
	".vacancy-detail__content__header__location a"
	).text.strip()

	# Extracting the description
	description = soup.select_one(
	".vacancy-detail__content__body__description__details"
	).get_text(separator=" ")
	vacancy_detail["Description"] = description.strip()

	# Extracting the profile details
	profile_details = soup.select(
	".vacancy-detail__content__body__profile__details li"
	)
	vacancy_detail["Profile"] = [
	detail.text.strip() for detail in profile_details
	]

	# Extracting the list of competences
	competences = soup.select(
	".vacancy-detail__content__body__competences__details li"
	)
	vacancy_detail["Competences"] = [
	competence.text.strip() for competence in competences
	]

	# Extracting the offer details
	offer_details = soup.select(
	".vacancy-detail__content__body__offer__details li"
	)
	vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]

	# Add the webpage and followup number
	vacancy_detail["Webpage"] = driver.current_url
	vacancy_detail["Followup_Number"] = followup_number

	# Get the final part of the URL
	parsed_url = urlparse(driver.current_url)
	vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)

	# Add the full URL of the webpage
	vacancy_detail["Full_URL"] = driver.current_url

	# Concatenate all the vacancy details into a single string
	vacancy_detail[
	"Vacancy"
	] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"

	# Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
	vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)

	# Increment the followup number
	followup_number += 1

	# Print the vacancy detail, page number and follow-up number
	print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
	print(vacancy_detail)

	# Append the vacancy detail to the CSV file
	df = pd.DataFrame([vacancy_detail])
	df.to_csv(
	csv_file, mode="a", header=not os.path.exists(csv_file), index=False
	)

	# Go back to the list page
	driver.back()
	time.sleep(5)

	# Go to the next page
	page_number += 1

	except WebDriverException as e:
	print(
	f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
	)
	driver.quit()
	time.sleep(60)
	page_number += 1

	except Exception as e:
	print(
	f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
	)
	time.sleep(60)
	page_number += 1

	# Close the driver
	driver.quit()