PaulMartrenchar commited on
Commit
cd58373
·
1 Parent(s): 1e0326f

First version

Browse files
Files changed (5) hide show
  1. WelcomeToTheJungle.py +265 -0
  2. app.py +34 -0
  3. jobspy_indeed.py +206 -0
  4. jobspy_linkedin.py +213 -0
  5. requirements.txt +4 -0
WelcomeToTheJungle.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from datetime import datetime
4
+ import warnings
5
+ from mistralai import Mistral, SDKError
6
+ from time import sleep
7
+ from bs4 import BeautifulSoup
8
+ from markdownify import markdownify
9
+
10
+ warnings.filterwarnings("ignore")
11
+ import os
12
+
13
+ models = ["mistral-small-2409", "open-mistral-nemo"]
14
+
15
+ import random
16
+ def get_model():
17
+ return random.choice(models)
18
+
19
+ def call_ai(prompt, json_mode):
20
+ try:
21
+ return _call_ai(prompt, json_mode)
22
+ except SDKError as e:
23
+ #Wait, then try again once
24
+ sleep(11)
25
+ return _call_ai(prompt, json_mode)
26
+ except Exception as e:
27
+ # Throw the error if it's not an SDKError
28
+ raise
29
+
30
+ def _call_ai(prompt, json_mode):
31
+ sleep(1.1)
32
+ client = Mistral(api_key=os.environ['MISTRAL_KEY'])
33
+
34
+ extra_param = {}
35
+ if json_mode:
36
+ extra_param = { "response_format" : {"type": "json_object"} }
37
+
38
+ chat_response = client.chat.complete(
39
+ model = get_model(),
40
+ messages = [
41
+ {
42
+ "role": "user",
43
+ "content": prompt,
44
+ },
45
+ ],
46
+ **extra_param
47
+ )
48
+
49
+ return chat_response.choices[0].message.content
50
+
51
+ def get_offer_information(company, offer):
52
+ try:
53
+ return _get_offer_information(company, offer)
54
+ except json.decoder.JSONDecodeError as e:
55
+ #try again once
56
+ return _get_offer_information(company, offer)
57
+ except Exception as e:
58
+ # Throw the error if it's not an SDKError
59
+ raise
60
+
61
+ def _get_offer_information(company, offer):
62
+ prompt = """This is a job offer from the company '{}', make a JSON with this information:
63
+ - company_description (string): a description of the company in less than 15 words.
64
+ - position_summary (string): a summary of the role in 3 bullet points
65
+ - language_requirements (string): the language requirements in French and English
66
+ - experience_requirements (string): the experience requirements
67
+ - is_an_internship (Boolean): true if it's an internship, false otherwise
68
+ - salary_range (string): the salary range in yearly salary if stated, write 'unknown' otherwise
69
+ - should_apply (Boolean): True if the offer requires up to 2 years of work experience and does not ask for other languages than English, French, Hindi or Nepali
70
+
71
+ Be concise in each answer. Answer in English.
72
+
73
+ Example:
74
+ {{
75
+ 'company_description': 'Galileo Global Education: A leading international network of higher education institutions.',
76
+ 'position_summary': 'Project Manager Marketing and Communication: Develop brand experience, manage marketing/communication plan, ensure brand image, monitor e-reputation, create content, and collaborate with digital team.',
77
+ 'language_requirements': 'French Fluent and English Native',
78
+ 'experience_requirements': 'Previous experience in a similar role, preferably in an agency.',
79
+ 'is_an_internship': false,
80
+ 'salary_range': '€38,000-€42,000',
81
+ 'should_apply': true,
82
+ }}
83
+
84
+ Offer:
85
+ {}""".format(company, offer)
86
+ result = call_ai(prompt, True)
87
+ obj = json.loads(result)
88
+ print(obj)
89
+ #Check result
90
+ if not "company_description" in obj:
91
+ obj["company_description"] = ""
92
+ if not "position_summary" in obj:
93
+ obj["position_summary"] = ""
94
+ if not "language_requirements" in obj:
95
+ obj["language_requirements"] = ""
96
+ if not "experience_requirements" in obj:
97
+ obj["experience_requirements"] = ""
98
+ if not "is_an_internship" in obj:
99
+ obj["is_an_internship"] = False
100
+ if not "salary_range" in obj:
101
+ obj["salary_range"] = ""
102
+ if not "should_apply" in obj:
103
+ obj["should_apply"] = True
104
+
105
+ return obj
106
+
107
+ def get_offer(url):
108
+ response = requests.get(url, verify=False)
109
+
110
+ if response.status_code == 200:
111
+ # Extract the text from the response
112
+ soup = BeautifulSoup(response.text, 'html.parser')
113
+ match = soup.find('div', {'id': 'the-position-section'})
114
+ text = match.text.rstrip().lstrip()
115
+
116
+ return markdownify(text)
117
+
118
+ else:
119
+ return ""
120
+
121
+ def get_extra_information_from_ai(company, url):
122
+ offer = get_offer(url)
123
+ return get_offer_information(company, offer)
124
+
125
+ def get_salary(job):
126
+ if job["ai_result"]["salary_range"].lower() not in ["", "unknown"]:
127
+ return job["ai_result"]["salary_range"]
128
+ return ""
129
+
130
+ def format_should_apply(should_apply):
131
+ if should_apply:
132
+ return "⭐ "
133
+ return ""
134
+
135
+ def get_logo(job):
136
+ if "{}".format(job["logo_photo_url"]) == "nan":
137
+ return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
138
+ return job["logo_photo_url"]
139
+
140
+ def format_str_or_list(sum):
141
+ if isinstance(sum, str):
142
+ return sum.replace("\n", "<br />")
143
+ if isinstance(sum, list):
144
+ return "<ul>" + "".join(f"<li>{item}</li>" for item in sum) + "</ul>"
145
+ return sum
146
+
147
+ def html_format_job(job):
148
+ #open box
149
+ result = ["<div class='job'>"]
150
+ #logo
151
+ result.append("<div class='logobox'><img src='{}' alt='Logo' class='logo'></div>".format(job["organization_logo_url"]))
152
+ #text part
153
+ result.append("<div style='flex: 5; padding: 10px;'>")
154
+ result.append("<h3><a href='{}' target='_blank'>{}{}</a></h3>".format(job["URL"], format_should_apply(job["ai_result"]["should_apply"]), job["name"]))
155
+ result.append("<p>{} ({}) - published at {}</p>".format(job["organization_name"], job["ai_result"]["company_description"], job["published_at"]))
156
+ result.append("<p><h4>Position: {}</h4>{}</p>".format(get_salary(job), format_str_or_list(job["ai_result"]["position_summary"])))
157
+ result.append("<p><h4>Language:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["language_requirements"])))
158
+ result.append("<p><h4>Experience:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["experience_requirements"])))
159
+ #close text part
160
+ result.append("</div>")
161
+ #close box
162
+ result.append("</div>")
163
+ return " ".join(result)
164
+
165
+ def filterout_jobs(jobs, job_filter, job_filter_negative):
166
+ selected_jobs = []
167
+ for job in jobs:
168
+ if not any(item in job["name"].lower() for item in job_filter_negative) and any(item in job["name"].lower() for item in job_filter):
169
+ job["ai_result"] = get_extra_information_from_ai(job["organization_name"], job["URL"])
170
+ if job["ai_result"]["is_an_internship"] == False:
171
+ selected_jobs.append(job)
172
+
173
+ return selected_jobs
174
+
175
+ def html_format_page(jobs, job_filter, job_filter_negative):
176
+ selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
177
+ result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
178
+ for job in selected_jobs:
179
+ result.append(html_format_job(job))
180
+ result.append("</body></html>")
181
+ return " ".join(result)
182
+
183
+ def get_jobs(search_term):
184
+ headers = {
185
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
186
+ "Accept": "*/*",
187
+ "Accept-Language": "en-US,en;q=0.5",
188
+ "Accept-Encoding": "gzip, deflate, br",
189
+ "Origin": "https://www.welcometothejungle.com",
190
+ "Connection": "keep-alive",
191
+ "Sec-Fetch-Dest": "empty",
192
+ "Sec-Fetch-Mode": "no-cors",
193
+ "Sec-Fetch-Site": "cross-site",
194
+ "content-type": "application/x-www-form-urlencoded",
195
+ "Referer": "https://www.welcometothejungle.com/",
196
+ "Pragma": "no-cache",
197
+ "Cache-Control": "no-cache",
198
+ "x-algolia-agent": "Algolia for JavaScript (4.14.3); Browser (lite); JS Helper (3.11.2); react (17.0.2); react-instantsearch (6.38.3)",
199
+ "x-algolia-api-key": "02f0d440abc99cae37e126886438b266",
200
+ "x-algolia-application-id": "CSEKHVMS53"
201
+ }
202
+
203
+ data = """{
204
+ "requests":[{
205
+ "indexName":"wk_cms_jobs_production_published_at_desc",
206
+ "params":"analyticsTags=%5B%22page%3Ajobs_index%22%2C%22language%3Aen%22%5D&aroundLatLng=48.85718%2C2.34141&aroundPrecision=20000&aroundRadius=20000&attributesToHighlight=%5B%22name%22%5D&attributesToRetrieve=%5B%22_geoloc%22%2C%22contract_type%22%2C%22experience_level_minimum%22%2C%22name%22%2C%22objectID%22%2C%22office%22%2C%22offices%22%2C%22organization.logo.url%22%2C%22organization.name%22%2C%22organization.reference%22%2C%22organization.slug%22%2C%22organization.website_organization%22%2C%22organization.descriptions%22%2C%22organization.has_default_job%22%2C%22promoted%22%2C%22published_at%22%2C%22reference%22%2C%22remote%22%2C%22slug%22%2C%22website%22%2C%22contract_type_names.en%22%2C%22organization.cover_image.en.small.url%22%2C%22organization.size.en%22%2C%22profession.category.en%22%2C%22profession.name.en%22%2C%22sectors_name.en%22%5D&clickAnalytics=true&facetFilters=%5B%5B%22contract_type_names.en%3AFull-Time%22%5D%2C%5B%22language%3Aen%22%5D%5D&facets=%5B%22offices.country_code%22%2C%22offices.state%22%2C%22offices.district%22%2C%22offices.location%22%2C%22online%22%2C%22organization.name%22%2C%22remote%22%2C%22contract_type_names.en%22%2C%22sectors_name.en.Advertising%20%2F%20Marketing%20%2F%20Agency%22%2C%22sectors_name.en.Architecture%22%2C%22sectors_name.en.Banking%20%2F%20Insurance%20%2F%20Finance%22%2C%22sectors_name.en.Consulting%20%2F%20Audit%22%2C%22sectors_name.en.Corporate%20Services%22%2C%22sectors_name.en.Culture%20%2F%20Media%20%2F%20Entertainment%22%2C%22sectors_name.en.Distribution%22%2C%22sectors_name.en.Education%20%2F%20Training%20%2F%20Recruitment%22%2C%22sectors_name.en.Engineering%22%2C%22sectors_name.en.Fashion%20%2F%20Luxury%20%2F%20Beauty%20%2F%20Lifestyle%22%2C%22sectors_name.en.Food%20and%20Beverage%22%2C%22sectors_name.en.Health%20%2F%20Social%20%2F%20Environment%22%2C%22sectors_name.en.Hotel%20%2F%20Tourism%20%2F%20Leisure%22%2C%22sectors_name.en.Industry%22%2C%22sectors_name.en.Legal%20%2F%20Law%22%2C%22sectors_name.en.Mobility%20%2F%20Transport%22%2C%22sectors_name.en.Nonprofit%20%2F%20Association%22%2C%22sectors_name.en.Public%20Administration%22%2C%22sectors_name.en.Real%20Estate%22%2C%22sectors_name.en.Tech%22%2C%22sectors.parent.en%22%2C%22profession_name.en.Audit%20%2F%20Finance%20%2F%20Insurance%22%2C%22profession_name.en.Business%22%2C%22profession_name.en.Consulting%22%2C%22profession_name.en.Customer%20Service%22%2C%22profession_name.en.Design%22%2C%22profession_name.en.Fashion%22%2C%22profession_name.en.Health%20%2F%20Medical%20%2F%20Social%22%2C%22profession_name.en.Hospitality%20%2F%20Restaurant%20services%22%2C%22profession_name.en.Industry%22%2C%22profession_name.en.Marketing%20%2F%20Communications%22%2C%22profession_name.en.Media%22%2C%22profession_name.en.Real%20Estate%22%2C%22profession_name.en.Retail%22%2C%22profession_name.en.Support%20Roles%22%2C%22profession_name.en.Tech%22%2C%22profession_name.en.Tourism%22%2C%22profession.category.en%22%2C%22experience_level_minimum%22%2C%22organization.size.en%22%2C%22language%22%5D&filters=website.reference%3Awttj_fr&getRankingInfo=true&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&highlightPreTag=%3Cais-highlight-0000000000%3E&hitsPerPage=80&maxValuesPerFacet=999&numericFilters=%5B%22experience_level_minimum%3E%3D0%22%2C%22experience_level_minimum%3C%3D2%22%5D&page=0&query=#####&tagFilters=&userToken=00c5e1a5-e384-4def-bae4-1d466974cc2d"
207
+ }]
208
+ }""".replace("#####", search_term.lower().replace(" ", "%20"))
209
+
210
+ url = "https://csekhvms53-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia^%^20for^%^20JavaScript^%^20(4.14.3)^%^3B^%^20Browser^%^20(lite)^%^3B^%^20JS^%^20Helper^%^20(3.11.2)^%^3B^%^20react^%^20(17.0.2)^%^3B^%^20react-instantsearch^%^20(6.38.3)&x-algolia-api-key=02f0d440abc99cae37e126886438b266&x-algolia-application-id=CSEKHVMS53&search_origin=jobs_search_client"
211
+
212
+ response = requests.post(url, headers=headers, data=data, verify=False)
213
+
214
+ #parse result
215
+ jsonResponse = json.loads(response.text)
216
+ results = jsonResponse["results"]
217
+ hits = results[0]["hits"]
218
+ jobs = []
219
+ for hit in hits:
220
+ #get the info
221
+ job = {}
222
+ job["name"] = hit["name"]
223
+ job["slug"] = hit["slug"]
224
+ if hit["published_at"] != None:
225
+ published_at = datetime.strptime(hit["published_at"], '%Y-%m-%dT%H:%M:%S.%f%z')
226
+ job["published_at"] = published_at.strftime("%d/%m/%Y %H:%M:%S")
227
+ else:
228
+ job["published_at"] = "None"
229
+ job["organization_name"] = hit["organization"]["name"]
230
+ if hit["organization"].get("size", None) is not None:
231
+ job["organization_size"] = hit["organization"]["size"]["en"]
232
+ else:
233
+ job["organization_size"] = ""
234
+ job["organization_logo_url"] = hit["organization"]["logo"]["url"]
235
+ job["organization_slug"] = hit["organization"]["website_organization"]["slug"]
236
+ job["objectID"] = hit["objectID"]
237
+ job["URL"] = "https://www.welcometothejungle.com/en/companies/{}/jobs/{}?o={}".format(job["organization_slug"], job["slug"], job["objectID"])
238
+ jobs.append(job)
239
+
240
+ return jobs
241
+
242
+ def wtoj_get_html():
243
+ content_writer = get_jobs('content writer')
244
+ digital_marketing = get_jobs('Marketing')
245
+ communication = get_jobs("Communication")
246
+ business_dev = get_jobs('Business development')
247
+ seo = get_jobs("SEO")
248
+
249
+ merged_list = content_writer + digital_marketing + communication + business_dev + seo
250
+ seen_urls = set()
251
+ unique_objects = []
252
+
253
+ for obj in merged_list:
254
+ if obj["URL"] not in seen_urls:
255
+ seen_urls.add(obj["URL"])
256
+ unique_objects.append(obj)
257
+
258
+ jobs = sorted(unique_objects, key=lambda x: x["published_at"], reverse=True)
259
+
260
+ #filter on the job description
261
+ job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
262
+ job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
263
+
264
+ return html_format_page(jobs, job_filter, job_filter_negative)
265
+
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+
4
+ from jobspy_indeed import indeed_get_html
5
+ from WelcomeToTheJungle import wtoj_get_html
6
+ from jobspy_linkedin import linkedin_get_html
7
+
8
+ def search_jobs(api_key, platform):
9
+ if api_key == "":
10
+ raise gr.Error("API key is required")
11
+ os.environ['MISTRAL_KEY'] = api_key
12
+ if platform == "Indeed":
13
+ return indeed_get_html()
14
+ elif platform == "Welcome to the jungle":
15
+ return wtoj_get_html()
16
+ elif platform == "LinkedIn":
17
+ return linkedin_get_html()
18
+ raise gr.Error("No platform selected")
19
+
20
+
21
+ api_key = gr.Textbox(label="API key")
22
+ platform = gr.Radio(choices=["Welcome to the jungle", "Indeed", "LinkedIn"], label="Platform")
23
+ output_html = gr.HTML(label="Result", value="<html><br/><br/><br/><br/></html>")
24
+ demo = gr.Interface(
25
+ fn=search_jobs,
26
+ inputs=[api_key, platform],
27
+ outputs=[output_html],
28
+ flagging_mode="never",
29
+ show_progress="full",
30
+ clear_btn=None,
31
+ title="Job search"
32
+ )
33
+
34
+ demo.launch()
jobspy_indeed.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import warnings
3
+ from mistralai import Mistral, SDKError
4
+ from time import sleep
5
+
6
+ from jobspy import scrape_jobs
7
+
8
+ warnings.filterwarnings("ignore")
9
+ import os
10
+
11
+ models = ["mistral-small-2409", "open-mistral-nemo"]
12
+
13
+ import random
14
+ def get_model():
15
+ return random.choice(models)
16
+
17
+ def call_ai(prompt, json_mode):
18
+ try:
19
+ return _call_ai(prompt, json_mode)
20
+ except SDKError as e:
21
+ #Wait, then try again once
22
+ sleep(11)
23
+ return _call_ai(prompt, json_mode)
24
+ except Exception as e:
25
+ # Throw the error if it's not an SDKError
26
+ raise
27
+
28
+ def _call_ai(prompt, json_mode):
29
+ sleep(1.1)
30
+ client = Mistral(api_key=os.environ['MISTRAL_KEY'])
31
+
32
+ extra_param = {}
33
+ if json_mode:
34
+ extra_param = { "response_format" : {"type": "json_object"} }
35
+
36
+ chat_response = client.chat.complete(
37
+ model = get_model(),
38
+ messages = [
39
+ {
40
+ "role": "user",
41
+ "content": prompt,
42
+ },
43
+ ],
44
+ **extra_param
45
+ )
46
+
47
+ return chat_response.choices[0].message.content
48
+
49
+ def get_offer_information(company, offer):
50
+ try:
51
+ return _get_offer_information(company, offer)
52
+ except json.decoder.JSONDecodeError as e:
53
+ #try again once
54
+ return _get_offer_information(company, offer)
55
+ except Exception as e:
56
+ # Throw the error if it's not an SDKError
57
+ raise
58
+
59
+ def _get_offer_information(company, offer):
60
+ prompt = """This is a job offer from the company '{}', make a JSON with this information:
61
+ - company_description (string): a description of the company in less than 15 words.
62
+ - position_summary (string): a summary of the role in 3 bullet points
63
+ - language_requirements (string): the language requirements in French and English
64
+ - experience_requirements (string): the experience requirements
65
+ - is_an_internship (Boolean): true if it's an internship, false otherwise
66
+ - salary_range (string): the salary range in yearly salary if stated, write 'unknown' otherwise
67
+ - should_apply (Boolean): True if the offer requires up to 2 years of work experience and does not ask for other languages than English, French, Hindi or Nepali
68
+
69
+ Be concise in each answer. Answer in English.
70
+
71
+ Example:
72
+ {{
73
+ 'company_description': 'Galileo Global Education: A leading international network of higher education institutions.',
74
+ 'position_summary': 'Project Manager Marketing and Communication: Develop brand experience, manage marketing/communication plan, ensure brand image, monitor e-reputation, create content, and collaborate with digital team.',
75
+ 'language_requirements': 'French Fluent and English Native',
76
+ 'experience_requirements': 'Previous experience in a similar role, preferably in an agency.',
77
+ 'is_an_internship': false,
78
+ 'salary_range': '€38,000-€42,000',
79
+ 'should_apply': true,
80
+ }}
81
+
82
+ Offer:
83
+ {}""".format(company, offer)
84
+ result = call_ai(prompt, True)
85
+ obj = json.loads(result)
86
+ print(obj)
87
+ #Check result
88
+ if not "company_description" in obj:
89
+ obj["company_description"] = ""
90
+ if not "position_summary" in obj:
91
+ obj["position_summary"] = ""
92
+ if not "language_requirements" in obj:
93
+ obj["language_requirements"] = ""
94
+ if not "experience_requirements" in obj:
95
+ obj["experience_requirements"] = ""
96
+ if not "is_an_internship" in obj:
97
+ obj["is_an_internship"] = False
98
+ if not "salary_range" in obj:
99
+ obj["salary_range"] = ""
100
+ if not "should_apply" in obj:
101
+ obj["should_apply"] = True
102
+
103
+ return obj
104
+
105
+ def get_job_url(job):
106
+ if job["job_url_direct"] == "":
107
+ return job["job_url"]
108
+ return job["job_url_direct"]
109
+
110
+ def get_company_url(job):
111
+ if job["company_url_direct"] == "":
112
+ return job["company_url"]
113
+ return job["company_url_direct"]
114
+
115
+ def get_salary(job):
116
+ if "{}".format(job["min_amount"]) == "nan" or "{}".format(job["min_amount"])== "None":
117
+ if job["ai_result"]["salary_range"].lower() not in ["", "unknown"]:
118
+ return job["ai_result"]["salary_range"]
119
+ return ""
120
+ return "{}-{}{}".format(job["min_amount"], job["max_amount"], job["currency"])
121
+
122
+ def format_should_apply(should_apply):
123
+ if should_apply:
124
+ return "&#x2B50; "
125
+ return ""
126
+
127
+ def get_logo(job):
128
+ if "{}".format(job["logo_photo_url"]) == "nan":
129
+ return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
130
+ return job["logo_photo_url"]
131
+
132
+ def format_str_or_list(sum):
133
+ if isinstance(sum, str):
134
+ return sum.replace("\n", "<br />")
135
+ if isinstance(sum, list):
136
+ return "<ul>" + "".join(f"<li>{item}</li>" for item in sum) + "</ul>"
137
+ return sum
138
+
139
+ def html_format_job(job):
140
+ #open box
141
+ result = ["<div class='job'>"]
142
+ #logo
143
+ result.append("<div class='logobox'><img src='{}' alt='No logo' class='logo'></div>".format(get_logo(job)))
144
+ #text part
145
+ result.append("<div style='flex: 5; padding: 10px;'>")
146
+ result.append("<h3><a href='{}' target='_blank'>{}{}</a></h3>".format(get_job_url(job), format_should_apply(job["ai_result"]["should_apply"]), job["title"]))
147
+ result.append("<p><a href='{}' target='_blank'>{}</a> ({}) - published at {}</p>".format(get_company_url(job), job["company"], job["ai_result"]["company_description"], job["date_posted"].strftime("%d/%m/%Y")))
148
+ result.append("<p><h4>Position: {}</h4>{}</p>".format(get_salary(job), format_str_or_list(job["ai_result"]["position_summary"])))
149
+ result.append("<p><h4>Language:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["language_requirements"])))
150
+ result.append("<p><h4>Experience:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["experience_requirements"])))
151
+ #close text part
152
+ result.append("</div>")
153
+ #close box
154
+ result.append("</div>")
155
+ return " ".join(result)
156
+
157
+ def filterout_jobs(jobs, job_filter, job_filter_negative):
158
+ selected_jobs = []
159
+ for index, job in jobs.iterrows():
160
+ if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter):
161
+ job["ai_result"] = get_offer_information(job["company"], job["description"])
162
+ if job["ai_result"]["is_an_internship"] == False:
163
+ selected_jobs.append(job)
164
+
165
+ return selected_jobs
166
+
167
+ def html_format_page(jobs, job_filter, job_filter_negative):
168
+ selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
169
+ result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
170
+ for job in selected_jobs:
171
+ result.append(html_format_job(job))
172
+ result.append("</body></html>")
173
+ return " ".join(result)
174
+
175
+ def get_jobs(search_term, results_wanted):
176
+ return scrape_jobs(
177
+ site_name=["indeed"],#, "linkedin", "glassdoor"],
178
+ search_term=search_term,
179
+ location="Paris, France",
180
+ job_type="fulltime",
181
+ results_wanted=results_wanted,
182
+ #hours_old=240, # (only Linkedin/Indeed is hour specific, others round up to days old)
183
+ country_indeed='France', # only needed for indeed / glassdoor
184
+ enforce_annual_salary=True,
185
+
186
+ linkedin_fetch_description=False, # get more info such as full description, direct job url for linkedin (slower)
187
+ )
188
+
189
+ def indeed_get_html():
190
+ content_writer = get_jobs('"content writer"', 50)
191
+ digital_marketing = get_jobs('"Digital Marketing"', 50)
192
+ communication = get_jobs("Communication", 50)
193
+ business_dev = get_jobs('"Business development"', 50)
194
+ seo = get_jobs("SEO", 50)
195
+
196
+ import pandas as pd
197
+ jobs = pd.concat([content_writer, digital_marketing, communication, business_dev, seo], ignore_index=True).drop_duplicates(subset='id').sort_values(by='date_posted', ascending=False)#.head(3)
198
+ """
199
+ jobs=get_jobs('"Digital Marketing"', 20)
200
+ """
201
+ #filter on the job description
202
+ job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
203
+ job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
204
+
205
+ return html_format_page(jobs, job_filter, job_filter_negative)
206
+
jobspy_linkedin.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import warnings
3
+ import datetime
4
+ from mistralai import Mistral, SDKError
5
+ from time import sleep
6
+
7
+ from jobspy import scrape_jobs
8
+
9
+ warnings.filterwarnings("ignore")
10
+ import os
11
+
12
+ models = ["mistral-small-2409", "open-mistral-nemo"]
13
+
14
+ import random
15
+ def get_model():
16
+ return random.choice(models)
17
+
18
+ def call_ai(prompt, json_mode):
19
+ try:
20
+ return _call_ai(prompt, json_mode)
21
+ except SDKError as e:
22
+ #Wait, then try again once
23
+ sleep(11)
24
+ return _call_ai(prompt, json_mode)
25
+ except Exception as e:
26
+ # Throw the error if it's not an SDKError
27
+ raise
28
+
29
+ def _call_ai(prompt, json_mode):
30
+ sleep(1.1)
31
+ client = Mistral(api_key=os.environ['MISTRAL_KEY'])
32
+
33
+ extra_param = {}
34
+ if json_mode:
35
+ extra_param = { "response_format" : {"type": "json_object"} }
36
+
37
+ chat_response = client.chat.complete(
38
+ model = get_model(),
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": prompt,
43
+ },
44
+ ],
45
+ **extra_param
46
+ )
47
+
48
+ return chat_response.choices[0].message.content
49
+
50
+ def get_offer_information(company, offer):
51
+ try:
52
+ return _get_offer_information(company, offer)
53
+ except json.decoder.JSONDecodeError as e:
54
+ #try again once
55
+ return _get_offer_information(company, offer)
56
+ except Exception as e:
57
+ # Throw the error if it's not an SDKError
58
+ raise
59
+
60
+ def _get_offer_information(company, offer):
61
+ prompt = """This is a job offer from the company '{}', make a JSON with this information:
62
+ - company_description (string): a description of the company in less than 15 words.
63
+ - position_summary (string): a summary of the role in 3 bullet points
64
+ - language_requirements (string): the language requirements in French and English
65
+ - experience_requirements (string): the experience requirements
66
+ - is_an_internship (Boolean): true if it's an internship, false otherwise
67
+ - salary_range (string): the salary range in yearly salary if stated, write 'unknown' otherwise
68
+ - should_apply (Boolean): True if the offer requires up to 2 years of work experience and does not ask for other languages than English, French, Hindi or Nepali
69
+
70
+ Be concise in each answer. Answer in English.
71
+
72
+ Example:
73
+ {{
74
+ 'company_description': 'Galileo Global Education: A leading international network of higher education institutions.',
75
+ 'position_summary': 'Project Manager Marketing and Communication: Develop brand experience, manage marketing/communication plan, ensure brand image, monitor e-reputation, create content, and collaborate with digital team.',
76
+ 'language_requirements': 'French Fluent and English Native',
77
+ 'experience_requirements': 'Previous experience in a similar role, preferably in an agency.',
78
+ 'is_an_internship': false,
79
+ 'salary_range': '€38,000-€42,000',
80
+ 'should_apply': true,
81
+ }}
82
+
83
+ Offer:
84
+ {}""".format(company, offer)
85
+ result = call_ai(prompt, True)
86
+ obj = json.loads(result)
87
+ print(obj)
88
+ #Check result
89
+ if not "company_description" in obj:
90
+ obj["company_description"] = ""
91
+ if not "position_summary" in obj:
92
+ obj["position_summary"] = ""
93
+ if not "language_requirements" in obj:
94
+ obj["language_requirements"] = ""
95
+ if not "experience_requirements" in obj:
96
+ obj["experience_requirements"] = ""
97
+ if not "is_an_internship" in obj:
98
+ obj["is_an_internship"] = False
99
+ if not "salary_range" in obj:
100
+ obj["salary_range"] = ""
101
+ if not "should_apply" in obj:
102
+ obj["should_apply"] = True
103
+
104
+ return obj
105
+
106
+ def get_job_url(job):
107
+ if "{}".format(job["job_url_direct"]) in ["null", "nan", "None"]:
108
+ return job["job_url"]
109
+ return job["job_url_direct"]
110
+
111
+ def get_company_url(job):
112
+ if "{}".format(job["company_url_direct"]) in ["null", "nan", "None"]:
113
+ return job["company_url"]
114
+ return job["company_url_direct"]
115
+
116
+ def get_salary(job):
117
+ if "{}".format(job["min_amount"]) == "nan" or "{}".format(job["min_amount"])== "None":
118
+ if job["ai_result"]["salary_range"].lower() not in ["", "unknown"]:
119
+ return job["ai_result"]["salary_range"]
120
+ return ""
121
+ return "{}-{}{}".format(job["min_amount"], job["max_amount"], job["currency"])
122
+
123
+ def format_should_apply(should_apply):
124
+ if should_apply:
125
+ return "&#x2B50; "
126
+ return ""
127
+
128
+ def get_logo(job):
129
+ if "{}".format(job["logo_photo_url"]) == "nan":
130
+ return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
131
+ return job["logo_photo_url"]
132
+
133
+ def format_str_or_list(sum):
134
+ if isinstance(sum, str):
135
+ return sum.replace("\n", "<br />")
136
+ if isinstance(sum, list):
137
+ return "<ul>" + "".join(f"<li>{item}</li>" for item in sum) + "</ul>"
138
+ return sum
139
+
140
+ def format_posted_date(date):
141
+ if "{}".format(date) == "nan":
142
+ return "?"
143
+ if isinstance(date, str):
144
+ return datetime.datetime.fromtimestamp(int(date)).strftime("%d/%m/%Y")
145
+ return date.strftime("%d/%m/%Y")
146
+
147
+ def html_format_job(job):
148
+ #open box
149
+ result = ["<div class='job'>"]
150
+ #logo
151
+ result.append("<div class='logobox'><img src='{}' alt='No logo' class='logo'></div>".format(get_logo(job)))
152
+ #text part
153
+ result.append("<div style='flex: 5; padding: 10px;'>")
154
+ result.append("<h3><a href='{}' target='_blank'>{}{}</a></h3>".format(get_job_url(job), format_should_apply(job["ai_result"]["should_apply"]), job["title"]))
155
+ result.append("<p><a href='{}' target='_blank'>{}</a> ({}) - published at {}</p>".format(get_company_url(job), job["company"], job["ai_result"]["company_description"], format_posted_date(job["date_posted"])))
156
+ result.append("<p><h4>Position: {}</h4>{}</p>".format(get_salary(job), format_str_or_list(job["ai_result"]["position_summary"])))
157
+ result.append("<p><h4>Language:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["language_requirements"])))
158
+ result.append("<p><h4>Experience:</h4>{}</p>".format(format_str_or_list(job["ai_result"]["experience_requirements"])))
159
+ #close text part
160
+ result.append("</div>")
161
+ #close box
162
+ result.append("</div>")
163
+ return " ".join(result)
164
+
165
+ def filterout_jobs(jobs, job_filter, job_filter_negative):
166
+ selected_jobs = []
167
+ for index, job in jobs.iterrows():
168
+ if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter) and "{}".format(job["description"]) not in ["null", "nan", "None"]:
169
+ job["ai_result"] = get_offer_information(job["company"], job["description"])
170
+ if job["ai_result"]["is_an_internship"] == False:
171
+ selected_jobs.append(job)
172
+
173
+ return selected_jobs
174
+
175
+ def html_format_page(jobs, job_filter, job_filter_negative):
176
+ selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
177
+ result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
178
+ for job in selected_jobs:
179
+ result.append(html_format_job(job))
180
+ result.append("</body></html>")
181
+ return " ".join(result)
182
+
183
+
184
+ def get_jobs(search_term, results_wanted):
185
+ return scrape_jobs(
186
+ site_name=["linkedin"],#, "linkedin", "glassdoor"],
187
+ search_term=search_term,
188
+ location="Paris, France",
189
+ job_type="fulltime",
190
+ results_wanted=results_wanted,
191
+ #hours_old=240, # (only Linkedin/Indeed is hour specific, others round up to days old)
192
+ linkedin_fetch_description=True,
193
+ enforce_annual_salary=True,
194
+ )
195
+
196
+ def linkedin_get_html():
197
+ content_writer = get_jobs('"content writer"', 50)
198
+ digital_marketing = get_jobs('"Digital Marketing"', 50)
199
+ communication = get_jobs("Communication", 50)
200
+ business_dev = get_jobs('"Business development"', 50)
201
+ seo = get_jobs("SEO", 50)
202
+
203
+ import pandas as pd
204
+ jobs = pd.concat([content_writer, digital_marketing, communication, business_dev, seo], ignore_index=True).drop_duplicates(subset='id').sort_values(by='date_posted', ascending=False)#.head(3)
205
+ """
206
+ jobs=get_jobs('"Digital Marketing"', 5)
207
+ """
208
+ #filter on the job description
209
+ job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
210
+ job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
211
+
212
+ return html_format_page(jobs, job_filter, job_filter_negative)
213
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ mistralai
2
+ jobspy
3
+ markdownify
4
+ beautifulsoup4