PaulMartrenchar commited on
Commit
ce02035
·
1 Parent(s): 768c1ef

Move html formating and job filtering out of the job retrievers and into the app.py

Browse files
Files changed (5) hide show
  1. WelcomeToTheJungle.py +2 -33
  2. ai_manager.py +4 -4
  3. app.py +48 -9
  4. jobspy_indeed.py +2 -40
  5. jobspy_linkedin.py +2 -38
WelcomeToTheJungle.py CHANGED
@@ -4,7 +4,6 @@ from datetime import datetime
4
  import warnings
5
  from bs4 import BeautifulSoup
6
  from markdownify import markdownify
7
- from ai_manager import get_offer_information
8
  from typing import List
9
  from JobDescription import JobDescription
10
 
@@ -29,25 +28,6 @@ def get_logo(job):
29
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
30
  return job["logo_photo_url"]
31
 
32
- def filterout_jobs(jobs, job_filter, job_filter_negative):
33
- selected_jobs = []
34
- for job in jobs:
35
- if not any(item in job["name"].lower() for item in job_filter_negative) and any(item in job["name"].lower() for item in job_filter):
36
- selected_jobs.append(job)
37
-
38
- return selected_jobs
39
-
40
- def html_format_page(jobs):
41
- result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
42
- if len(jobs) > 0:
43
- for job in jobs:
44
- if job.ai_result["is_an_internship"] == False:
45
- result.append(job.to_html())
46
- else:
47
- result.append("No job found")
48
- result.append("</body></html>")
49
- return " ".join(result)
50
-
51
  def get_jobs(search_term):
52
  headers = {
53
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
@@ -110,17 +90,13 @@ def get_jobs(search_term):
110
 
111
  return jobs
112
 
113
-
114
- def get_filtered_jobs(search_term)-> List[JobDescription]:
115
  unique_objects = get_jobs(search_term)
116
 
117
  jobs = sorted(unique_objects, key=lambda x: x["published_at"], reverse=True)
118
 
119
  #filter on the job description
120
- job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
121
- job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
122
-
123
- selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
124
 
125
  result = []
126
  for job in selected_jobs:
@@ -132,10 +108,3 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
132
  result.append(job_desc)
133
 
134
  return result
135
-
136
- def wtoj_get_html(search_term):
137
- jobs = get_filtered_jobs(search_term)
138
- for job in jobs:
139
- job.ai_result = get_offer_information(job.company, job.job_description)
140
-
141
- return html_format_page(jobs)
 
4
  import warnings
5
  from bs4 import BeautifulSoup
6
  from markdownify import markdownify
 
7
  from typing import List
8
  from JobDescription import JobDescription
9
 
 
28
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
29
  return job["logo_photo_url"]
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def get_jobs(search_term):
32
  headers = {
33
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
 
90
 
91
  return jobs
92
 
93
+ def wtoj_get_jobs(search_term)-> List[JobDescription]:
 
94
  unique_objects = get_jobs(search_term)
95
 
96
  jobs = sorted(unique_objects, key=lambda x: x["published_at"], reverse=True)
97
 
98
  #filter on the job description
99
+
 
 
 
100
 
101
  result = []
102
  for job in selected_jobs:
 
108
  result.append(job_desc)
109
 
110
  return result
 
 
 
 
 
 
 
ai_manager.py CHANGED
@@ -41,17 +41,17 @@ def _call_ai(prompt, json_mode):
41
 
42
  return chat_response.choices[0].message.content
43
 
44
- def get_offer_information(company, offer):
45
  try:
46
- return _get_offer_information(company, offer)
47
  except json.decoder.JSONDecodeError as e:
48
  #try again once
49
- return _get_offer_information(company, offer)
50
  except Exception as e:
51
  # Throw the error if it's not an SDKError
52
  raise
53
 
54
- def _get_offer_information(company, offer):
55
  prompt = """This is a job offer from the company '{}', make a JSON with this information:
56
  - company_description (string): a description of the company in less than 15 words.
57
  - position_summary (string): a summary of the role in 3 bullet points
 
41
 
42
  return chat_response.choices[0].message.content
43
 
44
+ def get_extra_information(company, offer):
45
  try:
46
+ return _get_extra_information(company, offer)
47
  except json.decoder.JSONDecodeError as e:
48
  #try again once
49
+ return _get_extra_information(company, offer)
50
  except Exception as e:
51
  # Throw the error if it's not an SDKError
52
  raise
53
 
54
+ def _get_extra_information(company, offer):
55
  prompt = """This is a job offer from the company '{}', make a JSON with this information:
56
  - company_description (string): a description of the company in less than 15 words.
57
  - position_summary (string): a summary of the role in 3 bullet points
app.py CHANGED
@@ -1,21 +1,60 @@
1
  import gradio as gr
2
  from datetime import datetime
 
 
3
 
4
- from jobspy_indeed import indeed_get_html
5
- from WelcomeToTheJungle import wtoj_get_html
6
- from jobspy_linkedin import linkedin_get_html
 
7
 
8
- def search_jobs(raw_search_term, platform):
 
 
 
 
 
 
 
 
 
 
 
9
  current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
10
- search_term = '"' + raw_search_term + '"'
11
  print(f"{current_datetime} - new search: {raw_search_term} on {platform}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  if platform == "Indeed":
13
- return indeed_get_html(search_term)
14
  elif platform == "Welcome to the jungle":
15
- return wtoj_get_html(search_term)
16
  elif platform == "LinkedIn":
17
- return linkedin_get_html(search_term)
18
- raise gr.Error("No platform selected")
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  search_textbox = gr.Radio(choices=["Content writer", "Digital Marketing", "Communication", "Business development", "SEO"], label="Search")
 
1
  import gradio as gr
2
  from datetime import datetime
3
+ from typing import List
4
+ from JobDescription import JobDescription
5
 
6
+ from jobspy_indeed import indeed_get_jobs
7
+ from WelcomeToTheJungle import wtoj_get_jobs
8
+ from jobspy_linkedin import linkedin_get_jobs
9
+ from ai_manager import get_extra_information
10
 
11
+ def html_format_page(jobs : List[JobDescription]):
12
+ result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
13
+ if len(jobs) > 0:
14
+ for job in jobs:
15
+ if job.ai_result["is_an_internship"] == False:
16
+ result.append(job.to_html())
17
+ else:
18
+ result.append("No job found")
19
+ result.append("</body></html>")
20
+ return " ".join(result)
21
+
22
+ def log_start(raw_search_term, platform):
23
  current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
24
  print(f"{current_datetime} - new search: {raw_search_term} on {platform}")
25
+
26
+ def filterout_jobs(jobs, job_filter, job_filter_negative):
27
+ selected_jobs = []
28
+ for job in jobs:
29
+ if not any(item in job["name"].lower() for item in job_filter_negative) and any(item in job["name"].lower() for item in job_filter):
30
+ selected_jobs.append(job)
31
+
32
+ return selected_jobs
33
+
34
+ def search_jobs(raw_search_term, platform):
35
+ log_start(raw_search_term, platform)
36
+ search_term = '"' + raw_search_term + '"'
37
+
38
+ jobs = []
39
  if platform == "Indeed":
40
+ jobs = indeed_get_jobs(search_term)
41
  elif platform == "Welcome to the jungle":
42
+ jobs = wtoj_get_jobs(search_term)
43
  elif platform == "LinkedIn":
44
+ jobs = linkedin_get_jobs(search_term)
45
+ else:
46
+ raise gr.Error("No platform selected")
47
+
48
+ job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
49
+ job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
50
+
51
+ selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
52
+
53
+ for job in selected_jobs:
54
+ job.ai_result = get_extra_information(job.company, job.job_description)
55
+
56
+ return html_format_page(jobs)
57
+
58
 
59
 
60
  search_textbox = gr.Radio(choices=["Content writer", "Digital Marketing", "Communication", "Business development", "SEO"], label="Search")
jobspy_indeed.py CHANGED
@@ -1,10 +1,6 @@
1
- import warnings
2
  from jobspy import scrape_jobs
3
  from typing import List
4
 
5
- warnings.filterwarnings("ignore")
6
-
7
- from ai_manager import get_offer_information
8
  from JobDescription import JobDescription
9
 
10
  def get_job_url(job):
@@ -30,25 +26,6 @@ def get_logo(job):
30
  except:
31
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
32
 
33
- def filterout_jobs(jobs, job_filter, job_filter_negative):
34
- selected_jobs = []
35
- for index, job in jobs.iterrows():
36
- if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter):
37
- selected_jobs.append(job)
38
-
39
- return selected_jobs
40
-
41
- def html_format_page(jobs : List[JobDescription]):
42
- result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
43
- if len(jobs) > 0:
44
- for job in jobs:
45
- if job.ai_result["is_an_internship"] == False:
46
- result.append(job.to_html())
47
- else:
48
- result.append("No job found")
49
- result.append("</body></html>")
50
- return " ".join(result)
51
-
52
  def get_jobs(search_term, results_wanted):
53
  return scrape_jobs(
54
  site_name=["indeed"],#, "linkedin", "glassdoor"],
@@ -63,17 +40,11 @@ def get_jobs(search_term, results_wanted):
63
  linkedin_fetch_description=False, # get more info such as full description, direct job url for linkedin (slower)
64
  )
65
 
66
- def get_filtered_jobs(search_term)-> List[JobDescription]:
67
  jobs = get_jobs(search_term, 50)
68
 
69
- #filter on the job description
70
- job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
71
- job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
72
-
73
- selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
74
-
75
  result = []
76
- for job in selected_jobs:
77
  job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
78
  job_description=job["description"])
79
  job_desc.published_at=job["date_posted"]
@@ -83,12 +54,3 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
83
 
84
  return result
85
 
86
-
87
-
88
- def indeed_get_html(search_term):
89
- jobs = get_filtered_jobs(search_term)
90
- for job in jobs:
91
- job.ai_result = get_offer_information(job.company, job.job_description)
92
-
93
- return html_format_page(jobs)
94
-
 
 
1
  from jobspy import scrape_jobs
2
  from typing import List
3
 
 
 
 
4
  from JobDescription import JobDescription
5
 
6
  def get_job_url(job):
 
26
  except:
27
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_jobs(search_term, results_wanted):
30
  return scrape_jobs(
31
  site_name=["indeed"],#, "linkedin", "glassdoor"],
 
40
  linkedin_fetch_description=False, # get more info such as full description, direct job url for linkedin (slower)
41
  )
42
 
43
+ def indeed_get_jobs(search_term)-> List[JobDescription]:
44
  jobs = get_jobs(search_term, 50)
45
 
 
 
 
 
 
 
46
  result = []
47
+ for job in jobs:
48
  job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
49
  job_description=job["description"])
50
  job_desc.published_at=job["date_posted"]
 
54
 
55
  return result
56
 
 
 
 
 
 
 
 
 
 
jobspy_linkedin.py CHANGED
@@ -1,12 +1,8 @@
1
- import warnings
2
  from typing import List
3
  from JobDescription import JobDescription
4
 
5
  from jobspy import scrape_jobs
6
 
7
- warnings.filterwarnings("ignore")
8
- from ai_manager import get_offer_information
9
-
10
  def get_job_url(job):
11
  if "{}".format(job["job_url_direct"]) in ["null", "nan", "None"]:
12
  return job["job_url"]
@@ -30,26 +26,6 @@ def get_logo(job):
30
  except:
31
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
32
 
33
- def filterout_jobs(jobs, job_filter, job_filter_negative):
34
- selected_jobs = []
35
- for index, job in jobs.iterrows():
36
- if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter) and "{}".format(job["description"]) not in ["null", "nan", "None"]:
37
- selected_jobs.append(job)
38
-
39
- return selected_jobs
40
-
41
- def html_format_page(jobs : List[JobDescription]):
42
- result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
43
- if len(jobs) > 0:
44
- for job in jobs:
45
- if job.ai_result["is_an_internship"] == False:
46
- result.append(job.to_html())
47
- else:
48
- result.append("No job found")
49
- result.append("</body></html>")
50
- return " ".join(result)
51
-
52
-
53
  def get_jobs(search_term, results_wanted):
54
  return scrape_jobs(
55
  site_name=["linkedin"],#, "linkedin", "glassdoor"],
@@ -62,17 +38,11 @@ def get_jobs(search_term, results_wanted):
62
  enforce_annual_salary=True,
63
  )
64
 
65
- def get_filtered_jobs(search_term)-> List[JobDescription]:
66
  jobs = get_jobs(search_term, 50)
67
 
68
- #filter on the job description
69
- job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
70
- job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
71
-
72
- selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
73
-
74
  result = []
75
- for job in selected_jobs:
76
  job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
77
  job_description=job["description"])
78
  job_desc.published_at=job["date_posted"]
@@ -82,10 +52,4 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
82
 
83
  return result
84
 
85
- def linkedin_get_html(search_term):
86
- jobs = get_filtered_jobs(search_term)
87
- for job in jobs:
88
- job.ai_result = get_offer_information(job.company, job.job_description)
89
-
90
- return html_format_page(jobs)
91
 
 
 
1
  from typing import List
2
  from JobDescription import JobDescription
3
 
4
  from jobspy import scrape_jobs
5
 
 
 
 
6
  def get_job_url(job):
7
  if "{}".format(job["job_url_direct"]) in ["null", "nan", "None"]:
8
  return job["job_url"]
 
26
  except:
27
  return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_jobs(search_term, results_wanted):
30
  return scrape_jobs(
31
  site_name=["linkedin"],#, "linkedin", "glassdoor"],
 
38
  enforce_annual_salary=True,
39
  )
40
 
41
+ def linkedin_get_jobs(search_term)-> List[JobDescription]:
42
  jobs = get_jobs(search_term, 50)
43
 
 
 
 
 
 
 
44
  result = []
45
+ for job in jobs:
46
  job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
47
  job_description=job["description"])
48
  job_desc.published_at=job["date_posted"]
 
52
 
53
  return result
54
 
 
 
 
 
 
 
55