Spaces:
Sleeping
Sleeping
Commit
·
ce02035
1
Parent(s):
768c1ef
Move html formating and job filtering out of the job retrievers and into the app.py
Browse files- WelcomeToTheJungle.py +2 -33
- ai_manager.py +4 -4
- app.py +48 -9
- jobspy_indeed.py +2 -40
- jobspy_linkedin.py +2 -38
WelcomeToTheJungle.py
CHANGED
@@ -4,7 +4,6 @@ from datetime import datetime
|
|
4 |
import warnings
|
5 |
from bs4 import BeautifulSoup
|
6 |
from markdownify import markdownify
|
7 |
-
from ai_manager import get_offer_information
|
8 |
from typing import List
|
9 |
from JobDescription import JobDescription
|
10 |
|
@@ -29,25 +28,6 @@ def get_logo(job):
|
|
29 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
30 |
return job["logo_photo_url"]
|
31 |
|
32 |
-
def filterout_jobs(jobs, job_filter, job_filter_negative):
|
33 |
-
selected_jobs = []
|
34 |
-
for job in jobs:
|
35 |
-
if not any(item in job["name"].lower() for item in job_filter_negative) and any(item in job["name"].lower() for item in job_filter):
|
36 |
-
selected_jobs.append(job)
|
37 |
-
|
38 |
-
return selected_jobs
|
39 |
-
|
40 |
-
def html_format_page(jobs):
|
41 |
-
result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
|
42 |
-
if len(jobs) > 0:
|
43 |
-
for job in jobs:
|
44 |
-
if job.ai_result["is_an_internship"] == False:
|
45 |
-
result.append(job.to_html())
|
46 |
-
else:
|
47 |
-
result.append("No job found")
|
48 |
-
result.append("</body></html>")
|
49 |
-
return " ".join(result)
|
50 |
-
|
51 |
def get_jobs(search_term):
|
52 |
headers = {
|
53 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
@@ -110,17 +90,13 @@ def get_jobs(search_term):
|
|
110 |
|
111 |
return jobs
|
112 |
|
113 |
-
|
114 |
-
def get_filtered_jobs(search_term)-> List[JobDescription]:
|
115 |
unique_objects = get_jobs(search_term)
|
116 |
|
117 |
jobs = sorted(unique_objects, key=lambda x: x["published_at"], reverse=True)
|
118 |
|
119 |
#filter on the job description
|
120 |
-
|
121 |
-
job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
|
122 |
-
|
123 |
-
selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
|
124 |
|
125 |
result = []
|
126 |
for job in selected_jobs:
|
@@ -132,10 +108,3 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
|
|
132 |
result.append(job_desc)
|
133 |
|
134 |
return result
|
135 |
-
|
136 |
-
def wtoj_get_html(search_term):
|
137 |
-
jobs = get_filtered_jobs(search_term)
|
138 |
-
for job in jobs:
|
139 |
-
job.ai_result = get_offer_information(job.company, job.job_description)
|
140 |
-
|
141 |
-
return html_format_page(jobs)
|
|
|
4 |
import warnings
|
5 |
from bs4 import BeautifulSoup
|
6 |
from markdownify import markdownify
|
|
|
7 |
from typing import List
|
8 |
from JobDescription import JobDescription
|
9 |
|
|
|
28 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
29 |
return job["logo_photo_url"]
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def get_jobs(search_term):
|
32 |
headers = {
|
33 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
|
|
90 |
|
91 |
return jobs
|
92 |
|
93 |
+
def wtoj_get_jobs(search_term)-> List[JobDescription]:
|
|
|
94 |
unique_objects = get_jobs(search_term)
|
95 |
|
96 |
jobs = sorted(unique_objects, key=lambda x: x["published_at"], reverse=True)
|
97 |
|
98 |
#filter on the job description
|
99 |
+
|
|
|
|
|
|
|
100 |
|
101 |
result = []
|
102 |
for job in selected_jobs:
|
|
|
108 |
result.append(job_desc)
|
109 |
|
110 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ai_manager.py
CHANGED
@@ -41,17 +41,17 @@ def _call_ai(prompt, json_mode):
|
|
41 |
|
42 |
return chat_response.choices[0].message.content
|
43 |
|
44 |
-
def
|
45 |
try:
|
46 |
-
return
|
47 |
except json.decoder.JSONDecodeError as e:
|
48 |
#try again once
|
49 |
-
return
|
50 |
except Exception as e:
|
51 |
# Throw the error if it's not an SDKError
|
52 |
raise
|
53 |
|
54 |
-
def
|
55 |
prompt = """This is a job offer from the company '{}', make a JSON with this information:
|
56 |
- company_description (string): a description of the company in less than 15 words.
|
57 |
- position_summary (string): a summary of the role in 3 bullet points
|
|
|
41 |
|
42 |
return chat_response.choices[0].message.content
|
43 |
|
44 |
+
def get_extra_information(company, offer):
|
45 |
try:
|
46 |
+
return _get_extra_information(company, offer)
|
47 |
except json.decoder.JSONDecodeError as e:
|
48 |
#try again once
|
49 |
+
return _get_extra_information(company, offer)
|
50 |
except Exception as e:
|
51 |
# Throw the error if it's not an SDKError
|
52 |
raise
|
53 |
|
54 |
+
def _get_extra_information(company, offer):
|
55 |
prompt = """This is a job offer from the company '{}', make a JSON with this information:
|
56 |
- company_description (string): a description of the company in less than 15 words.
|
57 |
- position_summary (string): a summary of the role in 3 bullet points
|
app.py
CHANGED
@@ -1,21 +1,60 @@
|
|
1 |
import gradio as gr
|
2 |
from datetime import datetime
|
|
|
|
|
3 |
|
4 |
-
from jobspy_indeed import
|
5 |
-
from WelcomeToTheJungle import
|
6 |
-
from jobspy_linkedin import
|
|
|
7 |
|
8 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
10 |
-
search_term = '"' + raw_search_term + '"'
|
11 |
print(f"{current_datetime} - new search: {raw_search_term} on {platform}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
if platform == "Indeed":
|
13 |
-
|
14 |
elif platform == "Welcome to the jungle":
|
15 |
-
|
16 |
elif platform == "LinkedIn":
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
search_textbox = gr.Radio(choices=["Content writer", "Digital Marketing", "Communication", "Business development", "SEO"], label="Search")
|
|
|
1 |
import gradio as gr
|
2 |
from datetime import datetime
|
3 |
+
from typing import List
|
4 |
+
from JobDescription import JobDescription
|
5 |
|
6 |
+
from jobspy_indeed import indeed_get_jobs
|
7 |
+
from WelcomeToTheJungle import wtoj_get_jobs
|
8 |
+
from jobspy_linkedin import linkedin_get_jobs
|
9 |
+
from ai_manager import get_extra_information
|
10 |
|
11 |
+
def html_format_page(jobs : List[JobDescription]):
|
12 |
+
result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
|
13 |
+
if len(jobs) > 0:
|
14 |
+
for job in jobs:
|
15 |
+
if job.ai_result["is_an_internship"] == False:
|
16 |
+
result.append(job.to_html())
|
17 |
+
else:
|
18 |
+
result.append("No job found")
|
19 |
+
result.append("</body></html>")
|
20 |
+
return " ".join(result)
|
21 |
+
|
22 |
+
def log_start(raw_search_term, platform):
|
23 |
current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
24 |
print(f"{current_datetime} - new search: {raw_search_term} on {platform}")
|
25 |
+
|
26 |
+
def filterout_jobs(jobs, job_filter, job_filter_negative):
|
27 |
+
selected_jobs = []
|
28 |
+
for job in jobs:
|
29 |
+
if not any(item in job["name"].lower() for item in job_filter_negative) and any(item in job["name"].lower() for item in job_filter):
|
30 |
+
selected_jobs.append(job)
|
31 |
+
|
32 |
+
return selected_jobs
|
33 |
+
|
34 |
+
def search_jobs(raw_search_term, platform):
|
35 |
+
log_start(raw_search_term, platform)
|
36 |
+
search_term = '"' + raw_search_term + '"'
|
37 |
+
|
38 |
+
jobs = []
|
39 |
if platform == "Indeed":
|
40 |
+
jobs = indeed_get_jobs(search_term)
|
41 |
elif platform == "Welcome to the jungle":
|
42 |
+
jobs = wtoj_get_jobs(search_term)
|
43 |
elif platform == "LinkedIn":
|
44 |
+
jobs = linkedin_get_jobs(search_term)
|
45 |
+
else:
|
46 |
+
raise gr.Error("No platform selected")
|
47 |
+
|
48 |
+
job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
|
49 |
+
job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
|
50 |
+
|
51 |
+
selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
|
52 |
+
|
53 |
+
for job in selected_jobs:
|
54 |
+
job.ai_result = get_extra_information(job.company, job.job_description)
|
55 |
+
|
56 |
+
return html_format_page(jobs)
|
57 |
+
|
58 |
|
59 |
|
60 |
search_textbox = gr.Radio(choices=["Content writer", "Digital Marketing", "Communication", "Business development", "SEO"], label="Search")
|
jobspy_indeed.py
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
-
import warnings
|
2 |
from jobspy import scrape_jobs
|
3 |
from typing import List
|
4 |
|
5 |
-
warnings.filterwarnings("ignore")
|
6 |
-
|
7 |
-
from ai_manager import get_offer_information
|
8 |
from JobDescription import JobDescription
|
9 |
|
10 |
def get_job_url(job):
|
@@ -30,25 +26,6 @@ def get_logo(job):
|
|
30 |
except:
|
31 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
32 |
|
33 |
-
def filterout_jobs(jobs, job_filter, job_filter_negative):
|
34 |
-
selected_jobs = []
|
35 |
-
for index, job in jobs.iterrows():
|
36 |
-
if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter):
|
37 |
-
selected_jobs.append(job)
|
38 |
-
|
39 |
-
return selected_jobs
|
40 |
-
|
41 |
-
def html_format_page(jobs : List[JobDescription]):
|
42 |
-
result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
|
43 |
-
if len(jobs) > 0:
|
44 |
-
for job in jobs:
|
45 |
-
if job.ai_result["is_an_internship"] == False:
|
46 |
-
result.append(job.to_html())
|
47 |
-
else:
|
48 |
-
result.append("No job found")
|
49 |
-
result.append("</body></html>")
|
50 |
-
return " ".join(result)
|
51 |
-
|
52 |
def get_jobs(search_term, results_wanted):
|
53 |
return scrape_jobs(
|
54 |
site_name=["indeed"],#, "linkedin", "glassdoor"],
|
@@ -63,17 +40,11 @@ def get_jobs(search_term, results_wanted):
|
|
63 |
linkedin_fetch_description=False, # get more info such as full description, direct job url for linkedin (slower)
|
64 |
)
|
65 |
|
66 |
-
def
|
67 |
jobs = get_jobs(search_term, 50)
|
68 |
|
69 |
-
#filter on the job description
|
70 |
-
job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
|
71 |
-
job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
|
72 |
-
|
73 |
-
selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
|
74 |
-
|
75 |
result = []
|
76 |
-
for job in
|
77 |
job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
|
78 |
job_description=job["description"])
|
79 |
job_desc.published_at=job["date_posted"]
|
@@ -83,12 +54,3 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
|
|
83 |
|
84 |
return result
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
def indeed_get_html(search_term):
|
89 |
-
jobs = get_filtered_jobs(search_term)
|
90 |
-
for job in jobs:
|
91 |
-
job.ai_result = get_offer_information(job.company, job.job_description)
|
92 |
-
|
93 |
-
return html_format_page(jobs)
|
94 |
-
|
|
|
|
|
1 |
from jobspy import scrape_jobs
|
2 |
from typing import List
|
3 |
|
|
|
|
|
|
|
4 |
from JobDescription import JobDescription
|
5 |
|
6 |
def get_job_url(job):
|
|
|
26 |
except:
|
27 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def get_jobs(search_term, results_wanted):
|
30 |
return scrape_jobs(
|
31 |
site_name=["indeed"],#, "linkedin", "glassdoor"],
|
|
|
40 |
linkedin_fetch_description=False, # get more info such as full description, direct job url for linkedin (slower)
|
41 |
)
|
42 |
|
43 |
+
def indeed_get_jobs(search_term)-> List[JobDescription]:
|
44 |
jobs = get_jobs(search_term, 50)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
result = []
|
47 |
+
for job in jobs:
|
48 |
job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
|
49 |
job_description=job["description"])
|
50 |
job_desc.published_at=job["date_posted"]
|
|
|
54 |
|
55 |
return result
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
jobspy_linkedin.py
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
-
import warnings
|
2 |
from typing import List
|
3 |
from JobDescription import JobDescription
|
4 |
|
5 |
from jobspy import scrape_jobs
|
6 |
|
7 |
-
warnings.filterwarnings("ignore")
|
8 |
-
from ai_manager import get_offer_information
|
9 |
-
|
10 |
def get_job_url(job):
|
11 |
if "{}".format(job["job_url_direct"]) in ["null", "nan", "None"]:
|
12 |
return job["job_url"]
|
@@ -30,26 +26,6 @@ def get_logo(job):
|
|
30 |
except:
|
31 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
32 |
|
33 |
-
def filterout_jobs(jobs, job_filter, job_filter_negative):
|
34 |
-
selected_jobs = []
|
35 |
-
for index, job in jobs.iterrows():
|
36 |
-
if not any(item in job["title"].lower() for item in job_filter_negative) and any(item in job["title"].lower() for item in job_filter) and "{}".format(job["description"]) not in ["null", "nan", "None"]:
|
37 |
-
selected_jobs.append(job)
|
38 |
-
|
39 |
-
return selected_jobs
|
40 |
-
|
41 |
-
def html_format_page(jobs : List[JobDescription]):
|
42 |
-
result = ["<html><head><style>.job{display: flex;width:70%;margin: 5px auto;border: 1px solid;border-radius: 5px;}.logobox{flex: 1;display: flex;align-items: center;justify-content: center;}.logo{width:100px;height:100px}h4{margin: 2px;}</style></head><body>"]
|
43 |
-
if len(jobs) > 0:
|
44 |
-
for job in jobs:
|
45 |
-
if job.ai_result["is_an_internship"] == False:
|
46 |
-
result.append(job.to_html())
|
47 |
-
else:
|
48 |
-
result.append("No job found")
|
49 |
-
result.append("</body></html>")
|
50 |
-
return " ".join(result)
|
51 |
-
|
52 |
-
|
53 |
def get_jobs(search_term, results_wanted):
|
54 |
return scrape_jobs(
|
55 |
site_name=["linkedin"],#, "linkedin", "glassdoor"],
|
@@ -62,17 +38,11 @@ def get_jobs(search_term, results_wanted):
|
|
62 |
enforce_annual_salary=True,
|
63 |
)
|
64 |
|
65 |
-
def
|
66 |
jobs = get_jobs(search_term, 50)
|
67 |
|
68 |
-
#filter on the job description
|
69 |
-
job_filter = ["marketing", "communication", "community", "business development", "experience", "social media", "brand", "ppc", "seo", "sea", "ads", "user acquisition", "adops", "consultant"]
|
70 |
-
job_filter_negative = ["stage", "stagiaire", "alternant", "alternance", "intern", "internship", "apprenti"]
|
71 |
-
|
72 |
-
selected_jobs = filterout_jobs(jobs, job_filter, job_filter_negative)
|
73 |
-
|
74 |
result = []
|
75 |
-
for job in
|
76 |
job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
|
77 |
job_description=job["description"])
|
78 |
job_desc.published_at=job["date_posted"]
|
@@ -82,10 +52,4 @@ def get_filtered_jobs(search_term)-> List[JobDescription]:
|
|
82 |
|
83 |
return result
|
84 |
|
85 |
-
def linkedin_get_html(search_term):
|
86 |
-
jobs = get_filtered_jobs(search_term)
|
87 |
-
for job in jobs:
|
88 |
-
job.ai_result = get_offer_information(job.company, job.job_description)
|
89 |
-
|
90 |
-
return html_format_page(jobs)
|
91 |
|
|
|
|
|
1 |
from typing import List
|
2 |
from JobDescription import JobDescription
|
3 |
|
4 |
from jobspy import scrape_jobs
|
5 |
|
|
|
|
|
|
|
6 |
def get_job_url(job):
|
7 |
if "{}".format(job["job_url_direct"]) in ["null", "nan", "None"]:
|
8 |
return job["job_url"]
|
|
|
26 |
except:
|
27 |
return "https://e7.pngegg.com/pngimages/153/807/png-clipart-timer-clock-computer-icons-unknown-planet-digital-clock-time.png"
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def get_jobs(search_term, results_wanted):
|
30 |
return scrape_jobs(
|
31 |
site_name=["linkedin"],#, "linkedin", "glassdoor"],
|
|
|
38 |
enforce_annual_salary=True,
|
39 |
)
|
40 |
|
41 |
+
def linkedin_get_jobs(search_term)-> List[JobDescription]:
|
42 |
jobs = get_jobs(search_term, 50)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
result = []
|
45 |
+
for job in jobs:
|
46 |
job_desc = JobDescription(title=job["title"], company=job["company"], url=get_job_url(job), company_url=get_company_url(job),
|
47 |
job_description=job["description"])
|
48 |
job_desc.published_at=job["date_posted"]
|
|
|
52 |
|
53 |
return result
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|