MoRa2001 commited on
Commit
2ef1980
·
verified ·
1 Parent(s): e240f5a

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README-file.md +39 -0
  3. all.csv +3 -0
  4. app.py +62 -0
  5. functions.py +123 -0
  6. requirements.txt +9 -0
  7. run.sh +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ all.csv filter=lfs diff=lfs merge=lfs -text
README-file.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # JobMatcher-Intelligent-Job-Matching-System
2
+
3
+ JobMatcher automates job search by scraping data from LinkedIn, Indeed, Bayt, and Wuzzuf. It leverages transformer models to summarize user CVs and calculates cosine similarity, presenting personalized job recommendations for seamless career opportunities.
4
+
5
+ ## Job Matcher
6
+
7
+ Job Matcher is a Python project designed to help users find suitable job opportunities by scraping data from multiple job websites such as LinkedIn, Indeed, Bayt, and Wuzzuf. The project performs data scraping, preprocessing, and feature engineering, consolidating the information into a single CSV file. Users can input their CV, which is then summarized using transformer models. The summarized data is used to calculate cosine similarity against job data, helping users discover the most relevant job opportunities.
8
+
9
+ ## Features
10
+
11
+ - **Data Scraping:** Collect job data from LinkedIn, Indeed, Bayt, and Wuzzuf.
12
+ - **Data Processing:** Apply preprocessing techniques and feature engineering for improved data quality.
13
+ - **User CV Input:** Summarize user CVs using transformer models.
14
+ - **Job Matching:** Calculate cosine similarity to identify the most relevant job opportunities.
15
+
16
+ ## Deploying with FastAPI
17
+
18
+ 1. **Run the FastAPI application:**
19
+
20
+ ```bash
21
+ uvicorn fastapi_app:app --reload
22
+ ```
23
+
24
+ ## Connecting with Streamlit App
25
+
26
+ 2. **Run the Streamlit app:**
27
+
28
+ ```bash
29
+ streamlit run streamlit_app.py
30
+ ```
31
+
32
+ ## Requirements
33
+
34
+ - **Python 3.8:** The project is built using Python 3.8.
35
+ - **requirements.txt:** Install project dependencies by running the following command:
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
all.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ab2086422880bdf67ec6407af77ed6810b14775c38cb5fa3ac4e183c0764d9
3
+ size 17639766
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from functions import extract_text_from_pdf, git_most_similar_job, generate_gemini_content
3
+ import os
4
+
5
+ #################### Web ############################################################
6
+ st.title("Jobs Suitable for each CV ")
7
+
8
+ info = """- The user uploads his CV in PDF format and passes the number of jobs he wants,
9
+ and we display the number of jobs most suitable for each CV from the following recruitment sites,
10
+ LinkedIn, Wazzaf, Indeed and Bayt"""
11
+ st.write(f":blue[{info}]")
12
+
13
+ note = " Note : that the jobs currently available are in the fields of programming and technology only"
14
+ st.write(f":red[{note}]")
15
+
16
+
17
+ # Set the overall layout width
18
+ st.markdown(
19
+ """
20
+ <style>
21
+ .dataframe {
22
+ max-width: 3000px;
23
+ margin: auto;
24
+ }
25
+ </style>
26
+ """,
27
+ unsafe_allow_html=True,
28
+ )
29
+ ###############################################################################################
30
+
31
+
32
+ # code
33
+ uploaded_file = st.file_uploader("Choose a CV file", "pdf")
34
+ number_of_jobs = st.number_input("Number of Jobs", min_value=1, step=1, max_value=2000)
35
+ submit = st.button("get jobs")
36
+
37
+ if submit and uploaded_file and number_of_jobs:
38
+ st.subheader("The Most recommended jobs is : ")
39
+ pdf_content = uploaded_file.read() # Read file content
40
+
41
+ pdf_text = extract_text_from_pdf(pdf_content)
42
+ cv_summary = generate_gemini_content(transcript_text=pdf_text)
43
+ data_df = git_most_similar_job(cv_summarize=cv_summary, number_of_jobs=number_of_jobs)
44
+
45
+
46
+
47
+ st.data_editor(
48
+ data_df,
49
+ column_config={
50
+ "job_link": st.column_config.LinkColumn(
51
+ "Job Link",
52
+ help=f"The top {number_of_jobs} jobs links",
53
+ validate="^https://[a-z]+\.streamlit\.app$",
54
+ max_chars=1000,
55
+ ),
56
+ },
57
+ hide_index=True,
58
+ )
59
+ st.success(f"The top {number_of_jobs} jobs")
60
+
61
+ else:
62
+ st.warning("Please upload a PDF file and add number of job .")
functions.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import fitz
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from dotenv import load_dotenv
9
+ import google.generativeai as genai
10
+
11
+ load_dotenv() ## Load all the environment variables
12
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
13
+
14
+ def extract_text_from_pdf(pdf_content):
15
+ """
16
+ Extracts text content from a PDF file.
17
+ Parameters:
18
+ - pdf_content (bytes): Bytes-like object containing the content of the PDF file.
19
+ Returns:
20
+ - str: Extracted text content from the PDF file.
21
+ """
22
+ text = ''
23
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
24
+ temp_file.write(pdf_content)
25
+ temp_path = temp_file.name
26
+
27
+ pdf_document = fitz.open(temp_path)
28
+ for page_number in range(pdf_document.page_count):
29
+ page = pdf_document[page_number]
30
+ text += page.get_text()
31
+
32
+ pdf_document.close() # Close the PDF document explicitly
33
+ os.remove(temp_path) # Remove the temporary file after use
34
+ return str(text.replace("\xa0", ""))
35
+
36
+
37
+ def generate_gemini_content(transcript_text):
38
+ """
39
+ Generates a summary based on the input text using Google's Gemini Pro model.
40
+ Parameters:
41
+ - transcript_text (str): Text to be summarized.
42
+ Returns:
43
+ - str: Generated summary.
44
+ """
45
+ prompt = """
46
+ Instructions:
47
+ Please provide a concise summary of your relevant experience, skills,
48
+ and qualifications in the field of programming and technology.
49
+ Highlight your practical experience, technological proficiencies, technical skills, soft skills,
50
+ proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
51
+ Additionally, include your location of residence and any other relevant details related to the programming industry
52
+ to facilitate accurate matching with job descriptions.
53
+ Example summary:
54
+ "Experienced software engineer with proficiency in Python, JavaScript, and Java.
55
+ Skilled in developing web applications using React.js and Django frameworks.
56
+ Strong problem-solving and communication skills. Located in New York City,
57
+ seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
58
+ CV is :
59
+ """
60
+ model = genai.GenerativeModel("gemini-pro")
61
+ response = model.generate_content(prompt + transcript_text)
62
+ return response.text
63
+
64
+
65
+ def git_indices(data, cv_vect, df_vect):
66
+ """
67
+ Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
68
+ Parameters:
69
+ - data (str): Input data.
70
+ - cv_vect (numpy.ndarray): Vector representation of the input data.
71
+ - df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
72
+ Returns:
73
+ - numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
74
+ """
75
+ for i in range(0, len([data])):
76
+ distances = cosine_similarity(cv_vect[i], df_vect).flatten()
77
+ indices = np.argsort(distances)[::-1]
78
+ return indices
79
+
80
+
81
+ def fit_data(csv_path: str):
82
+ """
83
+ Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
84
+ Parameters:
85
+ - csv_path (str): Path to the CSV file containing job descriptions.
86
+ Returns:
87
+ - pandas.DataFrame: DataFrame containing job descriptions.
88
+ - sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
89
+ - scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
90
+ """
91
+ df = pd.read_csv(csv_path)
92
+ x = df["concatenated_column"]
93
+ y = df["label"]
94
+
95
+ vectorizer = TfidfVectorizer(stop_words='english')
96
+
97
+ vectorizer.fit(x)
98
+ df_vect = vectorizer.transform(x)
99
+
100
+ return df, vectorizer, df_vect
101
+
102
+ df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )
103
+
104
+
105
+
106
+ def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
107
+ """
108
+ Finds the most similar job descriptions to the input CV summary.
109
+ Parameters:
110
+ - cv_summarize (str): Summary of the CV.
111
+ - number_of_jobs (int): Number of similar job descriptions to return.
112
+ Returns:
113
+ - pandas.DataFrame: DataFrame containing the most similar job descriptions.
114
+ """
115
+ cv_vect = vectorizer.transform([cv_summarize])
116
+ indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
117
+
118
+ prediction_data = df.iloc[indices[:number_of_jobs]]
119
+
120
+ # Check if all threads have finished
121
+ print("ALL Done \n\n")
122
+
123
+ return prediction_data
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.1.4
2
+ pandas-flavor==0.6.0
3
+ PyMuPDF==1.23.8
4
+ PyMuPDFb==1.23.7
5
+ pypdf==4.1.0
6
+ PyPDF2==3.0.1
7
+ scikit-learn==1.4.2
8
+ streamlit==1.34.0
9
+ google-generativeai==0.4.1
run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit run app.py