MoRa2001 commited on
Commit
cca1a92
·
verified ·
1 Parent(s): 3420843

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +123 -122
functions.py CHANGED
@@ -1,123 +1,124 @@
1
- import os
2
- import tempfile
3
- import fitz
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- import numpy as np
6
- import pandas as pd
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from dotenv import load_dotenv
9
- import google.generativeai as genai
10
-
11
- load_dotenv() ## Load all the environment variables
12
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
13
-
14
- def extract_text_from_pdf(pdf_content):
15
- """
16
- Extracts text content from a PDF file.
17
- Parameters:
18
- - pdf_content (bytes): Bytes-like object containing the content of the PDF file.
19
- Returns:
20
- - str: Extracted text content from the PDF file.
21
- """
22
- text = ''
23
- with tempfile.NamedTemporaryFile(delete=False) as temp_file:
24
- temp_file.write(pdf_content)
25
- temp_path = temp_file.name
26
-
27
- pdf_document = fitz.open(temp_path)
28
- for page_number in range(pdf_document.page_count):
29
- page = pdf_document[page_number]
30
- text += page.get_text()
31
-
32
- pdf_document.close() # Close the PDF document explicitly
33
- os.remove(temp_path) # Remove the temporary file after use
34
- return str(text.replace("\xa0", ""))
35
-
36
-
37
- def generate_gemini_content(transcript_text):
38
- """
39
- Generates a summary based on the input text using Google's Gemini Pro model.
40
- Parameters:
41
- - transcript_text (str): Text to be summarized.
42
- Returns:
43
- - str: Generated summary.
44
- """
45
- prompt = """
46
- Instructions:
47
- Please provide a concise summary of your relevant experience, skills,
48
- and qualifications in the field of programming and technology.
49
- Highlight your practical experience, technological proficiencies, technical skills, soft skills,
50
- proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
51
- Additionally, include your location of residence and any other relevant details related to the programming industry
52
- to facilitate accurate matching with job descriptions.
53
- Example summary:
54
- "Experienced software engineer with proficiency in Python, JavaScript, and Java.
55
- Skilled in developing web applications using React.js and Django frameworks.
56
- Strong problem-solving and communication skills. Located in New York City,
57
- seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
58
- CV is :
59
- """
60
- model = genai.GenerativeModel("gemini-pro")
61
- response = model.generate_content(prompt + transcript_text)
62
- return response.text
63
-
64
-
65
- def git_indices(data, cv_vect, df_vect):
66
- """
67
- Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
68
- Parameters:
69
- - data (str): Input data.
70
- - cv_vect (numpy.ndarray): Vector representation of the input data.
71
- - df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
72
- Returns:
73
- - numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
74
- """
75
- for i in range(0, len([data])):
76
- distances = cosine_similarity(cv_vect[i], df_vect).flatten()
77
- indices = np.argsort(distances)[::-1]
78
- return indices
79
-
80
-
81
- def fit_data(csv_path: str):
82
- """
83
- Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
84
- Parameters:
85
- - csv_path (str): Path to the CSV file containing job descriptions.
86
- Returns:
87
- - pandas.DataFrame: DataFrame containing job descriptions.
88
- - sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
89
- - scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
90
- """
91
- df = pd.read_csv(csv_path)
92
- x = df["concatenated_column"]
93
- y = df["label"]
94
-
95
- vectorizer = TfidfVectorizer(stop_words='english')
96
-
97
- vectorizer.fit(x)
98
- df_vect = vectorizer.transform(x)
99
-
100
- return df, vectorizer, df_vect
101
-
102
- df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )
103
-
104
-
105
-
106
- def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
107
- """
108
- Finds the most similar job descriptions to the input CV summary.
109
- Parameters:
110
- - cv_summarize (str): Summary of the CV.
111
- - number_of_jobs (int): Number of similar job descriptions to return.
112
- Returns:
113
- - pandas.DataFrame: DataFrame containing the most similar job descriptions.
114
- """
115
- cv_vect = vectorizer.transform([cv_summarize])
116
- indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
117
-
118
- prediction_data = df.iloc[indices[:number_of_jobs]]
119
-
120
- # Check if all threads have finished
121
- print("ALL Done \n\n")
122
-
 
123
  return prediction_data
 
1
+ import os
2
+ import tempfile
3
+ import fitz
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from dotenv import load_dotenv
9
+ import google.generativeai as genai
10
+
11
+ load_dotenv() ## Load all the environment variables
12
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
13
+
14
+ def extract_text_from_pdf(pdf_content):
15
+ """
16
+ Extracts text content from a PDF file.
17
+ Parameters:
18
+ - pdf_content (bytes): Bytes-like object containing the content of the PDF file.
19
+ Returns:
20
+ - str: Extracted text content from the PDF file.
21
+ """
22
+ text = ''
23
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
24
+ temp_file.write(pdf_content)
25
+ temp_path = temp_file.name
26
+
27
+ pdf_document = fitz.open(temp_path)
28
+ for page_number in range(pdf_document.page_count):
29
+ page = pdf_document[page_number]
30
+ text += page.get_text()
31
+
32
+ pdf_document.close() # Close the PDF document explicitly
33
+ os.remove(temp_path) # Remove the temporary file after use
34
+ return str(text.replace("\xa0", ""))
35
+
36
+
37
+ def generate_gemini_content(transcript_text):
38
+ """
39
+ Generates a summary based on the input text using Google's Gemini Pro model.
40
+ Parameters:
41
+ - transcript_text (str): Text to be summarized.
42
+ Returns:
43
+ - str: Generated summary.
44
+ """
45
+ prompt = """
46
+ Instructions:
47
+ Please provide a concise summary of your relevant experience, skills,
48
+ and qualifications in the field of programming and technology.
49
+ Highlight your practical experience, technological proficiencies, technical skills, soft skills,
50
+ proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
51
+ Additionally, include your location of residence and any other relevant details related to the programming industry
52
+ to facilitate accurate matching with job descriptions.
53
+ Example summary:
54
+ "Experienced software engineer with proficiency in Python, JavaScript, and Java.
55
+ Skilled in developing web applications using React.js and Django frameworks.
56
+ Strong problem-solving and communication skills. Located in New York City,
57
+ seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
58
+ CV is :
59
+ """
60
+ model = genai.GenerativeModel("gemini-pro")
61
+ response = model.generate_content(prompt + transcript_text)
62
+ return response.text
63
+
64
+
65
+ def git_indices(data, cv_vect, df_vect):
66
+ """
67
+ Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
68
+ Parameters:
69
+ - data (str): Input data.
70
+ - cv_vect (numpy.ndarray): Vector representation of the input data.
71
+ - df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
72
+ Returns:
73
+ - numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
74
+ """
75
+ for i in range(0, len([data])):
76
+ distances = cosine_similarity(cv_vect[i], df_vect).flatten()
77
+ indices = np.argsort(distances)[::-1]
78
+ return indices
79
+
80
+
81
+ def fit_data(csv_path: str):
82
+ """
83
+ Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
84
+ Parameters:
85
+ - csv_path (str): Path to the CSV file containing job descriptions.
86
+ Returns:
87
+ - pandas.DataFrame: DataFrame containing job descriptions.
88
+ - sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
89
+ - scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
90
+ """
91
+ df = pd.read_csv(csv_path)
92
+ x = df["concatenated_column"]
93
+ y = df["label"]
94
+ df.drop("concatenated_column", axis=1, inplace=True)
95
+
96
+ vectorizer = TfidfVectorizer(stop_words='english')
97
+
98
+ vectorizer.fit(x)
99
+ df_vect = vectorizer.transform(x)
100
+
101
+ return df, vectorizer, df_vect
102
+
103
+ df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )
104
+
105
+
106
+
107
+ def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
108
+ """
109
+ Finds the most similar job descriptions to the input CV summary.
110
+ Parameters:
111
+ - cv_summarize (str): Summary of the CV.
112
+ - number_of_jobs (int): Number of similar job descriptions to return.
113
+ Returns:
114
+ - pandas.DataFrame: DataFrame containing the most similar job descriptions.
115
+ """
116
+ cv_vect = vectorizer.transform([cv_summarize])
117
+ indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
118
+
119
+ prediction_data = df.iloc[indices[:number_of_jobs]]
120
+
121
+ # Check if all threads have finished
122
+ print("ALL Done \n\n")
123
+
124
  return prediction_data