Update functions.py
Browse files- functions.py +123 -122
functions.py
CHANGED
|
@@ -1,123 +1,124 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import tempfile
|
| 3 |
-
import fitz
|
| 4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
-
import numpy as np
|
| 6 |
-
import pandas as pd
|
| 7 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
-
import google.generativeai as genai
|
| 10 |
-
|
| 11 |
-
load_dotenv() ## Load all the environment variables
|
| 12 |
-
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 13 |
-
|
| 14 |
-
def extract_text_from_pdf(pdf_content):
|
| 15 |
-
"""
|
| 16 |
-
Extracts text content from a PDF file.
|
| 17 |
-
Parameters:
|
| 18 |
-
- pdf_content (bytes): Bytes-like object containing the content of the PDF file.
|
| 19 |
-
Returns:
|
| 20 |
-
- str: Extracted text content from the PDF file.
|
| 21 |
-
"""
|
| 22 |
-
text = ''
|
| 23 |
-
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
| 24 |
-
temp_file.write(pdf_content)
|
| 25 |
-
temp_path = temp_file.name
|
| 26 |
-
|
| 27 |
-
pdf_document = fitz.open(temp_path)
|
| 28 |
-
for page_number in range(pdf_document.page_count):
|
| 29 |
-
page = pdf_document[page_number]
|
| 30 |
-
text += page.get_text()
|
| 31 |
-
|
| 32 |
-
pdf_document.close() # Close the PDF document explicitly
|
| 33 |
-
os.remove(temp_path) # Remove the temporary file after use
|
| 34 |
-
return str(text.replace("\xa0", ""))
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def generate_gemini_content(transcript_text):
|
| 38 |
-
"""
|
| 39 |
-
Generates a summary based on the input text using Google's Gemini Pro model.
|
| 40 |
-
Parameters:
|
| 41 |
-
- transcript_text (str): Text to be summarized.
|
| 42 |
-
Returns:
|
| 43 |
-
- str: Generated summary.
|
| 44 |
-
"""
|
| 45 |
-
prompt = """
|
| 46 |
-
Instructions:
|
| 47 |
-
Please provide a concise summary of your relevant experience, skills,
|
| 48 |
-
and qualifications in the field of programming and technology.
|
| 49 |
-
Highlight your practical experience, technological proficiencies, technical skills, soft skills,
|
| 50 |
-
proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
|
| 51 |
-
Additionally, include your location of residence and any other relevant details related to the programming industry
|
| 52 |
-
to facilitate accurate matching with job descriptions.
|
| 53 |
-
Example summary:
|
| 54 |
-
"Experienced software engineer with proficiency in Python, JavaScript, and Java.
|
| 55 |
-
Skilled in developing web applications using React.js and Django frameworks.
|
| 56 |
-
Strong problem-solving and communication skills. Located in New York City,
|
| 57 |
-
seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
|
| 58 |
-
CV is :
|
| 59 |
-
"""
|
| 60 |
-
model = genai.GenerativeModel("gemini-pro")
|
| 61 |
-
response = model.generate_content(prompt + transcript_text)
|
| 62 |
-
return response.text
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def git_indices(data, cv_vect, df_vect):
|
| 66 |
-
"""
|
| 67 |
-
Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
|
| 68 |
-
Parameters:
|
| 69 |
-
- data (str): Input data.
|
| 70 |
-
- cv_vect (numpy.ndarray): Vector representation of the input data.
|
| 71 |
-
- df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
|
| 72 |
-
Returns:
|
| 73 |
-
- numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
|
| 74 |
-
"""
|
| 75 |
-
for i in range(0, len([data])):
|
| 76 |
-
distances = cosine_similarity(cv_vect[i], df_vect).flatten()
|
| 77 |
-
indices = np.argsort(distances)[::-1]
|
| 78 |
-
return indices
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def fit_data(csv_path: str):
|
| 82 |
-
"""
|
| 83 |
-
Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
|
| 84 |
-
Parameters:
|
| 85 |
-
- csv_path (str): Path to the CSV file containing job descriptions.
|
| 86 |
-
Returns:
|
| 87 |
-
- pandas.DataFrame: DataFrame containing job descriptions.
|
| 88 |
-
- sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
|
| 89 |
-
- scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
|
| 90 |
-
"""
|
| 91 |
-
df = pd.read_csv(csv_path)
|
| 92 |
-
x = df["concatenated_column"]
|
| 93 |
-
y = df["label"]
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
| 123 |
return prediction_data
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import fitz
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import google.generativeai as genai
|
| 10 |
+
|
| 11 |
+
load_dotenv() ## Load all the environment variables
|
| 12 |
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
| 13 |
+
|
| 14 |
+
def extract_text_from_pdf(pdf_content):
|
| 15 |
+
"""
|
| 16 |
+
Extracts text content from a PDF file.
|
| 17 |
+
Parameters:
|
| 18 |
+
- pdf_content (bytes): Bytes-like object containing the content of the PDF file.
|
| 19 |
+
Returns:
|
| 20 |
+
- str: Extracted text content from the PDF file.
|
| 21 |
+
"""
|
| 22 |
+
text = ''
|
| 23 |
+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
| 24 |
+
temp_file.write(pdf_content)
|
| 25 |
+
temp_path = temp_file.name
|
| 26 |
+
|
| 27 |
+
pdf_document = fitz.open(temp_path)
|
| 28 |
+
for page_number in range(pdf_document.page_count):
|
| 29 |
+
page = pdf_document[page_number]
|
| 30 |
+
text += page.get_text()
|
| 31 |
+
|
| 32 |
+
pdf_document.close() # Close the PDF document explicitly
|
| 33 |
+
os.remove(temp_path) # Remove the temporary file after use
|
| 34 |
+
return str(text.replace("\xa0", ""))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def generate_gemini_content(transcript_text):
|
| 38 |
+
"""
|
| 39 |
+
Generates a summary based on the input text using Google's Gemini Pro model.
|
| 40 |
+
Parameters:
|
| 41 |
+
- transcript_text (str): Text to be summarized.
|
| 42 |
+
Returns:
|
| 43 |
+
- str: Generated summary.
|
| 44 |
+
"""
|
| 45 |
+
prompt = """
|
| 46 |
+
Instructions:
|
| 47 |
+
Please provide a concise summary of your relevant experience, skills,
|
| 48 |
+
and qualifications in the field of programming and technology.
|
| 49 |
+
Highlight your practical experience, technological proficiencies, technical skills, soft skills,
|
| 50 |
+
proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
|
| 51 |
+
Additionally, include your location of residence and any other relevant details related to the programming industry
|
| 52 |
+
to facilitate accurate matching with job descriptions.
|
| 53 |
+
Example summary:
|
| 54 |
+
"Experienced software engineer with proficiency in Python, JavaScript, and Java.
|
| 55 |
+
Skilled in developing web applications using React.js and Django frameworks.
|
| 56 |
+
Strong problem-solving and communication skills. Located in New York City,
|
| 57 |
+
seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
|
| 58 |
+
CV is :
|
| 59 |
+
"""
|
| 60 |
+
model = genai.GenerativeModel("gemini-pro")
|
| 61 |
+
response = model.generate_content(prompt + transcript_text)
|
| 62 |
+
return response.text
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def git_indices(data, cv_vect, df_vect):
|
| 66 |
+
"""
|
| 67 |
+
Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
|
| 68 |
+
Parameters:
|
| 69 |
+
- data (str): Input data.
|
| 70 |
+
- cv_vect (numpy.ndarray): Vector representation of the input data.
|
| 71 |
+
- df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
|
| 72 |
+
Returns:
|
| 73 |
+
- numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
|
| 74 |
+
"""
|
| 75 |
+
for i in range(0, len([data])):
|
| 76 |
+
distances = cosine_similarity(cv_vect[i], df_vect).flatten()
|
| 77 |
+
indices = np.argsort(distances)[::-1]
|
| 78 |
+
return indices
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def fit_data(csv_path: str):
|
| 82 |
+
"""
|
| 83 |
+
Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
|
| 84 |
+
Parameters:
|
| 85 |
+
- csv_path (str): Path to the CSV file containing job descriptions.
|
| 86 |
+
Returns:
|
| 87 |
+
- pandas.DataFrame: DataFrame containing job descriptions.
|
| 88 |
+
- sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
|
| 89 |
+
- scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
|
| 90 |
+
"""
|
| 91 |
+
df = pd.read_csv(csv_path)
|
| 92 |
+
x = df["concatenated_column"]
|
| 93 |
+
y = df["label"]
|
| 94 |
+
df.drop("concatenated_column", axis=1, inplace=True)
|
| 95 |
+
|
| 96 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
| 97 |
+
|
| 98 |
+
vectorizer.fit(x)
|
| 99 |
+
df_vect = vectorizer.transform(x)
|
| 100 |
+
|
| 101 |
+
return df, vectorizer, df_vect
|
| 102 |
+
|
| 103 |
+
df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
|
| 108 |
+
"""
|
| 109 |
+
Finds the most similar job descriptions to the input CV summary.
|
| 110 |
+
Parameters:
|
| 111 |
+
- cv_summarize (str): Summary of the CV.
|
| 112 |
+
- number_of_jobs (int): Number of similar job descriptions to return.
|
| 113 |
+
Returns:
|
| 114 |
+
- pandas.DataFrame: DataFrame containing the most similar job descriptions.
|
| 115 |
+
"""
|
| 116 |
+
cv_vect = vectorizer.transform([cv_summarize])
|
| 117 |
+
indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
|
| 118 |
+
|
| 119 |
+
prediction_data = df.iloc[indices[:number_of_jobs]]
|
| 120 |
+
|
| 121 |
+
# Check if all threads have finished
|
| 122 |
+
print("ALL Done \n\n")
|
| 123 |
+
|
| 124 |
return prediction_data
|