|
import os
|
|
import tempfile
|
|
import fitz
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from dotenv import load_dotenv
|
|
import google.generativeai as genai
|
|
|
|
load_dotenv()
|
|
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
|
|
|
def extract_text_from_pdf(pdf_content):
|
|
"""
|
|
Extracts text content from a PDF file.
|
|
Parameters:
|
|
- pdf_content (bytes): Bytes-like object containing the content of the PDF file.
|
|
Returns:
|
|
- str: Extracted text content from the PDF file.
|
|
"""
|
|
text = ''
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
|
temp_file.write(pdf_content)
|
|
temp_path = temp_file.name
|
|
|
|
pdf_document = fitz.open(temp_path)
|
|
for page_number in range(pdf_document.page_count):
|
|
page = pdf_document[page_number]
|
|
text += page.get_text()
|
|
|
|
pdf_document.close()
|
|
os.remove(temp_path)
|
|
return str(text.replace("\xa0", ""))
|
|
|
|
|
|
def generate_gemini_content(transcript_text):
|
|
"""
|
|
Generates a summary based on the input text using Google's Gemini Pro model.
|
|
Parameters:
|
|
- transcript_text (str): Text to be summarized.
|
|
Returns:
|
|
- str: Generated summary.
|
|
"""
|
|
prompt = """
|
|
Instructions:
|
|
Please provide a concise summary of your relevant experience, skills,
|
|
and qualifications in the field of programming and technology.
|
|
Highlight your practical experience, technological proficiencies, technical skills, soft skills,
|
|
proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields.
|
|
Additionally, include your location of residence and any other relevant details related to the programming industry
|
|
to facilitate accurate matching with job descriptions.
|
|
Example summary:
|
|
"Experienced software engineer with proficiency in Python, JavaScript, and Java.
|
|
Skilled in developing web applications using React.js and Django frameworks.
|
|
Strong problem-solving and communication skills. Located in New York City,
|
|
seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
|
|
CV is :
|
|
"""
|
|
model = genai.GenerativeModel("gemini-pro")
|
|
response = model.generate_content(prompt + transcript_text)
|
|
return response.text
|
|
|
|
|
|
def git_indices(data, cv_vect, df_vect):
|
|
"""
|
|
Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
|
|
Parameters:
|
|
- data (str): Input data.
|
|
- cv_vect (numpy.ndarray): Vector representation of the input data.
|
|
- df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
|
|
Returns:
|
|
- numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
|
|
"""
|
|
for i in range(0, len([data])):
|
|
distances = cosine_similarity(cv_vect[i], df_vect).flatten()
|
|
indices = np.argsort(distances)[::-1]
|
|
return indices
|
|
|
|
|
|
def fit_data(csv_path: str):
|
|
"""
|
|
Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
|
|
Parameters:
|
|
- csv_path (str): Path to the CSV file containing job descriptions.
|
|
Returns:
|
|
- pandas.DataFrame: DataFrame containing job descriptions.
|
|
- sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
|
|
- scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
|
|
"""
|
|
df = pd.read_csv(csv_path)
|
|
x = df["concatenated_column"]
|
|
y = df["label"]
|
|
|
|
vectorizer = TfidfVectorizer(stop_words='english')
|
|
|
|
vectorizer.fit(x)
|
|
df_vect = vectorizer.transform(x)
|
|
|
|
return df, vectorizer, df_vect
|
|
|
|
df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )
|
|
|
|
|
|
|
|
def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
|
|
"""
|
|
Finds the most similar job descriptions to the input CV summary.
|
|
Parameters:
|
|
- cv_summarize (str): Summary of the CV.
|
|
- number_of_jobs (int): Number of similar job descriptions to return.
|
|
Returns:
|
|
- pandas.DataFrame: DataFrame containing the most similar job descriptions.
|
|
"""
|
|
cv_vect = vectorizer.transform([cv_summarize])
|
|
indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
|
|
|
|
prediction_data = df.iloc[indices[:number_of_jobs]]
|
|
|
|
|
|
print("ALL Done \n\n")
|
|
|
|
return prediction_data |