changes in datapreprocessing
#2
by
Samay42
- opened
app.py
CHANGED
@@ -3,9 +3,14 @@ import fitz
|
|
3 |
import google.generativeai as genai
|
4 |
from dotenv import load_dotenv
|
5 |
import os
|
|
|
|
|
|
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
-
|
|
|
9 |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
10 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
11 |
|
@@ -17,6 +22,32 @@ def extract_text_from_pdf(file):
|
|
17 |
text += page.get_text()
|
18 |
return text
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def get_gemini_response(prompt):
|
21 |
"""Function to load Google Gemini model and provide queries as response."""
|
22 |
response = model.generate_content([prompt])
|
@@ -24,34 +55,22 @@ def get_gemini_response(prompt):
|
|
24 |
|
25 |
def analyze_resume(text):
|
26 |
prompt = (
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
"### Summary\n"
|
44 |
-
"This resume highlights a candidate with strong foundational skills in [Key Fields/Domains]. They have demonstrated technical proficiency in [Primary Technical Skills] and possess significant experience in [Type of Experience (e.g., internships, projects)]. The candidate has completed notable projects such as [Relevant Projects] and holds certifications in [Relevant Certifications]. Their primary areas of interest include [Primary Interests], demonstrating a strong alignment with their technical expertise.\n\n"
|
45 |
-
|
46 |
-
"### Percentage Distribution of Fields/Domains\n"
|
47 |
-
"- **[Field/Domain 1]**: X%\n"
|
48 |
-
" - Keywords: [Relevant Keywords]\n"
|
49 |
-
"- **[Field/Domain 2]**: Y%\n"
|
50 |
-
" - Keywords: [Relevant Keywords]\n"
|
51 |
-
"- **[Field/Domain 3]**: Z%\n"
|
52 |
-
" - Keywords: [Relevant Keywords]\n"
|
53 |
-
)
|
54 |
-
|
55 |
analysis = get_gemini_response(prompt + text)
|
56 |
return analysis
|
57 |
|
@@ -64,7 +83,7 @@ def extract_domains_from_analysis(analysis):
|
|
64 |
)
|
65 |
domain_response = get_gemini_response(domain_prompt)
|
66 |
return domain_response.strip()
|
67 |
-
|
68 |
def generate_mcq_questions(analysis, selected_domain):
|
69 |
num_questions = 20 # Default to 20 questions
|
70 |
prompt_template = (
|
@@ -140,14 +159,17 @@ def generate_coding_questions(analysis, selected_domain):
|
|
140 |
def generate_interview_questions(analysis, selected_domain):
|
141 |
num_questions = 20 # Default to 20 questions
|
142 |
prompt_template = (
|
143 |
-
f"Based on the candidate's resume and the identified skills, experience, and education, generate a set of {num_questions} interview questions "
|
144 |
f"that assess their fit for the position at our company. The questions should cover topics such as problem-solving abilities, leadership skills, "
|
145 |
f"communication skills, cultural fit, etc. Additionally, include follow-up questions to probe deeper into the candidate's responses and evaluate their thought process. "
|
146 |
-
f"The questions should be specific to the field of {selected_domain}."
|
|
|
|
|
147 |
)
|
148 |
questions = get_gemini_response(prompt_template)
|
149 |
return questions
|
150 |
|
|
|
151 |
def split_questions_answers(quiz_response):
|
152 |
"""Function that splits the questions and answers from the quiz response."""
|
153 |
if "Answers:" in quiz_response:
|
@@ -165,18 +187,80 @@ def main():
|
|
165 |
if 'selected_domain' not in st.session_state:
|
166 |
st.session_state.selected_domain = "DSA"
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
uploaded_file = st.file_uploader("Upload your resume PDF", type="pdf")
|
169 |
-
if uploaded_file is not None:
|
|
|
170 |
resume_text = extract_text_from_pdf(uploaded_file)
|
171 |
|
172 |
-
st.
|
173 |
-
|
|
|
|
|
|
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
st.write("Analysis Result:")
|
176 |
-
st.write(analysis)
|
177 |
|
178 |
# Extract domains from the analysis
|
179 |
-
domain_response = extract_domains_from_analysis(analysis)
|
180 |
domains = [domain.strip() for domain in domain_response.split(',')]
|
181 |
default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
|
182 |
|
@@ -199,11 +283,11 @@ def main():
|
|
199 |
if st.button("Generate Questions"):
|
200 |
if question_type == "Technical Round":
|
201 |
if technical_subtype == "MCQs":
|
202 |
-
quiz_response = generate_mcq_questions(analysis, st.session_state.selected_domain)
|
203 |
else:
|
204 |
-
quiz_response = generate_coding_questions(analysis, st.session_state.selected_domain)
|
205 |
else:
|
206 |
-
quiz_response = generate_interview_questions(analysis, st.session_state.selected_domain)
|
207 |
|
208 |
questions, answers = split_questions_answers(quiz_response)
|
209 |
|
@@ -215,4 +299,4 @@ def main():
|
|
215 |
st.write(answers)
|
216 |
|
217 |
if __name__ == "__main__":
|
218 |
-
main()
|
|
|
3 |
import google.generativeai as genai
|
4 |
from dotenv import load_dotenv
|
5 |
import os
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.vectorstores import Chroma
|
8 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
+
from langchain.schema import Document
|
10 |
|
11 |
load_dotenv()
|
12 |
+
|
13 |
+
# Initialize Google Gemini Model
|
14 |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
15 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
16 |
|
|
|
22 |
text += page.get_text()
|
23 |
return text
|
24 |
|
25 |
+
def preprocess_and_chunk(resume_text):
|
26 |
+
"""Preprocess and chunk the resume text for RAG-based retrieval."""
|
27 |
+
|
28 |
+
# Wrap resume text in a Document object
|
29 |
+
docs = [Document(page_content=resume_text)] # Convert the text into a Document object
|
30 |
+
|
31 |
+
# Split text into chunks
|
32 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
|
33 |
+
docs = text_splitter.split_documents(docs)
|
34 |
+
|
35 |
+
# Create a vectorstore
|
36 |
+
vectorstore = Chroma(
|
37 |
+
collection_name="full_documents",
|
38 |
+
embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
|
39 |
+
model_kwargs={'device': 'cpu'})
|
40 |
+
)
|
41 |
+
|
42 |
+
vectorstore.add_documents(docs)
|
43 |
+
|
44 |
+
return vectorstore
|
45 |
+
|
46 |
+
def retrieve_relevant_text(query, vectorstore):
|
47 |
+
"""Retrieve relevant text based on a query using the vector store."""
|
48 |
+
results = vectorstore.similarity_search(query)
|
49 |
+
return results
|
50 |
+
|
51 |
def get_gemini_response(prompt):
|
52 |
"""Function to load Google Gemini model and provide queries as response."""
|
53 |
response = model.generate_content([prompt])
|
|
|
55 |
|
56 |
def analyze_resume(text):
|
57 |
prompt = (
|
58 |
+
"You are an expert resume analyst. Analyze the following resume text and provide the following:\n\n"
|
59 |
+
"1. A brief summary of the resume, including the candidate's main interests and the fields they are most passionate about.\n"
|
60 |
+
"2. A detailed percentage distribution of fields/domains present in the resume, with keywords extracted from the resume. Ensure the total sums up to 100%.\n"
|
61 |
+
"3. A overall explanation for each domain in brief according to the resume."
|
62 |
+
"Here is an example of how the output should look like:\n\n"
|
63 |
+
"### Summary\n"
|
64 |
+
"The resume indicates a strong background and interest in machine learning, data science, and software development. The candidate has worked on several projects involving machine learning algorithms, data preprocessing, and building software applications. They have demonstrated proficiency in Python, Java, and various machine learning frameworks. The candidate is passionate about solving complex problems using AI and has a keen interest in continuing to develop their skills in this area.\n\n"
|
65 |
+
"### Percentage Distribution of Fields/Domains\n"
|
66 |
+
"Note: only include standard technologies as keywords.\n"
|
67 |
+
"- *Machine Learning (ML)*: 40%\n"
|
68 |
+
" - Keywords: Algorithms, Keras, PyTorch, Scikit-Learn, Predictive Models\n"
|
69 |
+
"- *Data Science (DS)*: 30%\n"
|
70 |
+
" - Keywords: Data Analysis, Pandas, NumPy, Visualization, Statistical Methods\n"
|
71 |
+
"- *Software Development (SD)*: 30%\n"
|
72 |
+
" - Keywords: Python, Java, Software Engineering, APIs, Git\n"
|
73 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
analysis = get_gemini_response(prompt + text)
|
75 |
return analysis
|
76 |
|
|
|
83 |
)
|
84 |
domain_response = get_gemini_response(domain_prompt)
|
85 |
return domain_response.strip()
|
86 |
+
|
87 |
def generate_mcq_questions(analysis, selected_domain):
|
88 |
num_questions = 20 # Default to 20 questions
|
89 |
prompt_template = (
|
|
|
159 |
def generate_interview_questions(analysis, selected_domain):
|
160 |
num_questions = 20 # Default to 20 questions
|
161 |
prompt_template = (
|
162 |
+
f"Based on the candidate's resume {analysis} and the identified skills, experience, and education, generate a set of {num_questions} interview questions "
|
163 |
f"that assess their fit for the position at our company. The questions should cover topics such as problem-solving abilities, leadership skills, "
|
164 |
f"communication skills, cultural fit, etc. Additionally, include follow-up questions to probe deeper into the candidate's responses and evaluate their thought process. "
|
165 |
+
f"Note:The questions should be specific to the field of {selected_domain}."
|
166 |
+
f"Questions should be both knowledge base and industry application level also which include practical application of knowledge but only based on {selected_domain}."
|
167 |
+
f"You need to take reference from the resume analysis but the questions generated should be strictly based on {selected_domain}."
|
168 |
)
|
169 |
questions = get_gemini_response(prompt_template)
|
170 |
return questions
|
171 |
|
172 |
+
|
173 |
def split_questions_answers(quiz_response):
|
174 |
"""Function that splits the questions and answers from the quiz response."""
|
175 |
if "Answers:" in quiz_response:
|
|
|
187 |
if 'selected_domain' not in st.session_state:
|
188 |
st.session_state.selected_domain = "DSA"
|
189 |
|
190 |
+
if 'analysis' not in st.session_state:
|
191 |
+
st.session_state.analysis = None
|
192 |
+
|
193 |
+
if 'uploaded_file' not in st.session_state:
|
194 |
+
st.session_state.uploaded_file = None
|
195 |
+
|
196 |
uploaded_file = st.file_uploader("Upload your resume PDF", type="pdf")
|
197 |
+
if uploaded_file is not None and uploaded_file != st.session_state.uploaded_file:
|
198 |
+
st.session_state.uploaded_file = uploaded_file
|
199 |
resume_text = extract_text_from_pdf(uploaded_file)
|
200 |
|
201 |
+
with st.spinner("Analyzing Resume..."):
|
202 |
+
st.session_state.analysis = analyze_resume(resume_text)
|
203 |
+
|
204 |
+
st.write("Analysis Result:")
|
205 |
+
st.write(st.session_state.analysis)
|
206 |
|
207 |
+
# Extract domains from the analysis
|
208 |
+
domain_response = extract_domains_from_analysis(st.session_state.analysis)
|
209 |
+
domains = [domain.strip() for domain in domain_response.split(',')]
|
210 |
+
default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
|
211 |
+
|
212 |
+
def update_selected_domain():
|
213 |
+
st.session_state.selected_domain = st.session_state.domain_select
|
214 |
+
|
215 |
+
st.selectbox(
|
216 |
+
"Select Domain for Questions:",
|
217 |
+
default_domains,
|
218 |
+
key="domain_select",
|
219 |
+
index=default_domains.index(st.session_state.selected_domain) if st.session_state.selected_domain in default_domains else 0,
|
220 |
+
on_change=update_selected_domain
|
221 |
+
)
|
222 |
+
|
223 |
+
question_type = st.selectbox("Select Question Type:", ["Technical Round", "Interview Round"])
|
224 |
+
|
225 |
+
if question_type == "Technical Round":
|
226 |
+
technical_subtype = st.selectbox("Select Technical Round Type:", ["MCQs", "Coding Challenges"])
|
227 |
+
|
228 |
+
if st.button("Generate Questions"):
|
229 |
+
if question_type == "Technical Round":
|
230 |
+
if technical_subtype == "MCQs":
|
231 |
+
quiz_response = generate_mcq_questions(st.session_state.analysis, st.session_state.selected_domain)
|
232 |
+
else:
|
233 |
+
quiz_response = generate_coding_questions(st.session_state.analysis, st.session_state.selected_domain)
|
234 |
+
else:
|
235 |
+
quiz_response = generate_interview_questions(st.session_state.analysis, st.session_state.selected_domain)
|
236 |
+
|
237 |
+
questions, answers = split_questions_answers(quiz_response)
|
238 |
+
|
239 |
+
st.write("Generated Questions:")
|
240 |
+
st.write(questions)
|
241 |
+
|
242 |
+
if st.button("Show Answers"):
|
243 |
+
st.write("Answers:")
|
244 |
+
st.write(answers)
|
245 |
+
|
246 |
+
# Process and store resume text in vectorstore
|
247 |
+
vectorstore = preprocess_and_chunk(resume_text)
|
248 |
+
|
249 |
+
# Query for relevant text
|
250 |
+
query = st.text_input("Enter a query to search relevant text:", "")
|
251 |
+
if query:
|
252 |
+
with st.spinner("Retrieving relevant text..."):
|
253 |
+
relevant_texts = retrieve_relevant_text(query, vectorstore)
|
254 |
+
st.write("Relevant Texts:")
|
255 |
+
for text in relevant_texts:
|
256 |
+
st.write(text.page_content)
|
257 |
+
|
258 |
+
elif uploaded_file is not None and uploaded_file == st.session_state.uploaded_file:
|
259 |
st.write("Analysis Result:")
|
260 |
+
st.write(st.session_state.analysis)
|
261 |
|
262 |
# Extract domains from the analysis
|
263 |
+
domain_response = extract_domains_from_analysis(st.session_state.analysis)
|
264 |
domains = [domain.strip() for domain in domain_response.split(',')]
|
265 |
default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
|
266 |
|
|
|
283 |
if st.button("Generate Questions"):
|
284 |
if question_type == "Technical Round":
|
285 |
if technical_subtype == "MCQs":
|
286 |
+
quiz_response = generate_mcq_questions(st.session_state.analysis, st.session_state.selected_domain)
|
287 |
else:
|
288 |
+
quiz_response = generate_coding_questions(st.session_state.analysis, st.session_state.selected_domain)
|
289 |
else:
|
290 |
+
quiz_response = generate_interview_questions(st.session_state.analysis, st.session_state.selected_domain)
|
291 |
|
292 |
questions, answers = split_questions_answers(quiz_response)
|
293 |
|
|
|
299 |
st.write(answers)
|
300 |
|
301 |
if __name__ == "__main__":
|
302 |
+
main()
|