changes in datapreprocessing

#2
by Samay42 - opened
Files changed (1) hide show
  1. app.py +125 -41
app.py CHANGED
@@ -3,9 +3,14 @@ import fitz
3
  import google.generativeai as genai
4
  from dotenv import load_dotenv
5
  import os
 
 
 
 
6
 
7
  load_dotenv()
8
- # Initialize the Google Gemini model
 
9
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
10
  model = genai.GenerativeModel('gemini-1.5-flash')
11
 
@@ -17,6 +22,32 @@ def extract_text_from_pdf(file):
17
  text += page.get_text()
18
  return text
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def get_gemini_response(prompt):
21
  """Function to load Google Gemini model and provide queries as response."""
22
  response = model.generate_content([prompt])
@@ -24,34 +55,22 @@ def get_gemini_response(prompt):
24
 
25
  def analyze_resume(text):
26
  prompt = (
27
- "You are an expert resume analyst with a focus on extracting precise, relevant information. Analyze the following resume text and provide the following detailed outputs:\n\n"
28
-
29
- "1. **Summary**:\n"
30
- " - Provide a concise and accurate summary of the resume.\n"
31
- " - Focus on the candidate's most important foundational skills, key technical competencies, and significant work experiences.\n"
32
- " - Highlight their primary areas of interest, notable projects, and any certifications directly relevant to their field.\n"
33
- " - Ensure that the summary directly aligns with the key details provided in the resume, avoiding any interpretation or unnecessary extrapolation. Maintain a neutral, objective tone.\n\n"
34
-
35
- "2. **Percentage Distribution of Fields/Domains**:\n"
36
- " - Break down the resume content into specific fields or domains, with a detailed percentage distribution.\n"
37
- " - Extract and list relevant keywords from each field/domain.\n"
38
- " - Ensure that the total distribution sums up to 100%.\n"
39
- " - Keywords should be limited to standard technical terms or industry-specific phrases, with no addition of non-essential details.\n"
40
- " - Ensure that the percentage distribution and keywords are strictly based on the content present in the resume text.\n\n"
41
-
42
- "### Example Format:\n\n"
43
- "### Summary\n"
44
- "This resume highlights a candidate with strong foundational skills in [Key Fields/Domains]. They have demonstrated technical proficiency in [Primary Technical Skills] and possess significant experience in [Type of Experience (e.g., internships, projects)]. The candidate has completed notable projects such as [Relevant Projects] and holds certifications in [Relevant Certifications]. Their primary areas of interest include [Primary Interests], demonstrating a strong alignment with their technical expertise.\n\n"
45
-
46
- "### Percentage Distribution of Fields/Domains\n"
47
- "- **[Field/Domain 1]**: X%\n"
48
- " - Keywords: [Relevant Keywords]\n"
49
- "- **[Field/Domain 2]**: Y%\n"
50
- " - Keywords: [Relevant Keywords]\n"
51
- "- **[Field/Domain 3]**: Z%\n"
52
- " - Keywords: [Relevant Keywords]\n"
53
- )
54
-
55
  analysis = get_gemini_response(prompt + text)
56
  return analysis
57
 
@@ -64,7 +83,7 @@ def extract_domains_from_analysis(analysis):
64
  )
65
  domain_response = get_gemini_response(domain_prompt)
66
  return domain_response.strip()
67
-
68
  def generate_mcq_questions(analysis, selected_domain):
69
  num_questions = 20 # Default to 20 questions
70
  prompt_template = (
@@ -140,14 +159,17 @@ def generate_coding_questions(analysis, selected_domain):
140
  def generate_interview_questions(analysis, selected_domain):
141
  num_questions = 20 # Default to 20 questions
142
  prompt_template = (
143
- f"Based on the candidate's resume and the identified skills, experience, and education, generate a set of {num_questions} interview questions "
144
  f"that assess their fit for the position at our company. The questions should cover topics such as problem-solving abilities, leadership skills, "
145
  f"communication skills, cultural fit, etc. Additionally, include follow-up questions to probe deeper into the candidate's responses and evaluate their thought process. "
146
- f"The questions should be specific to the field of {selected_domain}."
 
 
147
  )
148
  questions = get_gemini_response(prompt_template)
149
  return questions
150
 
 
151
  def split_questions_answers(quiz_response):
152
  """Function that splits the questions and answers from the quiz response."""
153
  if "Answers:" in quiz_response:
@@ -165,18 +187,80 @@ def main():
165
  if 'selected_domain' not in st.session_state:
166
  st.session_state.selected_domain = "DSA"
167
 
 
 
 
 
 
 
168
  uploaded_file = st.file_uploader("Upload your resume PDF", type="pdf")
169
- if uploaded_file is not None:
 
170
  resume_text = extract_text_from_pdf(uploaded_file)
171
 
172
- st.write("Analyzing Resume...")
173
- analysis = analyze_resume(resume_text)
 
 
 
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  st.write("Analysis Result:")
176
- st.write(analysis)
177
 
178
  # Extract domains from the analysis
179
- domain_response = extract_domains_from_analysis(analysis)
180
  domains = [domain.strip() for domain in domain_response.split(',')]
181
  default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
182
 
@@ -199,11 +283,11 @@ def main():
199
  if st.button("Generate Questions"):
200
  if question_type == "Technical Round":
201
  if technical_subtype == "MCQs":
202
- quiz_response = generate_mcq_questions(analysis, st.session_state.selected_domain)
203
  else:
204
- quiz_response = generate_coding_questions(analysis, st.session_state.selected_domain)
205
  else:
206
- quiz_response = generate_interview_questions(analysis, st.session_state.selected_domain)
207
 
208
  questions, answers = split_questions_answers(quiz_response)
209
 
@@ -215,4 +299,4 @@ def main():
215
  st.write(answers)
216
 
217
  if __name__ == "__main__":
218
- main()
 
3
  import google.generativeai as genai
4
  from dotenv import load_dotenv
5
  import os
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.embeddings import HuggingFaceEmbeddings
9
+ from langchain.schema import Document
10
 
11
  load_dotenv()
12
+
13
+ # Initialize Google Gemini Model
14
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
15
  model = genai.GenerativeModel('gemini-1.5-flash')
16
 
 
22
  text += page.get_text()
23
  return text
24
 
25
+ def preprocess_and_chunk(resume_text):
26
+ """Preprocess and chunk the resume text for RAG-based retrieval."""
27
+
28
+ # Wrap resume text in a Document object
29
+ docs = [Document(page_content=resume_text)] # Convert the text into a Document object
30
+
31
+ # Split text into chunks
32
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
33
+ docs = text_splitter.split_documents(docs)
34
+
35
+ # Create a vectorstore
36
+ vectorstore = Chroma(
37
+ collection_name="full_documents",
38
+ embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
39
+ model_kwargs={'device': 'cpu'})
40
+ )
41
+
42
+ vectorstore.add_documents(docs)
43
+
44
+ return vectorstore
45
+
46
+ def retrieve_relevant_text(query, vectorstore):
47
+ """Retrieve relevant text based on a query using the vector store."""
48
+ results = vectorstore.similarity_search(query)
49
+ return results
50
+
51
  def get_gemini_response(prompt):
52
  """Function to load Google Gemini model and provide queries as response."""
53
  response = model.generate_content([prompt])
 
55
 
56
  def analyze_resume(text):
57
  prompt = (
58
+ "You are an expert resume analyst. Analyze the following resume text and provide the following:\n\n"
59
+ "1. A brief summary of the resume, including the candidate's main interests and the fields they are most passionate about.\n"
60
+ "2. A detailed percentage distribution of fields/domains present in the resume, with keywords extracted from the resume. Ensure the total sums up to 100%.\n"
61
+ "3. A overall explanation for each domain in brief according to the resume."
62
+ "Here is an example of how the output should look like:\n\n"
63
+ "### Summary\n"
64
+ "The resume indicates a strong background and interest in machine learning, data science, and software development. The candidate has worked on several projects involving machine learning algorithms, data preprocessing, and building software applications. They have demonstrated proficiency in Python, Java, and various machine learning frameworks. The candidate is passionate about solving complex problems using AI and has a keen interest in continuing to develop their skills in this area.\n\n"
65
+ "### Percentage Distribution of Fields/Domains\n"
66
+ "Note: only include standard technologies as keywords.\n"
67
+ "- *Machine Learning (ML)*: 40%\n"
68
+ " - Keywords: Algorithms, Keras, PyTorch, Scikit-Learn, Predictive Models\n"
69
+ "- *Data Science (DS)*: 30%\n"
70
+ " - Keywords: Data Analysis, Pandas, NumPy, Visualization, Statistical Methods\n"
71
+ "- *Software Development (SD)*: 30%\n"
72
+ " - Keywords: Python, Java, Software Engineering, APIs, Git\n"
73
+ )
 
 
 
 
 
 
 
 
 
 
 
 
74
  analysis = get_gemini_response(prompt + text)
75
  return analysis
76
 
 
83
  )
84
  domain_response = get_gemini_response(domain_prompt)
85
  return domain_response.strip()
86
+
87
  def generate_mcq_questions(analysis, selected_domain):
88
  num_questions = 20 # Default to 20 questions
89
  prompt_template = (
 
159
  def generate_interview_questions(analysis, selected_domain):
160
  num_questions = 20 # Default to 20 questions
161
  prompt_template = (
162
+ f"Based on the candidate's resume {analysis} and the identified skills, experience, and education, generate a set of {num_questions} interview questions "
163
  f"that assess their fit for the position at our company. The questions should cover topics such as problem-solving abilities, leadership skills, "
164
  f"communication skills, cultural fit, etc. Additionally, include follow-up questions to probe deeper into the candidate's responses and evaluate their thought process. "
165
+ f"Note:The questions should be specific to the field of {selected_domain}."
166
+ f"Questions should be both knowledge base and industry application level also which include practical application of knowledge but only based on {selected_domain}."
167
+ f"You need to take reference from the resume analysis but the questions generated should be strictly based on {selected_domain}."
168
  )
169
  questions = get_gemini_response(prompt_template)
170
  return questions
171
 
172
+
173
  def split_questions_answers(quiz_response):
174
  """Function that splits the questions and answers from the quiz response."""
175
  if "Answers:" in quiz_response:
 
187
  if 'selected_domain' not in st.session_state:
188
  st.session_state.selected_domain = "DSA"
189
 
190
+ if 'analysis' not in st.session_state:
191
+ st.session_state.analysis = None
192
+
193
+ if 'uploaded_file' not in st.session_state:
194
+ st.session_state.uploaded_file = None
195
+
196
  uploaded_file = st.file_uploader("Upload your resume PDF", type="pdf")
197
+ if uploaded_file is not None and uploaded_file != st.session_state.uploaded_file:
198
+ st.session_state.uploaded_file = uploaded_file
199
  resume_text = extract_text_from_pdf(uploaded_file)
200
 
201
+ with st.spinner("Analyzing Resume..."):
202
+ st.session_state.analysis = analyze_resume(resume_text)
203
+
204
+ st.write("Analysis Result:")
205
+ st.write(st.session_state.analysis)
206
 
207
+ # Extract domains from the analysis
208
+ domain_response = extract_domains_from_analysis(st.session_state.analysis)
209
+ domains = [domain.strip() for domain in domain_response.split(',')]
210
+ default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
211
+
212
+ def update_selected_domain():
213
+ st.session_state.selected_domain = st.session_state.domain_select
214
+
215
+ st.selectbox(
216
+ "Select Domain for Questions:",
217
+ default_domains,
218
+ key="domain_select",
219
+ index=default_domains.index(st.session_state.selected_domain) if st.session_state.selected_domain in default_domains else 0,
220
+ on_change=update_selected_domain
221
+ )
222
+
223
+ question_type = st.selectbox("Select Question Type:", ["Technical Round", "Interview Round"])
224
+
225
+ if question_type == "Technical Round":
226
+ technical_subtype = st.selectbox("Select Technical Round Type:", ["MCQs", "Coding Challenges"])
227
+
228
+ if st.button("Generate Questions"):
229
+ if question_type == "Technical Round":
230
+ if technical_subtype == "MCQs":
231
+ quiz_response = generate_mcq_questions(st.session_state.analysis, st.session_state.selected_domain)
232
+ else:
233
+ quiz_response = generate_coding_questions(st.session_state.analysis, st.session_state.selected_domain)
234
+ else:
235
+ quiz_response = generate_interview_questions(st.session_state.analysis, st.session_state.selected_domain)
236
+
237
+ questions, answers = split_questions_answers(quiz_response)
238
+
239
+ st.write("Generated Questions:")
240
+ st.write(questions)
241
+
242
+ if st.button("Show Answers"):
243
+ st.write("Answers:")
244
+ st.write(answers)
245
+
246
+ # Process and store resume text in vectorstore
247
+ vectorstore = preprocess_and_chunk(resume_text)
248
+
249
+ # Query for relevant text
250
+ query = st.text_input("Enter a query to search relevant text:", "")
251
+ if query:
252
+ with st.spinner("Retrieving relevant text..."):
253
+ relevant_texts = retrieve_relevant_text(query, vectorstore)
254
+ st.write("Relevant Texts:")
255
+ for text in relevant_texts:
256
+ st.write(text.page_content)
257
+
258
+ elif uploaded_file is not None and uploaded_file == st.session_state.uploaded_file:
259
  st.write("Analysis Result:")
260
+ st.write(st.session_state.analysis)
261
 
262
  # Extract domains from the analysis
263
+ domain_response = extract_domains_from_analysis(st.session_state.analysis)
264
  domains = [domain.strip() for domain in domain_response.split(',')]
265
  default_domains = ["DSA", "DBMS", "Programming Basics"] + domains
266
 
 
283
  if st.button("Generate Questions"):
284
  if question_type == "Technical Round":
285
  if technical_subtype == "MCQs":
286
+ quiz_response = generate_mcq_questions(st.session_state.analysis, st.session_state.selected_domain)
287
  else:
288
+ quiz_response = generate_coding_questions(st.session_state.analysis, st.session_state.selected_domain)
289
  else:
290
+ quiz_response = generate_interview_questions(st.session_state.analysis, st.session_state.selected_domain)
291
 
292
  questions, answers = split_questions_answers(quiz_response)
293
 
 
299
  st.write(answers)
300
 
301
  if __name__ == "__main__":
302
+ main()