Spaces:

ak0601
/

Job_reccomendation

Sleeping

App Files Files Community

Job_reccomendation / src /streamlit_app.py

ak0601

Update src/streamlit_app.py

f1ace55 verified 2 months ago

raw

history blame contribute delete

19.1 kB

	import streamlit as st
	import pandas as pd
	import PyPDF2
	import os
	from google.oauth2 import service_account
	import gspread
	from pydantic import BaseModel, Field
	from typing import List
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import ChatPromptTemplate
	import time
	import re

	# ──────────────────────────────────────────────────────────────────────────────
	# 1) ENVIRONMENT VARIABLES / SECRETS
	#
	# On Huggingface Spaces:
	# - Go to your Space’s Settings → Secrets and add:
	# • OPENAI_API_KEY = your‐openai‐key
	# • GOOGLE_API_KEY = your‐google‐key (if you use any Google LLM)
	# - If you also need a Google Service Account JSON, either:
	# a) Commit it (careful: that is public by default – only do so if it’s non‐sensitive!),
	# b) Or add it as “Repository Files” via the “Files & versions” tab,
	# c) Or load it from a Secret.
	#
	# In code below, we’ll assume the service‐account JSON is committed under:
	# └─ synapse-recruitment-34e7b48899b4.json
	#
	# If you instead want to load it from a single‐line environment variable, you can do:
	# service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON"))
	# creds = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES)
	#
	# For now, we’ll simply use:
	# SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json"
	#
	# And expect that file to be present in the top‐level of your repo/Space.
	#
	# ──────────────────────────────────────────────────────────────────────────────


	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")

	if OPENAI_API_KEY == "":
	st.warning("⚠️ OPENAI_API_KEY is not set. The LLM calls will fail unless you add it under Secrets.")

	# ──────────────────────────────────────────────────────────────────────────────
	# 2) Pydantic models for structured output
	# ──────────────────────────────────────────────────────────────────────────────

	class structure(BaseModel):
	name: str = Field(description="Name of the candidate")
	location: str = Field(description="The location of the candidate.")
	skills: List[str] = Field(description="List of individual skills of the candidate")
	ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.")
	yoe: str = Field(description="Years of experience of the candidate.")
	experience: str = Field(description="A brief summary of the candidate's past experience.")


	class Job(BaseModel):
	job_title: str = Field(description="The title of the job.")
	company: str = Field(description="The company offering the job.")
	location: str = Field(description="The location of the job.")
	skills: List[str] = Field(description="List of skills required for the job.")
	description: str = Field(description="A brief description of the job.")
	relevance_score: float = Field(description="Relevance score of the job to the candidate's resume.")
	justification: str = Field(description = "Reason for giving this relevance score and what all areas need to be improved by the candidate")


	# ──────────────────────────────────────────────────────────────────────────────
	# 3) Helper: parse a comma‐separated “Tech Stack” string into a Python set
	# ──────────────────────────────────────────────────────────────────────────────

	def parse_tech_stack(stack):
	if pd.isna(stack) or stack == "" or stack is None:
	return set()
	if isinstance(stack, set):
	return stack
	try:
	# If it's literally a Python‐set string like "{'python','django'}"
	if isinstance(stack, str) and stack.startswith("{") and stack.endswith("}"):
	items = stack.strip("{}").split(",")
	return set(item.strip().strip("'\"").lower() for item in items if item.strip())
	# Otherwise assume comma‐separated values
	return set(s.strip().lower() for s in str(stack).split(",") if s.strip())
	except Exception as e:
	st.error(f"Error parsing tech stack: {e}")
	return set()


	# ──────────────────────────────────────────────────────────────────────────────
	# 4) Google Sheets initialization (Service Account JSON must be present in repo)
	# ──────────────────────────────────────────────────────────────────────────────

	def initialize_google_sheets():
	SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json"
	SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
	if not os.path.exists(SERVICE_ACCOUNT_FILE):
	st.error(f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'.\n"
	"Either commit it into the repo or load from a Secret.")
	return None

	try:
	creds = service_account.Credentials.from_service_account_file(
	SERVICE_ACCOUNT_FILE, scopes=SCOPES
	)
	return gspread.authorize(creds)
	except Exception as e:
	st.error(f"Failed to load Google Service Account credentials: {e}")
	return None


	def load_jobs_data():
	gc = initialize_google_sheets()
	if gc is None:
	return None

	try:
	# NOTE: Replace this key with your actual spreadsheet key
	SPREADSHEET_KEY = "1BZlvbtFyiQ9Pgr_lpepDJua1ZeVEqrCLjssNd6OiG9k"
	worksheet = gc.open_by_key(SPREADSHEET_KEY).worksheet("paraform_jobs_formatted")
	all_values = worksheet.get_all_values()
	if not all_values or len(all_values) < 2:
	st.warning("No data found in the Jobs sheet.")
	return None

	df = pd.DataFrame(all_values[1:], columns=all_values[0]).fillna("")
	# Add a “parsed_stack” column so we can pre‐filter by skill overlap
	df["parsed_stack"] = df["Tech Stack"].apply(parse_tech_stack)
	return df

	except Exception as e:
	st.error(f"Error loading jobs data from Google Sheets: {e}")
	return None


	# ──────────────────────────────────────────────────────────────────────────────
	# 5) PDF → plain text
	# ──────────────────────────────────────────────────────────────────────────────

	def extract_text_from_pdf(pdf_file):
	try:
	reader = PyPDF2.PdfReader(pdf_file)
	full_text = ""
	for page in reader.pages:
	text = page.extract_text()
	if text:
	full_text += text + "\n"
	return full_text
	except Exception as e:
	st.error(f"Failed to read PDF: {e}")
	return ""


	# ──────────────────────────────────────────────────────────────────────────────
	# 6) Call GPT‐4o‐mini to extract structured fields from resume text
	# ──────────────────────────────────────────────────────────────────────────────

	def structure_resume_data(resume_text: str) -> structure:
	llm = ChatOpenAI(
	model="gpt-4o-mini",
	temperature=0.0,
	max_retries=2,
	)
	sum_llm = llm.with_structured_output(structure)

	prompt = ChatPromptTemplate.from_messages([
	("system", "You are a helper that extracts structured data from a resume."),
	("human", "Extract the following fields from this resume:\n{resume_text}\n"
	"If any field is missing, return ‘Unknown’.")
	])

	try:
	parsed = (prompt \| sum_llm).invoke({"resume_text": resume_text})
	return parsed
	except Exception as e:
	st.error(f"Failed to extract structure from resume: {e}")
	# Return a fallback with “Unknown” fields
	return structure(
	name="Unknown",
	location="Unknown",
	skills=[],
	ideal_jobs="Unknown",
	yoe="Unknown",
	experience="Unknown"
	)


	# ──────────────────────────────────────────────────────────────────────────────
	# 7) Evaluate jobs: Pre‐filter by requiring at least two overlapping skills,
	# then run an LLM loop (with a “Stop” check on each iteration)
	# ──────────────────────────────────────────────────────────────────────────────

	def eval_jobs(jobs_df: pd.DataFrame, resume_text: str) -> pd.DataFrame:
	"""
	1) Extract candidate info (list of skills, etc.)
	2) Build a skill‐set from response.skills
	3) Pre‐filter all jobs so that job’s Tech Stack has ≥2 skills in common
	4) For that filtered subset, run an LLM evaluation loop
	– on each iteration, check `st.session_state.evaluation_running`:
	if it has become False, break out immediately.
	5) Return a DataFrame of top‐10 results (or empty if none).
	"""
	response = structure_resume_data(resume_text)
	candidate_skills = set(skill.lower() for skill in response.skills)

	# How many overlapping skills does each job have?
	def matching_skill_count(tech_stack: str) -> int:
	job_skills = set(s.strip().lower() for s in tech_stack.split(",") if s.strip())
	return len(candidate_skills & job_skills)

	jobs_df["matching_skills"] = jobs_df["Tech Stack"].apply(matching_skill_count)
	filtered = jobs_df[jobs_df["matching_skills"] >= 2].copy()

	if filtered.empty:
	st.warning("No jobs passed the 2-skill pre-filter.")
	return pd.DataFrame()

	# Build a candidate_text blob for the LLM to consume
	candidate_text = (
	f"{response.name} {response.location} "
	f"{', '.join(response.skills)} {response.ideal_jobs} "
	f"{response.yoe} {response.experience}"
	)

	# LLM setup for job‐evaluation
	llm = ChatOpenAI(
	model="gpt-4o-mini",
	temperature=0.0,
	max_retries=2,
	)
	eval_llm = llm.with_structured_output(Job)

	system_msg = (
	"You are an expert recruiter. First, filter by location & experience. "
	"Then pick jobs that match the candidate’s skills & background. "
	"Finally, assign a relevance score (0–10)."
	)
	prompt = ChatPromptTemplate.from_messages([
	("system", system_msg),
	("human", "Evaluate Job: {job_text}\nCandidate: {candidate_text}\n"
	"Return JSON with job_title, company, location, skills, description, relevance_score.")
	])
	chain = prompt \| eval_llm

	jobs_for_eval = filtered[["Company", "Role", "Locations", "parsed_stack", "YOE", "matching_skills"]]
	results = []

	progress_bar = st.progress(0)
	status_text = st.empty()
	total = len(jobs_for_eval)

	for i, row in enumerate(jobs_for_eval.itertuples(), start=1):
	# If the user clicked “Stop Evaluation” → evaluation_running = False
	if not st.session_state.evaluation_running:
	status_text.text("⏸️ Evaluation halted by user.")
	break

	progress_bar.progress(i / total)
	status_text.text(f"Evaluating job {i}/{total}: {row.Role} at {row.Company}")

	job_text = " ".join([
	row.Role,
	row.Company,
	row.Locations,
	", ".join(row.parsed_stack),
	str(row.YOE)
	])

	try:
	eval_job = chain.invoke({
	"job_text": job_text,
	"candidate_text": candidate_text
	})
	except Exception as e:
	st.error(f"LLM failed on job #{i}: {e}")
	# Skip this job and continue
	continue

	results.append({
	"job_title": eval_job.job_title,
	"company": eval_job.company,
	"location": eval_job.location,
	"skills": eval_job.skills,
	"description": eval_job.description,
	"relevance_score": eval_job.relevance_score,
	"matching_skills": row.matching_skills
	})

	# Simulate a delay so you can see the Stop button in action
	time.sleep(0.5)

	progress_bar.empty()
	status_text.empty()

	if not results:
	return pd.DataFrame()

	df_results = pd.DataFrame(results)
	# Sort first by matching_skills desc, then by relevance_score desc, take top 10
	df_results = df_results.sort_values(
	by=["matching_skills", "relevance_score"],
	ascending=[False, False]
	).head(10)

	return df_results


	# ──────────────────────────────────────────────────────────────────────────────
	# 8) Clean résumé text (lowercase, strip special chars)
	# ──────────────────────────────────────────────────────────────────────────────

	def preprocess_text(text: str) -> str:
	return re.sub(r"[^a-zA-Z\s]", "", text.lower())


	# ──────────────────────────────────────────────────────────────────────────────
	# 9) Streamlit UI
	# ──────────────────────────────────────────────────────────────────────────────

	def main():
	st.title("📝 Resume Evaluator & Job Recommender")

	# 9.1) Initialize session state flags
	if "evaluation_running" not in st.session_state:
	st.session_state.evaluation_running = False
	if "evaluation_complete" not in st.session_state:
	st.session_state.evaluation_complete = False

	# 9.2) File uploader
	uploaded_file = st.file_uploader(
	"Upload your resume (PDF)",
	type=["pdf"],
	help="After picking a PDF, click ‘Generate Recommendations’ below."
	)

	# 9.3) Always show BOTH “Generate Recommendations” and “Stop Evaluation” in two columns
	col1, col2 = st.columns(2)

	with col1:
	if st.session_state.evaluation_running:
	st.button("Generate Recommendations", disabled=True)
	else:
	if st.button("Generate Recommendations"):
	# 9.4) User clicked “Generate” → begin
	st.session_state.evaluation_running = True
	st.session_state.evaluation_complete = False

	# 9.5) Ensure a file was actually uploaded
	if uploaded_file is None:
	st.error("❗ Please upload a PDF before clicking ‘Generate Recommendations’.")
	st.session_state.evaluation_running = False
	else:
	# Debug: print basic type of what streamlit handed us
	st.write(f"▶️ Received file of type: `{type(uploaded_file)}`")

	# 9.6) Load job sheet
	jobs_df = load_jobs_data()
	if jobs_df is None:
	st.session_state.evaluation_running = False
	return

	# 9.7) Extract text from the PDF
	raw_text = extract_text_from_pdf(uploaded_file)
	if not raw_text.strip():
	st.error("⚠️ The uploaded PDF appears to contain no extractable text.")
	st.session_state.evaluation_running = False
	return

	cleaned = preprocess_text(raw_text)
	st.success("✅ Resume text extracted successfully!")

	# 9.8) Run the lengthy eval loop inside a spinner
	with st.spinner("Evaluating jobs…"):
	recommendations = eval_jobs(jobs_df, cleaned)

	# 9.9) Show results (or warning if none)
	if not recommendations.empty:
	st.header("Recommended Jobs")
	st.dataframe(recommendations)
	st.session_state.evaluation_complete = True
	else:
	st.warning("No matching jobs found or evaluation was halted mid‐stream.")

	# 9.10) Done (or halted)
	st.session_state.evaluation_running = False

	with col2:
	# The “Stop Evaluation” button is only enabled while evaluation_running is True:
	if st.session_state.evaluation_running:
	if st.button("Stop Evaluation"):
	st.session_state.evaluation_running = False
	st.warning("⏸️ User requested to stop evaluation.")
	else:
	st.button("Stop Evaluation", disabled=True)

	# 9.11) Once complete, allow “Try Another Resume” to reset
	if st.session_state.evaluation_complete:
	if st.button("Try Another Resume"):
	st.session_state.evaluation_complete = False
	st.experimental_rerun()


	if __name__ == "__main__":
	main()