Job_reccomendation / src /streamlit_app.py
ak0601's picture
Update src/streamlit_app.py
f1ace55 verified
import streamlit as st
import pandas as pd
import PyPDF2
import os
from google.oauth2 import service_account
import gspread
from pydantic import BaseModel, Field
from typing import List
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import time
import re
# ──────────────────────────────────────────────────────────────────────────────
# 1) ENVIRONMENT VARIABLES / SECRETS
#
# On Huggingface Spaces:
# - Go to your Space’s Settings β†’ Secrets and add:
# β€’ OPENAI_API_KEY = your‐openai‐key
# β€’ GOOGLE_API_KEY = your‐google‐key (if you use any Google LLM)
# - If you also need a Google Service Account JSON, either:
# a) Commit it (careful: that is public by default – only do so if it’s non‐sensitive!),
# b) Or add it as β€œRepository Files” via the β€œFiles & versions” tab,
# c) Or load it from a Secret.
#
# In code below, we’ll assume the service‐account JSON is committed under:
# └─ synapse-recruitment-34e7b48899b4.json
#
# If you instead want to load it from a single‐line environment variable, you can do:
# service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON"))
# creds = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES)
#
# For now, we’ll simply use:
# SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json"
#
# And expect that file to be present in the top‐level of your repo/Space.
#
# ──────────────────────────────────────────────────────────────────────────────
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
if OPENAI_API_KEY == "":
st.warning("⚠️ OPENAI_API_KEY is not set. The LLM calls will fail unless you add it under Secrets.")
# ──────────────────────────────────────────────────────────────────────────────
# 2) Pydantic models for structured output
# ──────────────────────────────────────────────────────────────────────────────
class structure(BaseModel):
name: str = Field(description="Name of the candidate")
location: str = Field(description="The location of the candidate.")
skills: List[str] = Field(description="List of individual skills of the candidate")
ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.")
yoe: str = Field(description="Years of experience of the candidate.")
experience: str = Field(description="A brief summary of the candidate's past experience.")
class Job(BaseModel):
job_title: str = Field(description="The title of the job.")
company: str = Field(description="The company offering the job.")
location: str = Field(description="The location of the job.")
skills: List[str] = Field(description="List of skills required for the job.")
description: str = Field(description="A brief description of the job.")
relevance_score: float = Field(description="Relevance score of the job to the candidate's resume.")
justification: str = Field(description = "Reason for giving this relevance score and what all areas need to be improved by the candidate")
# ──────────────────────────────────────────────────────────────────────────────
# 3) Helper: parse a comma‐separated β€œTech Stack” string into a Python set
# ──────────────────────────────────────────────────────────────────────────────
def parse_tech_stack(stack):
if pd.isna(stack) or stack == "" or stack is None:
return set()
if isinstance(stack, set):
return stack
try:
# If it's literally a Python‐set string like "{'python','django'}"
if isinstance(stack, str) and stack.startswith("{") and stack.endswith("}"):
items = stack.strip("{}").split(",")
return set(item.strip().strip("'\"").lower() for item in items if item.strip())
# Otherwise assume comma‐separated values
return set(s.strip().lower() for s in str(stack).split(",") if s.strip())
except Exception as e:
st.error(f"Error parsing tech stack: {e}")
return set()
# ──────────────────────────────────────────────────────────────────────────────
# 4) Google Sheets initialization (Service Account JSON must be present in repo)
# ──────────────────────────────────────────────────────────────────────────────
def initialize_google_sheets():
SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json"
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
if not os.path.exists(SERVICE_ACCOUNT_FILE):
st.error(f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'.\n"
"Either commit it into the repo or load from a Secret.")
return None
try:
creds = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES
)
return gspread.authorize(creds)
except Exception as e:
st.error(f"Failed to load Google Service Account credentials: {e}")
return None
def load_jobs_data():
gc = initialize_google_sheets()
if gc is None:
return None
try:
# NOTE: Replace this key with your actual spreadsheet key
SPREADSHEET_KEY = "1BZlvbtFyiQ9Pgr_lpepDJua1ZeVEqrCLjssNd6OiG9k"
worksheet = gc.open_by_key(SPREADSHEET_KEY).worksheet("paraform_jobs_formatted")
all_values = worksheet.get_all_values()
if not all_values or len(all_values) < 2:
st.warning("No data found in the Jobs sheet.")
return None
df = pd.DataFrame(all_values[1:], columns=all_values[0]).fillna("")
# Add a β€œparsed_stack” column so we can pre‐filter by skill overlap
df["parsed_stack"] = df["Tech Stack"].apply(parse_tech_stack)
return df
except Exception as e:
st.error(f"Error loading jobs data from Google Sheets: {e}")
return None
# ──────────────────────────────────────────────────────────────────────────────
# 5) PDF β†’ plain text
# ──────────────────────────────────────────────────────────────────────────────
def extract_text_from_pdf(pdf_file):
try:
reader = PyPDF2.PdfReader(pdf_file)
full_text = ""
for page in reader.pages:
text = page.extract_text()
if text:
full_text += text + "\n"
return full_text
except Exception as e:
st.error(f"Failed to read PDF: {e}")
return ""
# ──────────────────────────────────────────────────────────────────────────────
# 6) Call GPT‐4o‐mini to extract structured fields from resume text
# ──────────────────────────────────────────────────────────────────────────────
def structure_resume_data(resume_text: str) -> structure:
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
max_retries=2,
)
sum_llm = llm.with_structured_output(structure)
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helper that extracts structured data from a resume."),
("human", "Extract the following fields from this resume:\n{resume_text}\n"
"If any field is missing, return β€˜Unknown’.")
])
try:
parsed = (prompt | sum_llm).invoke({"resume_text": resume_text})
return parsed
except Exception as e:
st.error(f"Failed to extract structure from resume: {e}")
# Return a fallback with β€œUnknown” fields
return structure(
name="Unknown",
location="Unknown",
skills=[],
ideal_jobs="Unknown",
yoe="Unknown",
experience="Unknown"
)
# ──────────────────────────────────────────────────────────────────────────────
# 7) Evaluate jobs: Pre‐filter by requiring at least two overlapping skills,
# then run an LLM loop (with a β€œStop” check on each iteration)
# ──────────────────────────────────────────────────────────────────────────────
def eval_jobs(jobs_df: pd.DataFrame, resume_text: str) -> pd.DataFrame:
"""
1) Extract candidate info (list of skills, etc.)
2) Build a skill‐set from response.skills
3) Pre‐filter all jobs so that job’s Tech Stack has β‰₯2 skills in common
4) For that filtered subset, run an LLM evaluation loop
– on each iteration, check `st.session_state.evaluation_running`:
if it has become False, break out immediately.
5) Return a DataFrame of top‐10 results (or empty if none).
"""
response = structure_resume_data(resume_text)
candidate_skills = set(skill.lower() for skill in response.skills)
# How many overlapping skills does each job have?
def matching_skill_count(tech_stack: str) -> int:
job_skills = set(s.strip().lower() for s in tech_stack.split(",") if s.strip())
return len(candidate_skills & job_skills)
jobs_df["matching_skills"] = jobs_df["Tech Stack"].apply(matching_skill_count)
filtered = jobs_df[jobs_df["matching_skills"] >= 2].copy()
if filtered.empty:
st.warning("No jobs passed the 2-skill pre-filter.")
return pd.DataFrame()
# Build a candidate_text blob for the LLM to consume
candidate_text = (
f"{response.name} {response.location} "
f"{', '.join(response.skills)} {response.ideal_jobs} "
f"{response.yoe} {response.experience}"
)
# LLM setup for job‐evaluation
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
max_retries=2,
)
eval_llm = llm.with_structured_output(Job)
system_msg = (
"You are an expert recruiter. First, filter by location & experience. "
"Then pick jobs that match the candidate’s skills & background. "
"Finally, assign a relevance score (0–10)."
)
prompt = ChatPromptTemplate.from_messages([
("system", system_msg),
("human", "Evaluate Job: {job_text}\nCandidate: {candidate_text}\n"
"Return JSON with job_title, company, location, skills, description, relevance_score.")
])
chain = prompt | eval_llm
jobs_for_eval = filtered[["Company", "Role", "Locations", "parsed_stack", "YOE", "matching_skills"]]
results = []
progress_bar = st.progress(0)
status_text = st.empty()
total = len(jobs_for_eval)
for i, row in enumerate(jobs_for_eval.itertuples(), start=1):
# If the user clicked β€œStop Evaluation” β†’ evaluation_running = False
if not st.session_state.evaluation_running:
status_text.text("⏸️ Evaluation halted by user.")
break
progress_bar.progress(i / total)
status_text.text(f"Evaluating job {i}/{total}: {row.Role} at {row.Company}")
job_text = " ".join([
row.Role,
row.Company,
row.Locations,
", ".join(row.parsed_stack),
str(row.YOE)
])
try:
eval_job = chain.invoke({
"job_text": job_text,
"candidate_text": candidate_text
})
except Exception as e:
st.error(f"LLM failed on job #{i}: {e}")
# Skip this job and continue
continue
results.append({
"job_title": eval_job.job_title,
"company": eval_job.company,
"location": eval_job.location,
"skills": eval_job.skills,
"description": eval_job.description,
"relevance_score": eval_job.relevance_score,
"matching_skills": row.matching_skills
})
# Simulate a delay so you can see the Stop button in action
time.sleep(0.5)
progress_bar.empty()
status_text.empty()
if not results:
return pd.DataFrame()
df_results = pd.DataFrame(results)
# Sort first by matching_skills desc, then by relevance_score desc, take top 10
df_results = df_results.sort_values(
by=["matching_skills", "relevance_score"],
ascending=[False, False]
).head(10)
return df_results
# ──────────────────────────────────────────────────────────────────────────────
# 8) Clean rΓ©sumΓ© text (lowercase, strip special chars)
# ──────────────────────────────────────────────────────────────────────────────
def preprocess_text(text: str) -> str:
return re.sub(r"[^a-zA-Z\s]", "", text.lower())
# ──────────────────────────────────────────────────────────────────────────────
# 9) Streamlit UI
# ──────────────────────────────────────────────────────────────────────────────
def main():
st.title("πŸ“ Resume Evaluator & Job Recommender")
# 9.1) Initialize session state flags
if "evaluation_running" not in st.session_state:
st.session_state.evaluation_running = False
if "evaluation_complete" not in st.session_state:
st.session_state.evaluation_complete = False
# 9.2) File uploader
uploaded_file = st.file_uploader(
"Upload your resume (PDF)",
type=["pdf"],
help="After picking a PDF, click β€˜Generate Recommendations’ below."
)
# 9.3) Always show BOTH β€œGenerate Recommendations” and β€œStop Evaluation” in two columns
col1, col2 = st.columns(2)
with col1:
if st.session_state.evaluation_running:
st.button("Generate Recommendations", disabled=True)
else:
if st.button("Generate Recommendations"):
# 9.4) User clicked β€œGenerate” β†’ begin
st.session_state.evaluation_running = True
st.session_state.evaluation_complete = False
# 9.5) Ensure a file was actually uploaded
if uploaded_file is None:
st.error("❗ Please upload a PDF before clicking β€˜Generate Recommendations’.")
st.session_state.evaluation_running = False
else:
# Debug: print basic type of what streamlit handed us
st.write(f"▢️ Received file of type: `{type(uploaded_file)}`")
# 9.6) Load job sheet
jobs_df = load_jobs_data()
if jobs_df is None:
st.session_state.evaluation_running = False
return
# 9.7) Extract text from the PDF
raw_text = extract_text_from_pdf(uploaded_file)
if not raw_text.strip():
st.error("⚠️ The uploaded PDF appears to contain no extractable text.")
st.session_state.evaluation_running = False
return
cleaned = preprocess_text(raw_text)
st.success("βœ… Resume text extracted successfully!")
# 9.8) Run the lengthy eval loop inside a spinner
with st.spinner("Evaluating jobs…"):
recommendations = eval_jobs(jobs_df, cleaned)
# 9.9) Show results (or warning if none)
if not recommendations.empty:
st.header("Recommended Jobs")
st.dataframe(recommendations)
st.session_state.evaluation_complete = True
else:
st.warning("No matching jobs found or evaluation was halted mid‐stream.")
# 9.10) Done (or halted)
st.session_state.evaluation_running = False
with col2:
# The β€œStop Evaluation” button is only enabled while evaluation_running is True:
if st.session_state.evaluation_running:
if st.button("Stop Evaluation"):
st.session_state.evaluation_running = False
st.warning("⏸️ User requested to stop evaluation.")
else:
st.button("Stop Evaluation", disabled=True)
# 9.11) Once complete, allow β€œTry Another Resume” to reset
if st.session_state.evaluation_complete:
if st.button("Try Another Resume"):
st.session_state.evaluation_complete = False
st.experimental_rerun()
if __name__ == "__main__":
main()