Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import PyPDF2 | |
import os | |
from google.oauth2 import service_account | |
import gspread | |
from pydantic import BaseModel, Field | |
from typing import List | |
from langchain_openai import ChatOpenAI | |
from langchain_core.prompts import ChatPromptTemplate | |
import time | |
import re | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 1) ENVIRONMENT VARIABLES / SECRETS | |
# | |
# On Huggingface Spaces: | |
# - Go to your Spaceβs Settings β Secrets and add: | |
# β’ OPENAI_API_KEY = yourβopenaiβkey | |
# β’ GOOGLE_API_KEY = yourβgoogleβkey (if you use any Google LLM) | |
# - If you also need a Google Service Account JSON, either: | |
# a) Commit it (careful: that is public by default β only do so if itβs nonβsensitive!), | |
# b) Or add it as βRepository Filesβ via the βFiles & versionsβ tab, | |
# c) Or load it from a Secret. | |
# | |
# In code below, weβll assume the serviceβaccount JSON is committed under: | |
# ββ synapse-recruitment-34e7b48899b4.json | |
# | |
# If you instead want to load it from a singleβline environment variable, you can do: | |
# service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON")) | |
# creds = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES) | |
# | |
# For now, weβll simply use: | |
# SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" | |
# | |
# And expect that file to be present in the topβlevel of your repo/Space. | |
# | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") | |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") | |
if OPENAI_API_KEY == "": | |
st.warning("β οΈ OPENAI_API_KEY is not set. The LLM calls will fail unless you add it under Secrets.") | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 2) Pydantic models for structured output | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
class structure(BaseModel): | |
name: str = Field(description="Name of the candidate") | |
location: str = Field(description="The location of the candidate.") | |
skills: List[str] = Field(description="List of individual skills of the candidate") | |
ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.") | |
yoe: str = Field(description="Years of experience of the candidate.") | |
experience: str = Field(description="A brief summary of the candidate's past experience.") | |
class Job(BaseModel): | |
job_title: str = Field(description="The title of the job.") | |
company: str = Field(description="The company offering the job.") | |
location: str = Field(description="The location of the job.") | |
skills: List[str] = Field(description="List of skills required for the job.") | |
description: str = Field(description="A brief description of the job.") | |
relevance_score: float = Field(description="Relevance score of the job to the candidate's resume.") | |
justification: str = Field(description = "Reason for giving this relevance score and what all areas need to be improved by the candidate") | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 3) Helper: parse a commaβseparated βTech Stackβ string into a Python set | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def parse_tech_stack(stack): | |
if pd.isna(stack) or stack == "" or stack is None: | |
return set() | |
if isinstance(stack, set): | |
return stack | |
try: | |
# If it's literally a Pythonβset string like "{'python','django'}" | |
if isinstance(stack, str) and stack.startswith("{") and stack.endswith("}"): | |
items = stack.strip("{}").split(",") | |
return set(item.strip().strip("'\"").lower() for item in items if item.strip()) | |
# Otherwise assume commaβseparated values | |
return set(s.strip().lower() for s in str(stack).split(",") if s.strip()) | |
except Exception as e: | |
st.error(f"Error parsing tech stack: {e}") | |
return set() | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 4) Google Sheets initialization (Service Account JSON must be present in repo) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def initialize_google_sheets(): | |
SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" | |
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] | |
if not os.path.exists(SERVICE_ACCOUNT_FILE): | |
st.error(f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'.\n" | |
"Either commit it into the repo or load from a Secret.") | |
return None | |
try: | |
creds = service_account.Credentials.from_service_account_file( | |
SERVICE_ACCOUNT_FILE, scopes=SCOPES | |
) | |
return gspread.authorize(creds) | |
except Exception as e: | |
st.error(f"Failed to load Google Service Account credentials: {e}") | |
return None | |
def load_jobs_data(): | |
gc = initialize_google_sheets() | |
if gc is None: | |
return None | |
try: | |
# NOTE: Replace this key with your actual spreadsheet key | |
SPREADSHEET_KEY = "1BZlvbtFyiQ9Pgr_lpepDJua1ZeVEqrCLjssNd6OiG9k" | |
worksheet = gc.open_by_key(SPREADSHEET_KEY).worksheet("paraform_jobs_formatted") | |
all_values = worksheet.get_all_values() | |
if not all_values or len(all_values) < 2: | |
st.warning("No data found in the Jobs sheet.") | |
return None | |
df = pd.DataFrame(all_values[1:], columns=all_values[0]).fillna("") | |
# Add a βparsed_stackβ column so we can preβfilter by skill overlap | |
df["parsed_stack"] = df["Tech Stack"].apply(parse_tech_stack) | |
return df | |
except Exception as e: | |
st.error(f"Error loading jobs data from Google Sheets: {e}") | |
return None | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 5) PDF β plain text | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def extract_text_from_pdf(pdf_file): | |
try: | |
reader = PyPDF2.PdfReader(pdf_file) | |
full_text = "" | |
for page in reader.pages: | |
text = page.extract_text() | |
if text: | |
full_text += text + "\n" | |
return full_text | |
except Exception as e: | |
st.error(f"Failed to read PDF: {e}") | |
return "" | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 6) Call GPTβ4oβmini to extract structured fields from resume text | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def structure_resume_data(resume_text: str) -> structure: | |
llm = ChatOpenAI( | |
model="gpt-4o-mini", | |
temperature=0.0, | |
max_retries=2, | |
) | |
sum_llm = llm.with_structured_output(structure) | |
prompt = ChatPromptTemplate.from_messages([ | |
("system", "You are a helper that extracts structured data from a resume."), | |
("human", "Extract the following fields from this resume:\n{resume_text}\n" | |
"If any field is missing, return βUnknownβ.") | |
]) | |
try: | |
parsed = (prompt | sum_llm).invoke({"resume_text": resume_text}) | |
return parsed | |
except Exception as e: | |
st.error(f"Failed to extract structure from resume: {e}") | |
# Return a fallback with βUnknownβ fields | |
return structure( | |
name="Unknown", | |
location="Unknown", | |
skills=[], | |
ideal_jobs="Unknown", | |
yoe="Unknown", | |
experience="Unknown" | |
) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 7) Evaluate jobs: Preβfilter by requiring at least two overlapping skills, | |
# then run an LLM loop (with a βStopβ check on each iteration) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def eval_jobs(jobs_df: pd.DataFrame, resume_text: str) -> pd.DataFrame: | |
""" | |
1) Extract candidate info (list of skills, etc.) | |
2) Build a skillβset from response.skills | |
3) Preβfilter all jobs so that jobβs Tech Stack has β₯2 skills in common | |
4) For that filtered subset, run an LLM evaluation loop | |
β on each iteration, check `st.session_state.evaluation_running`: | |
if it has become False, break out immediately. | |
5) Return a DataFrame of topβ10 results (or empty if none). | |
""" | |
response = structure_resume_data(resume_text) | |
candidate_skills = set(skill.lower() for skill in response.skills) | |
# How many overlapping skills does each job have? | |
def matching_skill_count(tech_stack: str) -> int: | |
job_skills = set(s.strip().lower() for s in tech_stack.split(",") if s.strip()) | |
return len(candidate_skills & job_skills) | |
jobs_df["matching_skills"] = jobs_df["Tech Stack"].apply(matching_skill_count) | |
filtered = jobs_df[jobs_df["matching_skills"] >= 2].copy() | |
if filtered.empty: | |
st.warning("No jobs passed the 2-skill pre-filter.") | |
return pd.DataFrame() | |
# Build a candidate_text blob for the LLM to consume | |
candidate_text = ( | |
f"{response.name} {response.location} " | |
f"{', '.join(response.skills)} {response.ideal_jobs} " | |
f"{response.yoe} {response.experience}" | |
) | |
# LLM setup for jobβevaluation | |
llm = ChatOpenAI( | |
model="gpt-4o-mini", | |
temperature=0.0, | |
max_retries=2, | |
) | |
eval_llm = llm.with_structured_output(Job) | |
system_msg = ( | |
"You are an expert recruiter. First, filter by location & experience. " | |
"Then pick jobs that match the candidateβs skills & background. " | |
"Finally, assign a relevance score (0β10)." | |
) | |
prompt = ChatPromptTemplate.from_messages([ | |
("system", system_msg), | |
("human", "Evaluate Job: {job_text}\nCandidate: {candidate_text}\n" | |
"Return JSON with job_title, company, location, skills, description, relevance_score.") | |
]) | |
chain = prompt | eval_llm | |
jobs_for_eval = filtered[["Company", "Role", "Locations", "parsed_stack", "YOE", "matching_skills"]] | |
results = [] | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
total = len(jobs_for_eval) | |
for i, row in enumerate(jobs_for_eval.itertuples(), start=1): | |
# If the user clicked βStop Evaluationβ β evaluation_running = False | |
if not st.session_state.evaluation_running: | |
status_text.text("βΈοΈ Evaluation halted by user.") | |
break | |
progress_bar.progress(i / total) | |
status_text.text(f"Evaluating job {i}/{total}: {row.Role} at {row.Company}") | |
job_text = " ".join([ | |
row.Role, | |
row.Company, | |
row.Locations, | |
", ".join(row.parsed_stack), | |
str(row.YOE) | |
]) | |
try: | |
eval_job = chain.invoke({ | |
"job_text": job_text, | |
"candidate_text": candidate_text | |
}) | |
except Exception as e: | |
st.error(f"LLM failed on job #{i}: {e}") | |
# Skip this job and continue | |
continue | |
results.append({ | |
"job_title": eval_job.job_title, | |
"company": eval_job.company, | |
"location": eval_job.location, | |
"skills": eval_job.skills, | |
"description": eval_job.description, | |
"relevance_score": eval_job.relevance_score, | |
"matching_skills": row.matching_skills | |
}) | |
# Simulate a delay so you can see the Stop button in action | |
time.sleep(0.5) | |
progress_bar.empty() | |
status_text.empty() | |
if not results: | |
return pd.DataFrame() | |
df_results = pd.DataFrame(results) | |
# Sort first by matching_skills desc, then by relevance_score desc, take top 10 | |
df_results = df_results.sort_values( | |
by=["matching_skills", "relevance_score"], | |
ascending=[False, False] | |
).head(10) | |
return df_results | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 8) Clean rΓ©sumΓ© text (lowercase, strip special chars) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def preprocess_text(text: str) -> str: | |
return re.sub(r"[^a-zA-Z\s]", "", text.lower()) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# 9) Streamlit UI | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def main(): | |
st.title("π Resume Evaluator & Job Recommender") | |
# 9.1) Initialize session state flags | |
if "evaluation_running" not in st.session_state: | |
st.session_state.evaluation_running = False | |
if "evaluation_complete" not in st.session_state: | |
st.session_state.evaluation_complete = False | |
# 9.2) File uploader | |
uploaded_file = st.file_uploader( | |
"Upload your resume (PDF)", | |
type=["pdf"], | |
help="After picking a PDF, click βGenerate Recommendationsβ below." | |
) | |
# 9.3) Always show BOTH βGenerate Recommendationsβ and βStop Evaluationβ in two columns | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.session_state.evaluation_running: | |
st.button("Generate Recommendations", disabled=True) | |
else: | |
if st.button("Generate Recommendations"): | |
# 9.4) User clicked βGenerateβ β begin | |
st.session_state.evaluation_running = True | |
st.session_state.evaluation_complete = False | |
# 9.5) Ensure a file was actually uploaded | |
if uploaded_file is None: | |
st.error("β Please upload a PDF before clicking βGenerate Recommendationsβ.") | |
st.session_state.evaluation_running = False | |
else: | |
# Debug: print basic type of what streamlit handed us | |
st.write(f"βΆοΈ Received file of type: `{type(uploaded_file)}`") | |
# 9.6) Load job sheet | |
jobs_df = load_jobs_data() | |
if jobs_df is None: | |
st.session_state.evaluation_running = False | |
return | |
# 9.7) Extract text from the PDF | |
raw_text = extract_text_from_pdf(uploaded_file) | |
if not raw_text.strip(): | |
st.error("β οΈ The uploaded PDF appears to contain no extractable text.") | |
st.session_state.evaluation_running = False | |
return | |
cleaned = preprocess_text(raw_text) | |
st.success("β Resume text extracted successfully!") | |
# 9.8) Run the lengthy eval loop inside a spinner | |
with st.spinner("Evaluating jobsβ¦"): | |
recommendations = eval_jobs(jobs_df, cleaned) | |
# 9.9) Show results (or warning if none) | |
if not recommendations.empty: | |
st.header("Recommended Jobs") | |
st.dataframe(recommendations) | |
st.session_state.evaluation_complete = True | |
else: | |
st.warning("No matching jobs found or evaluation was halted midβstream.") | |
# 9.10) Done (or halted) | |
st.session_state.evaluation_running = False | |
with col2: | |
# The βStop Evaluationβ button is only enabled while evaluation_running is True: | |
if st.session_state.evaluation_running: | |
if st.button("Stop Evaluation"): | |
st.session_state.evaluation_running = False | |
st.warning("βΈοΈ User requested to stop evaluation.") | |
else: | |
st.button("Stop Evaluation", disabled=True) | |
# 9.11) Once complete, allow βTry Another Resumeβ to reset | |
if st.session_state.evaluation_complete: | |
if st.button("Try Another Resume"): | |
st.session_state.evaluation_complete = False | |
st.experimental_rerun() | |
if __name__ == "__main__": | |
main() | |