import streamlit as st import pandas as pd import PyPDF2 import os from google.oauth2 import service_account import gspread from pydantic import BaseModel, Field from typing import List from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate import time import re # ────────────────────────────────────────────────────────────────────────────── # 1) ENVIRONMENT VARIABLES / SECRETS # # On Huggingface Spaces: # - Go to your Space’s Settings → Secrets and add: # • OPENAI_API_KEY = your‐openai‐key # • GOOGLE_API_KEY = your‐google‐key (if you use any Google LLM) # - If you also need a Google Service Account JSON, either: # a) Commit it (careful: that is public by default – only do so if it’s non‐sensitive!), # b) Or add it as “Repository Files” via the “Files & versions” tab, # c) Or load it from a Secret. # # In code below, we’ll assume the service‐account JSON is committed under: # └─ synapse-recruitment-34e7b48899b4.json # # If you instead want to load it from a single‐line environment variable, you can do: # service_account_info = json.loads(os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON")) # creds = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES) # # For now, we’ll simply use: # SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" # # And expect that file to be present in the top‐level of your repo/Space. # # ────────────────────────────────────────────────────────────────────────────── OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") if OPENAI_API_KEY == "": st.warning("⚠️ OPENAI_API_KEY is not set. The LLM calls will fail unless you add it under Secrets.") # ────────────────────────────────────────────────────────────────────────────── # 2) Pydantic models for structured output # ────────────────────────────────────────────────────────────────────────────── class structure(BaseModel): name: str = Field(description="Name of the candidate") location: str = Field(description="The location of the candidate.") skills: List[str] = Field(description="List of individual skills of the candidate") ideal_jobs: str = Field(description="List of ideal jobs for the candidate based on past experience.") yoe: str = Field(description="Years of experience of the candidate.") experience: str = Field(description="A brief summary of the candidate's past experience.") class Job(BaseModel): job_title: str = Field(description="The title of the job.") company: str = Field(description="The company offering the job.") location: str = Field(description="The location of the job.") skills: List[str] = Field(description="List of skills required for the job.") description: str = Field(description="A brief description of the job.") relevance_score: float = Field(description="Relevance score of the job to the candidate's resume.") justification: str = Field(description = "Reason for giving this relevance score and what all areas need to be improved by the candidate") # ────────────────────────────────────────────────────────────────────────────── # 3) Helper: parse a comma‐separated “Tech Stack” string into a Python set # ────────────────────────────────────────────────────────────────────────────── def parse_tech_stack(stack): if pd.isna(stack) or stack == "" or stack is None: return set() if isinstance(stack, set): return stack try: # If it's literally a Python‐set string like "{'python','django'}" if isinstance(stack, str) and stack.startswith("{") and stack.endswith("}"): items = stack.strip("{}").split(",") return set(item.strip().strip("'\"").lower() for item in items if item.strip()) # Otherwise assume comma‐separated values return set(s.strip().lower() for s in str(stack).split(",") if s.strip()) except Exception as e: st.error(f"Error parsing tech stack: {e}") return set() # ────────────────────────────────────────────────────────────────────────────── # 4) Google Sheets initialization (Service Account JSON must be present in repo) # ────────────────────────────────────────────────────────────────────────────── def initialize_google_sheets(): SERVICE_ACCOUNT_FILE = "synapse-recruitment-34e7b48899b4.json" SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] if not os.path.exists(SERVICE_ACCOUNT_FILE): st.error(f"Service account file not found at '{SERVICE_ACCOUNT_FILE}'.\n" "Either commit it into the repo or load from a Secret.") return None try: creds = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_FILE, scopes=SCOPES ) return gspread.authorize(creds) except Exception as e: st.error(f"Failed to load Google Service Account credentials: {e}") return None def load_jobs_data(): gc = initialize_google_sheets() if gc is None: return None try: # NOTE: Replace this key with your actual spreadsheet key SPREADSHEET_KEY = "1BZlvbtFyiQ9Pgr_lpepDJua1ZeVEqrCLjssNd6OiG9k" worksheet = gc.open_by_key(SPREADSHEET_KEY).worksheet("paraform_jobs_formatted") all_values = worksheet.get_all_values() if not all_values or len(all_values) < 2: st.warning("No data found in the Jobs sheet.") return None df = pd.DataFrame(all_values[1:], columns=all_values[0]).fillna("") # Add a “parsed_stack” column so we can pre‐filter by skill overlap df["parsed_stack"] = df["Tech Stack"].apply(parse_tech_stack) return df except Exception as e: st.error(f"Error loading jobs data from Google Sheets: {e}") return None # ────────────────────────────────────────────────────────────────────────────── # 5) PDF → plain text # ────────────────────────────────────────────────────────────────────────────── def extract_text_from_pdf(pdf_file): try: reader = PyPDF2.PdfReader(pdf_file) full_text = "" for page in reader.pages: text = page.extract_text() if text: full_text += text + "\n" return full_text except Exception as e: st.error(f"Failed to read PDF: {e}") return "" # ────────────────────────────────────────────────────────────────────────────── # 6) Call GPT‐4o‐mini to extract structured fields from resume text # ────────────────────────────────────────────────────────────────────────────── def structure_resume_data(resume_text: str) -> structure: llm = ChatOpenAI( model="gpt-4o-mini", temperature=0.0, max_retries=2, ) sum_llm = llm.with_structured_output(structure) prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helper that extracts structured data from a resume."), ("human", "Extract the following fields from this resume:\n{resume_text}\n" "If any field is missing, return ‘Unknown’.") ]) try: parsed = (prompt | sum_llm).invoke({"resume_text": resume_text}) return parsed except Exception as e: st.error(f"Failed to extract structure from resume: {e}") # Return a fallback with “Unknown” fields return structure( name="Unknown", location="Unknown", skills=[], ideal_jobs="Unknown", yoe="Unknown", experience="Unknown" ) # ────────────────────────────────────────────────────────────────────────────── # 7) Evaluate jobs: Pre‐filter by requiring at least two overlapping skills, # then run an LLM loop (with a “Stop” check on each iteration) # ────────────────────────────────────────────────────────────────────────────── def eval_jobs(jobs_df: pd.DataFrame, resume_text: str) -> pd.DataFrame: """ 1) Extract candidate info (list of skills, etc.) 2) Build a skill‐set from response.skills 3) Pre‐filter all jobs so that job’s Tech Stack has ≥2 skills in common 4) For that filtered subset, run an LLM evaluation loop – on each iteration, check `st.session_state.evaluation_running`: if it has become False, break out immediately. 5) Return a DataFrame of top‐10 results (or empty if none). """ response = structure_resume_data(resume_text) candidate_skills = set(skill.lower() for skill in response.skills) # How many overlapping skills does each job have? def matching_skill_count(tech_stack: str) -> int: job_skills = set(s.strip().lower() for s in tech_stack.split(",") if s.strip()) return len(candidate_skills & job_skills) jobs_df["matching_skills"] = jobs_df["Tech Stack"].apply(matching_skill_count) filtered = jobs_df[jobs_df["matching_skills"] >= 2].copy() if filtered.empty: st.warning("No jobs passed the 2-skill pre-filter.") return pd.DataFrame() # Build a candidate_text blob for the LLM to consume candidate_text = ( f"{response.name} {response.location} " f"{', '.join(response.skills)} {response.ideal_jobs} " f"{response.yoe} {response.experience}" ) # LLM setup for job‐evaluation llm = ChatOpenAI( model="gpt-4o-mini", temperature=0.0, max_retries=2, ) eval_llm = llm.with_structured_output(Job) system_msg = ( "You are an expert recruiter. First, filter by location & experience. " "Then pick jobs that match the candidate’s skills & background. " "Finally, assign a relevance score (0–10)." ) prompt = ChatPromptTemplate.from_messages([ ("system", system_msg), ("human", "Evaluate Job: {job_text}\nCandidate: {candidate_text}\n" "Return JSON with job_title, company, location, skills, description, relevance_score.") ]) chain = prompt | eval_llm jobs_for_eval = filtered[["Company", "Role", "Locations", "parsed_stack", "YOE", "matching_skills"]] results = [] progress_bar = st.progress(0) status_text = st.empty() total = len(jobs_for_eval) for i, row in enumerate(jobs_for_eval.itertuples(), start=1): # If the user clicked “Stop Evaluation” → evaluation_running = False if not st.session_state.evaluation_running: status_text.text("⏸️ Evaluation halted by user.") break progress_bar.progress(i / total) status_text.text(f"Evaluating job {i}/{total}: {row.Role} at {row.Company}") job_text = " ".join([ row.Role, row.Company, row.Locations, ", ".join(row.parsed_stack), str(row.YOE) ]) try: eval_job = chain.invoke({ "job_text": job_text, "candidate_text": candidate_text }) except Exception as e: st.error(f"LLM failed on job #{i}: {e}") # Skip this job and continue continue results.append({ "job_title": eval_job.job_title, "company": eval_job.company, "location": eval_job.location, "skills": eval_job.skills, "description": eval_job.description, "relevance_score": eval_job.relevance_score, "matching_skills": row.matching_skills }) # Simulate a delay so you can see the Stop button in action time.sleep(0.5) progress_bar.empty() status_text.empty() if not results: return pd.DataFrame() df_results = pd.DataFrame(results) # Sort first by matching_skills desc, then by relevance_score desc, take top 10 df_results = df_results.sort_values( by=["matching_skills", "relevance_score"], ascending=[False, False] ).head(10) return df_results # ────────────────────────────────────────────────────────────────────────────── # 8) Clean résumé text (lowercase, strip special chars) # ────────────────────────────────────────────────────────────────────────────── def preprocess_text(text: str) -> str: return re.sub(r"[^a-zA-Z\s]", "", text.lower()) # ────────────────────────────────────────────────────────────────────────────── # 9) Streamlit UI # ────────────────────────────────────────────────────────────────────────────── def main(): st.title("📝 Resume Evaluator & Job Recommender") # 9.1) Initialize session state flags if "evaluation_running" not in st.session_state: st.session_state.evaluation_running = False if "evaluation_complete" not in st.session_state: st.session_state.evaluation_complete = False # 9.2) File uploader uploaded_file = st.file_uploader( "Upload your resume (PDF)", type=["pdf"], help="After picking a PDF, click ‘Generate Recommendations’ below." ) # 9.3) Always show BOTH “Generate Recommendations” and “Stop Evaluation” in two columns col1, col2 = st.columns(2) with col1: if st.session_state.evaluation_running: st.button("Generate Recommendations", disabled=True) else: if st.button("Generate Recommendations"): # 9.4) User clicked “Generate” → begin st.session_state.evaluation_running = True st.session_state.evaluation_complete = False # 9.5) Ensure a file was actually uploaded if uploaded_file is None: st.error("❗ Please upload a PDF before clicking ‘Generate Recommendations’.") st.session_state.evaluation_running = False else: # Debug: print basic type of what streamlit handed us st.write(f"▶️ Received file of type: `{type(uploaded_file)}`") # 9.6) Load job sheet jobs_df = load_jobs_data() if jobs_df is None: st.session_state.evaluation_running = False return # 9.7) Extract text from the PDF raw_text = extract_text_from_pdf(uploaded_file) if not raw_text.strip(): st.error("⚠️ The uploaded PDF appears to contain no extractable text.") st.session_state.evaluation_running = False return cleaned = preprocess_text(raw_text) st.success("✅ Resume text extracted successfully!") # 9.8) Run the lengthy eval loop inside a spinner with st.spinner("Evaluating jobs…"): recommendations = eval_jobs(jobs_df, cleaned) # 9.9) Show results (or warning if none) if not recommendations.empty: st.header("Recommended Jobs") st.dataframe(recommendations) st.session_state.evaluation_complete = True else: st.warning("No matching jobs found or evaluation was halted mid‐stream.") # 9.10) Done (or halted) st.session_state.evaluation_running = False with col2: # The “Stop Evaluation” button is only enabled while evaluation_running is True: if st.session_state.evaluation_running: if st.button("Stop Evaluation"): st.session_state.evaluation_running = False st.warning("⏸️ User requested to stop evaluation.") else: st.button("Stop Evaluation", disabled=True) # 9.11) Once complete, allow “Try Another Resume” to reset if st.session_state.evaluation_complete: if st.button("Try Another Resume"): st.session_state.evaluation_complete = False st.experimental_rerun() if __name__ == "__main__": main()