# Import necessary libraries import streamlit as st import nltk from nltk.tokenize import word_tokenize import PyPDF2 import pandas as pd import re import matplotlib.pyplot as plt import seaborn as sns import spacy from transformers import AutoTokenizer, AutoModel import torch from sklearn.metrics.pairwise import cosine_similarity # Download necessary NLTK data nltk.download('punkt') # Define regular expressions for pattern matching float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$') email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' float_digit_regex = re.compile(r'^\d{10}$') email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})') # Load Phi-4 model and tokenizer @st.cache_resource def load_model(): model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) return tokenizer, model tokenizer, model = load_model() # Function to extract text from PDF def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page_num in range(len(pdf_reader.pages)): text += pdf_reader.pages[page_num].extract_text() return text # Function to generate embeddings using Phi-4 def get_embeddings(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() return embeddings.numpy() # Function to calculate similarity between texts def calculate_similarity(text1, text2): emb1 = get_embeddings(text1) emb2 = get_embeddings(text2) return cosine_similarity([emb1], [emb2])[0][0] # Function to tokenize text using SpaCy def tokenize_text(text, nlp_model): doc = nlp_model(text, disable=["tagger", "parser"]) tokens = [(token.text.lower(), token.label_) for token in doc.ents] return tokens # Function to extract CGPA def extract_cgpa(resume_text): cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b' match = re.search(cgpa_pattern, resume_text, re.IGNORECASE) return float(match.group(1 or 2)) if match else None # Streamlit Frontend st.markdown("# Resume Matching Tool 📃📃") st.markdown("An application to match resumes with a job description using Phi-4") # File Upload resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True) job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"]) if resumes_files and job_descriptions_file: # Load SpaCy model nlp = spacy.load("en_Resume_Matching_Keywords") # Process documents job_description_text = extract_text_from_pdf(job_descriptions_file) resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files} # Generate embeddings job_embedding = get_embeddings(job_description_text) resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()} # Calculate similarities results = [] for name, emb in resume_embeddings.items(): similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100 results.append({ "Resume": name, "Similarity Score": f"{similarity:.2f}%", "Details": "View Details" }) # Show results st.dataframe(pd.DataFrame(results)) # Detailed analysis st.subheader("Detailed Analysis") selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys())) if selected_resume: resume_text = resumes_texts[selected_resume] # Entity extraction doc = nlp(resume_text) entities = [(ent.text, ent.label_) for ent in doc.ents] # Display entities st.write("### Extracted Entities") entity_df = pd.DataFrame(entities, columns=["Text", "Label"]) st.dataframe(entity_df) # Skills matching st.write("### Skills Matching") skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"] job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"] matched_skills = list(set(skills) & set(job_skills)) st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}") # Visualization st.write("### Similarity Heatmap") skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',') if skills_keywords: heatmap_data = [] for skill in skills_keywords: skill_emb = get_embeddings(skill.strip()) row = [] for name, emb in resume_embeddings.items(): row.append(cosine_similarity([emb], [skill_emb])[0][0]) heatmap_data.append(row) plt.figure(figsize=(12, 8)) sns.heatmap(pd.DataFrame(heatmap_data, columns=list(resumes_texts.keys()), index=skills_keywords), annot=True, cmap="YlGnBu") st.pyplot(plt) else: st.warning("Please upload both resumes and job description to proceed.")