File size: 5,416 Bytes
fde7f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8098013
fde7f4e
 
993065e
fde56ba
8098013
fde7f4e
 
 
 
 
 
 
 
 
 
 
 
8098013
fde7f4e
8098013
fde7f4e
 
 
 
 
 
 
 
 
 
 
8098013
 
 
 
 
 
fde7f4e
 
 
8098013
fde7f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8098013
fde7f4e
8098013
 
fde7f4e
 
 
8098013
 
 
 
 
 
 
 
 
 
 
fde7f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Import necessary libraries
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')

# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')

# Load Phi-3 model and tokenizer
@st.cache_resource
def load_model():
    model_name = "microsoft/Phi-4-mini-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    return tokenizer, model

tokenizer, model = load_model()

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

# Function to generate embeddings using Phi-3
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Function to calculate similarity between texts
def calculate_similarity(text1, text2):
    emb1 = get_embeddings(text1)
    emb2 = get_embeddings(text2)
    return cosine_similarity([emb1], [emb2])[0][0]

# Function to extract entities using Phi-3
def extract_entities(text):
    prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}"""
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=500)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Streamlit Frontend
st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
st.markdown("An application to match resumes with job descriptions using Phi-3")

# File Upload
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])

if resumes_files and job_descriptions_file:
    # Process documents
    job_description_text = extract_text_from_pdf(job_descriptions_file)
    resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
    
    # Generate embeddings
    job_embedding = get_embeddings(job_description_text)
    resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
    
    # Calculate similarities
    results = []
    for name, emb in resume_embeddings.items():
        similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
        results.append({
            "Resume": name,
            "Similarity Score": f"{similarity:.2f}%",
            "Details": "View Details"
        })
    
    # Show results
    st.dataframe(pd.DataFrame(results))
    
    # Detailed analysis
    st.subheader("Detailed Analysis")
    selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
    
    if selected_resume:
        resume_text = resumes_texts[selected_resume]
        
        # Entity extraction using Phi-3
        st.write("### Extracted Entities")
        entities = extract_entities(resume_text)
        st.code(entities, language="json")
        
        # Skills matching
        st.write("### Skills Matching")
        job_entities = extract_entities(job_description_text)
        
        # Simple text-based matching
        resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL)
        job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL)
        
        if resume_skills and job_skills:
            resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')]
            job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')]
            matched_skills = list(set(resume_skills_list) & set(job_skills_list))
            st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")

        # Visualization
        st.write("### Similarity Heatmap")
        skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
        
        if skills_keywords:
            heatmap_data = []
            for skill in skills_keywords:
                skill_emb = get_embeddings(skill.strip())
                row = []
                for name, emb in resume_embeddings.items():
                    row.append(cosine_similarity([emb], [skill_emb])[0][0])
                heatmap_data.append(row)
            
            plt.figure(figsize=(12, 8))
            sns.heatmap(pd.DataFrame(heatmap_data, 
                        columns=list(resumes_texts.keys()), 
                        index=skills_keywords),
                        annot=True, cmap="YlGnBu")
            st.pyplot(plt)
else:
    st.warning("Please upload both resumes and job description to proceed.")