File size: 5,413 Bytes
fde7f4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Import necessary libraries
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')

# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')

# Load Phi-4 model and tokenizer
@st.cache_resource
def load_model():
    model_name = "microsoft/Phi-4-multimodal-instruct"  # Hypothetical model name
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

tokenizer, model = load_model()

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

# Function to generate embeddings using Phi-4
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Function to calculate similarity between texts
def calculate_similarity(text1, text2):
    emb1 = get_embeddings(text1)
    emb2 = get_embeddings(text2)
    return cosine_similarity([emb1], [emb2])[0][0]

# Function to tokenize text using SpaCy
def tokenize_text(text, nlp_model):
    doc = nlp_model(text, disable=["tagger", "parser"])
    tokens = [(token.text.lower(), token.label_) for token in doc.ents]
    return tokens

# Function to extract CGPA
def extract_cgpa(resume_text):
    cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
    match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
    return float(match.group(1 or 2)) if match else None

# Streamlit Frontend
st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
st.markdown("An application to match resumes with a job description using Phi-4")

# File Upload
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])

if resumes_files and job_descriptions_file:
    # Load SpaCy model
    nlp = spacy.load("en_Resume_Matching_Keywords")
    
    # Process documents
    job_description_text = extract_text_from_pdf(job_descriptions_file)
    resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
    
    # Generate embeddings
    job_embedding = get_embeddings(job_description_text)
    resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
    
    # Calculate similarities
    results = []
    for name, emb in resume_embeddings.items():
        similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
        results.append({
            "Resume": name,
            "Similarity Score": f"{similarity:.2f}%",
            "Details": "View Details"
        })
    
    # Show results
    st.dataframe(pd.DataFrame(results))
    
    # Detailed analysis
    st.subheader("Detailed Analysis")
    selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
    
    if selected_resume:
        resume_text = resumes_texts[selected_resume]
        
        # Entity extraction
        doc = nlp(resume_text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        
        # Display entities
        st.write("### Extracted Entities")
        entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
        st.dataframe(entity_df)
        
        # Skills matching
        st.write("### Skills Matching")
        skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
        job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
        matched_skills = list(set(skills) & set(job_skills))
        st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")

        # Visualization
        st.write("### Similarity Heatmap")
        skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
        
        if skills_keywords:
            heatmap_data = []
            for skill in skills_keywords:
                skill_emb = get_embeddings(skill.strip())
                row = []
                for name, emb in resume_embeddings.items():
                    row.append(cosine_similarity([emb], [skill_emb])[0][0])
                heatmap_data.append(row)
            
            plt.figure(figsize=(12, 8))
            sns.heatmap(pd.DataFrame(heatmap_data, 
                        columns=list(resumes_texts.keys()), 
                        index=skills_keywords),
                        annot=True, cmap="YlGnBu")
            st.pyplot(plt)
else:
    st.warning("Please upload both resumes and job description to proceed.")