File size: 4,881 Bytes
7ed0298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import PyPDF2
from transformers import BertTokenizer, BertModel
from transformers import LongformerModel, LongformerTokenizer
from transformers import BigBirdModel, BigBirdTokenizer
import numpy as np
import gradio as gr
from pathlib import Path
import torch
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Load the BigBird model and tokenizer
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdModel.from_pretrained('google/bigbird-roberta-base')
#longformer
# Load the Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
#longFormer
def get_longformer_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
# Get the embeddings from Longformer
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
# BIGBIRD
def get_bigbird_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=4096)
# Get the embeddings from BigBird
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
def get_bert_embedding(text):
# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
# Get the embeddings from BERT
with torch.no_grad():
outputs = model(**inputs)
# Use the [CLS] token's embedding as the aggregate representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
return cls_embedding
def process_folder(file):
folder_path = os.path.dirname(file.name) # Get the directory of the selected file
files = os.listdir(folder_path) # List all files in the directory
file_paths = [os.path.join(folder_path, f) for f in files] # Get full paths of all files
return f"Files in folder: {', '.join(files)}"
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
text = ''
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() or ''
return text
def calculate_cosine(embedding1, embedding2):
# Calculate the dot product and magnitudes of the embeddings
dot_product = np.dot(embedding1, embedding2)
magnitude1 = np.linalg.norm(embedding1)
magnitude2 = np.linalg.norm(embedding2)
# Calculate cosine similarity
similarity = dot_product / (magnitude1 * magnitude2)
return similarity
def foo(files, JD):
# Extract text and compute embeddings for job description using different models
text_jd = extract_text_from_pdf(JD)
JD_embedding_bert = get_bert_embedding(text_jd).flatten() # Flatten to match the dimension
JD_embedding_longformer = get_longformer_embedding(text_jd).flatten()
JD_embedding_bigbird = get_bigbird_embedding(text_jd).flatten()
sim = []
for d in files:
text = extract_text_from_pdf(d)
# Compute embeddings for the resume using different models
resume_embedding_bert = get_bert_embedding(text).flatten() # Fixed function call
resume_embedding_longformer = get_longformer_embedding(text).flatten()
resume_embedding_bigbird = get_bigbird_embedding(text).flatten()
# Calculate cosine similarity for each model
similarity_bert = calculate_cosine(resume_embedding_bert, JD_embedding_bert)
similarity_longformer = calculate_cosine(resume_embedding_longformer, JD_embedding_longformer)
similarity_bigbird = calculate_cosine(resume_embedding_bigbird, JD_embedding_bigbird)
# Append the results to the array
sim.append(f"\nFile: {d.name:}\n"
f"Bert Similarity: {similarity_bert:.4f}\n"
f"Longformer Similarity: {similarity_longformer:.4f}\n"
f"BigBird Similarity: {similarity_bigbird:.4f}\n")
return "\n".join(sim) # Join the list into a single string for Gradio output
with gr.Blocks() as func:
inputs = [gr.File(file_count="multiple", label="Upload Resume Files"), gr.File(label="Upload Job Description")]
outputs = gr.Textbox(label="Similarity Scores")
show = gr.Button(value="Calculate Similarity")
show.click(foo, inputs, outputs)
func.launch() |