Spaces:

Akshayram1
/

ai_resume_matching_tool

Sleeping

App Files Files Community

Akshayram1 commited on Mar 5

Commit

8098013

verified ·

1 Parent(s): 4df9995

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -36

app.py CHANGED Viewed

@@ -7,11 +7,9 @@ import pandas as pd
 import re
 import matplotlib.pyplot as plt
 import seaborn as sns
-import spacy
 from transformers import AutoTokenizer, AutoModel
 import torch
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoModelForCausalLM  # Updated import
 # Download necessary NLTK data
 nltk.download('punkt')
@@ -22,16 +20,13 @@ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
 float_digit_regex = re.compile(r'^\d{10}$')
 email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
-# Load Phi-4 model and tokenizer
 @st.cache_resource
 def load_model():
-    model_name = "microsoft/phi-2"  # Verified working alternative
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
     return tokenizer, model
 tokenizer, model = load_model()
@@ -43,9 +38,9 @@ def extract_text_from_pdf(pdf_file):
         text += pdf_reader.pages[page_num].extract_text()
     return text
-# Function to generate embeddings using Phi-4
 def get_embeddings(text):
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     with torch.no_grad():
         outputs = model(**inputs)
     embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
@@ -57,30 +52,22 @@ def calculate_similarity(text1, text2):
     emb2 = get_embeddings(text2)
     return cosine_similarity([emb1], [emb2])[0][0]
-# Function to tokenize text using SpaCy
-def tokenize_text(text, nlp_model):
-    doc = nlp_model(text, disable=["tagger", "parser"])
-    tokens = [(token.text.lower(), token.label_) for token in doc.ents]
-    return tokens
-# Function to extract CGPA
-def extract_cgpa(resume_text):
-    cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
-    match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
-    return float(match.group(1 or 2)) if match else None
 # Streamlit Frontend
 st.markdown("# Resume Matching Tool 📃📃")
-st.markdown("An application to match resumes with a job description using Phi-4")
 # File Upload
 resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
 job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
 if resumes_files and job_descriptions_file:
-    # Load SpaCy model
-    nlp = spacy.load("en_Resume_Matching_Keywords")
     # Process documents
     job_description_text = extract_text_from_pdf(job_descriptions_file)
     resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
@@ -109,21 +96,24 @@ if resumes_files and job_descriptions_file:
     if selected_resume:
         resume_text = resumes_texts[selected_resume]
-        # Entity extraction
-        doc = nlp(resume_text)
-        entities = [(ent.text, ent.label_) for ent in doc.ents]
-        # Display entities
         st.write("### Extracted Entities")
-        entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
-        st.dataframe(entity_df)
         # Skills matching
         st.write("### Skills Matching")
-        skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
-        job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
-        matched_skills = list(set(skills) & set(job_skills))
-        st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
         # Visualization
         st.write("### Similarity Heatmap")

 import re
 import matplotlib.pyplot as plt
 import seaborn as sns
 from transformers import AutoTokenizer, AutoModel
 import torch
 from sklearn.metrics.pairwise import cosine_similarity
 # Download necessary NLTK data
 nltk.download('punkt')
 float_digit_regex = re.compile(r'^\d{10}$')
 email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
+# Load Phi-3 model and tokenizer
 @st.cache_resource
 def load_model():
+    model_name = "microsoft/phi-3-mini-4k-instruct"
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
     return tokenizer, model
 tokenizer, model = load_model()
         text += pdf_reader.pages[page_num].extract_text()
     return text
+# Function to generate embeddings using Phi-3
 def get_embeddings(text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096)
     with torch.no_grad():
         outputs = model(**inputs)
     embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
     emb2 = get_embeddings(text2)
     return cosine_similarity([emb1], [emb2])[0][0]
+# Function to extract entities using Phi-3
+def extract_entities(text):
+    prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}"""
+    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
+    outputs = model.generate(**inputs, max_length=500)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Streamlit Frontend
 st.markdown("# Resume Matching Tool 📃📃")
+st.markdown("An application to match resumes with job descriptions using Phi-3")
 # File Upload
 resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
 job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
 if resumes_files and job_descriptions_file:
     # Process documents
     job_description_text = extract_text_from_pdf(job_descriptions_file)
     resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
     if selected_resume:
         resume_text = resumes_texts[selected_resume]
+        # Entity extraction using Phi-3
         st.write("### Extracted Entities")
+        entities = extract_entities(resume_text)
+        st.code(entities, language="json")
         # Skills matching
         st.write("### Skills Matching")
+        job_entities = extract_entities(job_description_text)
+        # Simple text-based matching
+        resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL)
+        job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL)
+        if resume_skills and job_skills:
+            resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')]
+            job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')]
+            matched_skills = list(set(resume_skills_list) & set(job_skills_list))
+            st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
         # Visualization
         st.write("### Similarity Heatmap")