Spaces:
Sleeping
Sleeping
Update utils/mistral.py
Browse files- utils/mistral.py +25 -2
utils/mistral.py
CHANGED
@@ -42,7 +42,7 @@ def Data_Cleaner(text):
|
|
42 |
def Model_ProfessionalDetails_Output(resume, client):
|
43 |
system_role = {
|
44 |
"role": "system",
|
45 |
-
"content": "You are a skilled resume parser. Your task is to extract
|
46 |
}
|
47 |
user_prompt = {
|
48 |
"role": "user",
|
@@ -292,7 +292,25 @@ def validate_contact_email(personal_data):
|
|
292 |
|
293 |
return valid_contact, invalid_contact, valid_email, invalid_email
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
def process_resume_data(file_path):
|
297 |
resume_text, hyperlinks = extract_text_based_on_format(file_path)
|
298 |
print("Resume converted to text successfully.")
|
@@ -312,6 +330,11 @@ def process_resume_data(file_path):
|
|
312 |
# Extract professional details using Mistral
|
313 |
pro_data = Model_ProfessionalDetails_Output(resume_text, client)
|
314 |
print(pro_data)
|
|
|
|
|
|
|
|
|
|
|
315 |
# Check if per_data and pro_data have been populated correctly
|
316 |
if not per_data:
|
317 |
logging.warning("Mistral personal data extraction failed.")
|
@@ -360,7 +383,7 @@ def process_resume_data(file_path):
|
|
360 |
|
361 |
#Appending the list if any available as a text
|
362 |
result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
|
363 |
-
|
364 |
#Added the validator for details, Validate contact and email
|
365 |
valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
|
366 |
result['personal']['valid_contact'] = valid_contact
|
|
|
42 |
def Model_ProfessionalDetails_Output(resume, client):
|
43 |
system_role = {
|
44 |
"role": "system",
|
45 |
+
"content": "You are a skilled resume parser. Your task is to extract Professional details as well as Academic details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
|
46 |
}
|
47 |
user_prompt = {
|
48 |
"role": "user",
|
|
|
292 |
|
293 |
return valid_contact, invalid_contact, valid_email, invalid_email
|
294 |
|
295 |
+
#Extracting the Data Using the Regex if the model don't extract Contact details
|
296 |
+
def extract_link_details(text):
|
297 |
+
# Regex patterns
|
298 |
+
|
299 |
+
# Email regex
|
300 |
+
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
301 |
+
|
302 |
+
# URL and links regex, updated to avoid conflicts with email domains
|
303 |
+
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
304 |
+
|
305 |
+
emails = email_regex.findall(text)
|
306 |
|
307 |
+
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
308 |
+
|
309 |
+
# Remove profile links that might conflict with emails
|
310 |
+
links_RE = [link for link in links_RE if not any(email in link for email in emails)]
|
311 |
+
|
312 |
+
return links_RE
|
313 |
+
|
314 |
def process_resume_data(file_path):
|
315 |
resume_text, hyperlinks = extract_text_based_on_format(file_path)
|
316 |
print("Resume converted to text successfully.")
|
|
|
330 |
# Extract professional details using Mistral
|
331 |
pro_data = Model_ProfessionalDetails_Output(resume_text, client)
|
332 |
print(pro_data)
|
333 |
+
|
334 |
+
# Extract link using Regular Expression
|
335 |
+
links = extract_link_details(resume_text)
|
336 |
+
print(links)
|
337 |
+
|
338 |
# Check if per_data and pro_data have been populated correctly
|
339 |
if not per_data:
|
340 |
logging.warning("Mistral personal data extraction failed.")
|
|
|
383 |
|
384 |
#Appending the list if any available as a text
|
385 |
result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
|
386 |
+
result['personal']['other_links'] += links
|
387 |
#Added the validator for details, Validate contact and email
|
388 |
valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
|
389 |
result['personal']['valid_contact'] = valid_contact
|