WebashalarForML commited on
Commit
d581df7
·
verified ·
1 Parent(s): f08d659

Update utils/mistral.py

Browse files
Files changed (1) hide show
  1. utils/mistral.py +25 -2
utils/mistral.py CHANGED
@@ -42,7 +42,7 @@ def Data_Cleaner(text):
42
  def Model_ProfessionalDetails_Output(resume, client):
43
  system_role = {
44
  "role": "system",
45
- "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
46
  }
47
  user_prompt = {
48
  "role": "user",
@@ -292,7 +292,25 @@ def validate_contact_email(personal_data):
292
 
293
  return valid_contact, invalid_contact, valid_email, invalid_email
294
 
 
 
 
 
 
 
 
 
 
 
 
295
 
 
 
 
 
 
 
 
296
  def process_resume_data(file_path):
297
  resume_text, hyperlinks = extract_text_based_on_format(file_path)
298
  print("Resume converted to text successfully.")
@@ -312,6 +330,11 @@ def process_resume_data(file_path):
312
  # Extract professional details using Mistral
313
  pro_data = Model_ProfessionalDetails_Output(resume_text, client)
314
  print(pro_data)
 
 
 
 
 
315
  # Check if per_data and pro_data have been populated correctly
316
  if not per_data:
317
  logging.warning("Mistral personal data extraction failed.")
@@ -360,7 +383,7 @@ def process_resume_data(file_path):
360
 
361
  #Appending the list if any available as a text
362
  result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
363
-
364
  #Added the validator for details, Validate contact and email
365
  valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
366
  result['personal']['valid_contact'] = valid_contact
 
42
  def Model_ProfessionalDetails_Output(resume, client):
43
  system_role = {
44
  "role": "system",
45
+ "content": "You are a skilled resume parser. Your task is to extract Professional details as well as Academic details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
46
  }
47
  user_prompt = {
48
  "role": "user",
 
292
 
293
  return valid_contact, invalid_contact, valid_email, invalid_email
294
 
295
+ #Extracting the Data Using the Regex if the model don't extract Contact details
296
+ def extract_link_details(text):
297
+ # Regex patterns
298
+
299
+ # Email regex
300
+ email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
301
+
302
+ # URL and links regex, updated to avoid conflicts with email domains
303
+ link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
304
+
305
+ emails = email_regex.findall(text)
306
 
307
+ links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
308
+
309
+ # Remove profile links that might conflict with emails
310
+ links_RE = [link for link in links_RE if not any(email in link for email in emails)]
311
+
312
+ return links_RE
313
+
314
  def process_resume_data(file_path):
315
  resume_text, hyperlinks = extract_text_based_on_format(file_path)
316
  print("Resume converted to text successfully.")
 
330
  # Extract professional details using Mistral
331
  pro_data = Model_ProfessionalDetails_Output(resume_text, client)
332
  print(pro_data)
333
+
334
+ # Extract link using Regular Expression
335
+ links = extract_link_details(resume_text)
336
+ print(links)
337
+
338
  # Check if per_data and pro_data have been populated correctly
339
  if not per_data:
340
  logging.warning("Mistral personal data extraction failed.")
 
383
 
384
  #Appending the list if any available as a text
385
  result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
386
+ result['personal']['other_links'] += links
387
  #Added the validator for details, Validate contact and email
388
  valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
389
  result['personal']['valid_contact'] = valid_contact