WebashalarForML commited on
Commit
be55e80
·
verified ·
1 Parent(s): 3c92024

Update utils/mistral.py

Browse files
Files changed (1) hide show
  1. utils/mistral.py +378 -377
utils/mistral.py CHANGED
@@ -1,377 +1,378 @@
1
- # mistral.py
2
- import os
3
- import json
4
- import logging
5
- from huggingface_hub import InferenceClient
6
- from huggingface_hub.utils._errors import BadRequestError
7
- from dotenv import load_dotenv
8
- from utils.fileTotext import extract_text_based_on_format
9
- import re
10
- from utils.spacy import Parser_from_model
11
-
12
- # Load environment variables from .env file
13
- load_dotenv()
14
-
15
- # Authenticate with Hugging Face
16
- HFT = os.getenv('HF_TOKEN')
17
- if not HFT:
18
- raise ValueError("Hugging Face token is not set in environment variables.")
19
- client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
20
-
21
- # Function to clean model output
22
- def Data_Cleaner(text):
23
- pattern = r".*?format:"
24
- result = re.split(pattern, text, maxsplit=1)
25
- if len(result) > 1:
26
- text_after_format = result[1].strip().strip('`').strip('json')
27
- else:
28
- text_after_format = text.strip().strip('`').strip('json')
29
-
30
- return text_after_format
31
-
32
- # Function to call Mistral and process output
33
- def Model_ProfessionalDetails_Output(resume, client):
34
- system_role = {
35
- "role": "system",
36
- "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
37
- }
38
- user_prompt = {
39
- "role": "user",
40
- "content": f'''Act as a resume parser for the following text given in text: {resume}
41
- Extract the text in the following output JSON string as:
42
- {{
43
- "professional": {{
44
- "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",
45
- "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",
46
- "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",
47
- "projects": "Extract the names or titles of all projects mentioned in the resume.",
48
- "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",
49
- "experience": "Calculate total professional work experience in years and months based on the resume.",
50
- "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",
51
- "certifications": "Extract and list all certifications obtained as stated in the resume.",
52
- "roles": "Include the names of all job titles or roles held as indicated in the resume.",
53
- "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",
54
- "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",
55
- "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",
56
- "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
57
- }}
58
- }}
59
- Json Output:
60
- '''
61
- }
62
-
63
-
64
- response = ""
65
- for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=3000, stream=True, temperature=0.35):
66
- response += message.choices[0].delta.content
67
-
68
- try:
69
- clean_response = Data_Cleaner(response)
70
- parsed_response = json.loads(clean_response)
71
- except json.JSONDecodeError as e:
72
- logging.error(f"JSON Decode Error: {e}")
73
- return {}
74
-
75
- return parsed_response
76
-
77
- def Model_PersonalDetails_Output(resume, client):
78
- system_role = {
79
- "role": "system",
80
- "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
81
- }
82
- user_prompt = {
83
- "role": "user",
84
- "content": f'''Act as a resume parser for the following text given in text: {resume}
85
- Extract the text in the following output JSON string as:
86
- {{
87
- "personal": {{
88
- "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
89
- "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
90
- "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
91
- "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
92
- "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
93
- }}
94
- }}
95
- output:
96
- '''
97
- }
98
-
99
- # Response
100
- response = ""
101
- for message in client.chat_completion(
102
- messages=[system_role, user_prompt],
103
- max_tokens=3000,
104
- stream=True,
105
- temperature=0.35,
106
- ):
107
- response += message.choices[0].delta.content
108
-
109
- # Handle cases where the response might have formatting issues
110
- try:
111
- #print('The Og response:-->',response)
112
- clean_response=Data_Cleaner(response)
113
- #print("After data cleaning",clean_response)
114
- parsed_response = json.loads(clean_response)
115
-
116
- except json.JSONDecodeError as e:
117
- print("JSON Decode Error:", e)
118
- print("Raw Response:", response)
119
- return {}
120
-
121
- return parsed_response
122
-
123
-
124
- # # Fallback to SpaCy if Mistral fails
125
-
126
- # Add regex pattern for LinkedIn and GitHub links
127
- linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
128
- github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
129
- email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
130
- contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
131
-
132
- def extract_links(hyperlinks):
133
- linkedin_links = []
134
- github_links = []
135
-
136
- # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
137
- for link in hyperlinks:
138
- if re.match(linkedin_pattern, link):
139
- linkedin_links.append(link)
140
- elif re.match(github_pattern, link):
141
- github_links.append(link)
142
-
143
- return linkedin_links, github_links
144
-
145
- def is_valid_email(email):
146
- email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
147
- return re.match(email_regex, email) is not None
148
-
149
- def is_valid_contact(contact):
150
- patterns = [
151
- r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
152
- r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
153
- r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
154
- r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
155
- r'^\d{10}$', # 10 digits together
156
- r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
157
- r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
158
- r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
159
- r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
160
- r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
161
- r'\+1\d{10} ', # +1 XXXXXXXXXX
162
- r'\d{10} ', # XXXXXXXXXX
163
- r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
164
- r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
165
- r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
166
- r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
167
- r'\+44\d{10} ', # +44 XXXXXXXXXX
168
- r'0\d{10} ', # 0XXXXXXXXXX
169
- r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
170
- r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
171
- r'\+61\d{9} ', # +61 XXXXXXXXX
172
- r'0\d{9} ', # 0XXXXXXXXX
173
- r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
174
- r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
175
- r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
176
- r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
177
- r'\+91\d{10} ', # +91 XXXXXXXXXX
178
- r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
179
- r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
180
- r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
181
- r'\+49\d{12} ', # +49 XXXXXXXXXXXX
182
- r'\+49\d{10} ', # +49 XXXXXXXXXX
183
- r'0\d{11} ', # 0XXXXXXXXXXX
184
- r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
185
- r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
186
- r'\+86\d{11} ', # +86 XXXXXXXXXXX
187
- r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
188
- r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
189
- r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
190
- r'\+81\d{10} ', # +81 XXXXXXXXXX
191
- r'\+81\d{9} ', # +81 XXXXXXXXX
192
- r'0\d{9} ', # 0XXXXXXXXX
193
- r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
194
- r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
195
- r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
196
- r'\+55\d{11} ', # +55 XXXXXXXXXXX
197
- r'\+55\d{10} ', # +55 XXXXXXXXXX
198
- r'0\d{10} ', # 0XXXXXXXXXX
199
- r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
200
- r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
201
- r'\+33\d{9} ', # +33 XXXXXXXXX
202
- r'0\d{9} ', # 0XXXXXXXXX
203
- r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
204
- r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
205
- r'\+7\d{10} ', # +7 XXXXXXXXXX
206
- r'8\d{10} ', # 8 XXXXXXXXXX
207
- r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
208
- r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
209
- r'\+27\d{9} ', # +27 XXXXXXXXX
210
- r'0\d{9} ', # 0XXXXXXXXX
211
- r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
212
- r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
213
- r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
214
- r'\+52\d{10} ', # +52 XXXXXXXXXX
215
- r'01\d{7} ', # 01 XXXXXXX
216
- r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
217
- r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
218
- r'\+234\d{10} ', # +234 XXXXXXXXXX
219
- r'0\d{10} ', # 0XXXXXXXXXX
220
- r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
221
- r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
222
- r'\+971\d{8} ', # +971 XXXXXXXX
223
- r'0\d{8} ', # 0XXXXXXXX
224
- r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
225
- r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
226
- r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
227
- r'\+54\d{10} ', # +54 9 XXXXXXXXXX
228
- r'\+54\d{9} ', # +54 XXXXXXXXX
229
- r'0\d{7} ', # 0XXXXXXX
230
- r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
231
- r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
232
- r'\+966\d{8} ', # +966 XXXXXXXX
233
- r'0\d{8} ', # 0XXXXXXXX
234
- r'\+1\d{10} ', # +1 XXXXXXXXXX
235
- r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
236
- r'\d{5}\s\d{5} ', # XXXXX XXXXX
237
- r'\d{10} ', # XXXXXXXXXX
238
- r'\+44\d{10} ', # +44 XXXXXXXXXX
239
- r'0\d{10} ', # 0XXXXXXXXXX
240
- r'\+61\d{9} ', # +61 XXXXXXXXX
241
- r'0\d{9} ', # 0XXXXXXXXX
242
- r'\+91\d{10} ', # +91 XXXXXXXXXX
243
- r'\+49\d{12} ', # +49 XXXXXXXXXXXX
244
- r'\+49\d{10} ', # +49 XXXXXXXXXX
245
- r'0\d{11} ', # 0XXXXXXXXXXX
246
- r'\+86\d{11} ', # +86 XXXXXXXXXXX
247
- r'\+81\d{10} ', # +81 XXXXXXXXXX
248
- r'\+81\d{9} ', # +81 XXXXXXXXX
249
- r'0\d{9} ', # 0XXXXXXXXX
250
- r'\+55\d{11} ', # +55 XXXXXXXXXXX
251
- r'\+55\d{10} ', # +55 XXXXXXXXXX
252
- r'0\d{10} ', # 0XXXXXXXXXX
253
- r'\+33\d{9} ', # +33 XXXXXXXXX
254
- r'0\d{9} ', # 0XXXXXXXXX
255
- r'\+7\d{10} ', # +7 XXXXXXXXXX
256
- r'8\d{10} ', # 8 XXXXXXXXXX
257
- r'\+27\d{9} ', # +27 XXXXXXXXX
258
- r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
259
- r'\+52\d{10} ', # +52 XXXXXXXXXX
260
- r'01\d{7} ', # 01 XXXXXXX
261
- r'\+234\d{10} ', # +234 XXXXXXXXXX
262
- r'0\d{10} ', # 0XXXXXXXXXX
263
- r'\+971\d{8} ', # +971 XXXXXXXX
264
- r'0\d{8} ', # 0XXXXXXXX
265
- r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
266
- r'\+54\d{9} ', # +54 XXXXXXXXX
267
- r'0\d{7} ', # 0XXXXXXX
268
- r'\+966\d{8} ', # +966 XXXXXXXX
269
- r'0\d{8}' # 0XXXXXXXX
270
- ]
271
-
272
- # Check if the contact matches any of the patterns
273
- return any(re.match(pattern, contact) for pattern in patterns) is not None
274
-
275
-
276
- def validate_contact_email(personal_data):
277
- contact = personal_data.get('contact', 'Not found')
278
- email = personal_data.get('email', 'Not found')
279
-
280
- valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
281
- valid_email = is_valid_email(email) if email != 'Not found' else False
282
-
283
- invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
284
- invalid_email = 'Invalid email' if not valid_email else 'Valid email'
285
-
286
- return valid_contact, invalid_contact, valid_email, invalid_email
287
-
288
-
289
- def process_resume_data(file_path):
290
- resume_text, hyperlinks = extract_text_based_on_format(file_path)
291
- print("Resume converted to text successfully.")
292
-
293
- if not resume_text:
294
- return {"error": "Text extraction failed"}
295
-
296
- # Extract LinkedIn and GitHub links
297
- linkedin_links, github_links = extract_links(hyperlinks)
298
-
299
- # Attempt to use Mistral model for parsing
300
- try:
301
- # Extract personal details using Mistral
302
- per_data = Model_PersonalDetails_Output(resume_text, client)
303
-
304
- # Extract professional details using Mistral
305
- pro_data = Model_ProfessionalDetails_Output(resume_text, client)
306
-
307
- # Check if per_data and pro_data have been populated correctly
308
- if not per_data:
309
- logging.warning("Mistral personal data extraction failed.")
310
- per_data = {}
311
-
312
- if not pro_data:
313
- logging.warning("Mistral professional data extraction failed.")
314
- pro_data = {}
315
-
316
- # Combine both personal and professional details into a structured output
317
- result = {
318
- "personal": {
319
- "name": per_data.get('personal', {}).get('name', 'Not found'),
320
- "contact": per_data.get('personal', {}).get('contact_number', 'Not found'),
321
- "email": per_data.get('personal', {}).get('email', 'Not found'),
322
- "location": per_data.get('personal', {}).get('Address', 'Not found'),
323
- "linkedin": linkedin_links,
324
- "github": github_links,
325
- "other_links": hyperlinks # Store remaining links if needed
326
- },
327
- "professional": {
328
- "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
329
- "non_technical_skills": pro_data.get('professional', {}).get('non_technical_skills', 'Not found'),
330
- "tools": pro_data.get('professional', {}).get('tools', 'Not found'),
331
- "experience": [
332
- {
333
- "company": pro_data.get('professional', {}).get('companies_worked_at', 'Not found'),
334
- "projects": pro_data.get('professional', {}).get('projects', 'Not found'),
335
- "role": pro_data.get('professional', {}).get('worked_as', 'Not found'),
336
- "years": pro_data.get('professional', {}).get('experience', 'Not found'),
337
- "project_experience": pro_data.get('professional', {}).get('projects_experience', 'Not found')
338
- }
339
- ],
340
- "education": [
341
- {
342
- "qualification": pro_data.get('professional', {}).get('qualification', 'Not found'),
343
- "university": pro_data.get('professional', {}).get('university', 'Not found'),
344
- "course": pro_data.get('professional', {}).get('course', 'Not found'),
345
- "certificate": pro_data.get('professional', {}).get('certification', 'Not found')
346
- }
347
- ]
348
- }
349
- }
350
-
351
- # Validate contact and email
352
- valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
353
- result['personal']['valid_contact'] = valid_contact
354
- result['personal']['invalid_contact'] = invalid_contact
355
- result['personal']['valid_email'] = valid_email
356
- result['personal']['invalid_email'] = invalid_email
357
-
358
- # If Mistral produces valid output, return it
359
- if per_data or pro_data:
360
- logging.info("Successfully extracted data using Mistral.")
361
- print("---------Mistral-------")
362
- return result
363
- else:
364
- raise ValueError("Mistral returned no output")
365
-
366
- # Handle HuggingFace API or Mistral model errors
367
- except BadRequestError as e:
368
- logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
369
- print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
370
- except Exception as e:
371
- logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
372
- print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
373
-
374
- # Fallback to SpaCy if Mistral fails
375
- logging.warning("Mistral failed, switching to SpaCy.")
376
- print("---------SpaCy-------")
377
- return Parser_from_model(file_path)
 
 
1
+ # mistral.py
2
+ import os
3
+ import json
4
+ import logging
5
+ from huggingface_hub import InferenceClient
6
+ #from huggingface_hub.utils._errors import BadRequestError
7
+ from huggingface_hub import BadRequestError
8
+ from dotenv import load_dotenv
9
+ from utils.fileTotext import extract_text_based_on_format
10
+ import re
11
+ from utils.spacy import Parser_from_model
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Authenticate with Hugging Face
17
+ HFT = os.getenv('HF_TOKEN')
18
+ if not HFT:
19
+ raise ValueError("Hugging Face token is not set in environment variables.")
20
+ client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
21
+
22
+ # Function to clean model output
23
+ def Data_Cleaner(text):
24
+ pattern = r".*?format:"
25
+ result = re.split(pattern, text, maxsplit=1)
26
+ if len(result) > 1:
27
+ text_after_format = result[1].strip().strip('`').strip('json')
28
+ else:
29
+ text_after_format = text.strip().strip('`').strip('json')
30
+
31
+ return text_after_format
32
+
33
+ # Function to call Mistral and process output
34
+ def Model_ProfessionalDetails_Output(resume, client):
35
+ system_role = {
36
+ "role": "system",
37
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
38
+ }
39
+ user_prompt = {
40
+ "role": "user",
41
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
42
+ Extract the text in the following output JSON string as:
43
+ {{
44
+ "professional": {{
45
+ "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",
46
+ "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",
47
+ "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",
48
+ "projects": "Extract the names or titles of all projects mentioned in the resume.",
49
+ "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",
50
+ "experience": "Calculate total professional work experience in years and months based on the resume.",
51
+ "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",
52
+ "certifications": "Extract and list all certifications obtained as stated in the resume.",
53
+ "roles": "Include the names of all job titles or roles held as indicated in the resume.",
54
+ "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",
55
+ "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",
56
+ "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",
57
+ "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
58
+ }}
59
+ }}
60
+ Json Output:
61
+ '''
62
+ }
63
+
64
+
65
+ response = ""
66
+ for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=3000, stream=True, temperature=0.35):
67
+ response += message.choices[0].delta.content
68
+
69
+ try:
70
+ clean_response = Data_Cleaner(response)
71
+ parsed_response = json.loads(clean_response)
72
+ except json.JSONDecodeError as e:
73
+ logging.error(f"JSON Decode Error: {e}")
74
+ return {}
75
+
76
+ return parsed_response
77
+
78
+ def Model_PersonalDetails_Output(resume, client):
79
+ system_role = {
80
+ "role": "system",
81
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
82
+ }
83
+ user_prompt = {
84
+ "role": "user",
85
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
86
+ Extract the text in the following output JSON string as:
87
+ {{
88
+ "personal": {{
89
+ "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
90
+ "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
91
+ "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
92
+ "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
93
+ "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
94
+ }}
95
+ }}
96
+ output:
97
+ '''
98
+ }
99
+
100
+ # Response
101
+ response = ""
102
+ for message in client.chat_completion(
103
+ messages=[system_role, user_prompt],
104
+ max_tokens=3000,
105
+ stream=True,
106
+ temperature=0.35,
107
+ ):
108
+ response += message.choices[0].delta.content
109
+
110
+ # Handle cases where the response might have formatting issues
111
+ try:
112
+ #print('The Og response:-->',response)
113
+ clean_response=Data_Cleaner(response)
114
+ #print("After data cleaning",clean_response)
115
+ parsed_response = json.loads(clean_response)
116
+
117
+ except json.JSONDecodeError as e:
118
+ print("JSON Decode Error:", e)
119
+ print("Raw Response:", response)
120
+ return {}
121
+
122
+ return parsed_response
123
+
124
+
125
+ # # Fallback to SpaCy if Mistral fails
126
+
127
+ # Add regex pattern for LinkedIn and GitHub links
128
+ linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
129
+ github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
130
+ email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
131
+ contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
132
+
133
+ def extract_links(hyperlinks):
134
+ linkedin_links = []
135
+ github_links = []
136
+
137
+ # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
138
+ for link in hyperlinks:
139
+ if re.match(linkedin_pattern, link):
140
+ linkedin_links.append(link)
141
+ elif re.match(github_pattern, link):
142
+ github_links.append(link)
143
+
144
+ return linkedin_links, github_links
145
+
146
+ def is_valid_email(email):
147
+ email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
148
+ return re.match(email_regex, email) is not None
149
+
150
+ def is_valid_contact(contact):
151
+ patterns = [
152
+ r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
153
+ r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
154
+ r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
155
+ r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
156
+ r'^\d{10}$', # 10 digits together
157
+ r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
158
+ r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
159
+ r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
160
+ r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
161
+ r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
162
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
163
+ r'\d{10} ', # XXXXXXXXXX
164
+ r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
165
+ r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
166
+ r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
167
+ r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
168
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
169
+ r'0\d{10} ', # 0XXXXXXXXXX
170
+ r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
171
+ r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
172
+ r'\+61\d{9} ', # +61 XXXXXXXXX
173
+ r'0\d{9} ', # 0XXXXXXXXX
174
+ r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
175
+ r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
176
+ r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
177
+ r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
178
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
179
+ r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
180
+ r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
181
+ r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
182
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
183
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
184
+ r'0\d{11} ', # 0XXXXXXXXXXX
185
+ r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
186
+ r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
187
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
188
+ r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
189
+ r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
190
+ r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
191
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
192
+ r'\+81\d{9} ', # +81 XXXXXXXXX
193
+ r'0\d{9} ', # 0XXXXXXXXX
194
+ r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
195
+ r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
196
+ r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
197
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
198
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
199
+ r'0\d{10} ', # 0XXXXXXXXXX
200
+ r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
201
+ r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
202
+ r'\+33\d{9} ', # +33 XXXXXXXXX
203
+ r'0\d{9} ', # 0XXXXXXXXX
204
+ r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
205
+ r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
206
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
207
+ r'8\d{10} ', # 8 XXXXXXXXXX
208
+ r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
209
+ r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
210
+ r'\+27\d{9} ', # +27 XXXXXXXXX
211
+ r'0\d{9} ', # 0XXXXXXXXX
212
+ r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
213
+ r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
214
+ r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
215
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
216
+ r'01\d{7} ', # 01 XXXXXXX
217
+ r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
218
+ r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
219
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
220
+ r'0\d{10} ', # 0XXXXXXXXXX
221
+ r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
222
+ r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
223
+ r'\+971\d{8} ', # +971 XXXXXXXX
224
+ r'0\d{8} ', # 0XXXXXXXX
225
+ r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
226
+ r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
227
+ r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
228
+ r'\+54\d{10} ', # +54 9 XXXXXXXXXX
229
+ r'\+54\d{9} ', # +54 XXXXXXXXX
230
+ r'0\d{7} ', # 0XXXXXXX
231
+ r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
232
+ r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
233
+ r'\+966\d{8} ', # +966 XXXXXXXX
234
+ r'0\d{8} ', # 0XXXXXXXX
235
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
236
+ r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
237
+ r'\d{5}\s\d{5} ', # XXXXX XXXXX
238
+ r'\d{10} ', # XXXXXXXXXX
239
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
240
+ r'0\d{10} ', # 0XXXXXXXXXX
241
+ r'\+61\d{9} ', # +61 XXXXXXXXX
242
+ r'0\d{9} ', # 0XXXXXXXXX
243
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
244
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
245
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
246
+ r'0\d{11} ', # 0XXXXXXXXXXX
247
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
248
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
249
+ r'\+81\d{9} ', # +81 XXXXXXXXX
250
+ r'0\d{9} ', # 0XXXXXXXXX
251
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
252
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
253
+ r'0\d{10} ', # 0XXXXXXXXXX
254
+ r'\+33\d{9} ', # +33 XXXXXXXXX
255
+ r'0\d{9} ', # 0XXXXXXXXX
256
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
257
+ r'8\d{10} ', # 8 XXXXXXXXXX
258
+ r'\+27\d{9} ', # +27 XXXXXXXXX
259
+ r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
260
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
261
+ r'01\d{7} ', # 01 XXXXXXX
262
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
263
+ r'0\d{10} ', # 0XXXXXXXXXX
264
+ r'\+971\d{8} ', # +971 XXXXXXXX
265
+ r'0\d{8} ', # 0XXXXXXXX
266
+ r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
267
+ r'\+54\d{9} ', # +54 XXXXXXXXX
268
+ r'0\d{7} ', # 0XXXXXXX
269
+ r'\+966\d{8} ', # +966 XXXXXXXX
270
+ r'0\d{8}' # 0XXXXXXXX
271
+ ]
272
+
273
+ # Check if the contact matches any of the patterns
274
+ return any(re.match(pattern, contact) for pattern in patterns) is not None
275
+
276
+
277
+ def validate_contact_email(personal_data):
278
+ contact = personal_data.get('contact', 'Not found')
279
+ email = personal_data.get('email', 'Not found')
280
+
281
+ valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
282
+ valid_email = is_valid_email(email) if email != 'Not found' else False
283
+
284
+ invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
285
+ invalid_email = 'Invalid email' if not valid_email else 'Valid email'
286
+
287
+ return valid_contact, invalid_contact, valid_email, invalid_email
288
+
289
+
290
+ def process_resume_data(file_path):
291
+ resume_text, hyperlinks = extract_text_based_on_format(file_path)
292
+ print("Resume converted to text successfully.")
293
+
294
+ if not resume_text:
295
+ return {"error": "Text extraction failed"}
296
+
297
+ # Extract LinkedIn and GitHub links
298
+ linkedin_links, github_links = extract_links(hyperlinks)
299
+
300
+ # Attempt to use Mistral model for parsing
301
+ try:
302
+ # Extract personal details using Mistral
303
+ per_data = Model_PersonalDetails_Output(resume_text, client)
304
+
305
+ # Extract professional details using Mistral
306
+ pro_data = Model_ProfessionalDetails_Output(resume_text, client)
307
+
308
+ # Check if per_data and pro_data have been populated correctly
309
+ if not per_data:
310
+ logging.warning("Mistral personal data extraction failed.")
311
+ per_data = {}
312
+
313
+ if not pro_data:
314
+ logging.warning("Mistral professional data extraction failed.")
315
+ pro_data = {}
316
+
317
+ # Combine both personal and professional details into a structured output
318
+ result = {
319
+ "personal": {
320
+ "name": per_data.get('personal', {}).get('name', 'Not found'),
321
+ "contact": per_data.get('personal', {}).get('contact_number', 'Not found'),
322
+ "email": per_data.get('personal', {}).get('email', 'Not found'),
323
+ "location": per_data.get('personal', {}).get('Address', 'Not found'),
324
+ "linkedin": linkedin_links,
325
+ "github": github_links,
326
+ "other_links": hyperlinks # Store remaining links if needed
327
+ },
328
+ "professional": {
329
+ "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
330
+ "non_technical_skills": pro_data.get('professional', {}).get('non_technical_skills', 'Not found'),
331
+ "tools": pro_data.get('professional', {}).get('tools', 'Not found'),
332
+ "experience": [
333
+ {
334
+ "company": pro_data.get('professional', {}).get('companies_worked_at', 'Not found'),
335
+ "projects": pro_data.get('professional', {}).get('projects', 'Not found'),
336
+ "role": pro_data.get('professional', {}).get('worked_as', 'Not found'),
337
+ "years": pro_data.get('professional', {}).get('experience', 'Not found'),
338
+ "project_experience": pro_data.get('professional', {}).get('projects_experience', 'Not found')
339
+ }
340
+ ],
341
+ "education": [
342
+ {
343
+ "qualification": pro_data.get('professional', {}).get('qualification', 'Not found'),
344
+ "university": pro_data.get('professional', {}).get('university', 'Not found'),
345
+ "course": pro_data.get('professional', {}).get('course', 'Not found'),
346
+ "certificate": pro_data.get('professional', {}).get('certification', 'Not found')
347
+ }
348
+ ]
349
+ }
350
+ }
351
+
352
+ # Validate contact and email
353
+ valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
354
+ result['personal']['valid_contact'] = valid_contact
355
+ result['personal']['invalid_contact'] = invalid_contact
356
+ result['personal']['valid_email'] = valid_email
357
+ result['personal']['invalid_email'] = invalid_email
358
+
359
+ # If Mistral produces valid output, return it
360
+ if per_data or pro_data:
361
+ logging.info("Successfully extracted data using Mistral.")
362
+ print("---------Mistral-------")
363
+ return result
364
+ else:
365
+ raise ValueError("Mistral returned no output")
366
+
367
+ # Handle HuggingFace API or Mistral model errors
368
+ except BadRequestError as e:
369
+ logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
370
+ print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
371
+ except Exception as e:
372
+ logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
373
+ print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
374
+
375
+ # Fallback to SpaCy if Mistral fails
376
+ logging.warning("Mistral failed, switching to SpaCy.")
377
+ print("---------SpaCy-------")
378
+ return Parser_from_model(file_path)