WebashalarForML commited on
Commit
71c6bbf
·
verified ·
1 Parent(s): 99e545b

Upload 5 files

Browse files
Files changed (5) hide show
  1. utils/beckup.py +298 -0
  2. utils/error.py +45 -0
  3. utils/fileTotext.py +127 -0
  4. utils/mistral.py +377 -0
  5. utils/spacy.py +246 -0
utils/beckup.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mistral.py
2
+
3
+ import os
4
+ import json
5
+ import logging
6
+ from huggingface_hub import InferenceClient
7
+ from huggingface_hub.utils._errors import BadRequestError
8
+ from dotenv import load_dotenv
9
+ from utils.fileTotext import extract_text_based_on_format
10
+ import re
11
+ from utils.spacy import Parser_from_model
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Authenticate with Hugging Face
17
+ HFT = os.getenv('HF_TOKEN')
18
+ if not HFT:
19
+ raise ValueError("Hugging Face token is not set in environment variables.")
20
+ client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
21
+
22
+ # Function to clean model output
23
+ def Data_Cleaner(text):
24
+ pattern = r".*?format:"
25
+ result = re.split(pattern, text, maxsplit=1)
26
+ if len(result) > 1:
27
+ text_after_format = result[1].strip().strip('`').strip('json')
28
+ else:
29
+ text_after_format = text.strip().strip('`').strip('json')
30
+
31
+ return text_after_format
32
+
33
+ # Function to call Mistral and process output
34
+ def Model_ProfessionalDetails_Output(resume, client):
35
+ system_role = {
36
+ "role": "system",
37
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
38
+ }
39
+ user_prompt = {
40
+ "role": "user",
41
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
42
+ Extract the text in the following output JSON string as:
43
+ {{
44
+ "professional": {{
45
+ "skills": "Extract and list all technical skills, non-technical skills, programming languages, frameworks, domains, and technologies based on the resume.",
46
+ "soft_skills": "Extract non-technical skills, Communication skills, and soft skills based on the resume."
47
+ "projects": "Include only the project names, titles, or headers mentioned in the resume. ",
48
+ "projects_experience": ["Include overall project Experiences and about project in short mentioned in the resume.] ",
49
+ "experience": "Include the total experience in months or years as mentioned in the resume.",
50
+ "companies_worked_at": "Include the names of all companies worked at according to the resume. ",
51
+ "certification": "Include any certifications obtained based on the resume. ",
52
+ "worked_as": "Include the names of roles worked as according to the resume.",
53
+ "qualification":"Extract and list the qualifications based on the resume, (qualifications likes B.Tech). If none are found, return 'No education listed'.",
54
+ "course": "Extract the name of the Learning Course completed based on the resume. If not found, return 'No Course listed'.",
55
+ "university": "Extract the name of the university or Collage or Intitute attended based on the resume. If not found, return 'No university listed'.",
56
+ "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
57
+ }}
58
+ }}
59
+ Json Output:
60
+ '''
61
+ }
62
+
63
+
64
+ response = ""
65
+ for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=3000, stream=True, temperature=0.35):
66
+ response += message.choices[0].delta.content
67
+
68
+ try:
69
+ clean_response = Data_Cleaner(response)
70
+ parsed_response = json.loads(clean_response)
71
+ except json.JSONDecodeError as e:
72
+ logging.error(f"JSON Decode Error: {e}")
73
+ return {}
74
+
75
+ return parsed_response
76
+
77
+ def Model_PersonalDetails_Output(resume, client):
78
+ system_role = {
79
+ "role": "system",
80
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
81
+ }
82
+ user_prompt = {
83
+ "role": "user",
84
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
85
+ Extract the text in the following output JSON string as:
86
+ {{
87
+ "personal": {{
88
+ "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
89
+ "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
90
+ "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
91
+ "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
92
+ "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
93
+ }}
94
+ }}
95
+ output:
96
+ '''
97
+ }
98
+
99
+ # Response
100
+ response = ""
101
+ for message in client.chat_completion(
102
+ messages=[system_role, user_prompt],
103
+ max_tokens=3000,
104
+ stream=True,
105
+ temperature=0.35,
106
+ ):
107
+ response += message.choices[0].delta.content
108
+
109
+ # Handle cases where the response might have formatting issues
110
+ try:
111
+ #print('The Og response:-->',response)
112
+ clean_response=Data_Cleaner(response)
113
+ #print("After data cleaning",clean_response)
114
+ parsed_response = json.loads(clean_response)
115
+
116
+ except json.JSONDecodeError as e:
117
+ print("JSON Decode Error:", e)
118
+ print("Raw Response:", response)
119
+ return {}
120
+
121
+ return parsed_response
122
+
123
+
124
+ # # Fallback to SpaCy if Mistral fails
125
+
126
+ def process_resume_data(file_path):
127
+ resume_text, hyperlinks = extract_text_based_on_format(file_path)
128
+ print("Resume converted to text successfully.")
129
+
130
+ if not resume_text:
131
+ return {"error": "Text extraction failed"}
132
+
133
+ # Attempt to use Mistral model for parsing
134
+ try:
135
+ # Extract personal details using Mistral
136
+ per_data = Model_PersonalDetails_Output(resume_text, client)
137
+
138
+ # Extract professional details using Mistral
139
+ pro_data = Model_ProfessionalDetails_Output(resume_text, client)
140
+
141
+
142
+
143
+ # Check if per_data and pro_data have been populated correctly
144
+ if not per_data:
145
+ logging.warning("Mistral personal data extraction failed.")
146
+ per_data = {}
147
+
148
+ if not pro_data:
149
+ logging.warning("Mistral professional data extraction failed.")
150
+ pro_data = {}
151
+
152
+ # Combine both personal and professional details into a structured output
153
+ result = {
154
+ "personal": {
155
+ "name": per_data.get('personal', {}).get('name', 'Not found'),
156
+ "contact": per_data.get('personal', {}).get('contact_number', 'Not found'),
157
+ "email": per_data.get('personal', {}).get('email', 'Not found'),
158
+ "location": per_data.get('personal', {}).get('Address', 'Not found'),
159
+ "link": hyperlinks
160
+ },
161
+ "professional": {
162
+ "skills": pro_data.get('professional', {}).get('skills', 'Not found'),
163
+ "soft_skills": pro_data.get('professional', {}).get('soft_skills', 'Not found'),
164
+ "experience": [
165
+ {
166
+ "company": pro_data.get('professional', {}).get('companies_worked_at', 'Not found'),
167
+ "projects": pro_data.get('professional', {}).get('projects', 'Not found'),
168
+ "role": pro_data.get('professional', {}).get('worked_as', 'Not found'),
169
+ "years": pro_data.get('professional', {}).get('experience', 'Not found'),
170
+ "project_experience": pro_data.get('professional', {}).get('projects_experience', 'Not found')
171
+ }
172
+ ],
173
+ "education": [
174
+ {
175
+ "qualification": pro_data.get('professional', {}).get('qualification', 'Not found'),
176
+ "university": pro_data.get('professional', {}).get('university', 'Not found'),
177
+ "course": pro_data.get('professional', {}).get('course', 'Not found'),
178
+ "certificate": pro_data.get('professional', {}).get('certification', 'Not found')
179
+ }
180
+ ]
181
+ }
182
+ }
183
+
184
+ # If Mistral produces valid output, return it
185
+ if per_data or pro_data:
186
+ print("------Mistral-----")
187
+ return result
188
+ else:
189
+ raise ValueError("Mistral returned no output")
190
+
191
+ # Handle HuggingFace API or Mistral model errors
192
+ except BadRequestError as e:
193
+ logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
194
+ print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
195
+ except Exception as e:
196
+ logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
197
+ print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
198
+
199
+ # Fallback to SpaCy if Mistral fails
200
+ logging.warning("Mistral failed, switching to SpaCy.")
201
+ print("------Spacy-----")
202
+ return Parser_from_model(file_path)
203
+
204
+
205
+ # /////////////////////////////////////////////
206
+ # ////////////////Spacy.py/////////////////////
207
+ # /////////////////////////////////////////////
208
+
209
+
210
+ import spacy
211
+ from spacy.training import Example
212
+ from spacy.util import minibatch, compounding
213
+ from pathlib import Path
214
+ from spacy.tokens import DocBin
215
+ import random
216
+
217
+ # Load the training data from the .spacy file
218
+ def load_data_from_spacy_file(file_path):
219
+ # Initialize a blank English model to ensure compatibility
220
+ nlp = spacy.blank("en")
221
+
222
+ # Load the DocBin object and get documents
223
+ try:
224
+ doc_bin = DocBin().from_disk(file_path)
225
+ docs = list(doc_bin.get_docs(nlp.vocab))
226
+ return docs
227
+ except Exception as e:
228
+ print(f"Error loading data from .spacy file: {e}")
229
+ return []
230
+
231
+
232
+ # Train model function
233
+ def train_model(epochs, model_path):
234
+ # Initialize a blank English model
235
+ nlp = spacy.blank("en")
236
+
237
+ # Create an NER component and add it to the pipeline
238
+ if "ner" not in nlp.pipe_names:
239
+ ner = nlp.add_pipe("ner")
240
+
241
+ nlp.add_pipe("sentencizer")
242
+
243
+ # Define all possible entity labels
244
+ labels = [
245
+ "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
246
+ "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
247
+ "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
248
+ "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
249
+ ]
250
+
251
+ # Add labels to the NER component
252
+ for label in labels:
253
+ ner.add_label(label)
254
+
255
+ # Load the training data
256
+ train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
257
+
258
+ # Start the training
259
+ optimizer = nlp.begin_training()
260
+
261
+ epoch_losses = []
262
+ best_loss = float('inf')
263
+
264
+ # Training loop
265
+ for epoch in range(epochs):
266
+ losses = {}
267
+ random.shuffle(train_data) # Shuffle data for better training
268
+
269
+ # Create minibatches
270
+ batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
271
+
272
+ for batch in batches:
273
+ texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
274
+
275
+ # Convert to Example objects
276
+ examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
277
+
278
+ # Update the model
279
+ nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
280
+
281
+ current_loss = losses.get("ner", float('inf'))
282
+ epoch_losses.append(current_loss)
283
+
284
+ print(f"Losses at epoch {epoch + 1}: {losses}")
285
+
286
+ # Stop training if the loss is zero
287
+ if current_loss == 0:
288
+ break
289
+
290
+ # Save the best model
291
+ if current_loss < best_loss:
292
+ best_loss = current_loss
293
+ nlp.to_disk(model_path)
294
+
295
+ # Save the final model
296
+ nlp.to_disk(model_path)
297
+
298
+ return epoch_losses
utils/error.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from flask import render_template, request
3
+
4
+ # Set up logging for errors
5
+ logger = logging.getLogger(__name__)
6
+ logger.setLevel(logging.ERROR)
7
+
8
+ # File handler for logging errors to a file
9
+ file_handler = logging.FileHandler('app_error.log')
10
+ file_handler.setLevel(logging.ERROR)
11
+ file_formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
12
+ file_handler.setFormatter(file_formatter)
13
+ logger.addHandler(file_handler)
14
+
15
+ # Console handler for logging errors to the terminal
16
+ console_handler = logging.StreamHandler()
17
+ console_handler.setLevel(logging.ERROR)
18
+ console_formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
19
+ console_handler.setFormatter(console_formatter)
20
+ logger.addHandler(console_handler)
21
+
22
+ # 404 Error Handler
23
+ def page_not_found(e):
24
+ logger.error(f"404 Error: {request.url}")
25
+ return render_template('404.html'), 404
26
+
27
+ # 500 Error Handler
28
+ def internal_server_error(e):
29
+ logger.error(f"500 Error: {e}, URL: {request.url}")
30
+ return render_template('500.html'), 500
31
+
32
+ # File Not Found Error Handler
33
+ def handle_file_not_found():
34
+ logger.error("File not found.")
35
+ return render_template('error.html', message="The file you are looking for does not exist."), 404
36
+
37
+ # Invalid File Type Error Handler
38
+ def handle_invalid_file_type():
39
+ logger.error("Invalid file type.")
40
+ return render_template('error.html', message="Invalid file type. Allowed types: pdf, docx, rsf, odt, png, jpg, jpeg."), 400
41
+
42
+ # File Processing Error Handler
43
+ def handle_file_processing_error():
44
+ logger.error("File processing failed.")
45
+ return render_template('error.html', message="Failed to process the file."), 500
utils/fileTotext.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz
4
+ import logging
5
+ from PIL import Image
6
+ from pdf2image import convert_from_path
7
+ import platform
8
+ import pytesseract
9
+ import docx
10
+ from odf.opendocument import load as load_odt
11
+ from odf.text import P
12
+
13
+ # Path to tesseract executable (ensure it points to tesseract.exe)
14
+ if platform.system() == "Windows":
15
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
+ else:
17
+ # For Hugging Face Spaces or other Linux environments
18
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
+
20
+ # # Set up logging
21
+ # logging.basicConfig(
22
+ # level=logging.DEBUG,
23
+ # format='%(asctime)s - %(levelname)s - %(message)s',
24
+ # handlers=[logging.StreamHandler()]
25
+ # )
26
+
27
+ # # Path to Tesseract executable
28
+ # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
+ # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
+
31
+ # Function to extract text from PDF using PyMuPDF
32
+ def extract_text_from_pdf(file_path):
33
+ text = ""
34
+ hyperlinks = []
35
+ try:
36
+ doc = fitz.open(file_path)
37
+ for page_num in range(doc.page_count):
38
+ page = doc.load_page(page_num)
39
+ page_text = page.get_text("text")
40
+
41
+ if not page_text.strip():
42
+ images = convert_from_path(file_path, dpi=300)
43
+ for image in images:
44
+ text += pytesseract.image_to_string(image)
45
+ else:
46
+ text += page_text
47
+
48
+ links = page.get_links()
49
+ for link in links:
50
+ if link.get("uri"):
51
+ hyperlinks.append(link["uri"])
52
+ except Exception as e:
53
+ logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
+ return "", []
55
+
56
+ return text, list(set(hyperlinks))
57
+
58
+ # Function to extract text from DOCX
59
+ def extract_text_from_docx(file_path):
60
+ try:
61
+ doc = docx.Document(file_path)
62
+ text = "\n".join([para.text for para in doc.paragraphs])
63
+ return text
64
+ except Exception as e:
65
+ logging.error(f"Error extracting text from DOCX: {e}")
66
+ return ""
67
+
68
+ # Function to extract text from RSF (assuming text-based format)
69
+ def extract_text_from_rsf(file_path):
70
+ try:
71
+ with open(file_path, "r", encoding="utf-8") as file:
72
+ return file.read()
73
+ except Exception as e:
74
+ logging.error(f"Error extracting text from RSF: {e}")
75
+ return ""
76
+
77
+ # Function to extract text from ODT
78
+ def extract_text_from_odt(file_path):
79
+ try:
80
+ odt_doc = load_odt(file_path)
81
+ text_elements = odt_doc.getElementsByType(P)
82
+ text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
+ return text
84
+ except Exception as e:
85
+ logging.error(f"Error extracting text from ODT: {e}")
86
+ return ""
87
+
88
+ # Function to extract text from images using Tesseract
89
+ def extract_text_from_image(file_path):
90
+ try:
91
+ img = Image.open(file_path)
92
+ text = pytesseract.image_to_string(img)
93
+
94
+ return text
95
+ except Exception as e:
96
+ logging.error(f"Error extracting text from image: {e}")
97
+ return ""
98
+
99
+ # Function to clean and preprocess the extracted text
100
+ def preprocess_text(text):
101
+ text = re.sub(r'\s+', ' ', text)
102
+ text = re.sub(r'\n', ' ', text)
103
+ text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
104
+ return text.strip()
105
+
106
+ # Function to automatically detect file format and extract text
107
+ def extract_text_based_on_format(file_path):
108
+ file_ext = os.path.splitext(file_path)[1].lower()
109
+
110
+ if file_ext == '.pdf':
111
+ text, hyperlinks = extract_text_from_pdf(file_path)
112
+ elif file_ext == '.docx':
113
+ text = extract_text_from_docx(file_path)
114
+ hyperlinks = []
115
+ elif file_ext == '.rsf':
116
+ text = extract_text_from_rsf(file_path)
117
+ hyperlinks = []
118
+ elif file_ext == '.odt':
119
+ text = extract_text_from_odt(file_path)
120
+ hyperlinks = []
121
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
122
+ text = extract_text_from_image(file_path)
123
+ hyperlinks = []
124
+ else:
125
+ raise ValueError("Unsupported file format")
126
+
127
+ return text, hyperlinks
utils/mistral.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mistral.py
2
+ import os
3
+ import json
4
+ import logging
5
+ from huggingface_hub import InferenceClient
6
+ from huggingface_hub.utils._errors import BadRequestError
7
+ from dotenv import load_dotenv
8
+ from utils.fileTotext import extract_text_based_on_format
9
+ import re
10
+ from utils.spacy import Parser_from_model
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+
15
+ # Authenticate with Hugging Face
16
+ HFT = os.getenv('HF_TOKEN')
17
+ if not HFT:
18
+ raise ValueError("Hugging Face token is not set in environment variables.")
19
+ client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
20
+
21
+ # Function to clean model output
22
+ def Data_Cleaner(text):
23
+ pattern = r".*?format:"
24
+ result = re.split(pattern, text, maxsplit=1)
25
+ if len(result) > 1:
26
+ text_after_format = result[1].strip().strip('`').strip('json')
27
+ else:
28
+ text_after_format = text.strip().strip('`').strip('json')
29
+
30
+ return text_after_format
31
+
32
+ # Function to call Mistral and process output
33
+ def Model_ProfessionalDetails_Output(resume, client):
34
+ system_role = {
35
+ "role": "system",
36
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
37
+ }
38
+ user_prompt = {
39
+ "role": "user",
40
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
41
+ Extract the text in the following output JSON string as:
42
+ {{
43
+ "professional": {{
44
+ "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",
45
+ "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",
46
+ "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",
47
+ "projects": "Extract the names or titles of all projects mentioned in the resume.",
48
+ "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",
49
+ "experience": "Calculate total professional work experience in years and months based on the resume.",
50
+ "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",
51
+ "certifications": "Extract and list all certifications obtained as stated in the resume.",
52
+ "roles": "Include the names of all job titles or roles held as indicated in the resume.",
53
+ "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",
54
+ "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",
55
+ "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",
56
+ "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
57
+ }}
58
+ }}
59
+ Json Output:
60
+ '''
61
+ }
62
+
63
+
64
+ response = ""
65
+ for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=3000, stream=True, temperature=0.35):
66
+ response += message.choices[0].delta.content
67
+
68
+ try:
69
+ clean_response = Data_Cleaner(response)
70
+ parsed_response = json.loads(clean_response)
71
+ except json.JSONDecodeError as e:
72
+ logging.error(f"JSON Decode Error: {e}")
73
+ return {}
74
+
75
+ return parsed_response
76
+
77
+ def Model_PersonalDetails_Output(resume, client):
78
+ system_role = {
79
+ "role": "system",
80
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
81
+ }
82
+ user_prompt = {
83
+ "role": "user",
84
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
85
+ Extract the text in the following output JSON string as:
86
+ {{
87
+ "personal": {{
88
+ "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
89
+ "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
90
+ "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
91
+ "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
92
+ "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
93
+ }}
94
+ }}
95
+ output:
96
+ '''
97
+ }
98
+
99
+ # Response
100
+ response = ""
101
+ for message in client.chat_completion(
102
+ messages=[system_role, user_prompt],
103
+ max_tokens=3000,
104
+ stream=True,
105
+ temperature=0.35,
106
+ ):
107
+ response += message.choices[0].delta.content
108
+
109
+ # Handle cases where the response might have formatting issues
110
+ try:
111
+ #print('The Og response:-->',response)
112
+ clean_response=Data_Cleaner(response)
113
+ #print("After data cleaning",clean_response)
114
+ parsed_response = json.loads(clean_response)
115
+
116
+ except json.JSONDecodeError as e:
117
+ print("JSON Decode Error:", e)
118
+ print("Raw Response:", response)
119
+ return {}
120
+
121
+ return parsed_response
122
+
123
+
124
+ # # Fallback to SpaCy if Mistral fails
125
+
126
+ # Add regex pattern for LinkedIn and GitHub links
127
+ linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
128
+ github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
129
+ email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
130
+ contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
131
+
132
+ def extract_links(hyperlinks):
133
+ linkedin_links = []
134
+ github_links = []
135
+
136
+ # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
137
+ for link in hyperlinks:
138
+ if re.match(linkedin_pattern, link):
139
+ linkedin_links.append(link)
140
+ elif re.match(github_pattern, link):
141
+ github_links.append(link)
142
+
143
+ return linkedin_links, github_links
144
+
145
+ def is_valid_email(email):
146
+ email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
147
+ return re.match(email_regex, email) is not None
148
+
149
+ def is_valid_contact(contact):
150
+ patterns = [
151
+ r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
152
+ r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
153
+ r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
154
+ r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
155
+ r'^\d{10}$', # 10 digits together
156
+ r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
157
+ r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
158
+ r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
159
+ r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
160
+ r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
161
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
162
+ r'\d{10} ', # XXXXXXXXXX
163
+ r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
164
+ r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
165
+ r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
166
+ r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
167
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
168
+ r'0\d{10} ', # 0XXXXXXXXXX
169
+ r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
170
+ r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
171
+ r'\+61\d{9} ', # +61 XXXXXXXXX
172
+ r'0\d{9} ', # 0XXXXXXXXX
173
+ r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
174
+ r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
175
+ r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
176
+ r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
177
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
178
+ r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
179
+ r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
180
+ r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
181
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
182
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
183
+ r'0\d{11} ', # 0XXXXXXXXXXX
184
+ r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
185
+ r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
186
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
187
+ r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
188
+ r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
189
+ r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
190
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
191
+ r'\+81\d{9} ', # +81 XXXXXXXXX
192
+ r'0\d{9} ', # 0XXXXXXXXX
193
+ r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
194
+ r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
195
+ r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
196
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
197
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
198
+ r'0\d{10} ', # 0XXXXXXXXXX
199
+ r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
200
+ r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
201
+ r'\+33\d{9} ', # +33 XXXXXXXXX
202
+ r'0\d{9} ', # 0XXXXXXXXX
203
+ r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
204
+ r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
205
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
206
+ r'8\d{10} ', # 8 XXXXXXXXXX
207
+ r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
208
+ r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
209
+ r'\+27\d{9} ', # +27 XXXXXXXXX
210
+ r'0\d{9} ', # 0XXXXXXXXX
211
+ r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
212
+ r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
213
+ r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
214
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
215
+ r'01\d{7} ', # 01 XXXXXXX
216
+ r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
217
+ r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
218
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
219
+ r'0\d{10} ', # 0XXXXXXXXXX
220
+ r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
221
+ r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
222
+ r'\+971\d{8} ', # +971 XXXXXXXX
223
+ r'0\d{8} ', # 0XXXXXXXX
224
+ r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
225
+ r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
226
+ r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
227
+ r'\+54\d{10} ', # +54 9 XXXXXXXXXX
228
+ r'\+54\d{9} ', # +54 XXXXXXXXX
229
+ r'0\d{7} ', # 0XXXXXXX
230
+ r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
231
+ r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
232
+ r'\+966\d{8} ', # +966 XXXXXXXX
233
+ r'0\d{8} ', # 0XXXXXXXX
234
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
235
+ r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
236
+ r'\d{5}\s\d{5} ', # XXXXX XXXXX
237
+ r'\d{10} ', # XXXXXXXXXX
238
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
239
+ r'0\d{10} ', # 0XXXXXXXXXX
240
+ r'\+61\d{9} ', # +61 XXXXXXXXX
241
+ r'0\d{9} ', # 0XXXXXXXXX
242
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
243
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
244
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
245
+ r'0\d{11} ', # 0XXXXXXXXXXX
246
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
247
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
248
+ r'\+81\d{9} ', # +81 XXXXXXXXX
249
+ r'0\d{9} ', # 0XXXXXXXXX
250
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
251
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
252
+ r'0\d{10} ', # 0XXXXXXXXXX
253
+ r'\+33\d{9} ', # +33 XXXXXXXXX
254
+ r'0\d{9} ', # 0XXXXXXXXX
255
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
256
+ r'8\d{10} ', # 8 XXXXXXXXXX
257
+ r'\+27\d{9} ', # +27 XXXXXXXXX
258
+ r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
259
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
260
+ r'01\d{7} ', # 01 XXXXXXX
261
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
262
+ r'0\d{10} ', # 0XXXXXXXXXX
263
+ r'\+971\d{8} ', # +971 XXXXXXXX
264
+ r'0\d{8} ', # 0XXXXXXXX
265
+ r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
266
+ r'\+54\d{9} ', # +54 XXXXXXXXX
267
+ r'0\d{7} ', # 0XXXXXXX
268
+ r'\+966\d{8} ', # +966 XXXXXXXX
269
+ r'0\d{8}' # 0XXXXXXXX
270
+ ]
271
+
272
+ # Check if the contact matches any of the patterns
273
+ return any(re.match(pattern, contact) for pattern in patterns) is not None
274
+
275
+
276
+ def validate_contact_email(personal_data):
277
+ contact = personal_data.get('contact', 'Not found')
278
+ email = personal_data.get('email', 'Not found')
279
+
280
+ valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
281
+ valid_email = is_valid_email(email) if email != 'Not found' else False
282
+
283
+ invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
284
+ invalid_email = 'Invalid email' if not valid_email else 'Valid email'
285
+
286
+ return valid_contact, invalid_contact, valid_email, invalid_email
287
+
288
+
289
+ def process_resume_data(file_path):
290
+ resume_text, hyperlinks = extract_text_based_on_format(file_path)
291
+ print("Resume converted to text successfully.")
292
+
293
+ if not resume_text:
294
+ return {"error": "Text extraction failed"}
295
+
296
+ # Extract LinkedIn and GitHub links
297
+ linkedin_links, github_links = extract_links(hyperlinks)
298
+
299
+ # Attempt to use Mistral model for parsing
300
+ try:
301
+ # Extract personal details using Mistral
302
+ per_data = Model_PersonalDetails_Output(resume_text, client)
303
+
304
+ # Extract professional details using Mistral
305
+ pro_data = Model_ProfessionalDetails_Output(resume_text, client)
306
+
307
+ # Check if per_data and pro_data have been populated correctly
308
+ if not per_data:
309
+ logging.warning("Mistral personal data extraction failed.")
310
+ per_data = {}
311
+
312
+ if not pro_data:
313
+ logging.warning("Mistral professional data extraction failed.")
314
+ pro_data = {}
315
+
316
+ # Combine both personal and professional details into a structured output
317
+ result = {
318
+ "personal": {
319
+ "name": per_data.get('personal', {}).get('name', 'Not found'),
320
+ "contact": per_data.get('personal', {}).get('contact_number', 'Not found'),
321
+ "email": per_data.get('personal', {}).get('email', 'Not found'),
322
+ "location": per_data.get('personal', {}).get('Address', 'Not found'),
323
+ "linkedin": linkedin_links,
324
+ "github": github_links,
325
+ "other_links": hyperlinks # Store remaining links if needed
326
+ },
327
+ "professional": {
328
+ "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
329
+ "non_technical_skills": pro_data.get('professional', {}).get('non_technical_skills', 'Not found'),
330
+ "tools": pro_data.get('professional', {}).get('tools', 'Not found'),
331
+ "experience": [
332
+ {
333
+ "company": pro_data.get('professional', {}).get('companies_worked_at', 'Not found'),
334
+ "projects": pro_data.get('professional', {}).get('projects', 'Not found'),
335
+ "role": pro_data.get('professional', {}).get('worked_as', 'Not found'),
336
+ "years": pro_data.get('professional', {}).get('experience', 'Not found'),
337
+ "project_experience": pro_data.get('professional', {}).get('projects_experience', 'Not found')
338
+ }
339
+ ],
340
+ "education": [
341
+ {
342
+ "qualification": pro_data.get('professional', {}).get('qualification', 'Not found'),
343
+ "university": pro_data.get('professional', {}).get('university', 'Not found'),
344
+ "course": pro_data.get('professional', {}).get('course', 'Not found'),
345
+ "certificate": pro_data.get('professional', {}).get('certification', 'Not found')
346
+ }
347
+ ]
348
+ }
349
+ }
350
+
351
+ # Validate contact and email
352
+ valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
353
+ result['personal']['valid_contact'] = valid_contact
354
+ result['personal']['invalid_contact'] = invalid_contact
355
+ result['personal']['valid_email'] = valid_email
356
+ result['personal']['invalid_email'] = invalid_email
357
+
358
+ # If Mistral produces valid output, return it
359
+ if per_data or pro_data:
360
+ logging.info("Successfully extracted data using Mistral.")
361
+ print("---------Mistral-------")
362
+ return result
363
+ else:
364
+ raise ValueError("Mistral returned no output")
365
+
366
+ # Handle HuggingFace API or Mistral model errors
367
+ except BadRequestError as e:
368
+ logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
369
+ print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
370
+ except Exception as e:
371
+ logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
372
+ print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
373
+
374
+ # Fallback to SpaCy if Mistral fails
375
+ logging.warning("Mistral failed, switching to SpaCy.")
376
+ print("---------SpaCy-------")
377
+ return Parser_from_model(file_path)
utils/spacy.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import logging
3
+ import json
4
+ from utils.fileTotext import extract_text_based_on_format
5
+ import re
6
+
7
+ def is_valid_email(email):
8
+ email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
9
+ return re.match(email_regex, email) is not None
10
+
11
+ def is_valid_contact(contact):
12
+ patterns = [
13
+ r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
14
+ r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
15
+ r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
16
+ r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
17
+ r'^\d{10}$', # 10 digits together
18
+ r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
19
+ r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
20
+ r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
21
+ r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
22
+ r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
23
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
24
+ r'\d{10} ', # XXXXXXXXXX
25
+ r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
26
+ r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
27
+ r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
28
+ r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
29
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
30
+ r'0\d{10} ', # 0XXXXXXXXXX
31
+ r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
32
+ r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
33
+ r'\+61\d{9} ', # +61 XXXXXXXXX
34
+ r'0\d{9} ', # 0XXXXXXXXX
35
+ r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
36
+ r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
37
+ r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
38
+ r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
39
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
40
+ r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
41
+ r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
42
+ r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
43
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
44
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
45
+ r'0\d{11} ', # 0XXXXXXXXXXX
46
+ r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
47
+ r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
48
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
49
+ r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
50
+ r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
51
+ r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
52
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
53
+ r'\+81\d{9} ', # +81 XXXXXXXXX
54
+ r'0\d{9} ', # 0XXXXXXXXX
55
+ r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
56
+ r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
57
+ r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
58
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
59
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
60
+ r'0\d{10} ', # 0XXXXXXXXXX
61
+ r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
62
+ r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
63
+ r'\+33\d{9} ', # +33 XXXXXXXXX
64
+ r'0\d{9} ', # 0XXXXXXXXX
65
+ r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
66
+ r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
67
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
68
+ r'8\d{10} ', # 8 XXXXXXXXXX
69
+ r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
70
+ r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
71
+ r'\+27\d{9} ', # +27 XXXXXXXXX
72
+ r'0\d{9} ', # 0XXXXXXXXX
73
+ r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
74
+ r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
75
+ r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
76
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
77
+ r'01\d{7} ', # 01 XXXXXXX
78
+ r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
79
+ r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
80
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
81
+ r'0\d{10} ', # 0XXXXXXXXXX
82
+ r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
83
+ r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
84
+ r'\+971\d{8} ', # +971 XXXXXXXX
85
+ r'0\d{8} ', # 0XXXXXXXX
86
+ r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
87
+ r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
88
+ r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
89
+ r'\+54\d{10} ', # +54 9 XXXXXXXXXX
90
+ r'\+54\d{9} ', # +54 XXXXXXXXX
91
+ r'0\d{7} ', # 0XXXXXXX
92
+ r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
93
+ r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
94
+ r'\+966\d{8} ', # +966 XXXXXXXX
95
+ r'0\d{8} ', # 0XXXXXXXX
96
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
97
+ r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
98
+ r'\d{5}\s\d{5} ', # XXXXX XXXXX
99
+ r'\d{10} ', # XXXXXXXXXX
100
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
101
+ r'0\d{10} ', # 0XXXXXXXXXX
102
+ r'\+61\d{9} ', # +61 XXXXXXXXX
103
+ r'0\d{9} ', # 0XXXXXXXXX
104
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
105
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
106
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
107
+ r'0\d{11} ', # 0XXXXXXXXXXX
108
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
109
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
110
+ r'\+81\d{9} ', # +81 XXXXXXXXX
111
+ r'0\d{9} ', # 0XXXXXXXXX
112
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
113
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
114
+ r'0\d{10} ', # 0XXXXXXXXXX
115
+ r'\+33\d{9} ', # +33 XXXXXXXXX
116
+ r'0\d{9} ', # 0XXXXXXXXX
117
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
118
+ r'8\d{10} ', # 8 XXXXXXXXXX
119
+ r'\+27\d{9} ', # +27 XXXXXXXXX
120
+ r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
121
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
122
+ r'01\d{7} ', # 01 XXXXXXX
123
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
124
+ r'0\d{10} ', # 0XXXXXXXXXX
125
+ r'\+971\d{8} ', # +971 XXXXXXXX
126
+ r'0\d{8} ', # 0XXXXXXXX
127
+ r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
128
+ r'\+54\d{9} ', # +54 XXXXXXXXX
129
+ r'0\d{7} ', # 0XXXXXXX
130
+ r'\+966\d{8} ', # +966 XXXXXXXX
131
+ r'0\d{8}' # 0XXXXXXXX
132
+ ]
133
+
134
+ # Check if the contact matches any of the patterns
135
+ return any(re.match(pattern, contact) for pattern in patterns) is not None
136
+
137
+ # Function to parse resume with SpaCy
138
+ # Function to parse resume with SpaCy
139
+ def Parser_from_model(file_path):
140
+ result = {
141
+ "personal": {
142
+ "name": '',
143
+ "contact": '',
144
+ "email": '',
145
+ "location": '',
146
+ "link": '',
147
+ "invalid_email": '',
148
+ "invalid_contact": ''
149
+ },
150
+ "professional": {
151
+ "technical_skills": [],
152
+ "non_technical_skills": [],
153
+ "tools": [],
154
+ "experience": [
155
+ {
156
+ "company": '',
157
+ "projects": '',
158
+ "role": '',
159
+ "years": '',
160
+ "project_experience": []
161
+ }
162
+ ],
163
+ "education": [
164
+ {
165
+ "qualification": '',
166
+ "university": '',
167
+ "course": '',
168
+ "certificate": ''
169
+ }
170
+ ]
171
+ }
172
+ }
173
+
174
+ try:
175
+ nlp = spacy.load("Spacy_Models/ner_model_05_3")
176
+ logging.debug("Model loaded successfully.")
177
+ except Exception as e:
178
+ logging.error(f"Error loading model: {e}")
179
+ return {"error": "Model loading failed"}
180
+
181
+ try:
182
+ cleaned_text, hyperlinks = extract_text_based_on_format(file_path)
183
+ if not cleaned_text.strip():
184
+ logging.error("No text extracted from the file.")
185
+ return {"error": "Text extraction failed"}
186
+ except Exception as e:
187
+ logging.error(f"Error extracting text from file: {e}")
188
+ return {"error": "Text extraction failed"}
189
+
190
+ try:
191
+ doc = nlp(cleaned_text)
192
+ except Exception as e:
193
+ logging.error(f"Error processing text with SpaCy: {e}")
194
+ return {"error": "Text processing failed"}
195
+
196
+ # Initialize entities as a dictionary with lists
197
+ entities = {label: [] for label in ['PERSON', 'EMAIL', 'CONTACT', 'LOCATION', 'SKILL', 'SOFT_SKILL', 'COMPANY', 'PROJECTS', 'JOB_TITLE', 'YEARS_EXPERIENCE', 'EXPERIENCE', 'QUALIFICATION', 'UNIVERSITY', 'COURSE', 'CERTIFICATE']}
198
+
199
+ # Process entities
200
+ for ent in doc.ents:
201
+ if ent.label_ in entities:
202
+ if ent.text not in entities[ent.label_]: # Avoid duplicates
203
+ entities[ent.label_].append(ent.text)
204
+
205
+ # Map entities to the result JSON
206
+ result['personal']['name'] = entities.get('PERSON', [''])[0] if entities.get('PERSON', []) else ''
207
+
208
+ # Validate email
209
+ extracted_email = entities.get('EMAIL', [''])[0] if entities.get('EMAIL', []) else ''
210
+ if is_valid_email(extracted_email):
211
+ result['personal']['email'] = extracted_email
212
+ else:
213
+ logging.warning(f"Invalid email detected: {extracted_email}")
214
+ result['personal']['email'] = "Invalid email"
215
+ result['personal']['invalid_email'] = extracted_email
216
+
217
+ # Validate contact
218
+ extracted_contact = entities.get('CONTACT', [''])[0] if entities.get('CONTACT', []) else ''
219
+ if is_valid_contact(extracted_contact):
220
+ result['personal']['contact'] = extracted_contact
221
+ else:
222
+ logging.warning(f"Invalid contact detected: {extracted_contact}")
223
+ result['personal']['contact'] = "Invalid contact"
224
+ result['personal']['invalid_contact'] = extracted_contact
225
+
226
+ result['personal']['location'] = entities.get('LOCATION', [''])[0] if entities.get('LOCATION', []) else ''
227
+ result['personal']['link'] = hyperlinks # Hyperlinks from extracted text
228
+
229
+ result['professional']['technical_skills'] = entities.get('SKILL', [])
230
+ result['professional']['non_technical_skills'] = entities.get('SOFT_SKILL', [])
231
+ result['professional']['tools'] = [] # Add logic if tools extraction is needed
232
+
233
+ result['professional']['experience'][0]['company'] = entities.get('COMPANY', [''])[0] if entities.get('COMPANY', []) else ''
234
+ result['professional']['experience'][0]['projects'] = entities.get('PROJECTS', [''])[0] if entities.get('PROJECTS', []) else ''
235
+ result['professional']['experience'][0]['role'] = entities.get('JOB_TITLE', [''])[0] if entities.get('JOB_TITLE', []) else ''
236
+ result['professional']['experience'][0]['years'] = entities.get('YEARS_EXPERIENCE', [''])[0] if entities.get('YEARS_EXPERIENCE', []) else ''
237
+ result['professional']['experience'][0]['project_experience'] = entities.get('EXPERIENCE', [])
238
+
239
+ result['professional']['education'][0]['qualification'] = entities.get('QUALIFICATION', [''])[0] if entities.get('QUALIFICATION', []) else ''
240
+ result['professional']['education'][0]['university'] = entities.get('UNIVERSITY', [''])[0] if entities.get('UNIVERSITY', []) else ''
241
+ result['professional']['education'][0]['course'] = entities.get('COURSE', [''])[0] if entities.get('COURSE', []) else ''
242
+ result['professional']['education'][0]['certificate'] = entities.get('CERTIFICATE', [''])[0] if entities.get('CERTIFICATE', []) else ''
243
+
244
+ print(result)
245
+ return result
246
+