thejagstudio's picture
Update main.py
032b18b verified
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
import fitz # PyMuPDF for PDF text extraction
import spacy
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import os
app = Flask(__name__)
CORS(app)
# ===== Load Custom NER Model =====
try:
nlp = spacy.load("custom_ner_model") # Load your custom-trained NER model
print("Custom NER model loaded successfully.")
except Exception as e:
print(f"Error loading custom NER model: {e}")
exit()
# ===== Load T5 Model for Job Title Prediction =====
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
# Load model weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load("best.pth", map_location=device))
model.eval()
model.to(device)
print("T5 model for job title prediction loaded successfully.")
# ===== Helper Functions =====
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Extract entities using Custom NER
def extract_entities(text):
text=text.replace("\\n","\n")
doc = nlp(text) # Process text with custom NER
extracted_data = {}
for ent in doc.ents:
# Use only relevant labels
if ent.label_ in ["SKILL", "ROLE", "LOCATION", "AREA", "INDUSTRY"]:
if ent.label_ not in extracted_data:
extracted_data[ent.label_] = []
if ent.text not in extracted_data[ent.label_]:
extracted_data[ent.label_].append(ent.text)
# Format results as comma-separated strings
for key in extracted_data:
extracted_data[key] = ", ".join(extracted_data[key])
return extracted_data
# Predict job title using T5 model
def predict_job_title(skills, area,roles,location,industry):
input_text = f"Skills: {skills}; \nRole: {roles}; \nLocation: {location}; \nArea: {area}; \nIndustry: {industry}"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
with torch.no_grad():
outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
predicted_job_title = tokenizer.decode(outputs[0], skip_special_tokens=True)
return predicted_job_title
# ===== Flask Routes =====
@app.route('/')
def home():
return render_template('index.html') # Default home page
@app.route('/predict', methods=['POST'])
def predict():
if 'resume' not in request.files:
return jsonify({'error': 'No file uploaded'}), 400
file = request.files['resume']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not file.filename.endswith('.pdf'):
return jsonify({'error': 'Please upload a PDF file'}), 400
try:
# Step 1: Extract text from PDF
resume_text = extract_text_from_pdf(file)
# Step 2: Extract entities using Custom NER
extracted_data = extract_entities(resume_text)
# Step 3: Prepare input for T5 prediction
skills = extracted_data.get("SKILL", "")
area = extracted_data.get("AREA", "")
roles = extracted_data.get("ROLE", "")
location = extracted_data.get("LOCATION", "")
industry = extracted_data.get("INDUSTRY", "")
# Step 4: Predict job title
predicted_title = predict_job_title(skills, area,roles,location,industry)
# Step 5: Return response
response = {
'success': True,
'predicted_title': predicted_title.split(";")[0],
'extracted_skills': extracted_data.get("SKILL", ""),
'roles': extracted_data.get("ROLE", ""),
'locations': extracted_data.get("LOCATION", ""),
'area': extracted_data.get("AREA", ""),
'industry': extracted_data.get("INDUSTRY", "")
}
return jsonify(response)
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
from waitress import serve
print("Starting Flask app...")
serve(app, host="0.0.0.0", port=7860)