Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- BACKUP.PY +335 -0
- main.py +350 -0
- readme +150 -0
- requirements.txt +71 -0
BACKUP.PY
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
|
| 4 |
+
from werkzeug.utils import secure_filename
|
| 5 |
+
from utils.file_to_text import extract_text_based_on_format, preprocess_text
|
| 6 |
+
from utils.anoter_to_json import process_uploaded_json
|
| 7 |
+
from utils.json_to_spacy import convert_json_to_spacy
|
| 8 |
+
from utils.model import train_model
|
| 9 |
+
import zipfile
|
| 10 |
+
|
| 11 |
+
app = Flask(__name__)
|
| 12 |
+
app.secret_key = 'your_secret_key'
|
| 13 |
+
|
| 14 |
+
# Folder paths
|
| 15 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
| 16 |
+
app.config['JSON_FOLDER'] = 'JSON'
|
| 17 |
+
app.config['DATA_FOLDER'] = 'data'
|
| 18 |
+
app.config['MODELS_FOLDER'] = 'Models'
|
| 19 |
+
|
| 20 |
+
# Allowed file extensions
|
| 21 |
+
ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
|
| 22 |
+
|
| 23 |
+
# Function to check file extensions
|
| 24 |
+
def allowed_file(filename):
|
| 25 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 26 |
+
|
| 27 |
+
@app.route('/')
|
| 28 |
+
def index():
|
| 29 |
+
return render_template('upload.html')
|
| 30 |
+
|
| 31 |
+
# API for uploading Resume files
|
| 32 |
+
@app.route('/upload',methods=['GET', 'POST'])
|
| 33 |
+
def upload_file():
|
| 34 |
+
try:
|
| 35 |
+
if 'file' not in request.files:
|
| 36 |
+
flash('No file part', 'error')
|
| 37 |
+
return redirect(request.url)
|
| 38 |
+
|
| 39 |
+
file = request.files['file']
|
| 40 |
+
if file.filename == '':
|
| 41 |
+
flash('No selected file', 'error')
|
| 42 |
+
return redirect(request.url)
|
| 43 |
+
|
| 44 |
+
if file and allowed_file(file.filename):
|
| 45 |
+
filename = secure_filename(file.filename)
|
| 46 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
| 47 |
+
file.save(file_path)
|
| 48 |
+
|
| 49 |
+
# Handle text extraction for non-JSON files
|
| 50 |
+
if not filename.lower().endswith('.json'):
|
| 51 |
+
return process_other_files(file_path, filename)
|
| 52 |
+
|
| 53 |
+
flash('File type not allowed', 'error')
|
| 54 |
+
except Exception as e:
|
| 55 |
+
flash(f"Error: {str(e)}", 'error')
|
| 56 |
+
|
| 57 |
+
return redirect(request.url)
|
| 58 |
+
|
| 59 |
+
# Process non-JSON files, extract text and save to 'resume_text.txt'
|
| 60 |
+
def process_other_files(file_path, filename):
|
| 61 |
+
try:
|
| 62 |
+
extracted_text, _ = extract_text_based_on_format(file_path)
|
| 63 |
+
cleaned_text = preprocess_text(extracted_text)
|
| 64 |
+
|
| 65 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
| 66 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 67 |
+
|
| 68 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 69 |
+
f.write(cleaned_text)
|
| 70 |
+
|
| 71 |
+
session['uploaded_file'] = filename
|
| 72 |
+
return render_template('text.html', text=cleaned_text)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
flash(f"Error processing file {filename}: {str(e)}", 'error')
|
| 75 |
+
return redirect(request.referrer)
|
| 76 |
+
|
| 77 |
+
# API to handle the text editing and saving
|
| 78 |
+
@app.route('/edit_text', methods=['POST'])
|
| 79 |
+
def edit_text():
|
| 80 |
+
try:
|
| 81 |
+
# Get the edited text from the form
|
| 82 |
+
edited_text = request.form['edited_text']
|
| 83 |
+
|
| 84 |
+
# Save the edited text back to 'resume_text.txt'
|
| 85 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 86 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 87 |
+
f.write(edited_text)
|
| 88 |
+
|
| 89 |
+
flash('Text edited successfully', 'success')
|
| 90 |
+
# Pass the edited text back to the template
|
| 91 |
+
return render_template('text.html', text=edited_text)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
flash(f"Error saving edited text: {str(e)}", 'error')
|
| 94 |
+
return redirect(request.referrer)
|
| 95 |
+
|
| 96 |
+
# API for downloading the 'resume_text.txt' file
|
| 97 |
+
@app.route('/download', methods=['GET'])
|
| 98 |
+
def download_file():
|
| 99 |
+
try:
|
| 100 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
| 101 |
+
except Exception as e:
|
| 102 |
+
flash(f"Error downloading file: {str(e)}", 'error')
|
| 103 |
+
return redirect(request.referrer)
|
| 104 |
+
|
| 105 |
+
@app.route('/save_and_download', methods=['POST'])
|
| 106 |
+
def save_and_download():
|
| 107 |
+
try:
|
| 108 |
+
# Get the edited text from the form
|
| 109 |
+
edited_text = request.form['edited_text']
|
| 110 |
+
|
| 111 |
+
# Save the edited text back to 'resume_text.txt'
|
| 112 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 113 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 114 |
+
f.write(edited_text)
|
| 115 |
+
|
| 116 |
+
flash('Text edited successfully', 'success')
|
| 117 |
+
|
| 118 |
+
# Now send the file as a download
|
| 119 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
flash(f"Error saving and downloading file: {str(e)}", 'error')
|
| 123 |
+
return redirect(request.referrer)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# API for uploading and processing JSON files
|
| 127 |
+
@app.route('/upload_json', methods=['POST'])
|
| 128 |
+
def upload_json_file():
|
| 129 |
+
try:
|
| 130 |
+
if 'file' not in request.files:
|
| 131 |
+
flash('No file part', 'error')
|
| 132 |
+
return redirect(request.url)
|
| 133 |
+
|
| 134 |
+
file = request.files['file']
|
| 135 |
+
if file.filename == '':
|
| 136 |
+
flash('No selected file', 'error')
|
| 137 |
+
return redirect(request.url)
|
| 138 |
+
|
| 139 |
+
if file and file.filename.lower().endswith('.json'):
|
| 140 |
+
filename = secure_filename(file.filename)
|
| 141 |
+
json_path = os.path.join(app.config['JSON_FOLDER'], filename)
|
| 142 |
+
os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
|
| 143 |
+
file.save(json_path)
|
| 144 |
+
session['uploaded_json'] = filename
|
| 145 |
+
flash(f'JSON file {filename} uploaded successfully')
|
| 146 |
+
else:
|
| 147 |
+
flash('File type not allowed', 'error')
|
| 148 |
+
except Exception as e:
|
| 149 |
+
flash(f"Error: {str(e)}", 'error')
|
| 150 |
+
|
| 151 |
+
return redirect(request.referrer)
|
| 152 |
+
|
| 153 |
+
# Process uploaded JSON file and save formatted data
|
| 154 |
+
@app.route('/process_json', methods=['GET'])
|
| 155 |
+
def process_json_file():
|
| 156 |
+
try:
|
| 157 |
+
json_folder = app.config['JSON_FOLDER']
|
| 158 |
+
json_files = os.listdir(json_folder)
|
| 159 |
+
|
| 160 |
+
if not json_files:
|
| 161 |
+
flash('No JSON files found in the folder', 'error')
|
| 162 |
+
return redirect(request.referrer)
|
| 163 |
+
|
| 164 |
+
filename = json_files[0] # Modify logic if needed to handle multiple files
|
| 165 |
+
json_path = os.path.join(json_folder, filename)
|
| 166 |
+
|
| 167 |
+
if not os.path.exists(json_path):
|
| 168 |
+
flash(f'JSON file {filename} not found', 'error')
|
| 169 |
+
return redirect(request.referrer)
|
| 170 |
+
|
| 171 |
+
process_uploaded_json(json_path)
|
| 172 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
| 173 |
+
processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
|
| 174 |
+
|
| 175 |
+
flash(f'JSON file {filename} processed successfully')
|
| 176 |
+
except Exception as e:
|
| 177 |
+
flash(f"Error processing JSON file: {str(e)}", 'error')
|
| 178 |
+
|
| 179 |
+
return redirect(request.referrer)
|
| 180 |
+
|
| 181 |
+
# API for removing uploaded JSON files
|
| 182 |
+
@app.route('/remove_json', methods=['POST'])
|
| 183 |
+
def remove_all_json_files():
|
| 184 |
+
try:
|
| 185 |
+
json_folder = app.config['JSON_FOLDER']
|
| 186 |
+
for filename in os.listdir(json_folder):
|
| 187 |
+
file_path = os.path.join(json_folder, filename)
|
| 188 |
+
if os.path.isfile(file_path):
|
| 189 |
+
os.remove(file_path)
|
| 190 |
+
session.pop('uploaded_json', None)
|
| 191 |
+
|
| 192 |
+
flash('All JSON files removed successfully')
|
| 193 |
+
except Exception as e:
|
| 194 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 195 |
+
|
| 196 |
+
return redirect(request.referrer)
|
| 197 |
+
|
| 198 |
+
# API for removing non-JSON files
|
| 199 |
+
@app.route('/remove', methods=['POST'])
|
| 200 |
+
def remove_file():
|
| 201 |
+
try:
|
| 202 |
+
upload_folder = app.config['UPLOAD_FOLDER']
|
| 203 |
+
|
| 204 |
+
# Check if the folder exists
|
| 205 |
+
if os.path.exists(upload_folder):
|
| 206 |
+
# Loop through all files in the upload folder and remove them
|
| 207 |
+
for filename in os.listdir(upload_folder):
|
| 208 |
+
file_path = os.path.join(upload_folder, filename)
|
| 209 |
+
|
| 210 |
+
# Check if it is a file and remove it
|
| 211 |
+
if os.path.isfile(file_path):
|
| 212 |
+
os.remove(file_path)
|
| 213 |
+
|
| 214 |
+
# Clear session data related to uploaded files
|
| 215 |
+
session.pop('uploaded_file', None)
|
| 216 |
+
flash('All files removed successfully')
|
| 217 |
+
else:
|
| 218 |
+
flash(f"Upload folder does not exist", 'error')
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 222 |
+
|
| 223 |
+
return redirect(url_for('index'))
|
| 224 |
+
|
| 225 |
+
# HTML render routes (modify to fit your structure)
|
| 226 |
+
@app.route('/ner_preview', methods=['GET'])
|
| 227 |
+
def ner_preview():
|
| 228 |
+
return render_template('anoter.html')
|
| 229 |
+
|
| 230 |
+
@app.route('/json', methods=['GET'])
|
| 231 |
+
def json_file():
|
| 232 |
+
return render_template('savejson.html')
|
| 233 |
+
|
| 234 |
+
@app.route('/spacy', methods=['GET'])
|
| 235 |
+
def spacy_file():
|
| 236 |
+
return render_template('saveSpacy.html')
|
| 237 |
+
|
| 238 |
+
# @app.route('/text', methods=['GET'])
|
| 239 |
+
# def spacy_file():
|
| 240 |
+
# return render_template('text.html')
|
| 241 |
+
|
| 242 |
+
@app.route('/to_sapcy', methods=['POST'])
|
| 243 |
+
def to_sapcy():
|
| 244 |
+
try:
|
| 245 |
+
# Path to the JSON file
|
| 246 |
+
json_file_path = 'data/Json_Data.json'
|
| 247 |
+
# Convert the JSON file to a .spacy file
|
| 248 |
+
spacy_file_path = 'data/Spacy_data.spacy'
|
| 249 |
+
|
| 250 |
+
# Call the conversion function
|
| 251 |
+
convert_json_to_spacy(json_file_path, spacy_file_path)
|
| 252 |
+
|
| 253 |
+
flash('Model training data converted successfully', 'success')
|
| 254 |
+
except Exception as e:
|
| 255 |
+
flash(f"Error during conversion: {str(e)}", 'error')
|
| 256 |
+
|
| 257 |
+
return redirect(request.referrer)
|
| 258 |
+
|
| 259 |
+
@app.route('/train_model_endpoint', methods=['POST'])
|
| 260 |
+
def train_model_endpoint():
|
| 261 |
+
try:
|
| 262 |
+
# Get the number of epochs and model version from the request
|
| 263 |
+
epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
|
| 264 |
+
version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
|
| 265 |
+
|
| 266 |
+
# Call the training function with user-defined parameters
|
| 267 |
+
model_path = f"./Models/ner_model_{version}"
|
| 268 |
+
train_model(epochs, model_path)
|
| 269 |
+
|
| 270 |
+
flash('Model training completed successfully', 'success')
|
| 271 |
+
except Exception as e:
|
| 272 |
+
flash(f"Error during training: {str(e)}", 'error')
|
| 273 |
+
|
| 274 |
+
return redirect(url_for('index'))
|
| 275 |
+
|
| 276 |
+
# API for removing all files from specific folders
|
| 277 |
+
@app.route('/remove_files', methods=['POST'])
|
| 278 |
+
def remove_files():
|
| 279 |
+
try:
|
| 280 |
+
# Define folders to clear
|
| 281 |
+
folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER']]
|
| 282 |
+
|
| 283 |
+
for folder_path in folders_to_clear:
|
| 284 |
+
# Remove all files from the specified folder
|
| 285 |
+
for filename in os.listdir(folder_path):
|
| 286 |
+
file_path = os.path.join(folder_path, filename)
|
| 287 |
+
if os.path.isfile(file_path):
|
| 288 |
+
os.remove(file_path)
|
| 289 |
+
|
| 290 |
+
# Clear session variables related to the removed folders
|
| 291 |
+
session.pop('uploaded_file', None)
|
| 292 |
+
session.pop('uploaded_json', None)
|
| 293 |
+
|
| 294 |
+
flash('All files removed from folder successfully')
|
| 295 |
+
except Exception as e:
|
| 296 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 297 |
+
|
| 298 |
+
return redirect(url_for('index'))
|
| 299 |
+
|
| 300 |
+
# API for downloading the latest trained model
|
| 301 |
+
@app.route('/download_model', methods=['GET'])
|
| 302 |
+
def download_latest_model():
|
| 303 |
+
try:
|
| 304 |
+
models_dir = app.config['MODELS_FOLDER']
|
| 305 |
+
model_files = os.listdir(models_dir)
|
| 306 |
+
|
| 307 |
+
if not model_files:
|
| 308 |
+
flash('No model files found', 'error')
|
| 309 |
+
return redirect(request.referrer)
|
| 310 |
+
|
| 311 |
+
# Sort model files and get the latest one
|
| 312 |
+
latest_model_file = sorted(model_files, reverse=True)[0]
|
| 313 |
+
|
| 314 |
+
# Full path to the latest model file
|
| 315 |
+
model_path = os.path.join(models_dir, latest_model_file)
|
| 316 |
+
|
| 317 |
+
if not os.path.exists(model_path):
|
| 318 |
+
flash('Model file not found on the server', 'error')
|
| 319 |
+
return redirect(request.referrer)
|
| 320 |
+
|
| 321 |
+
# Create a zip file with the model
|
| 322 |
+
zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
|
| 323 |
+
|
| 324 |
+
with zipfile.ZipFile(zip_filename, 'w') as zipf:
|
| 325 |
+
zipf.write(model_path, os.path.basename(model_path))
|
| 326 |
+
|
| 327 |
+
# Send the zip file as a download
|
| 328 |
+
return send_file(zip_filename, as_attachment=True)
|
| 329 |
+
|
| 330 |
+
except Exception as e:
|
| 331 |
+
flash(f"Error while downloading the model: {str(e)}", 'error')
|
| 332 |
+
return redirect(request.referrer)
|
| 333 |
+
|
| 334 |
+
if __name__ == '__main__':
|
| 335 |
+
app.run(debug=True)
|
main.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
|
| 4 |
+
from werkzeug.utils import secure_filename
|
| 5 |
+
from utils.file_to_text import extract_text_based_on_format, preprocess_text
|
| 6 |
+
from utils.anoter_to_json import process_uploaded_json
|
| 7 |
+
from utils.json_to_spacy import convert_json_to_spacy
|
| 8 |
+
from utils.model import train_model
|
| 9 |
+
import zipfile
|
| 10 |
+
|
| 11 |
+
app = Flask(__name__)
|
| 12 |
+
app.secret_key = 'your_secret_key'
|
| 13 |
+
|
| 14 |
+
# Folder paths
|
| 15 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
| 16 |
+
app.config['JSON_FOLDER'] = 'JSON'
|
| 17 |
+
app.config['DATA_FOLDER'] = 'data'
|
| 18 |
+
app.config['MODELS_FOLDER'] = 'Models'
|
| 19 |
+
|
| 20 |
+
# Allowed file extensions
|
| 21 |
+
ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
|
| 22 |
+
|
| 23 |
+
# Function to check file extensions
|
| 24 |
+
def allowed_file(filename):
|
| 25 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 26 |
+
|
| 27 |
+
# HTML render routes (modify to fit your structure)
|
| 28 |
+
@app.route('/')
|
| 29 |
+
def index():
|
| 30 |
+
return render_template('upload.html')
|
| 31 |
+
@app.route('/guide')
|
| 32 |
+
def guide():
|
| 33 |
+
return render_template('guide.html')
|
| 34 |
+
|
| 35 |
+
@app.route('/ner_preview', methods=['GET'])
|
| 36 |
+
def ner_preview():
|
| 37 |
+
return render_template('anoter.html')
|
| 38 |
+
|
| 39 |
+
@app.route('/json', methods=['GET'])
|
| 40 |
+
def json_file():
|
| 41 |
+
return render_template('savejson.html')
|
| 42 |
+
|
| 43 |
+
@app.route('/spacy', methods=['GET'])
|
| 44 |
+
def spacy_file():
|
| 45 |
+
return render_template('saveSpacy.html')
|
| 46 |
+
|
| 47 |
+
@app.route('/text_preview', methods=['GET'])
|
| 48 |
+
def text_preview():
|
| 49 |
+
try:
|
| 50 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 51 |
+
if not os.path.exists(resume_file_path):
|
| 52 |
+
flash('Resume text not found', 'error')
|
| 53 |
+
return redirect(url_for('index'))
|
| 54 |
+
|
| 55 |
+
with open(resume_file_path, 'r') as f:
|
| 56 |
+
text = f.read()
|
| 57 |
+
return render_template('text.html', text=text)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
flash(f"Error loading text preview: {str(e)}", 'error')
|
| 60 |
+
return redirect(url_for('index'))
|
| 61 |
+
|
| 62 |
+
# API for uploading Resume files
|
| 63 |
+
@app.route('/upload',methods=['GET', 'POST'])
|
| 64 |
+
def upload_file():
|
| 65 |
+
try:
|
| 66 |
+
if 'file' not in request.files:
|
| 67 |
+
flash('No file part', 'error')
|
| 68 |
+
return redirect(request.url)
|
| 69 |
+
|
| 70 |
+
file = request.files['file']
|
| 71 |
+
if file.filename == '':
|
| 72 |
+
flash('No selected file', 'error')
|
| 73 |
+
return redirect(request.url)
|
| 74 |
+
|
| 75 |
+
if file and allowed_file(file.filename):
|
| 76 |
+
filename = secure_filename(file.filename)
|
| 77 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
| 78 |
+
file.save(file_path)
|
| 79 |
+
|
| 80 |
+
# Handle text extraction for non-JSON files
|
| 81 |
+
if not filename.lower().endswith('.json'):
|
| 82 |
+
return process_other_files(file_path, filename)
|
| 83 |
+
|
| 84 |
+
flash('File type not allowed', 'error')
|
| 85 |
+
except Exception as e:
|
| 86 |
+
flash(f"Error: {str(e)}", 'error')
|
| 87 |
+
|
| 88 |
+
return redirect(request.url)
|
| 89 |
+
|
| 90 |
+
# Process non-JSON files, extract text and save to 'resume_text.txt'
|
| 91 |
+
def process_other_files(file_path, filename):
|
| 92 |
+
try:
|
| 93 |
+
extracted_text, _ = extract_text_based_on_format(file_path)
|
| 94 |
+
cleaned_text = preprocess_text(extracted_text)
|
| 95 |
+
|
| 96 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
| 97 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 98 |
+
|
| 99 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 100 |
+
f.write(cleaned_text)
|
| 101 |
+
|
| 102 |
+
session['uploaded_file'] = filename
|
| 103 |
+
return render_template('text.html', text=cleaned_text)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
flash(f"Error processing file {filename}: {str(e)}", 'error')
|
| 106 |
+
return redirect(request.referrer)
|
| 107 |
+
|
| 108 |
+
# API to handle the text editing and saving
|
| 109 |
+
@app.route('/edit_text', methods=['POST'])
|
| 110 |
+
def edit_text():
|
| 111 |
+
try:
|
| 112 |
+
# Get the edited text from the form
|
| 113 |
+
edited_text = request.form['edited_text']
|
| 114 |
+
|
| 115 |
+
# Save the edited text back to 'resume_text.txt'
|
| 116 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 117 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 118 |
+
f.write(edited_text)
|
| 119 |
+
|
| 120 |
+
flash('Text edited successfully', 'success')
|
| 121 |
+
# Pass the edited text back to the template
|
| 122 |
+
return render_template('text.html', text=edited_text)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
flash(f"Error saving edited text: {str(e)}", 'error')
|
| 125 |
+
return redirect(request.referrer)
|
| 126 |
+
|
| 127 |
+
# API for downloading the 'resume_text.txt' file
|
| 128 |
+
@app.route('/download', methods=['GET'])
|
| 129 |
+
def download_file():
|
| 130 |
+
try:
|
| 131 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
| 132 |
+
except Exception as e:
|
| 133 |
+
flash(f"Error downloading file: {str(e)}", 'error')
|
| 134 |
+
return redirect(request.referrer)
|
| 135 |
+
|
| 136 |
+
@app.route('/save_and_download', methods=['POST'])
|
| 137 |
+
def save_and_download():
|
| 138 |
+
try:
|
| 139 |
+
# Get the edited text from the form
|
| 140 |
+
edited_text = request.form['edited_text']
|
| 141 |
+
|
| 142 |
+
# Save the edited text back to 'resume_text.txt'
|
| 143 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
| 144 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
| 145 |
+
f.write(edited_text)
|
| 146 |
+
|
| 147 |
+
# flash('Text edited successfully', 'success')
|
| 148 |
+
|
| 149 |
+
# Now send the file as a download
|
| 150 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
flash(f"Error saving and downloading file: {str(e)}", 'error')
|
| 154 |
+
return redirect(request.referrer)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# API for uploading and processing JSON files
|
| 158 |
+
@app.route('/upload_json', methods=['POST'])
|
| 159 |
+
def upload_json_file():
|
| 160 |
+
try:
|
| 161 |
+
if 'file' not in request.files:
|
| 162 |
+
flash('No file part', 'error')
|
| 163 |
+
return redirect(request.url)
|
| 164 |
+
|
| 165 |
+
file = request.files['file']
|
| 166 |
+
if file.filename == '':
|
| 167 |
+
flash('No selected file', 'error')
|
| 168 |
+
return redirect(request.url)
|
| 169 |
+
|
| 170 |
+
if file and file.filename.lower().endswith('.json'):
|
| 171 |
+
filename = secure_filename(file.filename)
|
| 172 |
+
json_path = os.path.join(app.config['JSON_FOLDER'], filename)
|
| 173 |
+
os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
|
| 174 |
+
file.save(json_path)
|
| 175 |
+
session['uploaded_json'] = filename
|
| 176 |
+
flash(f'JSON file {filename} uploaded successfully')
|
| 177 |
+
else:
|
| 178 |
+
flash('File type not allowed', 'error')
|
| 179 |
+
except Exception as e:
|
| 180 |
+
flash(f"Error: {str(e)}", 'error')
|
| 181 |
+
|
| 182 |
+
return redirect(request.referrer)
|
| 183 |
+
|
| 184 |
+
# Process uploaded JSON file and save formatted data
|
| 185 |
+
@app.route('/process_json', methods=['GET'])
|
| 186 |
+
def process_json_file():
|
| 187 |
+
try:
|
| 188 |
+
json_folder = app.config['JSON_FOLDER']
|
| 189 |
+
json_files = os.listdir(json_folder)
|
| 190 |
+
|
| 191 |
+
if not json_files:
|
| 192 |
+
flash('No JSON files found in the folder', 'error')
|
| 193 |
+
return redirect(request.referrer)
|
| 194 |
+
|
| 195 |
+
filename = json_files[0] # Modify logic if needed to handle multiple files
|
| 196 |
+
json_path = os.path.join(json_folder, filename)
|
| 197 |
+
|
| 198 |
+
if not os.path.exists(json_path):
|
| 199 |
+
flash(f'JSON file {filename} not found', 'error')
|
| 200 |
+
return redirect(request.referrer)
|
| 201 |
+
|
| 202 |
+
process_uploaded_json(json_path)
|
| 203 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
| 204 |
+
processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
|
| 205 |
+
|
| 206 |
+
flash(f'JSON file {filename} processed successfully')
|
| 207 |
+
except Exception as e:
|
| 208 |
+
flash(f"Error processing JSON file: {str(e)}", 'error')
|
| 209 |
+
|
| 210 |
+
return redirect(request.referrer)
|
| 211 |
+
|
| 212 |
+
# API for removing uploaded JSON files
|
| 213 |
+
@app.route('/remove_json', methods=['POST'])
|
| 214 |
+
def remove_all_json_files():
|
| 215 |
+
try:
|
| 216 |
+
json_folder = app.config['JSON_FOLDER']
|
| 217 |
+
for filename in os.listdir(json_folder):
|
| 218 |
+
file_path = os.path.join(json_folder, filename)
|
| 219 |
+
if os.path.isfile(file_path):
|
| 220 |
+
os.remove(file_path)
|
| 221 |
+
session.pop('uploaded_json', None)
|
| 222 |
+
|
| 223 |
+
flash('All JSON files removed successfully')
|
| 224 |
+
except Exception as e:
|
| 225 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 226 |
+
|
| 227 |
+
return redirect(request.referrer)
|
| 228 |
+
|
| 229 |
+
# API for removing non-JSON files
|
| 230 |
+
@app.route('/remove', methods=['POST'])
|
| 231 |
+
def remove_file():
|
| 232 |
+
try:
|
| 233 |
+
upload_folder = app.config['UPLOAD_FOLDER']
|
| 234 |
+
|
| 235 |
+
# Check if the folder exists
|
| 236 |
+
if os.path.exists(upload_folder):
|
| 237 |
+
# Loop through all files in the upload folder and remove them
|
| 238 |
+
for filename in os.listdir(upload_folder):
|
| 239 |
+
file_path = os.path.join(upload_folder, filename)
|
| 240 |
+
|
| 241 |
+
# Check if it is a file and remove it
|
| 242 |
+
if os.path.isfile(file_path):
|
| 243 |
+
os.remove(file_path)
|
| 244 |
+
|
| 245 |
+
# Clear session data related to uploaded files
|
| 246 |
+
session.pop('uploaded_file', None)
|
| 247 |
+
flash('All files removed successfully')
|
| 248 |
+
else:
|
| 249 |
+
flash(f"Upload folder does not exist", 'error')
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 253 |
+
|
| 254 |
+
return redirect(url_for('index'))
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@app.route('/to_sapcy', methods=['POST'])
|
| 258 |
+
def to_sapcy():
|
| 259 |
+
try:
|
| 260 |
+
# Path to the JSON file
|
| 261 |
+
json_file_path = 'data/Json_Data.json'
|
| 262 |
+
# Convert the JSON file to a .spacy file
|
| 263 |
+
spacy_file_path = 'data/Spacy_data.spacy'
|
| 264 |
+
|
| 265 |
+
# Call the conversion function
|
| 266 |
+
convert_json_to_spacy(json_file_path, spacy_file_path)
|
| 267 |
+
|
| 268 |
+
flash('Model training data converted successfully', 'success')
|
| 269 |
+
except Exception as e:
|
| 270 |
+
flash(f"Error during conversion: {str(e)}", 'error')
|
| 271 |
+
|
| 272 |
+
return redirect(request.referrer)
|
| 273 |
+
|
| 274 |
+
@app.route('/train_model_endpoint', methods=['POST'])
|
| 275 |
+
def train_model_endpoint():
|
| 276 |
+
try:
|
| 277 |
+
# Get the number of epochs and model version from the request
|
| 278 |
+
epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
|
| 279 |
+
version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
|
| 280 |
+
|
| 281 |
+
# Call the training function with user-defined parameters
|
| 282 |
+
model_path = f"./Models/ner_model_{version}"
|
| 283 |
+
train_model(epochs, model_path)
|
| 284 |
+
|
| 285 |
+
flash('Model training completed successfully', 'success')
|
| 286 |
+
except Exception as e:
|
| 287 |
+
flash(f"Error during training: {str(e)}", 'error')
|
| 288 |
+
|
| 289 |
+
return redirect(url_for('index'))
|
| 290 |
+
|
| 291 |
+
# API for removing all files from specific folders
|
| 292 |
+
@app.route('/remove_files', methods=['POST'])
|
| 293 |
+
def remove_files():
|
| 294 |
+
try:
|
| 295 |
+
# Define folders to clear
|
| 296 |
+
folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER'], app.config['MODELS_FOLDER'] ]
|
| 297 |
+
|
| 298 |
+
for folder_path in folders_to_clear:
|
| 299 |
+
# Remove all files from the specified folder
|
| 300 |
+
for filename in os.listdir(folder_path):
|
| 301 |
+
file_path = os.path.join(folder_path, filename)
|
| 302 |
+
if os.path.isfile(file_path):
|
| 303 |
+
os.remove(file_path)
|
| 304 |
+
|
| 305 |
+
# Clear session variables related to the removed folders
|
| 306 |
+
session.pop('uploaded_file', None)
|
| 307 |
+
session.pop('uploaded_json', None)
|
| 308 |
+
|
| 309 |
+
flash('All files removed from folder successfully')
|
| 310 |
+
except Exception as e:
|
| 311 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
| 312 |
+
|
| 313 |
+
return redirect(url_for('index'))
|
| 314 |
+
|
| 315 |
+
# API for downloading the latest trained model
|
| 316 |
+
@app.route('/download_model', methods=['GET'])
|
| 317 |
+
def download_latest_model():
|
| 318 |
+
try:
|
| 319 |
+
models_dir = app.config['MODELS_FOLDER']
|
| 320 |
+
model_files = os.listdir(models_dir)
|
| 321 |
+
|
| 322 |
+
if not model_files:
|
| 323 |
+
flash('No model files found', 'error')
|
| 324 |
+
return redirect(request.referrer)
|
| 325 |
+
|
| 326 |
+
# Sort model files and get the latest one
|
| 327 |
+
latest_model_file = sorted(model_files, reverse=True)[0]
|
| 328 |
+
|
| 329 |
+
# Full path to the latest model file
|
| 330 |
+
model_path = os.path.join(models_dir, latest_model_file)
|
| 331 |
+
|
| 332 |
+
if not os.path.exists(model_path):
|
| 333 |
+
flash('Model file not found on the server', 'error')
|
| 334 |
+
return redirect(request.referrer)
|
| 335 |
+
|
| 336 |
+
# Create a zip file with the model
|
| 337 |
+
zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
|
| 338 |
+
|
| 339 |
+
with zipfile.ZipFile(zip_filename, 'w') as zipf:
|
| 340 |
+
zipf.write(model_path, os.path.basename(model_path))
|
| 341 |
+
|
| 342 |
+
# Send the zip file as a download
|
| 343 |
+
return send_file(zip_filename, as_attachment=True)
|
| 344 |
+
|
| 345 |
+
except Exception as e:
|
| 346 |
+
flash(f"Error while downloading the model: {str(e)}", 'error')
|
| 347 |
+
return redirect(request.referrer)
|
| 348 |
+
|
| 349 |
+
if __name__ == '__main__':
|
| 350 |
+
app.run(debug=True)
|
readme
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
|
| 2 |
+
\\----------- **Resume Parser** ----------\\
|
| 3 |
+
\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
|
| 4 |
+
|
| 5 |
+
# Overview:
|
| 6 |
+
This project is a comprehensive Resume Parsing tool built using Python,
|
| 7 |
+
integrating the Mistral-Nemo-Instruct-2407 model for primary parsing.
|
| 8 |
+
If Mistral fails or encounters issues,
|
| 9 |
+
the system falls back to a custom-trained spaCy model to ensure continued functionality.
|
| 10 |
+
The tool is wrapped with a Flask API and has a user interface built using HTML and CSS.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Installation Guide:
|
| 14 |
+
|
| 15 |
+
1. Create and Activate a Virtual Environment
|
| 16 |
+
python -m venv venv
|
| 17 |
+
source venv/bin/activate # For Linux/Mac
|
| 18 |
+
# or
|
| 19 |
+
venv\Scripts\activate # For Windows
|
| 20 |
+
|
| 21 |
+
# NOTE: If the virtual environment (venv) is already created, you can skip the creation step and just activate.
|
| 22 |
+
- For Linux/Mac:
|
| 23 |
+
source venv/bin/activate
|
| 24 |
+
- For Windows:
|
| 25 |
+
venv\Scripts\activate
|
| 26 |
+
|
| 27 |
+
2. Install Required Libraries
|
| 28 |
+
pip install -r requirements.txt
|
| 29 |
+
|
| 30 |
+
# Ensure the following dependencies are included:
|
| 31 |
+
- Flask
|
| 32 |
+
- spaCy
|
| 33 |
+
- huggingface_hub
|
| 34 |
+
- PyMuPDF
|
| 35 |
+
- python-docx
|
| 36 |
+
- Tesseract-OCR (for image-based parsing)
|
| 37 |
+
|
| 38 |
+
; NOTE : If any model or library is not installed, you can install it using:
|
| 39 |
+
pip install <model_name>
|
| 40 |
+
_Replace <model_name> with the specific model or library you need to install_
|
| 41 |
+
|
| 42 |
+
3. Set up Hugging Face Token
|
| 43 |
+
- Add your Hugging Face token to the .env file as:
|
| 44 |
+
HF_TOKEN=<your_huggingface_token>
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# File Structure Overview:
|
| 48 |
+
Mistral_With_Spacy/
|
| 49 |
+
β
|
| 50 |
+
βββ Spacy_Models/
|
| 51 |
+
β βββ ner_model_05_3 # Pretrained spaCy model directory for resume parsing
|
| 52 |
+
β
|
| 53 |
+
βββ templates/
|
| 54 |
+
β βββ index.html # UI for file upload
|
| 55 |
+
β βββ result.html # Display parsed results in structured JSON
|
| 56 |
+
β
|
| 57 |
+
βββ uploads/ # Directory for uploaded resume files
|
| 58 |
+
β
|
| 59 |
+
βββ utils/
|
| 60 |
+
β βββ mistral.py # Code for calling Mistral API and handling responses
|
| 61 |
+
β βββ spacy.py # spaCy fallback model for parsing resumes
|
| 62 |
+
β βββ error.py # Error handling utilities
|
| 63 |
+
β βββ fileTotext.py # Functions to extract text from different file formats (PDF, DOCX, etc.)
|
| 64 |
+
β
|
| 65 |
+
βββ venv/ # Virtual environment
|
| 66 |
+
β
|
| 67 |
+
βββ .env # Environment variables file (contains Hugging Face token)
|
| 68 |
+
β
|
| 69 |
+
βββ main.py # Flask app handling API routes for uploading and processing resumes
|
| 70 |
+
β
|
| 71 |
+
βββ requirements.txt # Dependencies required for the project
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Program Overview:
|
| 75 |
+
|
| 76 |
+
# Mistral Integration (utils/mistral.py)
|
| 77 |
+
- Mistral API Calls: Uses Hugging Faces Mistral-Nemo-Instruct-2407 model to parse resumes.
|
| 78 |
+
- Personal and Professional Extraction: Two functions extract personal and professional information in structured JSON format.
|
| 79 |
+
- Fallback Mechanism: If Mistral fails, spaCys NER model is used as a fallback.
|
| 80 |
+
|
| 81 |
+
# SpaCy Integration (utils/spacy.py)
|
| 82 |
+
- Custom Trained Model: Uses a spaCy model (ner_model_05_3) trained specifically for resume parsing.
|
| 83 |
+
- Named Entity Recognition: Extracts key information like Name, Email, Contact, Location, Skills, Experience, etc., from resumes.
|
| 84 |
+
- Validation: Includes validation for extracted emails and contacts.
|
| 85 |
+
|
| 86 |
+
# File Conversion (utils/fileTotext.py)
|
| 87 |
+
- Text Extraction: Handles different resume formats (PDF, DOCX, ODT, RSF, and images like PNG, JPG, JPEG) and extracts text for further processing.
|
| 88 |
+
- PDF Files: Uses PyMuPDF to extract text and, if necessary, Tesseract-OCR for image-based PDF content.
|
| 89 |
+
- DOCX Files: Uses `python-docx` to extract structured text from Word documents.
|
| 90 |
+
- ODT Files: Uses `odfpy` to extract text from ODT (OpenDocument) files.
|
| 91 |
+
- RSF Files: Reads plain text from RSF files.
|
| 92 |
+
- Images (PNG, JPG, JPEG): Uses Tesseract-OCR to extract text from image-based resumes.
|
| 93 |
+
Note: For Tesseract-OCR, install it locally by following the [installation guide](https://github.com/UB-Mannheim/tesseract/wiki).
|
| 94 |
+
- Hyperlink Extraction: Extracts hyperlinks from PDF files, capturing any embedded URLs during the parsing process.
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# Error Handling (utils/error.py)
|
| 98 |
+
- Manages API response errors, file format issues, and ensures smooth fallbacks without crashing the app.
|
| 99 |
+
|
| 100 |
+
# Flask API (main.py)
|
| 101 |
+
Endpoints:
|
| 102 |
+
- /upload for uploading resumes.
|
| 103 |
+
- Displays parsed results in JSON format on the results page.
|
| 104 |
+
- UI: Simple interface for uploading resumes and viewing the parsing results.
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# Tree map of your program:
|
| 108 |
+
|
| 109 |
+
main.py
|
| 110 |
+
βββ Handles API side
|
| 111 |
+
βββ File upload/remove
|
| 112 |
+
βββ Process resumes
|
| 113 |
+
βββ Show result
|
| 114 |
+
utils
|
| 115 |
+
βββ fileTotext.py
|
| 116 |
+
β βββ Converts files to text
|
| 117 |
+
β βββ PDF
|
| 118 |
+
β βββ DOCX
|
| 119 |
+
β βββ RTF
|
| 120 |
+
β βββ ODT
|
| 121 |
+
β βββ PNG
|
| 122 |
+
β βββ JPG
|
| 123 |
+
β βββ JPEG
|
| 124 |
+
βββ mistral.py
|
| 125 |
+
β βββ Mistral API Calls
|
| 126 |
+
β β βββ Uses Mistral-Nemo-Instruct-2407 model
|
| 127 |
+
β βββ Personal and Professional Extraction
|
| 128 |
+
β β βββ Extracts personal information
|
| 129 |
+
β β βββ Extracts professional information
|
| 130 |
+
β βββ Fallback Mechanism
|
| 131 |
+
β βββ Uses spaCy NER model if Mistral fails
|
| 132 |
+
βββ spacy.py
|
| 133 |
+
βββ Custom Trained Model
|
| 134 |
+
β βββ Uses spaCy model (ner_model_05_3)
|
| 135 |
+
βββ Named Entity Recognition
|
| 136 |
+
β βββ Extracts key information (Name, Email, Contact, etc.)
|
| 137 |
+
βββ Validation
|
| 138 |
+
βββ Validates emails and contacts
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# References:
|
| 142 |
+
|
| 143 |
+
- [Flask Documentation](https://flask.palletsprojects.com/)
|
| 144 |
+
- [spaCy Documentation](https://spacy.io/usage)
|
| 145 |
+
- [Mistral Documentation](https://docs.mistral.ai/)
|
| 146 |
+
- [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/index)
|
| 147 |
+
- [PyMuPDF (MuPDF) Documentation](https://pymupdf.readthedocs.io/en/latest/)
|
| 148 |
+
- [python-docx Documentation](https://python-docx.readthedocs.io/en/latest/)
|
| 149 |
+
- [Tesseract OCR Documentation](https://github.com/UB-Mannheim/tesseract/wiki)
|
| 150 |
+
- [Virtual Environments in Python](https://docs.python.org/3/tutorial/venv.html)
|
requirements.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amqp==5.2.0
|
| 2 |
+
annotated-types==0.7.0
|
| 3 |
+
billiard==4.2.0
|
| 4 |
+
blinker==1.8.2
|
| 5 |
+
blis==0.7.11
|
| 6 |
+
catalogue==2.0.10
|
| 7 |
+
celery==5.4.0
|
| 8 |
+
certifi==2024.8.30
|
| 9 |
+
charset-normalizer==3.3.2
|
| 10 |
+
click==8.1.7
|
| 11 |
+
click-didyoumean==0.3.1
|
| 12 |
+
click-plugins==1.1.1
|
| 13 |
+
click-repl==0.3.0
|
| 14 |
+
cloudpathlib==0.19.0
|
| 15 |
+
colorama==0.4.6
|
| 16 |
+
confection==0.1.5
|
| 17 |
+
cymem==2.0.8
|
| 18 |
+
defusedxml==0.7.1
|
| 19 |
+
Flask==3.0.3
|
| 20 |
+
Flask-SQLAlchemy==3.1.1
|
| 21 |
+
greenlet==3.1.0
|
| 22 |
+
idna==3.10
|
| 23 |
+
itsdangerous==2.2.0
|
| 24 |
+
Jinja2==3.1.4
|
| 25 |
+
kombu==5.4.1
|
| 26 |
+
langcodes==3.4.0
|
| 27 |
+
language_data==1.2.0
|
| 28 |
+
lxml==5.3.0
|
| 29 |
+
marisa-trie==1.2.0
|
| 30 |
+
markdown-it-py==3.0.0
|
| 31 |
+
MarkupSafe==2.1.5
|
| 32 |
+
mdurl==0.1.2
|
| 33 |
+
murmurhash==1.0.10
|
| 34 |
+
numpy==1.26.4
|
| 35 |
+
odfpy==1.4.1
|
| 36 |
+
packaging==24.1
|
| 37 |
+
pdf2image==1.17.0
|
| 38 |
+
pillow==10.4.0
|
| 39 |
+
preshed==3.0.9
|
| 40 |
+
prompt_toolkit==3.0.47
|
| 41 |
+
pydantic==2.9.1
|
| 42 |
+
pydantic_core==2.23.3
|
| 43 |
+
Pygments==2.18.0
|
| 44 |
+
PyMuPDF==1.24.10
|
| 45 |
+
PyMuPDFb==1.24.10
|
| 46 |
+
pytesseract==0.3.13
|
| 47 |
+
python-dateutil==2.9.0.post0
|
| 48 |
+
python-docx==1.1.2
|
| 49 |
+
requests==2.32.3
|
| 50 |
+
rich==13.8.1
|
| 51 |
+
setuptools==75.0.0
|
| 52 |
+
shellingham==1.5.4
|
| 53 |
+
six==1.16.0
|
| 54 |
+
smart-open==7.0.4
|
| 55 |
+
spacy==3.7.6
|
| 56 |
+
spacy-legacy==3.0.12
|
| 57 |
+
spacy-loggers==1.0.5
|
| 58 |
+
SQLAlchemy==2.0.34
|
| 59 |
+
srsly==2.4.8
|
| 60 |
+
thinc==8.2.5
|
| 61 |
+
tqdm==4.66.5
|
| 62 |
+
typer==0.12.5
|
| 63 |
+
typing_extensions==4.12.2
|
| 64 |
+
tzdata==2024.1
|
| 65 |
+
urllib3==2.2.3
|
| 66 |
+
vine==5.1.0
|
| 67 |
+
wasabi==1.1.3
|
| 68 |
+
wcwidth==0.2.13
|
| 69 |
+
weasel==0.4.1
|
| 70 |
+
Werkzeug==3.0.4
|
| 71 |
+
wrapt==1.16.0
|