Spaces:
Sleeping
Sleeping
File size: 3,381 Bytes
9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe 83f32c2 9b0cfbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.tokens import DocBin
import random
import shutil
import os
def load_data_from_spacy_file(file_path):
"""Load training data from .spacy file."""
nlp = spacy.blank("en")
try:
doc_bin = DocBin().from_disk(file_path)
docs = list(doc_bin.get_docs(nlp.vocab))
print(f"Loaded {len(docs)} documents from {file_path}.")
return docs
except Exception as e:
print(f"Error loading data from .spacy file: {e}")
return []
def train_model(epochs, model_path):
"""Train NER model."""
nlp = spacy.blank("en")
# Add the NER pipeline
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner")
nlp.add_pipe("sentencizer") # Optional component to split sentences
# Define entity labels
labels = [
"PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
"UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
"COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
"LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
]
# Add the labels to the NER pipeline
for label in labels:
ner.add_label(label)
# Load training data
train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
# Verify if data was loaded correctly
if not train_data:
print("No training data found. Exiting training.")
return
optimizer = nlp.begin_training()
epoch_losses = []
best_loss = float('inf')
# Start training loop
for epoch in range(epochs):
losses = {}
random.shuffle(train_data) # Shuffle data
# Create batches
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
# Extract texts and annotations
try:
texts, annotations = zip(
*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
for doc in batch]
)
except ValueError as e:
print(f"Error processing batch: {e}")
continue
# Create Example objects
examples = [Example.from_dict(nlp.make_doc(text), annotation)
for text, annotation in zip(texts, annotations)]
# Update the model
nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
# Record loss for this epoch
current_loss = losses.get("ner", float('inf'))
epoch_losses.append(current_loss)
print(f"Losses at epoch {epoch + 1}: {losses}")
# Save the best model
if current_loss < best_loss:
best_loss = current_loss
temp_model_path = model_path + "_temp"
nlp.to_disk(temp_model_path)
# Safely move to the final path
if os.path.exists(model_path):
shutil.rmtree(model_path)
shutil.copytree(temp_model_path, model_path)
shutil.rmtree(temp_model_path)
# Save the final model
nlp.to_disk(model_path)
print(f"Training completed. Final model saved at: {model_path}")
return epoch_losses |