Spaces:

WebashalarForML
/

SpacyModelCreator

Sleeping

File size: 3,381 Bytes

9b0cfbe
 
 
 
 
 
 
83f32c2
9b0cfbe
 
83f32c2
9b0cfbe
 
 
 
 
83f32c2
9b0cfbe
 
 
 
 
 
83f32c2
9b0cfbe
 
83f32c2
9b0cfbe
 
83f32c2
 
9b0cfbe
83f32c2
9b0cfbe
 
 
 
 
 
 
83f32c2
9b0cfbe
 
 
83f32c2
9b0cfbe
 
83f32c2
 
 
 
9b0cfbe
83f32c2
9b0cfbe
 
 
83f32c2
9b0cfbe
 
83f32c2
 
 
9b0cfbe
83f32c2
9b0cfbe
83f32c2
 
 
 
 
 
 
 
 
 
 
 
 
 
9b0cfbe
 
83f32c2
 
9b0cfbe
 
83f32c2
 
 
9b0cfbe
 
 
 
 
 
83f32c2
9b0cfbe
83f32c2
 
 
 
 
9b0cfbe
83f32c2
9b0cfbe

import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.tokens import DocBin
import random
import shutil
import os

def load_data_from_spacy_file(file_path):
    """Load training data from .spacy file."""
    nlp = spacy.blank("en")
    
    try:
        doc_bin = DocBin().from_disk(file_path)
        docs = list(doc_bin.get_docs(nlp.vocab))
        print(f"Loaded {len(docs)} documents from {file_path}.")
        return docs
    except Exception as e:
        print(f"Error loading data from .spacy file: {e}")
        return []

def train_model(epochs, model_path):
    """Train NER model."""
    nlp = spacy.blank("en")
    
    # Add the NER pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    
    nlp.add_pipe("sentencizer")  # Optional component to split sentences

    # Define entity labels
    labels = [
        "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
        "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
        "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
        "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
    ]

    # Add the labels to the NER pipeline
    for label in labels:
        ner.add_label(label)

    # Load training data
    train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")

    # Verify if data was loaded correctly
    if not train_data:
        print("No training data found. Exiting training.")
        return

    optimizer = nlp.begin_training()
    epoch_losses = []
    best_loss = float('inf')

    # Start training loop
    for epoch in range(epochs):
        losses = {}
        random.shuffle(train_data)  # Shuffle data

        # Create batches
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

        for batch in batches:
            # Extract texts and annotations
            try:
                texts, annotations = zip(
                    *[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
                      for doc in batch]
                )
            except ValueError as e:
                print(f"Error processing batch: {e}")
                continue

            # Create Example objects
            examples = [Example.from_dict(nlp.make_doc(text), annotation) 
                        for text, annotation in zip(texts, annotations)]

            # Update the model
            nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        # Record loss for this epoch
        current_loss = losses.get("ner", float('inf'))
        epoch_losses.append(current_loss)

        print(f"Losses at epoch {epoch + 1}: {losses}")

        # Save the best model
        if current_loss < best_loss:
            best_loss = current_loss
            temp_model_path = model_path + "_temp"
            nlp.to_disk(temp_model_path)

            # Safely move to the final path
            if os.path.exists(model_path):
                shutil.rmtree(model_path)
            shutil.copytree(temp_model_path, model_path)
            shutil.rmtree(temp_model_path)

    # Save the final model
    nlp.to_disk(model_path)
    print(f"Training completed. Final model saved at: {model_path}")

    return epoch_losses