Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Update utils/model.py
Browse files- utils/model.py +98 -89
    	
        utils/model.py
    CHANGED
    
    | @@ -1,89 +1,98 @@ | |
| 1 | 
            -
            import spacy
         | 
| 2 | 
            -
            from spacy.training import Example
         | 
| 3 | 
            -
            from spacy.util import minibatch, compounding
         | 
| 4 | 
            -
            from pathlib import Path
         | 
| 5 | 
            -
            from spacy.tokens import DocBin
         | 
| 6 | 
            -
            import random
         | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
                 | 
| 12 | 
            -
                
         | 
| 13 | 
            -
                 | 
| 14 | 
            -
                 | 
| 15 | 
            -
             | 
| 16 | 
            -
                     | 
| 17 | 
            -
                     | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
                     | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
                 | 
| 27 | 
            -
                
         | 
| 28 | 
            -
                 | 
| 29 | 
            -
                 | 
| 30 | 
            -
             | 
| 31 | 
            -
                    
         | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
                 | 
| 36 | 
            -
             | 
| 37 | 
            -
                    " | 
| 38 | 
            -
                    " | 
| 39 | 
            -
                    " | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
                 | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
                 | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
                 | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
                 | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
                 | 
| 57 | 
            -
             | 
| 58 | 
            -
                     | 
| 59 | 
            -
                    
         | 
| 60 | 
            -
                     | 
| 61 | 
            -
                     | 
| 62 | 
            -
                    
         | 
| 63 | 
            -
                     | 
| 64 | 
            -
             | 
| 65 | 
            -
                        
         | 
| 66 | 
            -
                         | 
| 67 | 
            -
                         | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
                         | 
| 71 | 
            -
             | 
| 72 | 
            -
                     | 
| 73 | 
            -
                     | 
| 74 | 
            -
                    
         | 
| 75 | 
            -
                     | 
| 76 | 
            -
                    
         | 
| 77 | 
            -
                     | 
| 78 | 
            -
                    if  | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
                     | 
| 82 | 
            -
                     | 
| 83 | 
            -
             | 
| 84 | 
            -
                         | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import spacy
         | 
| 2 | 
            +
            from spacy.training import Example
         | 
| 3 | 
            +
            from spacy.util import minibatch, compounding
         | 
| 4 | 
            +
            from pathlib import Path
         | 
| 5 | 
            +
            from spacy.tokens import DocBin
         | 
| 6 | 
            +
            import random
         | 
| 7 | 
            +
            import shutil
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # Load the training data from the .spacy file
         | 
| 10 | 
            +
            def load_data_from_spacy_file(file_path):
         | 
| 11 | 
            +
                # Initialize a blank English model to ensure compatibility
         | 
| 12 | 
            +
                nlp = spacy.blank("en")
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
                # Load the DocBin object and get documents
         | 
| 15 | 
            +
                try:
         | 
| 16 | 
            +
                    doc_bin = DocBin().from_disk(file_path)
         | 
| 17 | 
            +
                    docs = list(doc_bin.get_docs(nlp.vocab))
         | 
| 18 | 
            +
                    return docs
         | 
| 19 | 
            +
                except Exception as e:
         | 
| 20 | 
            +
                    print(f"Error loading data from .spacy file: {e}")
         | 
| 21 | 
            +
                    return []
         | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
| 24 | 
            +
            # Train model function
         | 
| 25 | 
            +
            def train_model(epochs, model_path):
         | 
| 26 | 
            +
                # Initialize a blank English model
         | 
| 27 | 
            +
                nlp = spacy.blank("en")
         | 
| 28 | 
            +
                
         | 
| 29 | 
            +
                # Create an NER component and add it to the pipeline
         | 
| 30 | 
            +
                if "ner" not in nlp.pipe_names:
         | 
| 31 | 
            +
                    ner = nlp.add_pipe("ner")
         | 
| 32 | 
            +
                    
         | 
| 33 | 
            +
                nlp.add_pipe("sentencizer")
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                # Define all possible entity labels
         | 
| 36 | 
            +
                labels = [
         | 
| 37 | 
            +
                    "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
         | 
| 38 | 
            +
                    "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
         | 
| 39 | 
            +
                    "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
         | 
| 40 | 
            +
                    "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
         | 
| 41 | 
            +
                ]
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # Add labels to the NER component
         | 
| 44 | 
            +
                for label in labels:
         | 
| 45 | 
            +
                    ner.add_label(label)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # Load the training data
         | 
| 48 | 
            +
                train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                # Start the training
         | 
| 51 | 
            +
                optimizer = nlp.begin_training()
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                epoch_losses = []
         | 
| 54 | 
            +
                best_loss = float('inf')
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                # Training loop
         | 
| 57 | 
            +
                for epoch in range(epochs):
         | 
| 58 | 
            +
                    losses = {}
         | 
| 59 | 
            +
                    random.shuffle(train_data)  # Shuffle data for better training
         | 
| 60 | 
            +
                    
         | 
| 61 | 
            +
                    # Create minibatches
         | 
| 62 | 
            +
                    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
         | 
| 63 | 
            +
                    
         | 
| 64 | 
            +
                    for batch in batches:
         | 
| 65 | 
            +
                        texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
         | 
| 66 | 
            +
                        
         | 
| 67 | 
            +
                        # Convert to Example objects
         | 
| 68 | 
            +
                        examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
         | 
| 69 | 
            +
                           
         | 
| 70 | 
            +
                        # Update the model
         | 
| 71 | 
            +
                        nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
         | 
| 72 | 
            +
                    
         | 
| 73 | 
            +
                    current_loss = losses.get("ner", float('inf'))
         | 
| 74 | 
            +
                    epoch_losses.append(current_loss)
         | 
| 75 | 
            +
                    
         | 
| 76 | 
            +
                    print(f"Losses at epoch {epoch + 1}: {losses}")
         | 
| 77 | 
            +
                    
         | 
| 78 | 
            +
                    # Stop training if the loss is zero
         | 
| 79 | 
            +
                    if current_loss == 0:
         | 
| 80 | 
            +
                        break
         | 
| 81 | 
            +
                    
         | 
| 82 | 
            +
                    # Save the best model
         | 
| 83 | 
            +
                    if current_loss < best_loss:
         | 
| 84 | 
            +
                        best_loss = current_loss
         | 
| 85 | 
            +
                        # Save to a temporary path
         | 
| 86 | 
            +
                        temp_model_path = model_path + "_temp"
         | 
| 87 | 
            +
                        nlp.to_disk(temp_model_path)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                        # Use shutil to move the model to the final path
         | 
| 90 | 
            +
                        if os.path.exists(model_path):
         | 
| 91 | 
            +
                            shutil.rmtree(model_path)  # Remove the old model if it exists
         | 
| 92 | 
            +
                        shutil.copytree(temp_model_path, model_path)  # Copy the temp model to the final path
         | 
| 93 | 
            +
                        shutil.rmtree(temp_model_path)  # Remove the temporary model directory
         | 
| 94 | 
            +
                
         | 
| 95 | 
            +
                # Final save after training
         | 
| 96 | 
            +
                nlp.to_disk(model_path)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                return epoch_losses
         |