WebashalarForML commited on
Commit
9b0cfbe
·
verified ·
1 Parent(s): 05dbda1

Update utils/model.py

Browse files
Files changed (1) hide show
  1. utils/model.py +98 -89
utils/model.py CHANGED
@@ -1,89 +1,98 @@
1
- import spacy
2
- from spacy.training import Example
3
- from spacy.util import minibatch, compounding
4
- from pathlib import Path
5
- from spacy.tokens import DocBin
6
- import random
7
-
8
- # Load the training data from the .spacy file
9
- def load_data_from_spacy_file(file_path):
10
- # Initialize a blank English model to ensure compatibility
11
- nlp = spacy.blank("en")
12
-
13
- # Load the DocBin object and get documents
14
- try:
15
- doc_bin = DocBin().from_disk(file_path)
16
- docs = list(doc_bin.get_docs(nlp.vocab))
17
- return docs
18
- except Exception as e:
19
- print(f"Error loading data from .spacy file: {e}")
20
- return []
21
-
22
-
23
- # Train model function
24
- def train_model(epochs, model_path):
25
- # Initialize a blank English model
26
- nlp = spacy.blank("en")
27
-
28
- # Create an NER component and add it to the pipeline
29
- if "ner" not in nlp.pipe_names:
30
- ner = nlp.add_pipe("ner")
31
-
32
- nlp.add_pipe("sentencizer")
33
-
34
- # Define all possible entity labels
35
- labels = [
36
- "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
37
- "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
38
- "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
39
- "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
40
- ]
41
-
42
- # Add labels to the NER component
43
- for label in labels:
44
- ner.add_label(label)
45
-
46
- # Load the training data
47
- train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
48
-
49
- # Start the training
50
- optimizer = nlp.begin_training()
51
-
52
- epoch_losses = []
53
- best_loss = float('inf')
54
-
55
- # Training loop
56
- for epoch in range(epochs):
57
- losses = {}
58
- random.shuffle(train_data) # Shuffle data for better training
59
-
60
- # Create minibatches
61
- batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
62
-
63
- for batch in batches:
64
- texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
65
-
66
- # Convert to Example objects
67
- examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
68
-
69
- # Update the model
70
- nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
71
-
72
- current_loss = losses.get("ner", float('inf'))
73
- epoch_losses.append(current_loss)
74
-
75
- print(f"Losses at epoch {epoch + 1}: {losses}")
76
-
77
- # Stop training if the loss is zero
78
- if current_loss == 0:
79
- break
80
-
81
- # Save the best model
82
- if current_loss < best_loss:
83
- best_loss = current_loss
84
- nlp.to_disk(model_path)
85
-
86
- # Save the final model
87
- nlp.to_disk(model_path)
88
-
89
- return epoch_losses
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.training import Example
3
+ from spacy.util import minibatch, compounding
4
+ from pathlib import Path
5
+ from spacy.tokens import DocBin
6
+ import random
7
+ import shutil
8
+
9
+ # Load the training data from the .spacy file
10
+ def load_data_from_spacy_file(file_path):
11
+ # Initialize a blank English model to ensure compatibility
12
+ nlp = spacy.blank("en")
13
+
14
+ # Load the DocBin object and get documents
15
+ try:
16
+ doc_bin = DocBin().from_disk(file_path)
17
+ docs = list(doc_bin.get_docs(nlp.vocab))
18
+ return docs
19
+ except Exception as e:
20
+ print(f"Error loading data from .spacy file: {e}")
21
+ return []
22
+
23
+
24
+ # Train model function
25
+ def train_model(epochs, model_path):
26
+ # Initialize a blank English model
27
+ nlp = spacy.blank("en")
28
+
29
+ # Create an NER component and add it to the pipeline
30
+ if "ner" not in nlp.pipe_names:
31
+ ner = nlp.add_pipe("ner")
32
+
33
+ nlp.add_pipe("sentencizer")
34
+
35
+ # Define all possible entity labels
36
+ labels = [
37
+ "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
38
+ "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
39
+ "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
40
+ "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
41
+ ]
42
+
43
+ # Add labels to the NER component
44
+ for label in labels:
45
+ ner.add_label(label)
46
+
47
+ # Load the training data
48
+ train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
49
+
50
+ # Start the training
51
+ optimizer = nlp.begin_training()
52
+
53
+ epoch_losses = []
54
+ best_loss = float('inf')
55
+
56
+ # Training loop
57
+ for epoch in range(epochs):
58
+ losses = {}
59
+ random.shuffle(train_data) # Shuffle data for better training
60
+
61
+ # Create minibatches
62
+ batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
63
+
64
+ for batch in batches:
65
+ texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
66
+
67
+ # Convert to Example objects
68
+ examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
69
+
70
+ # Update the model
71
+ nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
72
+
73
+ current_loss = losses.get("ner", float('inf'))
74
+ epoch_losses.append(current_loss)
75
+
76
+ print(f"Losses at epoch {epoch + 1}: {losses}")
77
+
78
+ # Stop training if the loss is zero
79
+ if current_loss == 0:
80
+ break
81
+
82
+ # Save the best model
83
+ if current_loss < best_loss:
84
+ best_loss = current_loss
85
+ # Save to a temporary path
86
+ temp_model_path = model_path + "_temp"
87
+ nlp.to_disk(temp_model_path)
88
+
89
+ # Use shutil to move the model to the final path
90
+ if os.path.exists(model_path):
91
+ shutil.rmtree(model_path) # Remove the old model if it exists
92
+ shutil.copytree(temp_model_path, model_path) # Copy the temp model to the final path
93
+ shutil.rmtree(temp_model_path) # Remove the temporary model directory
94
+
95
+ # Final save after training
96
+ nlp.to_disk(model_path)
97
+
98
+ return epoch_losses