Delicalib commited on
Commit
ebc4b9d
·
verified ·
1 Parent(s): 226825c

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -13,19 +13,19 @@ model-index:
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
- value: 0.5982281487
17
  - name: NER Recall
18
  type: recall
19
- value: 0.6152545557
20
  - name: NER F Score
21
  type: f_score
22
- value: 0.6066219032
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `ru_patents_ner` |
27
- | **Version** | `1.0.1` |
28
- | **spaCy** | `>=3.8.4,<3.9.0` |
29
  | **Default Pipeline** | `transformer`, `ner` |
30
  | **Components** | `transformer`, `ner` |
31
  | **Vectors** | 500002 keys, 500002 unique vectors (300 dimensions) |
@@ -49,13 +49,14 @@ model-index:
49
 
50
  | Type | Score |
51
  | --- | --- |
52
- | `F1_MICRO` | 59.95 |
53
- | `F1_MACRO` | 55.92 |
54
- | `F1_WEIGHTED` | 59.51 |
55
- | `F1_COMPONENT` | 66.29 |
56
- | `F1_SYSTEM` | 66.62 |
57
- | `F1_ATTRIBUTE` | 34.84 |
58
- | `ENTS_P` | 60.95 |
59
- | `ENTS_R` | 58.99 |
60
- | `TRANSFORMER_LOSS` | 907006.78 |
61
- | `NER_LOSS` | 1524129.39 |
 
 
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
+ value: 0.6187035922
17
  - name: NER Recall
18
  type: recall
19
+ value: 0.6062930187
20
  - name: NER F Score
21
  type: f_score
22
+ value: 0.612435439
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `ru_patents_ner` |
27
+ | **Version** | `1.0.0` |
28
+ | **spaCy** | `>=3.8.5,<3.9.0` |
29
  | **Default Pipeline** | `transformer`, `ner` |
30
  | **Components** | `transformer`, `ner` |
31
  | **Vectors** | 500002 keys, 500002 unique vectors (300 dimensions) |
 
49
 
50
  | Type | Score |
51
  | --- | --- |
52
+ | `F1_MICRO` | 61.24 |
53
+ | `F1_MACRO` | 54.82 |
54
+ | `F1_WEIGHTED` | 60.09 |
55
+ | `F1_COMPONENT` | 67.20 |
56
+ | `F1_SYSTEM` | 64.79 |
57
+ | `F1_ATTRIBUTE` | 32.48 |
58
+ | `ENTS_P` | 61.87 |
59
+ | `ENTS_R` | 60.63 |
60
+ | `ENTS_F` | 61.24 |
61
+ | `TRANSFORMER_LOSS` | 144452.32 |
62
+ | `NER_LOSS` | 222665.13 |
config.cfg CHANGED
@@ -1,8 +1,9 @@
1
  [paths]
2
- train = "./Diplom/dataset/all_ner_train.spacy"
3
- dev = "./Diplom/dataset/all_ner_test.spacy"
4
  vectors = "ru_core_news_lg"
5
  init_tok2vec = null
 
6
 
7
  [system]
8
  gpu_allocator = "pytorch"
@@ -23,6 +24,7 @@ vectors = {"@vectors":"spacy.Vectors.v1"}
23
 
24
  [components.ner]
25
  factory = "ner_all_metrics"
 
26
  incorrect_spans_key = null
27
  moves = null
28
  scorer = {"@scorers":"spacy.ner_scorer.v1"}
@@ -50,7 +52,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
50
 
51
  [components.transformer.model]
52
  @architectures = "spacy-transformers.TransformerModel.v3"
53
- name = "ai-forever/ruRoberta-large"
54
  mixed_precision = false
55
 
56
  [components.transformer.model.get_spans]
@@ -84,16 +86,16 @@ limit = 0
84
  augmenter = null
85
 
86
  [training]
87
- accumulate_gradient = 1
88
  dev_corpus = "corpora.dev"
89
  train_corpus = "corpora.train"
90
  seed = ${system.seed}
91
  gpu_allocator = ${system.gpu_allocator}
92
- dropout = 0.2
93
  patience = 1600
94
  max_epochs = 0
95
- max_steps = 20000
96
- eval_frequency = 200
97
  frozen_components = []
98
  annotating_components = []
99
  before_to_disk = null
@@ -122,19 +124,20 @@ eps = 0.00000001
122
 
123
  [training.optimizer.learn_rate]
124
  @schedules = "warmup_linear.v1"
125
- warmup_steps = 300
126
- total_steps = 20000
127
- initial_rate = 0.000005
128
 
129
  [training.score_weights]
130
- f1_COMPONENT = 0.17
131
- f1_SYSTEM = 0.17
132
- f1_ATTRIBUTE = 0.17
 
 
 
133
  ents_p = 0.0
134
  ents_r = 0.0
135
- f1_macro = 0.17
136
- f1_weighted = 0.17
137
- ents_f = 0.17
138
  ents_per_type = null
139
 
140
  [pretraining]
 
1
  [paths]
2
+ train = "./data/all_ner_train.spacy"
3
+ dev = "./data/all_ner_test.spacy"
4
  vectors = "ru_core_news_lg"
5
  init_tok2vec = null
6
+ model = "./model-best"
7
 
8
  [system]
9
  gpu_allocator = "pytorch"
 
24
 
25
  [components.ner]
26
  factory = "ner_all_metrics"
27
+ eval_frequency = ${training.eval_frequency}
28
  incorrect_spans_key = null
29
  moves = null
30
  scorer = {"@scorers":"spacy.ner_scorer.v1"}
 
52
 
53
  [components.transformer.model]
54
  @architectures = "spacy-transformers.TransformerModel.v3"
55
+ name = "DeepPavlov/rubert-base-cased"
56
  mixed_precision = false
57
 
58
  [components.transformer.model.get_spans]
 
86
  augmenter = null
87
 
88
  [training]
89
+ accumulate_gradient = 3
90
  dev_corpus = "corpora.dev"
91
  train_corpus = "corpora.train"
92
  seed = ${system.seed}
93
  gpu_allocator = ${system.gpu_allocator}
94
+ dropout = 0.1
95
  patience = 1600
96
  max_epochs = 0
97
+ max_steps = 3000
98
+ eval_frequency = 50
99
  frozen_components = []
100
  annotating_components = []
101
  before_to_disk = null
 
124
 
125
  [training.optimizer.learn_rate]
126
  @schedules = "warmup_linear.v1"
127
+ warmup_steps = 250
128
+ total_steps = 10000
129
+ initial_rate = 0.00005
130
 
131
  [training.score_weights]
132
+ f1_micro = 0.14
133
+ f1_macro = 0.14
134
+ f1_weighted = 0.14
135
+ f1_COMPONENT = 0.14
136
+ f1_SYSTEM = 0.14
137
+ f1_ATTRIBUTE = 0.14
138
  ents_p = 0.0
139
  ents_r = 0.0
140
+ ents_f = 0.14
 
 
141
  ents_per_type = null
142
 
143
  [pretraining]
custom_factory.py CHANGED
@@ -1,7 +1,13 @@
1
  from spacy.pipeline.ner import EntityRecognizer
2
- from spacy.language import DEFAULT_CONFIG, Language
3
  from thinc.api import Config
4
  from sklearn.metrics import f1_score, precision_recall_fscore_support
 
 
 
 
 
 
5
 
6
 
7
  default_model_config = """
@@ -22,35 +28,59 @@ upstream = "*"
22
  """
23
  DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
24
 
 
25
  @Language.factory("ner_all_metrics",
26
- default_config={
27
- "model": DEFAULT_MODEL,
28
- "moves": None,
29
- "scorer": {"@scorers": "spacy.ner_scorer.v1"},
30
- "incorrect_spans_key": None,
31
- "update_with_oracle_cut_size": 100
 
32
  },
33
  default_score_weights={
 
 
 
34
  "f1_COMPONENT": 1.0,
35
  "f1_SYSTEM": 1.0,
36
  "f1_ATTRIBUTE": 1.0,
37
  "ents_p": 0.0,
38
  "ents_r": 0.0,
39
- "f1_macro": 1.0,
40
- "f1_weighted": 1.0,
41
- "ents_f": 1.0,
42
  })
43
- def create_ner_all_metrics(nlp, name, model, moves, scorer, incorrect_spans_key, update_with_oracle_cut_size):
44
- return NERWithAllMetrics(nlp.vocab, model, name=name, moves=moves, scorer=scorer,
45
- incorrect_spans_key=incorrect_spans_key,
46
- update_with_oracle_cut_size=update_with_oracle_cut_size)
 
 
 
 
 
 
 
 
 
47
 
48
  class NERWithAllMetrics(EntityRecognizer):
 
 
 
 
 
 
 
 
 
49
  def score(self, examples, **kwargs):
50
  scores = super().score(examples, **kwargs)
51
  scores = dict(list(scores.items()) + list(self.custom_scorer(examples).items()))
52
- scores["ents_f"] = scores["f1_micro"]
53
- del scores["f1_micro"]
 
 
 
 
54
  return scores
55
 
56
  def custom_scorer(self, examples):
@@ -85,3 +115,75 @@ class NERWithAllMetrics(EntityRecognizer):
85
  result["f1_weighted"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
86
 
87
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from spacy.pipeline.ner import EntityRecognizer
2
+ from spacy.language import Language
3
  from thinc.api import Config
4
  from sklearn.metrics import f1_score, precision_recall_fscore_support
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ import time
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
 
12
 
13
  default_model_config = """
 
28
  """
29
  DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
30
 
31
+
32
  @Language.factory("ner_all_metrics",
33
+ default_config={
34
+ "model": DEFAULT_MODEL,
35
+ "moves": None,
36
+ "scorer": {"@scorers": "spacy.ner_scorer.v1"},
37
+ "incorrect_spans_key": None,
38
+ "update_with_oracle_cut_size": 100,
39
+ "eval_frequency": 100,
40
  },
41
  default_score_weights={
42
+ "f1_micro": 1.0,
43
+ "f1_macro": 1.0,
44
+ "f1_weighted": 1.0,
45
  "f1_COMPONENT": 1.0,
46
  "f1_SYSTEM": 1.0,
47
  "f1_ATTRIBUTE": 1.0,
48
  "ents_p": 0.0,
49
  "ents_r": 0.0,
 
 
 
50
  })
51
+ def create_ner_all_metrics(
52
+ nlp, name,
53
+ model, moves,
54
+ scorer, incorrect_spans_key,
55
+ update_with_oracle_cut_size, eval_frequency
56
+ ):
57
+ return NERWithAllMetrics(
58
+ nlp.vocab, model,
59
+ name=name, moves=moves,
60
+ scorer=scorer, incorrect_spans_key=incorrect_spans_key,
61
+ update_with_oracle_cut_size=update_with_oracle_cut_size, eval_frequency=eval_frequency
62
+ )
63
+
64
 
65
  class NERWithAllMetrics(EntityRecognizer):
66
+
67
+ def __init__(self, *args, eval_frequency=100, **kwargs):
68
+ super().__init__(*args, **kwargs)
69
+ self.metric_history = []
70
+ self.max_f1 = 0
71
+ self.max_f1_step = 0
72
+ self.eval_frequency = eval_frequency
73
+ self.start_learning_time = None
74
+
75
  def score(self, examples, **kwargs):
76
  scores = super().score(examples, **kwargs)
77
  scores = dict(list(scores.items()) + list(self.custom_scorer(examples).items()))
78
+ tmp_scores = scores.copy()
79
+ tmp_scores["step"] = len(self.metric_history) * self.eval_frequency
80
+ if tmp_scores["f1_macro"] > self.max_f1:
81
+ self.max_f1 = tmp_scores["f1_macro"]
82
+ self.max_f1_step = tmp_scores["step"]
83
+ self.metric_history.append(tmp_scores)
84
  return scores
85
 
86
  def custom_scorer(self, examples):
 
115
  result["f1_weighted"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
116
 
117
  return result
118
+
119
+ def preprocess_metric_history(self):
120
+ result = {
121
+ "metric_name": [],
122
+ "metric_value": [],
123
+ "step": []
124
+ }
125
+ for cur_metrics in self.metric_history:
126
+ cur_step = cur_metrics["step"]
127
+ for key, value in cur_metrics.items():
128
+ if key != "step" and isinstance(value, float):
129
+ result["metric_name"].append(key)
130
+ result["metric_value"].append(value)
131
+ result["step"].append(cur_step)
132
+ return result
133
+
134
+ def save_metrics_history(self, path):
135
+ if self.start_learning_time is None:
136
+ self.start_learning_time = time.monotonic()
137
+
138
+ if self.metric_history:
139
+
140
+ metrics_history_to_save = self.preprocess_metric_history()
141
+ fig = px.line(metrics_history_to_save, x="step", y="metric_value", color="metric_name")
142
+ for trace in fig.data:
143
+ if trace.name in ["f1_micro", "f1_macro", "f1_weighted"]:
144
+ trace.line.width = 6
145
+ else:
146
+ trace.line.width = 1
147
+
148
+ idx = list(trace.x).index(self.max_f1_step)
149
+ highlight_y = list(trace.y)[idx]
150
+ line_color = trace.line.color
151
+ line_name = trace.name
152
+ fig.add_trace(go.Scatter(
153
+ x=[self.max_f1_step], y=[highlight_y],
154
+ mode='markers+text',
155
+ marker=dict(
156
+ color=line_color, size=10),
157
+ text=[f"{round(highlight_y, 2)}"],
158
+ textposition="top center",
159
+ name=f"{line_name} best"
160
+ ))
161
+
162
+ current_time = time.monotonic()
163
+ current_time_of_training = current_time - self.start_learning_time
164
+ current_time_of_training_text = f"{int(current_time_of_training // 3600)} hrs {int(current_time_of_training % 3600) // 60} min {round(current_time_of_training % 60)} sec"
165
+
166
+ fig.update_layout(title = dict(
167
+ text="Training statistics",
168
+ subtitle=dict(
169
+ text=f"Training time amounted to {current_time_of_training_text}",
170
+ font=dict(color="gray", size=13),
171
+ )
172
+ ))
173
+
174
+ output_dir = os.path.join(str(path), "logs")
175
+ os.makedirs(output_dir, exist_ok=True)
176
+ fig_path = os.path.join(output_dir, "training_metrics.html")
177
+ json_path = os.path.join(output_dir, "training_metrics.json")
178
+ fig.write_html(fig_path)
179
+ with open(json_path, "w", encoding="utf-8") as f:
180
+ json.dump({
181
+ "data": metrics_history_to_save,
182
+ "train_time_s": current_time_of_training
183
+ }, f, indent=2, ensure_ascii=False)
184
+
185
+ def to_disk(self, path, *args, **kwargs):
186
+ super().to_disk(path, *args, **kwargs)
187
+ output_dir = Path(path)
188
+ output_dir_metrics = output_dir.parent.parent
189
+ self.save_metrics_history(output_dir_metrics)
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"ru",
3
  "name":"patents_ner",
4
- "version":"1.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.8.4,<3.9.0",
11
- "spacy_git_version":"85cc763",
12
  "vectors":{
13
  "width":300,
14
  "vectors":500002,
@@ -37,36 +37,37 @@
37
 
38
  ],
39
  "performance":{
40
- "f1_COMPONENT":0.6689663984,
41
- "f1_SYSTEM":0.6772068511,
42
- "f1_ATTRIBUTE":0.3617454217,
43
- "ents_p":0.5982281487,
44
- "ents_r":0.6152545557,
45
- "f1_macro":0.5693062237,
46
- "f1_weighted":0.6031160932,
47
- "ents_f":0.6066219032,
 
48
  "ents_per_type":{
49
  "SYSTEM":{
50
- "p":0.6692708333,
51
- "r":0.6853333333,
52
- "f":0.6772068511
53
  },
54
  "ATTRIBUTE":{
55
- "p":0.3771805752,
56
- "r":0.3475238923,
57
- "f":0.3617454217
58
  },
59
  "COMPONENT":{
60
- "p":0.6496277916,
61
- "r":0.689491704,
62
- "f":0.6689663984
63
  }
64
  },
65
- "transformer_loss":9419.470294231,
66
- "ner_loss":14866.390625
67
  },
68
  "requirements":[
69
  "spacy-transformers>=1.3.8,<1.4.0",
70
- "spacy>=3.8.4,<3.9.0"
71
  ]
72
  }
 
1
  {
2
  "lang":"ru",
3
  "name":"patents_ner",
4
+ "version":"1.0.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.8.5,<3.9.0",
11
+ "spacy_git_version":"d0c705c",
12
  "vectors":{
13
  "width":300,
14
  "vectors":500002,
 
37
 
38
  ],
39
  "performance":{
40
+ "f1_micro":0.612435439,
41
+ "f1_macro":0.5482288349,
42
+ "f1_weighted":0.6009293658,
43
+ "f1_COMPONENT":0.6719853518,
44
+ "f1_SYSTEM":0.6479250335,
45
+ "f1_ATTRIBUTE":0.3247761194,
46
+ "ents_p":0.6187035922,
47
+ "ents_r":0.6062930187,
48
+ "ents_f":0.612435439,
49
  "ents_per_type":{
50
  "SYSTEM":{
51
+ "p":0.6419098143,
52
+ "r":0.6540540541,
53
+ "f":0.6479250335
54
  },
55
  "ATTRIBUTE":{
56
+ "p":0.4121212121,
57
+ "r":0.2679802956,
58
+ "f":0.3247761194
59
  },
60
  "COMPONENT":{
61
+ "p":0.6510390269,
62
+ "r":0.6943243243,
63
+ "f":0.6719853518
64
  }
65
  },
66
+ "transformer_loss":1444.5232329108,
67
+ "ner_loss":2226.6513070879
68
  },
69
  "requirements":[
70
  "spacy-transformers>=1.3.8,<1.4.0",
71
+ "spacy>=3.8.5,<3.9.0"
72
  ]
73
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef3ac16ec06d58f22ee3f23b71b37b706697b3c5099cfdbd27f42ec5fcc4155a
3
- size 285226
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e5cb4fb854f36288d0807225cf54aa7f980104e14b948225622b496282217d6
3
+ size 219690
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves��{"0":{},"1":{"COMPONENT":60434,"ATTRIBUTE":22497,"SYSTEM":8460},"2":{"COMPONENT":60434,"ATTRIBUTE":22497,"SYSTEM":8460},"3":{"COMPONENT":60434,"ATTRIBUTE":22497,"SYSTEM":8460},"4":{"COMPONENT":60434,"ATTRIBUTE":22497,"SYSTEM":8460,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"COMPONENT":63715,"ATTRIBUTE":22585,"SYSTEM":8551},"2":{"COMPONENT":63715,"ATTRIBUTE":22585,"SYSTEM":8551},"3":{"COMPONENT":63715,"ATTRIBUTE":22585,"SYSTEM":8551},"4":{"COMPONENT":63715,"ATTRIBUTE":22585,"SYSTEM":8551,"":1},"5":{"":1}}�cfg��neg_key�
ru_patents_ner-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f78862070ad285cfd7c0bce13a2457d8c70eeb8e5ece21d00215d83f2c22caba
3
- size 1822037615
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c30bb3920ae29fc3d514a4b955afddd4511e6bbe75d6c43adf6c505a9256c263
3
+ size 1161805382
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:993ccc8d74fab373bc75f00ef70e63c9acd93ed7006a5d49c3de375694aa808f
3
- size 1430062590
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59df9f2f2fd2593383e86d5e08a2f1583c6b2364caf5b20fb9734ead15fc8884
3
+ size 716719271
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48b62483d2ae9ba9e53ee3819ce1ad644df11b4c3a89256e9a1ccca552435c16
3
- size 49634993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af09429a518e34158d1f63f2eeb6326a59efbd36602a82fb0bec19c7ebe60ef
3
+ size 49607747