Delicalib commited on
Commit
226825c
·
verified ·
1 Parent(s): 40c2018

Update spaCy pipeline

Browse files
Files changed (7) hide show
  1. README.md +15 -10
  2. config.cfg +12 -7
  3. custom_factory.py +87 -0
  4. meta.json +22 -16
  5. ner/model +1 -1
  6. ru_patents_ner-any-py3-none-any.whl +2 -2
  7. transformer/model +1 -1
README.md CHANGED
@@ -13,25 +13,25 @@ model-index:
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
- value: 0.61849542
17
  - name: NER Recall
18
  type: recall
19
- value: 0.5961863611
20
  - name: NER F Score
21
  type: f_score
22
- value: 0.6071360245
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `ru_patents_ner` |
27
- | **Version** | `1.0.0` |
28
  | **spaCy** | `>=3.8.4,<3.9.0` |
29
  | **Default Pipeline** | `transformer`, `ner` |
30
  | **Components** | `transformer`, `ner` |
31
  | **Vectors** | 500002 keys, 500002 unique vectors (300 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
34
- | **Author** | [Delicalib](https://huggingface.co/Delicalib) |
35
 
36
  ### Label Scheme
37
 
@@ -49,8 +49,13 @@ model-index:
49
 
50
  | Type | Score |
51
  | --- | --- |
52
- | `ENTS_F` | 60.71 |
53
- | `ENTS_P` | 61.85 |
54
- | `ENTS_R` | 59.62 |
55
- | `TRANSFORMER_LOSS` | 21357700.30 |
56
- | `NER_LOSS` | 883914.55 |
 
 
 
 
 
 
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
+ value: 0.5982281487
17
  - name: NER Recall
18
  type: recall
19
+ value: 0.6152545557
20
  - name: NER F Score
21
  type: f_score
22
+ value: 0.6066219032
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `ru_patents_ner` |
27
+ | **Version** | `1.0.1` |
28
  | **spaCy** | `>=3.8.4,<3.9.0` |
29
  | **Default Pipeline** | `transformer`, `ner` |
30
  | **Components** | `transformer`, `ner` |
31
  | **Vectors** | 500002 keys, 500002 unique vectors (300 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
34
+ | **Author** | [n/a]() |
35
 
36
  ### Label Scheme
37
 
 
49
 
50
  | Type | Score |
51
  | --- | --- |
52
+ | `F1_MICRO` | 59.95 |
53
+ | `F1_MACRO` | 55.92 |
54
+ | `F1_WEIGHTED` | 59.51 |
55
+ | `F1_COMPONENT` | 66.29 |
56
+ | `F1_SYSTEM` | 66.62 |
57
+ | `F1_ATTRIBUTE` | 34.84 |
58
+ | `ENTS_P` | 60.95 |
59
+ | `ENTS_R` | 58.99 |
60
+ | `TRANSFORMER_LOSS` | 907006.78 |
61
+ | `NER_LOSS` | 1524129.39 |
config.cfg CHANGED
@@ -22,7 +22,7 @@ vectors = {"@vectors":"spacy.Vectors.v1"}
22
  [components]
23
 
24
  [components.ner]
25
- factory = "ner"
26
  incorrect_spans_key = null
27
  moves = null
28
  scorer = {"@scorers":"spacy.ner_scorer.v1"}
@@ -84,12 +84,12 @@ limit = 0
84
  augmenter = null
85
 
86
  [training]
87
- accumulate_gradient = 3
88
  dev_corpus = "corpora.dev"
89
  train_corpus = "corpora.train"
90
  seed = ${system.seed}
91
  gpu_allocator = ${system.gpu_allocator}
92
- dropout = 0.1
93
  patience = 1600
94
  max_epochs = 0
95
  max_steps = 20000
@@ -102,7 +102,7 @@ before_update = null
102
  [training.batcher]
103
  @batchers = "spacy.batch_by_padded.v1"
104
  discard_oversize = true
105
- size = 2000
106
  buffer = 256
107
  get_length = null
108
 
@@ -122,14 +122,19 @@ eps = 0.00000001
122
 
123
  [training.optimizer.learn_rate]
124
  @schedules = "warmup_linear.v1"
125
- warmup_steps = 250
126
  total_steps = 20000
127
- initial_rate = 0.00005
128
 
129
  [training.score_weights]
130
- ents_f = 1.0
 
 
131
  ents_p = 0.0
132
  ents_r = 0.0
 
 
 
133
  ents_per_type = null
134
 
135
  [pretraining]
 
22
  [components]
23
 
24
  [components.ner]
25
+ factory = "ner_all_metrics"
26
  incorrect_spans_key = null
27
  moves = null
28
  scorer = {"@scorers":"spacy.ner_scorer.v1"}
 
84
  augmenter = null
85
 
86
  [training]
87
+ accumulate_gradient = 1
88
  dev_corpus = "corpora.dev"
89
  train_corpus = "corpora.train"
90
  seed = ${system.seed}
91
  gpu_allocator = ${system.gpu_allocator}
92
+ dropout = 0.2
93
  patience = 1600
94
  max_epochs = 0
95
  max_steps = 20000
 
102
  [training.batcher]
103
  @batchers = "spacy.batch_by_padded.v1"
104
  discard_oversize = true
105
+ size = 1000
106
  buffer = 256
107
  get_length = null
108
 
 
122
 
123
  [training.optimizer.learn_rate]
124
  @schedules = "warmup_linear.v1"
125
+ warmup_steps = 300
126
  total_steps = 20000
127
+ initial_rate = 0.000005
128
 
129
  [training.score_weights]
130
+ f1_COMPONENT = 0.17
131
+ f1_SYSTEM = 0.17
132
+ f1_ATTRIBUTE = 0.17
133
  ents_p = 0.0
134
  ents_r = 0.0
135
+ f1_macro = 0.17
136
+ f1_weighted = 0.17
137
+ ents_f = 0.17
138
  ents_per_type = null
139
 
140
  [pretraining]
custom_factory.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.pipeline.ner import EntityRecognizer
2
+ from spacy.language import DEFAULT_CONFIG, Language
3
+ from thinc.api import Config
4
+ from sklearn.metrics import f1_score, precision_recall_fscore_support
5
+
6
+
7
+ default_model_config = """
8
+ [model]
9
+ @architectures = "spacy.TransitionBasedParser.v2"
10
+ state_type = "ner"
11
+ extra_state_tokens = false
12
+ hidden_width = 64
13
+ maxout_pieces = 2
14
+ use_upper = false
15
+ nO = null
16
+
17
+ [model.tok2vec]
18
+ @architectures = "spacy-transformers.TransformerListener.v1"
19
+ grad_factor = 1.0
20
+ pooling = {"@layers":"reduce_mean.v1"}
21
+ upstream = "*"
22
+ """
23
+ DEFAULT_MODEL = Config().from_str(default_model_config)["model"]
24
+
25
+ @Language.factory("ner_all_metrics",
26
+ default_config={
27
+ "model": DEFAULT_MODEL,
28
+ "moves": None,
29
+ "scorer": {"@scorers": "spacy.ner_scorer.v1"},
30
+ "incorrect_spans_key": None,
31
+ "update_with_oracle_cut_size": 100
32
+ },
33
+ default_score_weights={
34
+ "f1_COMPONENT": 1.0,
35
+ "f1_SYSTEM": 1.0,
36
+ "f1_ATTRIBUTE": 1.0,
37
+ "ents_p": 0.0,
38
+ "ents_r": 0.0,
39
+ "f1_macro": 1.0,
40
+ "f1_weighted": 1.0,
41
+ "ents_f": 1.0,
42
+ })
43
+ def create_ner_all_metrics(nlp, name, model, moves, scorer, incorrect_spans_key, update_with_oracle_cut_size):
44
+ return NERWithAllMetrics(nlp.vocab, model, name=name, moves=moves, scorer=scorer,
45
+ incorrect_spans_key=incorrect_spans_key,
46
+ update_with_oracle_cut_size=update_with_oracle_cut_size)
47
+
48
+ class NERWithAllMetrics(EntityRecognizer):
49
+ def score(self, examples, **kwargs):
50
+ scores = super().score(examples, **kwargs)
51
+ scores = dict(list(scores.items()) + list(self.custom_scorer(examples).items()))
52
+ scores["ents_f"] = scores["f1_micro"]
53
+ del scores["f1_micro"]
54
+ return scores
55
+
56
+ def custom_scorer(self, examples):
57
+ y_true = []
58
+ y_pred = []
59
+ for example in examples:
60
+ gold = {(ent.start_char, ent.end_char, ent.label_) for ent in example.reference.ents}
61
+ pred = {(ent.start_char, ent.end_char, ent.label_) for ent in example.predicted.ents}
62
+ all_spans = gold | pred
63
+ for span in all_spans:
64
+ if span in gold and span in pred:
65
+ y_true.append(span[2])
66
+ y_pred.append(span[2])
67
+ elif span in gold:
68
+ y_true.append(span[2])
69
+ y_pred.append("O")
70
+ elif span in pred:
71
+ y_true.append("O")
72
+ y_pred.append(span[2])
73
+
74
+ labels = sorted({label for label in y_true if label != "O"})
75
+
76
+ precision, recall, f1, support = precision_recall_fscore_support(
77
+ y_true, y_pred, labels=labels, zero_division=0, average=None
78
+ )
79
+ result = {}
80
+ for l, p, r, f in zip(labels, precision, recall, f1):
81
+ result[f"f1_{l}"] = f
82
+
83
+ result["f1_micro"] = f1_score(y_true, y_pred, average="micro", labels=labels, zero_division=0)
84
+ result["f1_macro"] = f1_score(y_true, y_pred, average="macro", labels=labels, zero_division=0)
85
+ result["f1_weighted"] = f1_score(y_true, y_pred, average="weighted", labels=labels, zero_division=0)
86
+
87
+ return result
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"ru",
3
  "name":"patents_ner",
4
- "version":"1.0.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
@@ -37,30 +37,36 @@
37
 
38
  ],
39
  "performance":{
40
- "ents_f":0.6071360245,
41
- "ents_p":0.61849542,
42
- "ents_r":0.5961863611,
 
 
 
 
 
43
  "ents_per_type":{
44
  "SYSTEM":{
45
- "p":0.7098765432,
46
- "r":0.6133333333,
47
- "f":0.6580829757
48
  },
49
  "ATTRIBUTE":{
50
- "p":0.4098671727,
51
- "r":0.2814943527,
52
- "f":0.3337625547
53
  },
54
  "COMPONENT":{
55
- "p":0.652184738,
56
- "r":0.6898867527,
57
- "f":0.6705061752
58
  }
59
  },
60
- "transformer_loss":213577.0030247931,
61
- "ner_loss":8839.1455078125
62
  },
63
  "requirements":[
64
- "spacy-transformers>=1.3.8,<1.4.0"
 
65
  ]
66
  }
 
1
  {
2
  "lang":"ru",
3
  "name":"patents_ner",
4
+ "version":"1.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
37
 
38
  ],
39
  "performance":{
40
+ "f1_COMPONENT":0.6689663984,
41
+ "f1_SYSTEM":0.6772068511,
42
+ "f1_ATTRIBUTE":0.3617454217,
43
+ "ents_p":0.5982281487,
44
+ "ents_r":0.6152545557,
45
+ "f1_macro":0.5693062237,
46
+ "f1_weighted":0.6031160932,
47
+ "ents_f":0.6066219032,
48
  "ents_per_type":{
49
  "SYSTEM":{
50
+ "p":0.6692708333,
51
+ "r":0.6853333333,
52
+ "f":0.6772068511
53
  },
54
  "ATTRIBUTE":{
55
+ "p":0.3771805752,
56
+ "r":0.3475238923,
57
+ "f":0.3617454217
58
  },
59
  "COMPONENT":{
60
+ "p":0.6496277916,
61
+ "r":0.689491704,
62
+ "f":0.6689663984
63
  }
64
  },
65
+ "transformer_loss":9419.470294231,
66
+ "ner_loss":14866.390625
67
  },
68
  "requirements":[
69
+ "spacy-transformers>=1.3.8,<1.4.0",
70
+ "spacy>=3.8.4,<3.9.0"
71
  ]
72
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5da257b2d82f31304f6c22db4498523c379293b02170d815f86e06716f641632
3
  size 285226
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef3ac16ec06d58f22ee3f23b71b37b706697b3c5099cfdbd27f42ec5fcc4155a
3
  size 285226
ru_patents_ner-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c04a8a1e0082c81fb457ac785059f6c2ebf63305224e3be65996a85ad6b5cf2
3
- size 1822022001
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f78862070ad285cfd7c0bce13a2457d8c70eeb8e5ece21d00215d83f2c22caba
3
+ size 1822037615
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4949f5d3d8e08a828297066f3c69d91a090b7584e37c93a67cc83649e9c1ed1
3
  size 1430062590
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993ccc8d74fab373bc75f00ef70e63c9acd93ed7006a5d49c3de375694aa808f
3
  size 1430062590