noahjax commited on
Commit
723fe48
·
verified ·
1 Parent(s): fb5cdfe

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -43,3 +43,4 @@ parser/model filter=lfs diff=lfs merge=lfs -text
43
  textcat_classify/model filter=lfs diff=lfs merge=lfs -text
44
  tok2vec_small/model filter=lfs diff=lfs merge=lfs -text
45
  vocab/strings.json filter=lfs diff=lfs merge=lfs -text
 
 
43
  textcat_classify/model filter=lfs diff=lfs merge=lfs -text
44
  tok2vec_small/model filter=lfs diff=lfs merge=lfs -text
45
  vocab/strings.json filter=lfs diff=lfs merge=lfs -text
46
+ senter/model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -24,10 +24,10 @@ model-index:
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `en_tako_query_analyzer` |
27
- | **Version** | `0.0.2` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
- | **Default Pipeline** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `tok2vec`, `ner`, `textcat_classify` |
30
- | **Components** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `tok2vec`, `ner`, `textcat_classify` |
31
  | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
 
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `en_tako_query_analyzer` |
27
+ | **Version** | `0.0.3` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
+ | **Default Pipeline** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `senter`, `lemmatizer`, `tok2vec`, `ner`, `textcat_classify` |
30
+ | **Components** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `senter`, `lemmatizer`, `tok2vec`, `ner`, `textcat_classify` |
31
  | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
config.cfg CHANGED
@@ -15,7 +15,7 @@ seed = 0
15
 
16
  [nlp]
17
  lang = "en"
18
- pipeline = ["tok2vec_small","tagger","parser","attribute_ruler","lemmatizer","tok2vec","ner","textcat_classify"]
19
  batch_size = 1000
20
  disabled = []
21
  before_creation = null
@@ -81,6 +81,33 @@ nO = null
81
  width = 96
82
  upstream = "tok2vec"
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  [components.tagger]
85
  factory = "tagger"
86
  label_smoothing = 0.0
@@ -260,22 +287,22 @@ eps = 0.00000001
260
  learn_rate = 0.001
261
 
262
  [training.score_weights]
263
- tag_acc = 0.25
264
- dep_uas = 0.12
265
- dep_las = 0.12
266
  dep_las_per_type = null
267
- sents_p = null
268
- sents_r = null
269
- sents_f = 0.0
270
- lemma_acc = 0.25
271
- ents_f = 0.12
272
  ents_p = 0.0
273
  ents_r = 0.0
274
  ents_per_type = null
275
- cats_score = 0.06
276
  cats_score_desc = null
277
  cats_micro_p = null
278
- cats_micro_r = 0.06
279
  cats_micro_f = null
280
  cats_macro_p = null
281
  cats_macro_r = null
 
15
 
16
  [nlp]
17
  lang = "en"
18
+ pipeline = ["tok2vec_small","tagger","parser","attribute_ruler","senter","lemmatizer","tok2vec","ner","textcat_classify"]
19
  batch_size = 1000
20
  disabled = []
21
  before_creation = null
 
81
  width = 96
82
  upstream = "tok2vec"
83
 
84
+ [components.senter]
85
+ factory = "senter"
86
+ overwrite = false
87
+ scorer = {"@scorers":"spacy.senter_scorer.v1"}
88
+
89
+ [components.senter.model]
90
+ @architectures = "spacy.Tagger.v2"
91
+ nO = null
92
+ normalize = false
93
+
94
+ [components.senter.model.tok2vec]
95
+ @architectures = "spacy.Tok2Vec.v2"
96
+
97
+ [components.senter.model.tok2vec.embed]
98
+ @architectures = "spacy.MultiHashEmbed.v2"
99
+ width = 16
100
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
101
+ rows = [1000,500,500,500,50]
102
+ include_static_vectors = false
103
+
104
+ [components.senter.model.tok2vec.encode]
105
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
106
+ width = 16
107
+ depth = 2
108
+ window_size = 1
109
+ maxout_pieces = 2
110
+
111
  [components.tagger]
112
  factory = "tagger"
113
  label_smoothing = 0.0
 
287
  learn_rate = 0.001
288
 
289
  [training.score_weights]
290
+ tag_acc = 0.2
291
+ dep_uas = 0.1
292
+ dep_las = 0.1
293
  dep_las_per_type = null
294
+ sents_p = 0.0
295
+ sents_r = 0.0
296
+ sents_f = 0.2
297
+ lemma_acc = 0.2
298
+ ents_f = 0.1
299
  ents_p = 0.0
300
  ents_r = 0.0
301
  ents_per_type = null
302
+ cats_score = 0.05
303
  cats_score_desc = null
304
  cats_micro_p = null
305
+ cats_micro_r = 0.05
306
  cats_micro_f = null
307
  cats_macro_p = null
308
  cats_macro_r = null
custom_textcat.py CHANGED
@@ -111,7 +111,6 @@ class CustomTextcat(TextCategorizer):
111
  self.cfg = dict(cfg)
112
  self.scorer = scorer
113
  if weights is not None:
114
- print(f"Using weights: {weights}")
115
  self.weights = numpy.array(weights)
116
 
117
  def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
 
111
  self.cfg = dict(cfg)
112
  self.scorer = scorer
113
  if weights is not None:
 
114
  self.weights = numpy.array(weights)
115
 
116
  def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
en_tako_query_analyzer-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d276080166d535d192f52fef9138c4678a32445a14b0c27f9a43c974be5c3aca
3
- size 619963181
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e1d83f7eaa00d4dfb3414bed716f22e27a59e12006528fa8d240ef1ac36e02
3
+ size 620143055
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"en",
3
  "name":"tako_query_analyzer",
4
- "version":"0.0.2",
5
  "description":"",
6
  "author":"",
7
  "email":"",
@@ -158,6 +158,7 @@
158
  "tagger",
159
  "parser",
160
  "attribute_ruler",
 
161
  "lemmatizer",
162
  "tok2vec",
163
  "ner",
@@ -168,6 +169,7 @@
168
  "tagger",
169
  "parser",
170
  "attribute_ruler",
 
171
  "lemmatizer",
172
  "tok2vec",
173
  "ner",
 
1
  {
2
  "lang":"en",
3
  "name":"tako_query_analyzer",
4
+ "version":"0.0.3",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
158
  "tagger",
159
  "parser",
160
  "attribute_ruler",
161
+ "senter",
162
  "lemmatizer",
163
  "tok2vec",
164
  "ner",
 
169
  "tagger",
170
  "parser",
171
  "attribute_ruler",
172
+ "senter",
173
  "lemmatizer",
174
  "tok2vec",
175
  "ner",
senter/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "overwrite":false
3
+ }
senter/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e62c2504dcdc66144b30c048100af05f12207f933f6d669982b947ae71ffdeef
3
+ size 197089