noahjax commited on
Commit
16849dc
·
verified ·
1 Parent(s): 41a8235

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -38,3 +38,8 @@ textcat/model filter=lfs diff=lfs merge=lfs -text
38
  tok2vec/model filter=lfs diff=lfs merge=lfs -text
39
  vocab/key2row filter=lfs diff=lfs merge=lfs -text
40
  vocab/vectors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
38
  tok2vec/model filter=lfs diff=lfs merge=lfs -text
39
  vocab/key2row filter=lfs diff=lfs merge=lfs -text
40
  vocab/vectors filter=lfs diff=lfs merge=lfs -text
41
+ ner/model filter=lfs diff=lfs merge=lfs -text
42
+ parser/model filter=lfs diff=lfs merge=lfs -text
43
+ textcat_classify/model filter=lfs diff=lfs merge=lfs -text
44
+ tok2vec_small/model filter=lfs diff=lfs merge=lfs -text
45
+ vocab/strings.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -2,7 +2,6 @@
2
  tags:
3
  - spacy
4
  - token-classification
5
- - text-classification
6
  language:
7
  - en
8
  model-index:
@@ -14,21 +13,21 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.7170177384
18
  - name: NER Recall
19
  type: recall
20
- value: 0.7172165234
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.7171171171
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_tako_query_analyzer` |
28
- | **Version** | `0.0.1` |
29
  | **spaCy** | `>=3.7.5,<3.8.0` |
30
- | **Default Pipeline** | `tok2vec`, `ner`, `textcat` |
31
- | **Components** | `tok2vec`, `ner`, `textcat` |
32
  | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
33
  | **Sources** | n/a |
34
  | **License** | n/a |
@@ -38,12 +37,14 @@ model-index:
38
 
39
  <details>
40
 
41
- <summary>View label scheme (33 labels for 2 components)</summary>
42
 
43
  | Component | Labels |
44
  | --- | --- |
45
- | **`ner`** | `CARDINAL`, `CUSTOM_ATTRIBUTE`, `CUSTOM_SEMANTIC_FUNCTION`, `CUSTOM_SPORTS_CONFERENCE`, `CUSTOM_SPORTS_LEAGUE`, `CUSTOM_SPORTS_ROLE`, `CUSTOM_STOCK_TICKER`, `CUSTOM_TEAM`, `DATE`, `EVENT`, `FAC`, `GPE`, `LANGUAGE`, `LAW`, `LOC`, `MONEY`, `NORP`, `ORDINAL`, `ORG`, `PERCENT`, `PERSON`, `PRODUCT`, `QUANTITY`, `TIME`, `WORK_OF_ART` |
46
- | **`textcat`** | `Business and Finance`, `Arts, Culture, and Entertainment`, `Crime`, `Sports`, `Politics`, `Science and Technology`, `Health and Wellness`, `Lifestyle and Fashion` |
 
 
47
 
48
  </details>
49
 
@@ -51,17 +52,16 @@ model-index:
51
 
52
  | Type | Score |
53
  | --- | --- |
54
- | `ENTS_F` | 71.71 |
55
- | `ENTS_P` | 71.70 |
56
- | `ENTS_R` | 71.72 |
57
- | `CATS_SCORE` | 70.53 |
58
- | `CATS_MICRO_P` | 85.89 |
59
- | `CATS_MICRO_R` | 85.89 |
60
- | `CATS_MICRO_F` | 85.89 |
61
- | `CATS_MACRO_P` | 74.89 |
62
- | `CATS_MACRO_R` | 67.56 |
63
- | `CATS_MACRO_F` | 70.53 |
64
- | `CATS_MACRO_AUC` | 93.04 |
65
- | `TOK2VEC_LOSS` | 61786.52 |
66
- | `NER_LOSS` | 46852.50 |
67
- | `TEXTCAT_LOSS` | 1.09 |
 
2
  tags:
3
  - spacy
4
  - token-classification
 
5
  language:
6
  - en
7
  model-index:
 
13
  metrics:
14
  - name: NER Precision
15
  type: precision
16
+ value: 0.0
17
  - name: NER Recall
18
  type: recall
19
+ value: 0.0
20
  - name: NER F Score
21
  type: f_score
22
+ value: 0.0
23
  ---
24
  | Feature | Description |
25
  | --- | --- |
26
  | **Name** | `en_tako_query_analyzer` |
27
+ | **Version** | `0.0.2` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
+ | **Default Pipeline** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `tok2vec`, `ner`, `textcat_classify` |
30
+ | **Components** | `tok2vec_small`, `tagger`, `parser`, `attribute_ruler`, `tok2vec`, `ner`, `textcat_classify` |
31
  | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
 
37
 
38
  <details>
39
 
40
+ <summary>View label scheme (116 labels for 4 components)</summary>
41
 
42
  | Component | Labels |
43
  | --- | --- |
44
+ | **`tagger`** | `$`, `''`, `,`, `-LRB-`, `-RRB-`, `.`, `:`, `ADD`, `AFX`, `CC`, `CD`, `DT`, `EX`, `FW`, `HYPH`, `IN`, `JJ`, `JJR`, `JJS`, `LS`, `MD`, `NFP`, `NN`, `NNP`, `NNPS`, `NNS`, `PDT`, `POS`, `PRP`, `PRP$`, `RB`, `RBR`, `RBS`, `RP`, `SYM`, `TO`, `UH`, `VB`, `VBD`, `VBG`, `VBN`, `VBP`, `VBZ`, `WDT`, `WP`, `WP$`, `WRB`, `XX`, `_SP`, ```` |
45
+ | **`parser`** | `ROOT`, `acl`, `acomp`, `advcl`, `advmod`, `agent`, `amod`, `appos`, `attr`, `aux`, `auxpass`, `case`, `cc`, `ccomp`, `compound`, `conj`, `csubj`, `csubjpass`, `dative`, `dep`, `det`, `dobj`, `expl`, `intj`, `mark`, `meta`, `neg`, `nmod`, `npadvmod`, `nsubj`, `nsubjpass`, `nummod`, `oprd`, `parataxis`, `pcomp`, `pobj`, `poss`, `preconj`, `predet`, `prep`, `prt`, `punct`, `quantmod`, `relcl`, `xcomp` |
46
+ | **`ner`** | `CARDINAL`, `DATE`, `EVENT`, `FAC`, `GPE`, `LANGUAGE`, `LAW`, `LOC`, `MONEY`, `NORP`, `ORDINAL`, `ORG`, `PERCENT`, `PERSON`, `PRODUCT`, `QUANTITY`, `STOCK_TICKER`, `TIME`, `WORK_OF_ART` |
47
+ | **`textcat_classify`** | `ACCEPT`, `REJECT` |
48
 
49
  </details>
50
 
 
52
 
53
  | Type | Score |
54
  | --- | --- |
55
+ | `ENTS_F` | 0.00 |
56
+ | `ENTS_P` | 0.00 |
57
+ | `ENTS_R` | 0.00 |
58
+ | `ENTS_PER_TYPE` | 0.00 |
59
+ | `CATS_SCORE` | 85.07 |
60
+ | `CATS_MICRO_P` | 85.31 |
61
+ | `CATS_MICRO_R` | 85.31 |
62
+ | `CATS_MICRO_F` | 85.31 |
63
+ | `CATS_MACRO_P` | 85.35 |
64
+ | `CATS_MACRO_R` | 85.31 |
65
+ | `CATS_MACRO_F` | 85.31 |
66
+ | `CATS_MACRO_AUC` | 91.67 |
67
+ | `TEXTCAT_CLASSIFY_LOSS` | 94.04 |
 
attribute_ruler/patterns ADDED
Binary file (14.7 kB). View file
 
config.cfg CHANGED
@@ -1,12 +1,13 @@
1
  [paths]
2
- train = "corpus/ner-train.spacy"
3
- dev = "corpus/ner-test.spacy"
4
  vectors = "en_core_web_lg"
5
  init_tok2vec = null
6
 
7
  [variables]
8
- wandb_project_name = "tako-entity-extractor"
9
  wandb_team_name = "tako-team"
 
10
 
11
  [system]
12
  gpu_allocator = "pytorch"
@@ -14,7 +15,7 @@ seed = 0
14
 
15
  [nlp]
16
  lang = "en"
17
- pipeline = ["tok2vec","ner","textcat"]
18
  batch_size = 1000
19
  disabled = []
20
  before_creation = null
@@ -43,19 +44,20 @@ nO = null
43
 
44
  [components.ner.model.tok2vec]
45
  @architectures = "spacy.Tok2VecListener.v1"
46
- width = ${components.tok2vec.model.encode.width}
47
  upstream = "*"
48
 
49
- [components.textcat]
50
- factory = "textcat"
 
51
  scorer = {"@scorers":"spacy.textcat_scorer.v2"}
52
  threshold = 0.0
53
 
54
- [components.textcat.model]
55
  @architectures = "spacy.TextCatEnsemble.v2"
56
  nO = null
57
 
58
- [components.textcat.model.linear_model]
59
  @architectures = "spacy.TextCatBOW.v3"
60
  exclusive_classes = false
61
  length = 262144
@@ -63,10 +65,22 @@ ngram_size = 1
63
  no_output_layer = false
64
  nO = null
65
 
66
- [components.textcat.model.tok2vec]
67
- @architectures = "spacy.Tok2VecListener.v1"
68
- width = ${components.tok2vec.model.encode.width}
69
- upstream = "*"
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  [components.tok2vec]
72
  factory = "tok2vec"
@@ -76,7 +90,7 @@ factory = "tok2vec"
76
 
77
  [components.tok2vec.model.embed]
78
  @architectures = "spacy.MultiHashEmbed.v2"
79
- width = ${components.tok2vec.model.encode.width}
80
  attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
81
  rows = [5000,1000,2500,2500]
82
  include_static_vectors = true
@@ -119,8 +133,8 @@ accumulate_gradient = 1
119
  patience = 1000
120
  max_epochs = 0
121
  max_steps = 20000
122
- eval_frequency = 25
123
- frozen_components = []
124
  annotating_components = ["ner"]
125
  before_to_disk = null
126
  before_update = null
@@ -132,7 +146,7 @@ get_length = null
132
  [training.batcher.size]
133
  @schedules = "compounding.v1"
134
  start = 100
135
- stop = 1000
136
  compound = 1.001
137
  t = 0.0
138
 
@@ -175,14 +189,14 @@ eps = 0.00000001
175
  learn_rate = 0.001
176
 
177
  [training.score_weights]
178
- ents_f = 0.2
179
  ents_p = 0.0
180
- ents_r = 0.3
181
  ents_per_type = null
182
- cats_score = 0.5
183
  cats_score_desc = null
184
  cats_micro_p = null
185
- cats_micro_r = null
186
  cats_micro_f = null
187
  cats_macro_p = null
188
  cats_macro_r = null
@@ -202,4 +216,12 @@ after_init = null
202
 
203
  [initialize.components]
204
 
 
 
 
 
 
 
 
 
205
  [initialize.tokenizer]
 
1
  [paths]
2
+ train = "corpus/filter-train.spacy"
3
+ dev = "corpus/filter-test.spacy"
4
  vectors = "en_core_web_lg"
5
  init_tok2vec = null
6
 
7
  [variables]
8
+ wandb_project_name = "tako-query-filter"
9
  wandb_team_name = "tako-team"
10
+ base_model = "ner/dashing-wind"
11
 
12
  [system]
13
  gpu_allocator = "pytorch"
 
15
 
16
  [nlp]
17
  lang = "en"
18
+ pipeline = ["tok2vec","ner","textcat_classify"]
19
  batch_size = 1000
20
  disabled = []
21
  before_creation = null
 
44
 
45
  [components.ner.model.tok2vec]
46
  @architectures = "spacy.Tok2VecListener.v1"
47
+ width = 256
48
  upstream = "*"
49
 
50
+ [components.textcat_classify]
51
+ factory = "weighted_textcat"
52
+ class_weights = [0.67,0.33]
53
  scorer = {"@scorers":"spacy.textcat_scorer.v2"}
54
  threshold = 0.0
55
 
56
+ [components.textcat_classify.model]
57
  @architectures = "spacy.TextCatEnsemble.v2"
58
  nO = null
59
 
60
+ [components.textcat_classify.model.linear_model]
61
  @architectures = "spacy.TextCatBOW.v3"
62
  exclusive_classes = false
63
  length = 262144
 
65
  no_output_layer = false
66
  nO = null
67
 
68
+ [components.textcat_classify.model.tok2vec]
69
+ @architectures = "spacy.Tok2Vec.v2"
70
+
71
+ [components.textcat_classify.model.tok2vec.embed]
72
+ @architectures = "spacy.MultiHashEmbed.v2"
73
+ width = 128
74
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE","ENT_TYPE"]
75
+ rows = [2000,500,1000,500,500]
76
+ include_static_vectors = true
77
+
78
+ [components.textcat_classify.model.tok2vec.encode]
79
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
80
+ width = 128
81
+ window_size = 1
82
+ maxout_pieces = 3
83
+ depth = 4
84
 
85
  [components.tok2vec]
86
  factory = "tok2vec"
 
90
 
91
  [components.tok2vec.model.embed]
92
  @architectures = "spacy.MultiHashEmbed.v2"
93
+ width = 256
94
  attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
95
  rows = [5000,1000,2500,2500]
96
  include_static_vectors = true
 
133
  patience = 1000
134
  max_epochs = 0
135
  max_steps = 20000
136
+ eval_frequency = 100
137
+ frozen_components = ["tagger","attribute_ruler","parser","tok2vec","ner"]
138
  annotating_components = ["ner"]
139
  before_to_disk = null
140
  before_update = null
 
146
  [training.batcher.size]
147
  @schedules = "compounding.v1"
148
  start = 100
149
+ stop = 2000
150
  compound = 1.001
151
  t = 0.0
152
 
 
189
  learn_rate = 0.001
190
 
191
  [training.score_weights]
192
+ ents_f = 0.5
193
  ents_p = 0.0
194
+ ents_r = 0.0
195
  ents_per_type = null
196
+ cats_score = 0.25
197
  cats_score_desc = null
198
  cats_micro_p = null
199
+ cats_micro_r = 0.25
200
  cats_micro_f = null
201
  cats_macro_p = null
202
  cats_macro_r = null
 
216
 
217
  [initialize.components]
218
 
219
+ [initialize.components.textcat_classify]
220
+ positive_label = "ACCEPT"
221
+
222
+ [initialize.components.textcat_classify.labels]
223
+ @readers = "spacy.read_labels.v1"
224
+ path = "corpus/labels/filter-labels/textcat_classify.json"
225
+ require = false
226
+
227
  [initialize.tokenizer]
custom_textcat.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
2
+ from spacy.util import registry
3
+ from thinc.types import Floats2d
4
+ from spacy.tokens import Doc
5
+ from spacy.pipeline import TextCategorizer
6
+ from spacy.training import Example, validate_examples
7
+ from spacy.pipeline.textcat import textcat_score
8
+ from spacy.vocab import Vocab
9
+ from spacy.scorer import Scorer
10
+ from spacy.language import Language
11
+ from thinc.api import Model
12
+ import numpy
13
+
14
+
15
+ @Language.factory(
16
+ "weighted_textcat",
17
+ assigns=["doc.cats"],
18
+ default_config={
19
+ "threshold": 0.0,
20
+ "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
21
+ },
22
+ default_score_weights={
23
+ "cats_score": 1.0,
24
+ "cats_score_desc": None,
25
+ "cats_micro_p": None,
26
+ "cats_micro_r": None,
27
+ "cats_micro_f": None,
28
+ "cats_macro_p": None,
29
+ "cats_macro_r": None,
30
+ "cats_macro_f": None,
31
+ "cats_macro_auc": None,
32
+ "cats_f_per_type": None,
33
+ },
34
+ )
35
+ def make_textcat(
36
+ nlp: Language,
37
+ name: str,
38
+ model: Model[List[Doc], List[Floats2d]],
39
+ threshold: float,
40
+ scorer: Optional[Callable],
41
+ class_weights: Optional[List],
42
+ ) -> "TextCategorizer":
43
+ """Create a TextCategorizer component. The text categorizer predicts categories
44
+ over a whole document. It can learn one or more labels, and the labels are considered
45
+ to be mutually exclusive (i.e. one true label per doc).
46
+
47
+ model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
48
+ scores for each category.
49
+ threshold (float): Cutoff to consider a prediction "positive".
50
+ scorer (Optional[Callable]): The scoring method.
51
+ """
52
+ if class_weights == "null":
53
+ class_weights = None
54
+ return CustomTextcat(
55
+ nlp.vocab,
56
+ model,
57
+ name,
58
+ threshold=threshold,
59
+ scorer=scorer,
60
+ weights=class_weights,
61
+ )
62
+
63
+
64
+ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
65
+ return Scorer.score_cats(
66
+ examples,
67
+ "cats",
68
+ multi_label=False,
69
+ **kwargs,
70
+ )
71
+
72
+
73
+ @registry.scorers("spacy.textcat_scorer.v2")
74
+ def make_textcat_scorer():
75
+ return textcat_score
76
+
77
+
78
+ class CustomTextcat(TextCategorizer):
79
+ def __init__(
80
+ self,
81
+ vocab: Vocab,
82
+ model: Model,
83
+ name: str = "textcat",
84
+ *,
85
+ threshold: float,
86
+ scorer: Optional[Callable] = textcat_score,
87
+ weights: Optional[List[float]] = None,
88
+ ) -> None:
89
+ """Initialize a text categorizer for single-label classification.
90
+
91
+ vocab (Vocab): The shared vocabulary.
92
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
93
+ name (str): The component instance name, used to add entries to the
94
+ losses during training.
95
+ threshold (float): Unused, not needed for single-label (exclusive
96
+ classes) classification.
97
+ scorer (Optional[Callable]): The scoring method. Defaults to
98
+ Scorer.score_cats for the attribute "cats".
99
+
100
+ DOCS: https://spacy.io/api/textcategorizer#init
101
+ """
102
+ self.vocab = vocab
103
+ self.model = model
104
+ self.name = name
105
+ self._rehearsal_model = None
106
+ cfg: Dict[str, Any] = {
107
+ "labels": [],
108
+ "threshold": threshold,
109
+ "positive_label": None,
110
+ }
111
+ self.cfg = dict(cfg)
112
+ self.scorer = scorer
113
+ if weights is not None:
114
+ print(f"Using weights: {weights}")
115
+ self.weights = numpy.array(weights)
116
+
117
+ def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
118
+ """Find the loss and gradient of loss for the batch of documents and
119
+ their predicted scores.
120
+
121
+ examples (Iterable[Examples]): The batch of examples.
122
+ scores: Scores representing the model's predictions.
123
+ RETURNS (Tuple[float, float]): The loss and the gradient.
124
+
125
+ DOCS: https://spacy.io/api/textcategorizer#get_loss
126
+ """
127
+ validate_examples(examples, "TextCategorizer.get_loss")
128
+ self._validate_categories(examples)
129
+ truths, not_missing = self._examples_to_truth(examples)
130
+ not_missing = self.model.ops.asarray(not_missing) # type: ignore
131
+ d_scores = scores - truths
132
+ d_scores *= not_missing
133
+ weights = self.model.ops.asarray(self.weights) # type: ignore
134
+ if weights is not None:
135
+ squared = d_scores**2
136
+ mean_square_error = numpy.sum(squared * weights) / (
137
+ numpy.sum(weights) * len(squared)
138
+ )
139
+ d_scores *= weights
140
+ else:
141
+ mean_square_error = (d_scores**2).mean()
142
+ return float(mean_square_error), d_scores
en_tako_query_analyzer-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9176c11237026aad73a7bfa404223c1173e986b0cbd3679485c931bbb1399ddd
3
- size 608751851
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758f2f483a1f44bf0ff426f5c5e2abf5867e859261672a51f8759e97ca667a31
3
+ size 619535137
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"en",
3
  "name":"tako_query_analyzer",
4
- "version":"0.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
@@ -16,18 +16,116 @@
16
  "name":"en_vectors"
17
  },
18
  "labels":{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "tok2vec":[
20
 
21
  ],
22
  "ner":[
23
  "CARDINAL",
24
- "CUSTOM_ATTRIBUTE",
25
- "CUSTOM_SEMANTIC_FUNCTION",
26
- "CUSTOM_SPORTS_CONFERENCE",
27
- "CUSTOM_SPORTS_LEAGUE",
28
- "CUSTOM_SPORTS_ROLE",
29
- "CUSTOM_STOCK_TICKER",
30
- "CUSTOM_TEAM",
31
  "DATE",
32
  "EVENT",
33
  "FAC",
@@ -43,208 +141,63 @@
43
  "PERSON",
44
  "PRODUCT",
45
  "QUANTITY",
 
46
  "TIME",
47
  "WORK_OF_ART"
48
  ],
49
- "textcat":[
50
- "Business and Finance",
51
- "Arts, Culture, and Entertainment",
52
- "Crime",
53
- "Sports",
54
- "Politics",
55
- "Science and Technology",
56
- "Health and Wellness",
57
- "Lifestyle and Fashion"
58
  ]
59
  },
60
  "pipeline":[
 
 
 
 
61
  "tok2vec",
62
  "ner",
63
- "textcat"
64
  ],
65
  "components":[
 
 
 
 
66
  "tok2vec",
67
  "ner",
68
- "textcat"
69
  ],
70
  "disabled":[
71
 
72
  ],
73
  "performance":{
74
- "ents_f":0.7171171171,
75
- "ents_p":0.7170177384,
76
- "ents_r":0.7172165234,
77
- "ents_per_type":{
78
- "CUSTOM_STOCK_TICKER":{
79
- "p":0.5652173913,
80
- "r":0.6842105263,
81
- "f":0.619047619
82
- },
83
- "EVENT":{
84
- "p":0.5733333333,
85
- "r":0.4257425743,
86
- "f":0.4886363636
87
- },
88
- "CUSTOM_TEAM":{
89
- "p":0.9032258065,
90
- "r":0.8358208955,
91
- "f":0.8682170543
92
- },
93
- "CUSTOM_ATTRIBUTE":{
94
- "p":0.6078998073,
95
- "r":0.5998098859,
96
- "f":0.6038277512
97
- },
98
- "GPE":{
99
- "p":0.8605898123,
100
- "r":0.9119318182,
101
- "f":0.8855172414
102
- },
103
- "ORG":{
104
- "p":0.649321267,
105
- "r":0.7377892031,
106
- "f":0.6907340554
107
- },
108
- "DATE":{
109
- "p":0.8639618138,
110
- "r":0.8894348894,
111
- "f":0.8765133172
112
- },
113
- "CUSTOM_SEMANTIC_FUNCTION":{
114
- "p":0.7647058824,
115
- "r":0.6713615023,
116
- "f":0.715
117
- },
118
- "LOC":{
119
- "p":0.6785714286,
120
- "r":0.4418604651,
121
- "f":0.5352112676
122
- },
123
- "MONEY":{
124
- "p":0.7894736842,
125
- "r":0.8181818182,
126
- "f":0.8035714286
127
- },
128
- "PRODUCT":{
129
- "p":0.5107526882,
130
- "r":0.5053191489,
131
- "f":0.5080213904
132
- },
133
- "LANGUAGE":{
134
- "p":0.0,
135
- "r":0.0,
136
- "f":0.0
137
- },
138
- "PERSON":{
139
- "p":0.8941176471,
140
- "r":0.8260869565,
141
- "f":0.8587570621
142
- },
143
- "QUANTITY":{
144
- "p":0.6111111111,
145
- "r":0.6111111111,
146
- "f":0.6111111111
147
- },
148
- "NORP":{
149
- "p":0.5588235294,
150
- "r":0.5,
151
- "f":0.5277777778
152
- },
153
- "LAW":{
154
- "p":0.8571428571,
155
- "r":0.3529411765,
156
- "f":0.5
157
- },
158
- "CUSTOM_SPORTS_LEAGUE":{
159
- "p":0.9642857143,
160
- "r":0.9473684211,
161
- "f":0.9557522124
162
- },
163
- "CUSTOM_SPORTS_ROLE":{
164
- "p":0.9444444444,
165
- "r":0.7391304348,
166
- "f":0.8292682927
167
- },
168
- "CARDINAL":{
169
- "p":0.5,
170
- "r":0.1111111111,
171
- "f":0.1818181818
172
- },
173
- "PERCENT":{
174
- "p":1.0,
175
- "r":0.1428571429,
176
- "f":0.25
177
- },
178
- "FAC":{
179
- "p":0.1428571429,
180
- "r":0.1,
181
- "f":0.1176470588
182
- },
183
- "ORDINAL":{
184
- "p":0.0,
185
- "r":0.0,
186
- "f":0.0
187
- },
188
- "WORK_OF_ART":{
189
- "p":0.0,
190
- "r":0.0,
191
- "f":0.0
192
- }
193
- },
194
- "cats_score":0.7053147985,
195
- "cats_score_desc":"macro F",
196
- "cats_micro_p":0.8588957055,
197
- "cats_micro_r":0.8588957055,
198
- "cats_micro_f":0.8588957055,
199
- "cats_macro_p":0.7489478628,
200
- "cats_macro_r":0.6756235592,
201
- "cats_macro_f":0.7053147985,
202
- "cats_macro_auc":0.9304171084,
203
  "cats_f_per_type":{
204
- "Business and Finance":{
205
- "p":0.8983516484,
206
- "r":0.9450867052,
207
- "f":0.9211267606
208
- },
209
- "Arts, Culture, and Entertainment":{
210
- "p":0.5,
211
- "r":0.4772727273,
212
- "f":0.488372093
213
- },
214
- "Crime":{
215
- "p":0.9090909091,
216
- "r":0.5555555556,
217
- "f":0.6896551724
218
- },
219
- "Sports":{
220
- "p":0.965034965,
221
- "r":0.8846153846,
222
- "f":0.9230769231
223
- },
224
- "Politics":{
225
- "p":0.9125,
226
- "r":0.8538011696,
227
- "f":0.8821752266
228
- },
229
- "Science and Technology":{
230
- "p":0.6853932584,
231
- "r":0.6931818182,
232
- "f":0.6892655367
233
- },
234
- "Health and Wellness":{
235
- "p":0.7878787879,
236
- "r":0.7647058824,
237
- "f":0.776119403
238
- },
239
- "Lifestyle and Fashion":{
240
- "p":0.3333333333,
241
- "r":0.2307692308,
242
- "f":0.2727272727
243
  }
244
  },
245
- "tok2vec_loss":617.8652423046,
246
- "ner_loss":468.5249541622,
247
- "textcat_loss":0.0108695821
248
  },
249
  "requirements":[
250
 
 
1
  {
2
  "lang":"en",
3
  "name":"tako_query_analyzer",
4
+ "version":"0.0.2",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
16
  "name":"en_vectors"
17
  },
18
  "labels":{
19
+ "tok2vec_small":[
20
+
21
+ ],
22
+ "tagger":[
23
+ "$",
24
+ "''",
25
+ ",",
26
+ "-LRB-",
27
+ "-RRB-",
28
+ ".",
29
+ ":",
30
+ "ADD",
31
+ "AFX",
32
+ "CC",
33
+ "CD",
34
+ "DT",
35
+ "EX",
36
+ "FW",
37
+ "HYPH",
38
+ "IN",
39
+ "JJ",
40
+ "JJR",
41
+ "JJS",
42
+ "LS",
43
+ "MD",
44
+ "NFP",
45
+ "NN",
46
+ "NNP",
47
+ "NNPS",
48
+ "NNS",
49
+ "PDT",
50
+ "POS",
51
+ "PRP",
52
+ "PRP$",
53
+ "RB",
54
+ "RBR",
55
+ "RBS",
56
+ "RP",
57
+ "SYM",
58
+ "TO",
59
+ "UH",
60
+ "VB",
61
+ "VBD",
62
+ "VBG",
63
+ "VBN",
64
+ "VBP",
65
+ "VBZ",
66
+ "WDT",
67
+ "WP",
68
+ "WP$",
69
+ "WRB",
70
+ "XX",
71
+ "_SP",
72
+ "``"
73
+ ],
74
+ "parser":[
75
+ "ROOT",
76
+ "acl",
77
+ "acomp",
78
+ "advcl",
79
+ "advmod",
80
+ "agent",
81
+ "amod",
82
+ "appos",
83
+ "attr",
84
+ "aux",
85
+ "auxpass",
86
+ "case",
87
+ "cc",
88
+ "ccomp",
89
+ "compound",
90
+ "conj",
91
+ "csubj",
92
+ "csubjpass",
93
+ "dative",
94
+ "dep",
95
+ "det",
96
+ "dobj",
97
+ "expl",
98
+ "intj",
99
+ "mark",
100
+ "meta",
101
+ "neg",
102
+ "nmod",
103
+ "npadvmod",
104
+ "nsubj",
105
+ "nsubjpass",
106
+ "nummod",
107
+ "oprd",
108
+ "parataxis",
109
+ "pcomp",
110
+ "pobj",
111
+ "poss",
112
+ "preconj",
113
+ "predet",
114
+ "prep",
115
+ "prt",
116
+ "punct",
117
+ "quantmod",
118
+ "relcl",
119
+ "xcomp"
120
+ ],
121
+ "attribute_ruler":[
122
+
123
+ ],
124
  "tok2vec":[
125
 
126
  ],
127
  "ner":[
128
  "CARDINAL",
 
 
 
 
 
 
 
129
  "DATE",
130
  "EVENT",
131
  "FAC",
 
141
  "PERSON",
142
  "PRODUCT",
143
  "QUANTITY",
144
+ "STOCK_TICKER",
145
  "TIME",
146
  "WORK_OF_ART"
147
  ],
148
+ "textcat_classify":[
149
+ "ACCEPT",
150
+ "REJECT"
 
 
 
 
 
 
151
  ]
152
  },
153
  "pipeline":[
154
+ "tok2vec_small",
155
+ "tagger",
156
+ "parser",
157
+ "attribute_ruler",
158
  "tok2vec",
159
  "ner",
160
+ "textcat_classify"
161
  ],
162
  "components":[
163
+ "tok2vec_small",
164
+ "tagger",
165
+ "parser",
166
+ "attribute_ruler",
167
  "tok2vec",
168
  "ner",
169
+ "textcat_classify"
170
  ],
171
  "disabled":[
172
 
173
  ],
174
  "performance":{
175
+ "ents_f":0.0,
176
+ "ents_p":0.0,
177
+ "ents_r":0.0,
178
+ "ents_per_type":0.0,
179
+ "cats_score":0.8507157464,
180
+ "cats_score_desc":"F (ACCEPT)",
181
+ "cats_micro_p":0.8531187123,
182
+ "cats_micro_r":0.8531187123,
183
+ "cats_micro_f":0.8531187123,
184
+ "cats_macro_p":0.853485064,
185
+ "cats_macro_r":0.8531187123,
186
+ "cats_macro_f":0.8530806455,
187
+ "cats_macro_auc":0.9167497439,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  "cats_f_per_type":{
189
+ "ACCEPT":{
190
+ "p":0.8648648649,
191
+ "r":0.8370221328,
192
+ "f":0.8507157464
193
+ },
194
+ "REJECT":{
195
+ "p":0.8421052632,
196
+ "r":0.8692152918,
197
+ "f":0.8554455446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  }
199
  },
200
+ "textcat_classify_loss":0.9403656576
 
 
201
  },
202
  "requirements":[
203
 
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves�l{"0":{},"1":{"CUSTOM_ATTRIBUTE":8802,"GPE":3351,"DATE":2617,"ORG":2570,"PRODUCT":1459,"CUSTOM_SEMANTIC_FUNCTION":995,"PERSON":760,"EVENT":594,"CUSTOM_TEAM":518,"CUSTOM_STOCK_TICKER":394,"CUSTOM_SPORTS_LEAGUE":322,"NORP":260,"LOC":233,"MONEY":199,"CUSTOM_SPORTS_ROLE":125,"FAC":111,"LAW":96,"QUANTITY":90,"WORK_OF_ART":68,"PERCENT":33,"CARDINAL":19,"LANGUAGE":13,"TIME":5,"CUSTOM_SPORTS_CONFERENCE":5,"ORDINAL":2},"2":{"CUSTOM_ATTRIBUTE":8802,"GPE":3351,"DATE":2617,"ORG":2570,"PRODUCT":1459,"CUSTOM_SEMANTIC_FUNCTION":995,"PERSON":760,"EVENT":594,"CUSTOM_TEAM":518,"CUSTOM_STOCK_TICKER":394,"CUSTOM_SPORTS_LEAGUE":322,"NORP":260,"LOC":233,"MONEY":199,"CUSTOM_SPORTS_ROLE":125,"FAC":111,"LAW":96,"QUANTITY":90,"WORK_OF_ART":68,"PERCENT":33,"CARDINAL":19,"LANGUAGE":13,"TIME":5,"CUSTOM_SPORTS_CONFERENCE":5,"ORDINAL":2},"3":{"CUSTOM_ATTRIBUTE":8802,"GPE":3351,"DATE":2617,"ORG":2570,"PRODUCT":1459,"CUSTOM_SEMANTIC_FUNCTION":995,"PERSON":760,"EVENT":594,"CUSTOM_TEAM":518,"CUSTOM_STOCK_TICKER":394,"CUSTOM_SPORTS_LEAGUE":322,"NORP":260,"LOC":233,"MONEY":199,"CUSTOM_SPORTS_ROLE":125,"FAC":111,"LAW":96,"QUANTITY":90,"WORK_OF_ART":68,"PERCENT":33,"CARDINAL":19,"LANGUAGE":13,"TIME":5,"CUSTOM_SPORTS_CONFERENCE":5,"ORDINAL":2},"4":{"CUSTOM_ATTRIBUTE":8802,"GPE":3351,"DATE":2617,"ORG":2570,"PRODUCT":1459,"CUSTOM_SEMANTIC_FUNCTION":995,"PERSON":760,"EVENT":594,"CUSTOM_TEAM":518,"CUSTOM_STOCK_TICKER":394,"CUSTOM_SPORTS_LEAGUE":322,"NORP":260,"LOC":233,"MONEY":199,"CUSTOM_SPORTS_ROLE":125,"FAC":111,"LAW":96,"QUANTITY":90,"WORK_OF_ART":68,"PERCENT":33,"CARDINAL":19,"LANGUAGE":13,"TIME":5,"CUSTOM_SPORTS_CONFERENCE":5,"ORDINAL":2,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"2":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"3":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25},"4":{"ORG":32008,"GPE":3728,"PERSON":1105,"DATE":850,"WORK_OF_ART":686,"PRODUCT":585,"EVENT":283,"MONEY":214,"NORP":179,"STOCK_TICKER":156,"LAW":129,"LOC":111,"PERCENT":88,"FAC":75,"QUANTITY":60,"CARDINAL":57,"ORDINAL":42,"TIME":27,"LANGUAGE":25,"":1},"5":{"":1}}�cfg��neg_key�
parser/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":30,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
parser/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1836fbc02b3924b2fd5f65325c58ae852ff112db1090ca724e5a801e68b85fd
3
+ size 319909
parser/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves� {"0":{"":994332},"1":{"":999432},"2":{"det":172595,"nsubj":165748,"compound":116623,"amod":105184,"aux":86667,"punct":65478,"advmod":62763,"poss":36443,"mark":27941,"nummod":22598,"auxpass":15594,"prep":14001,"nsubjpass":13856,"neg":12357,"cc":10739,"nmod":9562,"advcl":9062,"npadvmod":8168,"quantmod":7101,"intj":6464,"ccomp":5896,"dobj":3427,"expl":3360,"dep":2871,"predet":1944,"parataxis":1837,"csubj":1428,"preconj":621,"pobj||prep":616,"attr":578,"meta":376,"advmod||conj":368,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183790,"pobj":182191,"prep":174008,"dobj":89615,"conj":59687,"cc":51930,"ccomp":30385,"advmod":22861,"xcomp":21021,"relcl":20969,"advcl":19828,"attr":17741,"acomp":16922,"appos":15265,"case":13388,"acl":12085,"pcomp":10324,"dep":10116,"npadvmod":9796,"prt":8179,"agent":3903,"dative":3866,"nsubj":3470,"neg":2906,"amod":2839,"intj":2819,"nummod":2732,"oprd":2301,"parataxis":1261,"quantmod":319,"nmod":294,"acl||dobj":200,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":92,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"relcl||pobj":42,"acl||nsubjpass":42,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":111664}}�cfg��neg_key�
tagger/cfg ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "label_smoothing":0.0,
3
+ "labels":[
4
+ "$",
5
+ "''",
6
+ ",",
7
+ "-LRB-",
8
+ "-RRB-",
9
+ ".",
10
+ ":",
11
+ "ADD",
12
+ "AFX",
13
+ "CC",
14
+ "CD",
15
+ "DT",
16
+ "EX",
17
+ "FW",
18
+ "HYPH",
19
+ "IN",
20
+ "JJ",
21
+ "JJR",
22
+ "JJS",
23
+ "LS",
24
+ "MD",
25
+ "NFP",
26
+ "NN",
27
+ "NNP",
28
+ "NNPS",
29
+ "NNS",
30
+ "PDT",
31
+ "POS",
32
+ "PRP",
33
+ "PRP$",
34
+ "RB",
35
+ "RBR",
36
+ "RBS",
37
+ "RP",
38
+ "SYM",
39
+ "TO",
40
+ "UH",
41
+ "VB",
42
+ "VBD",
43
+ "VBG",
44
+ "VBN",
45
+ "VBP",
46
+ "VBZ",
47
+ "WDT",
48
+ "WP",
49
+ "WP$",
50
+ "WRB",
51
+ "XX",
52
+ "_SP",
53
+ "``"
54
+ ],
55
+ "neg_prefix":"!",
56
+ "overwrite":false
57
+ }
tagger/model ADDED
Binary file (19.8 kB). View file
 
textcat_classify/cfg ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels":[
3
+ "ACCEPT",
4
+ "REJECT"
5
+ ],
6
+ "threshold":0.0,
7
+ "positive_label":"ACCEPT"
8
+ }
textcat_classify/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65c611aa01b463b7f99116d0b1a53cd75effb9d0bac5febef70bf3b85f0b075
3
+ size 8319359
tok2vec/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b26ab00bd800730dbd5328c6603e549fd33426b18a10ca6c4efd1bf2e68c7e84
3
  size 34434008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8db1e5a93c4f955f990b7f6005b11c65ac6b9efa20f2c02291ac2013d06a203
3
  size 34434008
tok2vec_small/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }
tok2vec_small/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d8414521eaf75f817bd1b351b26039a22a912bb2617f95ead305420f2ebffd
3
+ size 6269370
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff