kyleluoma commited on
Commit
d922b58
·
verified ·
1 Parent(s): caf6281

Upload 2 files

Browse files
snails_naturalness_classifier.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2024 Kyle Luoma
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from transformers import AutoTokenizer, CanineForSequenceClassification, pipeline
18
+ import torch
19
+ import pandas as pd
20
+ import tokenprocessing as tp
21
+
22
+
23
+ class CanineIdentifierClassifier:
24
+ """
25
+ A classifier for identifying word naturalness using a pre-trained text analysis model.
26
+ Classifies words as Regular (label N1), Low (label N2), or Least (label N3) natural.
27
+ Attributes:
28
+ model_name (str): The name of the model used for classification.
29
+ checkpoint (int): The checkpoint number of the model.
30
+ id2label (dict): A dictionary mapping label IDs to label names.
31
+ label2id (dict): A dictionary mapping label names to label IDs.
32
+ classifier (pipeline): The sentiment analysis pipeline used for classification.
33
+ identifiers (pd.DataFrame): A DataFrame containing identifiers to classify.
34
+ Methods:
35
+ __init__(identifiers=pd.DataFrame()):
36
+ Initializes the classifier with the given identifiers DataFrame.
37
+ do_batch_job(ident_df=None, save_as_excel=False, make_tag=True):
38
+ Performs batch classification on the given DataFrame of identifiers.
39
+ Args:
40
+ ident_df (pd.DataFrame, optional): The DataFrame of identifiers to classify. Defaults to None.
41
+ save_as_excel (bool, optional): Whether to save the results as an Excel file. Defaults to False.
42
+ make_tag (bool, optional): Whether to add a token tag to the text before classification. Defaults to True.
43
+ classify_identifier(identifier, make_tag=True):
44
+ Classifies a single identifier.
45
+ Args:
46
+ identifier (str): The identifier to classify.
47
+ make_tag (bool, optional): Whether to add a token tag to the identifier before classification. Defaults to True.
48
+ Returns:
49
+ list: The classification result.
50
+ """
51
+
52
+
53
+ def __init__(self, identifiers = pd.DataFrame()):
54
+
55
+ self.model_name = "kyleluoma/SNAILS-word-naturalness-classifier"
56
+ self.checkpoint = 5590
57
+ self.id2label = {0: "N1", 1: "N2", 2: "N3"}
58
+ self.label2id = {"N1": 0, "N2": 1, "N3": 2}
59
+ self.classifier = pipeline(
60
+ "sentiment-analysis",
61
+ model = "kyleluoma/SNAILS-word-naturalness-classifier",
62
+ device=0
63
+ )
64
+ self.identifiers = identifiers
65
+
66
+ def do_batch_job(self, ident_df: pd.DataFrame = None, save_as_excel: bool = False, make_tag: bool = True):
67
+ """
68
+ Processes a batch of text data through a classifier and optionally saves the results to an Excel file.
69
+ Args:
70
+ ident_df (pd.DataFrame, optional): DataFrame containing the text data to be classified.
71
+ If None, uses self.identifiers. Defaults to None.
72
+ save_as_excel (bool, optional): If True, saves the results to an Excel file. Defaults to False.
73
+ make_tag (bool, optional): If True, appends a token tag to the text before classification. Defaults to True.
74
+ Returns:
75
+ None
76
+ """
77
+
78
+ auto_scores = []
79
+
80
+ if ident_df == None:
81
+ ident_df = self.identifiers
82
+
83
+ for row in ident_df.itertuples():
84
+ if make_tag:
85
+ pred = classifier(row.text + tp.make_token_tag(row.text))
86
+ else:
87
+ pred = self.classifier(row.text)
88
+ print(pred)
89
+ auto_scores.append(pred[0]['label'])
90
+
91
+ ident_df["prediction"] = auto_scores
92
+
93
+ if save_as_excel:
94
+ ident_df[['text', 'prediction', 'category']].to_excel(
95
+ f"./classifier-inference-results/{self.model_name}-cp-{self.checkpoint}.xlsx",
96
+ index=False
97
+ )
98
+
99
+ def classify_identifier(self, identifier: str, make_tag: bool = True):
100
+ """
101
+ Classifies the given identifier using the classifier.
102
+ Args:
103
+ identifier (str): The identifier to classify.
104
+ make_tag (bool, optional): If True, appends a token tag to the identifier before classification. Defaults to True.
105
+ Returns:
106
+ The classification result of the identifier.
107
+ """
108
+
109
+ identifier = str(identifier)
110
+ if make_tag:
111
+ identifier += (" " + tp.make_token_tag(identifier))
112
+ pred = self.classifier(identifier)
113
+ # print("Classifying", identifier, "as", pred)
114
+ return pred
115
+
116
+
117
+ if __name__ == "__main__":
118
+ classifier = CanineIdentifierClassifier()
119
+ print(classifier.classify_identifier("WinterWeather"))
120
+ print(classifier.classify_identifier("WntrWthr"))
121
+ print(classifier.classify_identifier("WWth"))
tokenprocessing.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def make_token_tag(identifier):
2
+ """
3
+ Feature engineering for identifiers, tags each character as a vowel, consonant, number, special character, or other.
4
+
5
+ Args:
6
+ identifier (str): The identifier to tag.
7
+
8
+ Returns:
9
+ str: A string of tag characters the same length as the input string.
10
+ """
11
+ vowels = ["a", "e", "i", "o", "u"]
12
+ special = ["-", "_", "@"]
13
+ numbers = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
14
+ consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"]
15
+ tags = ""
16
+ for c in identifier.lower():
17
+ if c in vowels:
18
+ tags += "^"
19
+ elif c in special:
20
+ tags += "$"
21
+ elif c in numbers:
22
+ tags += "#"
23
+ elif c in consonants:
24
+ tags += "+"
25
+ else:
26
+ tags += "*"
27
+ return tags