Upload 2 files
Browse files- snails_naturalness_classifier.py +121 -0
- tokenprocessing.py +27 -0
snails_naturalness_classifier.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Copyright 2024 Kyle Luoma
|
3 |
+
|
4 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
you may not use this file except in compliance with the License.
|
6 |
+
You may obtain a copy of the License at
|
7 |
+
|
8 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
|
10 |
+
Unless required by applicable law or agreed to in writing, software
|
11 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
See the License for the specific language governing permissions and
|
14 |
+
limitations under the License.
|
15 |
+
"""
|
16 |
+
|
17 |
+
from transformers import AutoTokenizer, CanineForSequenceClassification, pipeline
|
18 |
+
import torch
|
19 |
+
import pandas as pd
|
20 |
+
import tokenprocessing as tp
|
21 |
+
|
22 |
+
|
23 |
+
class CanineIdentifierClassifier:
|
24 |
+
"""
|
25 |
+
A classifier for identifying word naturalness using a pre-trained text analysis model.
|
26 |
+
Classifies words as Regular (label N1), Low (label N2), or Least (label N3) natural.
|
27 |
+
Attributes:
|
28 |
+
model_name (str): The name of the model used for classification.
|
29 |
+
checkpoint (int): The checkpoint number of the model.
|
30 |
+
id2label (dict): A dictionary mapping label IDs to label names.
|
31 |
+
label2id (dict): A dictionary mapping label names to label IDs.
|
32 |
+
classifier (pipeline): The sentiment analysis pipeline used for classification.
|
33 |
+
identifiers (pd.DataFrame): A DataFrame containing identifiers to classify.
|
34 |
+
Methods:
|
35 |
+
__init__(identifiers=pd.DataFrame()):
|
36 |
+
Initializes the classifier with the given identifiers DataFrame.
|
37 |
+
do_batch_job(ident_df=None, save_as_excel=False, make_tag=True):
|
38 |
+
Performs batch classification on the given DataFrame of identifiers.
|
39 |
+
Args:
|
40 |
+
ident_df (pd.DataFrame, optional): The DataFrame of identifiers to classify. Defaults to None.
|
41 |
+
save_as_excel (bool, optional): Whether to save the results as an Excel file. Defaults to False.
|
42 |
+
make_tag (bool, optional): Whether to add a token tag to the text before classification. Defaults to True.
|
43 |
+
classify_identifier(identifier, make_tag=True):
|
44 |
+
Classifies a single identifier.
|
45 |
+
Args:
|
46 |
+
identifier (str): The identifier to classify.
|
47 |
+
make_tag (bool, optional): Whether to add a token tag to the identifier before classification. Defaults to True.
|
48 |
+
Returns:
|
49 |
+
list: The classification result.
|
50 |
+
"""
|
51 |
+
|
52 |
+
|
53 |
+
def __init__(self, identifiers = pd.DataFrame()):
|
54 |
+
|
55 |
+
self.model_name = "kyleluoma/SNAILS-word-naturalness-classifier"
|
56 |
+
self.checkpoint = 5590
|
57 |
+
self.id2label = {0: "N1", 1: "N2", 2: "N3"}
|
58 |
+
self.label2id = {"N1": 0, "N2": 1, "N3": 2}
|
59 |
+
self.classifier = pipeline(
|
60 |
+
"sentiment-analysis",
|
61 |
+
model = "kyleluoma/SNAILS-word-naturalness-classifier",
|
62 |
+
device=0
|
63 |
+
)
|
64 |
+
self.identifiers = identifiers
|
65 |
+
|
66 |
+
def do_batch_job(self, ident_df: pd.DataFrame = None, save_as_excel: bool = False, make_tag: bool = True):
|
67 |
+
"""
|
68 |
+
Processes a batch of text data through a classifier and optionally saves the results to an Excel file.
|
69 |
+
Args:
|
70 |
+
ident_df (pd.DataFrame, optional): DataFrame containing the text data to be classified.
|
71 |
+
If None, uses self.identifiers. Defaults to None.
|
72 |
+
save_as_excel (bool, optional): If True, saves the results to an Excel file. Defaults to False.
|
73 |
+
make_tag (bool, optional): If True, appends a token tag to the text before classification. Defaults to True.
|
74 |
+
Returns:
|
75 |
+
None
|
76 |
+
"""
|
77 |
+
|
78 |
+
auto_scores = []
|
79 |
+
|
80 |
+
if ident_df == None:
|
81 |
+
ident_df = self.identifiers
|
82 |
+
|
83 |
+
for row in ident_df.itertuples():
|
84 |
+
if make_tag:
|
85 |
+
pred = classifier(row.text + tp.make_token_tag(row.text))
|
86 |
+
else:
|
87 |
+
pred = self.classifier(row.text)
|
88 |
+
print(pred)
|
89 |
+
auto_scores.append(pred[0]['label'])
|
90 |
+
|
91 |
+
ident_df["prediction"] = auto_scores
|
92 |
+
|
93 |
+
if save_as_excel:
|
94 |
+
ident_df[['text', 'prediction', 'category']].to_excel(
|
95 |
+
f"./classifier-inference-results/{self.model_name}-cp-{self.checkpoint}.xlsx",
|
96 |
+
index=False
|
97 |
+
)
|
98 |
+
|
99 |
+
def classify_identifier(self, identifier: str, make_tag: bool = True):
|
100 |
+
"""
|
101 |
+
Classifies the given identifier using the classifier.
|
102 |
+
Args:
|
103 |
+
identifier (str): The identifier to classify.
|
104 |
+
make_tag (bool, optional): If True, appends a token tag to the identifier before classification. Defaults to True.
|
105 |
+
Returns:
|
106 |
+
The classification result of the identifier.
|
107 |
+
"""
|
108 |
+
|
109 |
+
identifier = str(identifier)
|
110 |
+
if make_tag:
|
111 |
+
identifier += (" " + tp.make_token_tag(identifier))
|
112 |
+
pred = self.classifier(identifier)
|
113 |
+
# print("Classifying", identifier, "as", pred)
|
114 |
+
return pred
|
115 |
+
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
classifier = CanineIdentifierClassifier()
|
119 |
+
print(classifier.classify_identifier("WinterWeather"))
|
120 |
+
print(classifier.classify_identifier("WntrWthr"))
|
121 |
+
print(classifier.classify_identifier("WWth"))
|
tokenprocessing.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def make_token_tag(identifier):
|
2 |
+
"""
|
3 |
+
Feature engineering for identifiers, tags each character as a vowel, consonant, number, special character, or other.
|
4 |
+
|
5 |
+
Args:
|
6 |
+
identifier (str): The identifier to tag.
|
7 |
+
|
8 |
+
Returns:
|
9 |
+
str: A string of tag characters the same length as the input string.
|
10 |
+
"""
|
11 |
+
vowels = ["a", "e", "i", "o", "u"]
|
12 |
+
special = ["-", "_", "@"]
|
13 |
+
numbers = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
14 |
+
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"]
|
15 |
+
tags = ""
|
16 |
+
for c in identifier.lower():
|
17 |
+
if c in vowels:
|
18 |
+
tags += "^"
|
19 |
+
elif c in special:
|
20 |
+
tags += "$"
|
21 |
+
elif c in numbers:
|
22 |
+
tags += "#"
|
23 |
+
elif c in consonants:
|
24 |
+
tags += "+"
|
25 |
+
else:
|
26 |
+
tags += "*"
|
27 |
+
return tags
|