Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,11 @@ from transformers import AutoTokenizer,AutoModel
|
|
6 |
import numpy as np
|
7 |
import torch.nn as nn
|
8 |
import torch.nn.functional as F
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
class BCNN(nn.Module):
|
11 |
def __init__(self, embedding_dim, output_dim,
|
12 |
dropout,bidirectional_units,conv_filters):
|
@@ -55,12 +59,14 @@ class TextClassificationApp:
|
|
55 |
model_name (str): Hugging Face model name for tokenization
|
56 |
"""
|
57 |
# Set up Streamlit page
|
|
|
|
|
58 |
st.set_page_config(
|
59 |
-
page_title="Text
|
60 |
-
page_icon="
|
61 |
layout="wide"
|
62 |
)
|
63 |
-
|
64 |
# Device configuration
|
65 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
66 |
|
@@ -83,6 +89,48 @@ class TextClassificationApp:
|
|
83 |
# Maximum sequence length
|
84 |
self.max_length = 128
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def preprocess_text(self, text):
|
87 |
"""
|
88 |
Preprocess input text for model prediction
|
@@ -94,6 +142,8 @@ class TextClassificationApp:
|
|
94 |
torch.Tensor: Tokenized and encoded input
|
95 |
"""
|
96 |
# Tokenize and encode the text
|
|
|
|
|
97 |
input_ids = []
|
98 |
attention_masks = []
|
99 |
encoded = self.tokenizer.encode_plus(
|
@@ -180,7 +230,7 @@ class TextClassificationApp:
|
|
180 |
|
181 |
def main():
|
182 |
# Replace these with your actual model path and class names
|
183 |
-
MODEL_PATH = '/
|
184 |
CLASS_NAMES = [
|
185 |
'Non-toxic',
|
186 |
'Toxic'
|
|
|
6 |
import numpy as np
|
7 |
import torch.nn as nn
|
8 |
import torch.nn.functional as F
|
9 |
+
import pandas as pd
|
10 |
+
import re
|
11 |
+
teencode_df = pd.read_csv('toxic_classification_model/teencode.txt',names=['teencode','map'],sep='\t',)
|
12 |
+
teencode_list = teencode_df['teencode'].to_list()
|
13 |
+
map_list = teencode_df['map'].to_list()
|
14 |
class BCNN(nn.Module):
|
15 |
def __init__(self, embedding_dim, output_dim,
|
16 |
dropout,bidirectional_units,conv_filters):
|
|
|
59 |
model_name (str): Hugging Face model name for tokenization
|
60 |
"""
|
61 |
# Set up Streamlit page
|
62 |
+
# Custom CSS for justice-themed design
|
63 |
+
# Streamlit page configuration
|
64 |
st.set_page_config(
|
65 |
+
page_title="⚖️ Text Justice Classifier",
|
66 |
+
page_icon="⚖️",
|
67 |
layout="wide"
|
68 |
)
|
69 |
+
|
70 |
# Device configuration
|
71 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
72 |
|
|
|
89 |
# Maximum sequence length
|
90 |
self.max_length = 128
|
91 |
|
92 |
+
def remove_dub_char(self, sentence):
|
93 |
+
sentence = str(sentence)
|
94 |
+
words = []
|
95 |
+
for word in sentence.strip().split():
|
96 |
+
if word in teencode_list:
|
97 |
+
words.append(word)
|
98 |
+
continue
|
99 |
+
words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
|
100 |
+
return ' '.join(words)
|
101 |
+
|
102 |
+
def searchTeencode(self,word):
|
103 |
+
try:
|
104 |
+
global teencode_count
|
105 |
+
index = teencode_list.index(word)
|
106 |
+
map_word = map_list[index]
|
107 |
+
teencode_count += 1
|
108 |
+
return map_word
|
109 |
+
except:
|
110 |
+
pass
|
111 |
+
|
112 |
+
def deTeencode(self, sentence):
|
113 |
+
lenn = 0
|
114 |
+
sentence = str(sentence)
|
115 |
+
# Tokenize
|
116 |
+
nestList_tokens = sentence.split()
|
117 |
+
for tokens_idx, text_tokens in enumerate(nestList_tokens):
|
118 |
+
# Teencode
|
119 |
+
lenn += len(text_tokens)
|
120 |
+
for idx, word in enumerate(text_tokens):
|
121 |
+
deteencoded = self.searchTeencode(word)
|
122 |
+
if deteencoded is not None:
|
123 |
+
text_tokens[idx] = deteencoded
|
124 |
+
nestList_tokens[tokens_idx] = text_tokens
|
125 |
+
|
126 |
+
flat_list = [item for sublist in nestList_tokens for item in sublist]
|
127 |
+
|
128 |
+
# Detokenize
|
129 |
+
detokens = MosesDetokenizer().detokenize(flat_list, return_str=True)
|
130 |
+
|
131 |
+
return detokens
|
132 |
+
|
133 |
+
|
134 |
def preprocess_text(self, text):
|
135 |
"""
|
136 |
Preprocess input text for model prediction
|
|
|
142 |
torch.Tensor: Tokenized and encoded input
|
143 |
"""
|
144 |
# Tokenize and encode the text
|
145 |
+
text = self.remove_dub_char(text)
|
146 |
+
text = self.deTeencode(text)
|
147 |
input_ids = []
|
148 |
attention_masks = []
|
149 |
encoded = self.tokenizer.encode_plus(
|
|
|
230 |
|
231 |
def main():
|
232 |
# Replace these with your actual model path and class names
|
233 |
+
MODEL_PATH = 'toxic_classification_model/toxic.pt'
|
234 |
CLASS_NAMES = [
|
235 |
'Non-toxic',
|
236 |
'Toxic'
|