zerostratos commited on
Commit
a1474f1
·
verified ·
1 Parent(s): b8314eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -5
app.py CHANGED
@@ -6,7 +6,11 @@ from transformers import AutoTokenizer,AutoModel
6
  import numpy as np
7
  import torch.nn as nn
8
  import torch.nn.functional as F
9
-
 
 
 
 
10
  class BCNN(nn.Module):
11
  def __init__(self, embedding_dim, output_dim,
12
  dropout,bidirectional_units,conv_filters):
@@ -55,12 +59,14 @@ class TextClassificationApp:
55
  model_name (str): Hugging Face model name for tokenization
56
  """
57
  # Set up Streamlit page
 
 
58
  st.set_page_config(
59
- page_title="Text Classification",
60
- page_icon="📝",
61
  layout="wide"
62
  )
63
-
64
  # Device configuration
65
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
66
 
@@ -83,6 +89,48 @@ class TextClassificationApp:
83
  # Maximum sequence length
84
  self.max_length = 128
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def preprocess_text(self, text):
87
  """
88
  Preprocess input text for model prediction
@@ -94,6 +142,8 @@ class TextClassificationApp:
94
  torch.Tensor: Tokenized and encoded input
95
  """
96
  # Tokenize and encode the text
 
 
97
  input_ids = []
98
  attention_masks = []
99
  encoded = self.tokenizer.encode_plus(
@@ -180,7 +230,7 @@ class TextClassificationApp:
180
 
181
  def main():
182
  # Replace these with your actual model path and class names
183
- MODEL_PATH = '/workspaces/final-project-dl/toxic.pt'
184
  CLASS_NAMES = [
185
  'Non-toxic',
186
  'Toxic'
 
6
  import numpy as np
7
  import torch.nn as nn
8
  import torch.nn.functional as F
9
+ import pandas as pd
10
+ import re
11
+ teencode_df = pd.read_csv('toxic_classification_model/teencode.txt',names=['teencode','map'],sep='\t',)
12
+ teencode_list = teencode_df['teencode'].to_list()
13
+ map_list = teencode_df['map'].to_list()
14
  class BCNN(nn.Module):
15
  def __init__(self, embedding_dim, output_dim,
16
  dropout,bidirectional_units,conv_filters):
 
59
  model_name (str): Hugging Face model name for tokenization
60
  """
61
  # Set up Streamlit page
62
+ # Custom CSS for justice-themed design
63
+ # Streamlit page configuration
64
  st.set_page_config(
65
+ page_title="⚖️ Text Justice Classifier",
66
+ page_icon="⚖️",
67
  layout="wide"
68
  )
69
+
70
  # Device configuration
71
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
72
 
 
89
  # Maximum sequence length
90
  self.max_length = 128
91
 
92
+ def remove_dub_char(self, sentence):
93
+ sentence = str(sentence)
94
+ words = []
95
+ for word in sentence.strip().split():
96
+ if word in teencode_list:
97
+ words.append(word)
98
+ continue
99
+ words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
100
+ return ' '.join(words)
101
+
102
+ def searchTeencode(self,word):
103
+ try:
104
+ global teencode_count
105
+ index = teencode_list.index(word)
106
+ map_word = map_list[index]
107
+ teencode_count += 1
108
+ return map_word
109
+ except:
110
+ pass
111
+
112
+ def deTeencode(self, sentence):
113
+ lenn = 0
114
+ sentence = str(sentence)
115
+ # Tokenize
116
+ nestList_tokens = sentence.split()
117
+ for tokens_idx, text_tokens in enumerate(nestList_tokens):
118
+ # Teencode
119
+ lenn += len(text_tokens)
120
+ for idx, word in enumerate(text_tokens):
121
+ deteencoded = self.searchTeencode(word)
122
+ if deteencoded is not None:
123
+ text_tokens[idx] = deteencoded
124
+ nestList_tokens[tokens_idx] = text_tokens
125
+
126
+ flat_list = [item for sublist in nestList_tokens for item in sublist]
127
+
128
+ # Detokenize
129
+ detokens = MosesDetokenizer().detokenize(flat_list, return_str=True)
130
+
131
+ return detokens
132
+
133
+
134
  def preprocess_text(self, text):
135
  """
136
  Preprocess input text for model prediction
 
142
  torch.Tensor: Tokenized and encoded input
143
  """
144
  # Tokenize and encode the text
145
+ text = self.remove_dub_char(text)
146
+ text = self.deTeencode(text)
147
  input_ids = []
148
  attention_masks = []
149
  encoded = self.tokenizer.encode_plus(
 
230
 
231
  def main():
232
  # Replace these with your actual model path and class names
233
+ MODEL_PATH = 'toxic_classification_model/toxic.pt'
234
  CLASS_NAMES = [
235
  'Non-toxic',
236
  'Toxic'