Spaces:

zerostratos
/

toxic_classification_model

Sleeping

App Files Files Community

zerostratos commited on Dec 15, 2024

Commit

a1474f1

verified ·

1 Parent(s): b8314eb

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -5

app.py CHANGED Viewed

@@ -6,7 +6,11 @@ from transformers import AutoTokenizer,AutoModel
 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
 class BCNN(nn.Module):
     def __init__(self, embedding_dim, output_dim,
                  dropout,bidirectional_units,conv_filters):
@@ -55,12 +59,14 @@ class TextClassificationApp:
             model_name (str): Hugging Face model name for tokenization
         """
         # Set up Streamlit page
         st.set_page_config(
-            page_title="Text Classification",
-            page_icon="📝",
             layout="wide"
         )
         # Device configuration
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -83,6 +89,48 @@ class TextClassificationApp:
         # Maximum sequence length
         self.max_length = 128
     def preprocess_text(self, text):
         """
         Preprocess input text for model prediction
@@ -94,6 +142,8 @@ class TextClassificationApp:
             torch.Tensor: Tokenized and encoded input
         """
         # Tokenize and encode the text
         input_ids = []
         attention_masks = []
         encoded = self.tokenizer.encode_plus(
@@ -180,7 +230,7 @@ class TextClassificationApp:
 def main():
     # Replace these with your actual model path and class names
-    MODEL_PATH = '/workspaces/final-project-dl/toxic.pt'
     CLASS_NAMES = [
         'Non-toxic',
         'Toxic'

 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
+import pandas as pd
+import re
+teencode_df = pd.read_csv('toxic_classification_model/teencode.txt',names=['teencode','map'],sep='\t',)
+teencode_list = teencode_df['teencode'].to_list()
+map_list = teencode_df['map'].to_list()
 class BCNN(nn.Module):
     def __init__(self, embedding_dim, output_dim,
                  dropout,bidirectional_units,conv_filters):
             model_name (str): Hugging Face model name for tokenization
         """
         # Set up Streamlit page
+         # Custom CSS for justice-themed design
+         # Streamlit page configuration
         st.set_page_config(
+            page_title="⚖️ Text Justice Classifier",
+            page_icon="⚖️",
             layout="wide"
         )
         # Device configuration
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         # Maximum sequence length
         self.max_length = 128
+    def remove_dub_char(self, sentence):
+        sentence = str(sentence)
+        words = []
+        for word in sentence.strip().split():
+            if word in teencode_list:
+                words.append(word)
+                continue
+            words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
+        return ' '.join(words)
+    def searchTeencode(self,word):
+        try:
+            global teencode_count
+            index = teencode_list.index(word)
+            map_word = map_list[index]
+            teencode_count += 1
+            return map_word
+        except:
+            pass
+    def deTeencode(self, sentence):
+        lenn = 0
+        sentence = str(sentence)
+        # Tokenize
+        nestList_tokens = sentence.split()
+        for tokens_idx, text_tokens in enumerate(nestList_tokens):
+            # Teencode
+            lenn += len(text_tokens)
+            for idx, word in enumerate(text_tokens):
+                deteencoded = self.searchTeencode(word)
+                if deteencoded is not None:
+                    text_tokens[idx] = deteencoded
+            nestList_tokens[tokens_idx] = text_tokens
+        flat_list = [item for sublist in nestList_tokens for item in sublist]
+        # Detokenize
+        detokens = MosesDetokenizer().detokenize(flat_list, return_str=True)
+        return detokens
     def preprocess_text(self, text):
         """
         Preprocess input text for model prediction
             torch.Tensor: Tokenized and encoded input
         """
         # Tokenize and encode the text
+        text = self.remove_dub_char(text)
+        text = self.deTeencode(text)
         input_ids = []
         attention_masks = []
         encoded = self.tokenizer.encode_plus(
 def main():
     # Replace these with your actual model path and class names
+    MODEL_PATH = 'toxic_classification_model/toxic.pt'
     CLASS_NAMES = [
         'Non-toxic',
         'Toxic'