Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,11 @@ from transformers import AutoTokenizer,AutoModel
|
|
| 6 |
import numpy as np
|
| 7 |
import torch.nn as nn
|
| 8 |
import torch.nn.functional as F
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class BCNN(nn.Module):
|
| 11 |
def __init__(self, embedding_dim, output_dim,
|
| 12 |
dropout,bidirectional_units,conv_filters):
|
|
@@ -55,12 +59,14 @@ class TextClassificationApp:
|
|
| 55 |
model_name (str): Hugging Face model name for tokenization
|
| 56 |
"""
|
| 57 |
# Set up Streamlit page
|
|
|
|
|
|
|
| 58 |
st.set_page_config(
|
| 59 |
-
page_title="Text
|
| 60 |
-
page_icon="
|
| 61 |
layout="wide"
|
| 62 |
)
|
| 63 |
-
|
| 64 |
# Device configuration
|
| 65 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 66 |
|
|
@@ -83,6 +89,48 @@ class TextClassificationApp:
|
|
| 83 |
# Maximum sequence length
|
| 84 |
self.max_length = 128
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def preprocess_text(self, text):
|
| 87 |
"""
|
| 88 |
Preprocess input text for model prediction
|
|
@@ -94,6 +142,8 @@ class TextClassificationApp:
|
|
| 94 |
torch.Tensor: Tokenized and encoded input
|
| 95 |
"""
|
| 96 |
# Tokenize and encode the text
|
|
|
|
|
|
|
| 97 |
input_ids = []
|
| 98 |
attention_masks = []
|
| 99 |
encoded = self.tokenizer.encode_plus(
|
|
@@ -180,7 +230,7 @@ class TextClassificationApp:
|
|
| 180 |
|
| 181 |
def main():
|
| 182 |
# Replace these with your actual model path and class names
|
| 183 |
-
MODEL_PATH = '/
|
| 184 |
CLASS_NAMES = [
|
| 185 |
'Non-toxic',
|
| 186 |
'Toxic'
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import torch.nn as nn
|
| 8 |
import torch.nn.functional as F
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import re
|
| 11 |
+
teencode_df = pd.read_csv('toxic_classification_model/teencode.txt',names=['teencode','map'],sep='\t',)
|
| 12 |
+
teencode_list = teencode_df['teencode'].to_list()
|
| 13 |
+
map_list = teencode_df['map'].to_list()
|
| 14 |
class BCNN(nn.Module):
|
| 15 |
def __init__(self, embedding_dim, output_dim,
|
| 16 |
dropout,bidirectional_units,conv_filters):
|
|
|
|
| 59 |
model_name (str): Hugging Face model name for tokenization
|
| 60 |
"""
|
| 61 |
# Set up Streamlit page
|
| 62 |
+
# Custom CSS for justice-themed design
|
| 63 |
+
# Streamlit page configuration
|
| 64 |
st.set_page_config(
|
| 65 |
+
page_title="⚖️ Text Justice Classifier",
|
| 66 |
+
page_icon="⚖️",
|
| 67 |
layout="wide"
|
| 68 |
)
|
| 69 |
+
|
| 70 |
# Device configuration
|
| 71 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 72 |
|
|
|
|
| 89 |
# Maximum sequence length
|
| 90 |
self.max_length = 128
|
| 91 |
|
| 92 |
+
def remove_dub_char(self, sentence):
|
| 93 |
+
sentence = str(sentence)
|
| 94 |
+
words = []
|
| 95 |
+
for word in sentence.strip().split():
|
| 96 |
+
if word in teencode_list:
|
| 97 |
+
words.append(word)
|
| 98 |
+
continue
|
| 99 |
+
words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
|
| 100 |
+
return ' '.join(words)
|
| 101 |
+
|
| 102 |
+
def searchTeencode(self,word):
|
| 103 |
+
try:
|
| 104 |
+
global teencode_count
|
| 105 |
+
index = teencode_list.index(word)
|
| 106 |
+
map_word = map_list[index]
|
| 107 |
+
teencode_count += 1
|
| 108 |
+
return map_word
|
| 109 |
+
except:
|
| 110 |
+
pass
|
| 111 |
+
|
| 112 |
+
def deTeencode(self, sentence):
|
| 113 |
+
lenn = 0
|
| 114 |
+
sentence = str(sentence)
|
| 115 |
+
# Tokenize
|
| 116 |
+
nestList_tokens = sentence.split()
|
| 117 |
+
for tokens_idx, text_tokens in enumerate(nestList_tokens):
|
| 118 |
+
# Teencode
|
| 119 |
+
lenn += len(text_tokens)
|
| 120 |
+
for idx, word in enumerate(text_tokens):
|
| 121 |
+
deteencoded = self.searchTeencode(word)
|
| 122 |
+
if deteencoded is not None:
|
| 123 |
+
text_tokens[idx] = deteencoded
|
| 124 |
+
nestList_tokens[tokens_idx] = text_tokens
|
| 125 |
+
|
| 126 |
+
flat_list = [item for sublist in nestList_tokens for item in sublist]
|
| 127 |
+
|
| 128 |
+
# Detokenize
|
| 129 |
+
detokens = MosesDetokenizer().detokenize(flat_list, return_str=True)
|
| 130 |
+
|
| 131 |
+
return detokens
|
| 132 |
+
|
| 133 |
+
|
| 134 |
def preprocess_text(self, text):
|
| 135 |
"""
|
| 136 |
Preprocess input text for model prediction
|
|
|
|
| 142 |
torch.Tensor: Tokenized and encoded input
|
| 143 |
"""
|
| 144 |
# Tokenize and encode the text
|
| 145 |
+
text = self.remove_dub_char(text)
|
| 146 |
+
text = self.deTeencode(text)
|
| 147 |
input_ids = []
|
| 148 |
attention_masks = []
|
| 149 |
encoded = self.tokenizer.encode_plus(
|
|
|
|
| 230 |
|
| 231 |
def main():
|
| 232 |
# Replace these with your actual model path and class names
|
| 233 |
+
MODEL_PATH = 'toxic_classification_model/toxic.pt'
|
| 234 |
CLASS_NAMES = [
|
| 235 |
'Non-toxic',
|
| 236 |
'Toxic'
|