File size: 7,401 Bytes
5f49389
 
 
 
 
 
 
 
a1474f1
 
8fc3e0c
 
 
5f49389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1474f1
 
5f49389
a1474f1
 
5f49389
 
a1474f1
5f49389
 
 
 
 
 
 
 
 
 
 
 
 
7dbb487
5f49389
 
 
 
 
 
 
 
a1474f1
 
 
 
 
 
 
 
 
 
5f49389
 
 
 
 
 
 
 
 
 
 
a1474f1
5f49389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a741a
5f49389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1a52ff
5f49389
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import streamlit as st
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer,AutoModel
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import re
teencode_df = pd.read_csv('teencode.txt',names=['teencode','map'],sep='\t',)
teencode_list = teencode_df['teencode'].to_list()
map_list = teencode_df['map'].to_list()
class BCNN(nn.Module):
    def __init__(self, embedding_dim, output_dim,
                 dropout,bidirectional_units,conv_filters):

        super().__init__()
        self.bert = AutoModel.from_pretrained('vinai/phobert-base-v2')
        #.fc_input = nn.Linear(embedding_dim,embedding_dim)
        self.bidirectional_lstm = nn.LSTM(
            embedding_dim, bidirectional_units, bidirectional=True, batch_first=True
        )
        self.conv1 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[0], kernel_size=4)
        self.conv2 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[1], kernel_size=5)

        self.fc = nn.Linear(64, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,b_input_ids,b_input_mask):
        encoded = self.bert(b_input_ids,b_input_mask)[0]
        embedded, _ = self.bidirectional_lstm(encoded)
        embedded = embedded.permute(0, 2, 1)
        conved_1 = F.relu(self.conv1(embedded))
        conved_2 = F.relu(self.conv2(embedded))
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]

        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        #pooled_n = [batch size, n_fibatlters]

        cat = self.dropout(torch.cat((pooled_1, pooled_2), dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]

        result =  self.fc(cat)

        return result

class TextClassificationApp:
    def __init__(self, model_path, class_names, model_name='vinai/phobert-base-v2'):
        """
        Initialize Streamlit Text Classification App
        
        Args:
            model_path (str): Path to the pre-trained .pt model file
            class_names (list): List of classification labels
            model_name (str): Hugging Face model name for tokenization
        """
        # Set up Streamlit page
         # Custom CSS for justice-themed design
         # Streamlit page configuration
        st.set_page_config(
            page_title="⚖️ Text Justice Classifier", 
            page_icon="⚖️", 
            layout="wide"
        )

        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load the model
        EMBEDDING_DIM = 768
        OUTPUT_DIM = 2
        DROPOUT = 0.1
        CONV_FILTERS = [32, 32]  # Number of filters for each kernel size (4 and 5)
        BIDIRECTIONAL_UNITS = 128
        self.model = BCNN(EMBEDDING_DIM, OUTPUT_DIM, DROPOUT, BIDIRECTIONAL_UNITS, CONV_FILTERS)
        self.model = torch.load(r'toxic.pt',map_location=torch.device('cpu'),weights_only = False)
        self.model.eval()  # Set to evaluation mode
        
        # Store class names
        self.class_names = class_names
        
        # Maximum sequence length
        self.max_length = 128

    def remove_dub_char(self, sentence):
        sentence = str(sentence)
        words = []
        for word in sentence.strip().split():
            if word in teencode_list:
                words.append(word)
                continue
            words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
        return ' '.join(words)

    def preprocess_text(self, text):
        """
        Preprocess input text for model prediction
        
        Args:
            text (str): Input text to classify
        
        Returns:
            torch.Tensor: Tokenized and encoded input
        """
        # Tokenize and encode the text
        text = self.remove_dub_char(text)
        input_ids = []
        attention_masks = []
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'].to(self.device))
        attention_masks.append(encoded['attention_mask'].to(self.device))
        input_ids = torch.cat(input_ids, dim=0).to(self.device)
        attention_masks = torch.cat(attention_masks, dim=0).to(self.device)
        return input_ids, attention_masks

    def predict(self, text):
        """
        Make prediction on the input text
        
        Args:
            text (str): Input text to classify
        
        Returns:
            tuple: (predicted class, probabilities)
        """
        # Preprocess the text
        inputs,mask = self.preprocess_text(text)
        
        # Disable gradient calculation
        with torch.no_grad():
            # Get model outputs
            outputs = self.model(inputs,mask)
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(outputs, dim=1)
            
            # Get top predictions
            top_probs, top_classes = torch.topk(probabilities, k=1)
            
            return top_classes[0].cpu().numpy(), top_probs[0].cpu().numpy()

    def run(self):
        """
        Main Streamlit app runner
        """
        # Title and description
        st.title("📄 Toxic Classification")
        st.write("Enter text to classify")
        
        # Text input
        text_input = st.text_area(
            "Paste your text here", 
            height=250,
            placeholder="Enter the text you want to classify..."
        )
        
        # Prediction button
        if st.button("Classify Text"):
            if text_input.strip():
                # Make prediction
                top_classes, top_probs = self.predict(text_input)
                
                # Display results
                st.subheader("Classification Results")
                
                # Create columns for results
                cols = st.columns(3)
                
                for i, (cls, prob) in enumerate(zip(top_classes, top_probs)):
                    with cols[i]:
                        st.metric(
                            label=f"Top {i+1} Prediction", 
                            value=f"{self.class_names[cls]}", 
                            delta=f"{prob:.2%}"
                        )
                # Show input text details
                with st.expander("Input Text Details"):
                    st.write("**Original Text:**")
                    st.write(text_input)
                    st.write(f"**Text Length:** {len(text_input)} characters")
            else:
                st.warning("Please enter some text to classify")

def main():
    # Replace these with your actual model path and class names
    MODEL_PATH = 'toxic.pt'
    CLASS_NAMES = [ 
        'Non-toxic', 
        'Toxic'
    ]
    
    # Initialize and run the app
    app = TextClassificationApp(MODEL_PATH, CLASS_NAMES)
    app.run()

if __name__ == "__main__":
    main()