Spaces:

zerostratos
/

toxic_classification_model

Sleeping

App Files Files Community

zerostratos commited on Dec 15, 2024

Commit

5f49389

verified ·

1 Parent(s): 1373cd8

Update streamlitapp.py

Browse files

Files changed (1) hide show

streamlitapp.py +195 -0

streamlitapp.py CHANGED Viewed

	@@ -0,0 +1,195 @@

+import streamlit as st
+import torch
+import torch.nn as nn
+import transformers
+from transformers import AutoTokenizer,AutoModel
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+class BCNN(nn.Module):
+    def __init__(self, embedding_dim, output_dim,
+                 dropout,bidirectional_units,conv_filters):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained('vinai/phobert-base-v2')
+        #.fc_input = nn.Linear(embedding_dim,embedding_dim)
+        self.bidirectional_lstm = nn.LSTM(
+            embedding_dim, bidirectional_units, bidirectional=True, batch_first=True
+        )
+        self.conv1 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[0], kernel_size=4)
+        self.conv2 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[1], kernel_size=5)
+        self.fc = nn.Linear(64, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self,b_input_ids,b_input_mask):
+        encoded = self.bert(b_input_ids,b_input_mask)[0]
+        embedded, _ = self.bidirectional_lstm(encoded)
+        embedded = embedded.permute(0, 2, 1)
+        conved_1 = F.relu(self.conv1(embedded))
+        conved_2 = F.relu(self.conv2(embedded))
+        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
+        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
+        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
+        #pooled_n = [batch size, n_fibatlters]
+        cat = self.dropout(torch.cat((pooled_1, pooled_2), dim = 1))
+        #cat = [batch size, n_filters * len(filter_sizes)]
+        result =  self.fc(cat)
+        return result
+class TextClassificationApp:
+    def __init__(self, model_path, class_names, model_name='vinai/phobert-base-v2'):
+        """
+        Initialize Streamlit Text Classification App
+        Args:
+            model_path (str): Path to the pre-trained .pt model file
+            class_names (list): List of classification labels
+            model_name (str): Hugging Face model name for tokenization
+        """
+        # Set up Streamlit page
+        st.set_page_config(
+            page_title="Text Classification",
+            page_icon="📝",
+            layout="wide"
+        )
+        # Device configuration
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Load the model
+        EMBEDDING_DIM = 768
+        OUTPUT_DIM = 2
+        DROPOUT = 0.1
+        CONV_FILTERS = [32, 32]  # Number of filters for each kernel size (4 and 5)
+        BIDIRECTIONAL_UNITS = 128
+        self.model = BCNN(EMBEDDING_DIM, OUTPUT_DIM, DROPOUT, BIDIRECTIONAL_UNITS, CONV_FILTERS)
+        self.model = torch.load(r'toxic.pt',map_location=torch.device('cpu'))
+        self.model.eval()  # Set to evaluation mode
+        # Store class names
+        self.class_names = class_names
+        # Maximum sequence length
+        self.max_length = 128
+    def preprocess_text(self, text):
+        """
+        Preprocess input text for model prediction
+        Args:
+            text (str): Input text to classify
+        Returns:
+            torch.Tensor: Tokenized and encoded input
+        """
+        # Tokenize and encode the text
+        input_ids = []
+        attention_masks = []
+        encoded = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        input_ids.append(encoded['input_ids'].to(self.device))
+        attention_masks.append(encoded['attention_mask'].to(self.device))
+        input_ids = torch.cat(input_ids, dim=0).to(self.device)
+        attention_masks = torch.cat(attention_masks, dim=0).to(self.device)
+        return input_ids, attention_masks
+    def predict(self, text):
+        """
+        Make prediction on the input text
+        Args:
+            text (str): Input text to classify
+        Returns:
+            tuple: (predicted class, probabilities)
+        """
+        # Preprocess the text
+        inputs,mask = self.preprocess_text(text)
+        # Disable gradient calculation
+        with torch.no_grad():
+            # Get model outputs
+            outputs = self.model(inputs,mask)
+            # Apply softmax to get probabilities
+            probabilities = torch.softmax(outputs, dim=1)
+            # Get top predictions
+            top_probs, top_classes = torch.topk(probabilities, k=1)
+            return top_classes[0].cpu().numpy(), top_probs[0].cpu().numpy()
+    def run(self):
+        """
+        Main Streamlit app runner
+        """
+        # Title and description
+        st.title("📄 Text Classification")
+        st.write("Enter text to classify")
+        # Text input
+        text_input = st.text_area(
+            "Paste your text here",
+            height=250,
+            placeholder="Enter the text you want to classify..."
+        )
+        # Prediction button
+        if st.button("Classify Text"):
+            if text_input.strip():
+                # Make prediction
+                top_classes, top_probs = self.predict(text_input)
+                # Display results
+                st.subheader("Classification Results")
+                # Create columns for results
+                cols = st.columns(3)
+                for i, (cls, prob) in enumerate(zip(top_classes, top_probs)):
+                    with cols[i]:
+                        st.metric(
+                            label=f"Top {i+1} Prediction",
+                            value=f"{self.class_names[cls]}",
+                            delta=f"{prob:.2%}"
+                        )
+                # Show input text details
+                with st.expander("Input Text Details"):
+                    st.write("**Original Text:**")
+                    st.write(text_input)
+                    st.write(f"**Text Length:** {len(text_input)} characters")
+            else:
+                st.warning("Please enter some text to classify")
+def main():
+    # Replace these with your actual model path and class names
+    MODEL_PATH = '/workspaces/final-project-dl/toxic.pt'
+    CLASS_NAMES = [
+        'Non-toxic',
+        'Toxic'
+    ]
+    # Initialize and run the app
+    app = TextClassificationApp(MODEL_PATH, CLASS_NAMES)
+    app.run()
+if __name__ == "__main__":
+    main()