Spaces:
Build error
Build error
import streamlit as st | |
import torch | |
import torch.nn as nn | |
import transformers | |
from transformers import AutoTokenizer,AutoModel | |
import numpy as np | |
import torch.nn as nn | |
import matplotlib.pyplot as plt | |
import torch.nn.functional as F | |
class BCNN(nn.Module): | |
def __init__(self, embedding_dim, output_dim, | |
dropout,bidirectional_units,conv_filters): | |
super().__init__() | |
self.bert = AutoModel.from_pretrained('vinai/phobert-base-v2') | |
#.fc_input = nn.Linear(embedding_dim,embedding_dim) | |
self.bidirectional_lstm = nn.LSTM( | |
embedding_dim, bidirectional_units, bidirectional=True, batch_first=True | |
) | |
self.conv1 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[0], kernel_size=4) | |
self.conv2 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[1], kernel_size=5) | |
self.fc = nn.Linear(64, output_dim) | |
self.dropout = nn.Dropout(dropout) | |
def forward(self,b_input_ids,b_input_mask): | |
encoded = self.bert(b_input_ids,b_input_mask)[0] | |
embedded, _ = self.bidirectional_lstm(encoded) | |
embedded = embedded.permute(0, 2, 1) | |
conved_1 = F.relu(self.conv1(embedded)) | |
conved_2 = F.relu(self.conv2(embedded)) | |
#conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1] | |
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2) | |
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2) | |
#pooled_n = [batch size, n_fibatlters] | |
cat = self.dropout(torch.cat((pooled_1, pooled_2), dim = 1)) | |
#cat = [batch size, n_filters * len(filter_sizes)] | |
result = self.fc(cat) | |
return result | |
class TextClassificationApp: | |
def __init__(self, model_path, class_names, model_name='vinai/phobert-base-v2'): | |
""" | |
Initialize Streamlit Text Classification App | |
Args: | |
model_path (str): Path to the pre-trained .pt model file | |
class_names (list): List of classification labels | |
model_name (str): Hugging Face model name for tokenization | |
""" | |
# Set up Streamlit page | |
st.set_page_config( | |
page_title="Text Classification", | |
page_icon="📝", | |
layout="wide" | |
) | |
# Device configuration | |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
# Load tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Load the model | |
EMBEDDING_DIM = 768 | |
OUTPUT_DIM = 2 | |
DROPOUT = 0.1 | |
CONV_FILTERS = [32, 32] # Number of filters for each kernel size (4 and 5) | |
BIDIRECTIONAL_UNITS = 128 | |
self.model = BCNN(EMBEDDING_DIM, OUTPUT_DIM, DROPOUT, BIDIRECTIONAL_UNITS, CONV_FILTERS) | |
self.model = torch.load(r'toxic.pt',map_location=torch.device('cpu')) | |
self.model.eval() # Set to evaluation mode | |
# Store class names | |
self.class_names = class_names | |
# Maximum sequence length | |
self.max_length = 128 | |
def preprocess_text(self, text): | |
""" | |
Preprocess input text for model prediction | |
Args: | |
text (str): Input text to classify | |
Returns: | |
torch.Tensor: Tokenized and encoded input | |
""" | |
# Tokenize and encode the text | |
input_ids = [] | |
attention_masks = [] | |
encoded = self.tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=self.max_length, | |
padding='max_length', | |
truncation=True, | |
return_tensors='pt' | |
) | |
input_ids.append(encoded['input_ids'].to(self.device)) | |
attention_masks.append(encoded['attention_mask'].to(self.device)) | |
input_ids = torch.cat(input_ids, dim=0).to(self.device) | |
attention_masks = torch.cat(attention_masks, dim=0).to(self.device) | |
return input_ids, attention_masks | |
def predict(self, text): | |
""" | |
Make prediction on the input text | |
Args: | |
text (str): Input text to classify | |
Returns: | |
tuple: (predicted class, probabilities) | |
""" | |
# Preprocess the text | |
inputs,mask = self.preprocess_text(text) | |
# Disable gradient calculation | |
with torch.no_grad(): | |
# Get model outputs | |
outputs = self.model(inputs,mask) | |
# Apply softmax to get probabilities | |
probabilities = torch.softmax(outputs, dim=1) | |
# Get top predictions | |
top_probs, top_classes = torch.topk(probabilities, k=1) | |
return top_classes[0].cpu().numpy(), top_probs[0].cpu().numpy() | |
def run(self): | |
""" | |
Main Streamlit app runner | |
""" | |
# Title and description | |
st.title("📄 Text Classification") | |
st.write("Enter text to classify") | |
# Text input | |
text_input = st.text_area( | |
"Paste your text here", | |
height=250, | |
placeholder="Enter the text you want to classify..." | |
) | |
# Prediction button | |
if st.button("Classify Text"): | |
if text_input.strip(): | |
# Make prediction | |
top_classes, top_probs = self.predict(text_input) | |
# Display results | |
st.subheader("Classification Results") | |
# Create columns for results | |
cols = st.columns(3) | |
for i, (cls, prob) in enumerate(zip(top_classes, top_probs)): | |
with cols[i]: | |
st.metric( | |
label=f"Top {i+1} Prediction", | |
value=f"{self.class_names[cls]}", | |
delta=f"{prob:.2%}" | |
) | |
# Show input text details | |
with st.expander("Input Text Details"): | |
st.write("**Original Text:**") | |
st.write(text_input) | |
st.write(f"**Text Length:** {len(text_input)} characters") | |
else: | |
st.warning("Please enter some text to classify") | |
def main(): | |
# Replace these with your actual model path and class names | |
MODEL_PATH = '/workspaces/final-project-dl/toxic.pt' | |
CLASS_NAMES = [ | |
'Non-toxic', | |
'Toxic' | |
] | |
# Initialize and run the app | |
app = TextClassificationApp(MODEL_PATH, CLASS_NAMES) | |
app.run() | |
if __name__ == "__main__": | |
main() |