Spaces:
Sleeping
Sleeping
File size: 5,628 Bytes
ea19ac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
df = pd.read_csv('Emotion_classify_Data.csv')
"""
https://www.kaggle.com/code/vidhikishorwaghela/emonlp-decoding-human-feelings-with-deep-learning
"""
def preprocess_data(df):
"""
Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
"""
df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
df = df.dropna()
df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
return df
df = preprocess_data(df)
indep = df['text']
dep = df['label']
labelEncoder = LabelEncoder()
dep = labelEncoder.fit_transform(dep)
# First split: Separate out a training set and a temporary set
X_train, X_temp, y_train, y_temp = train_test_split(indep, dep, test_size=0.4, random_state=42)
# Second split: Divide the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
import torch
import torch.nn as nn
class LSTMModel(nn.Module):
def __init__(self, max_words, max_len):
super(LSTMModel, self).__init__()
self.embedding = nn.Embedding(num_embeddings=max_words, embedding_dim=16, max_norm=max_len)
self.lstm = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
self.fc = nn.Linear(in_features=64, out_features=3)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.embedding(x)
x, (hidden, cell) = self.lstm(x)
x = x[:, -1, :] # Get the last output of the sequence
x = self.fc(x)
x = self.softmax(x)
return x
# Usage
max_words = 10000 # Adjust as per your vocabulary size
max_len = 100 # Adjust as per your sequence length
model = LSTMModel(max_words, max_len)
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)
import torch
from collections import Counter
from itertools import chain
# Create a vocabulary from the training set
def create_vocab(texts, max_words, oov_token='<OOV>'):
# Count the words
word_counts = Counter(chain.from_iterable([text.split() for text in texts]))
# Most common words
most_common = word_counts.most_common(max_words - 1) # Reserve one for OOV token
# Create the vocabulary
vocab = {word: idx + 1 for idx, (word, count) in enumerate(most_common)}
vocab[oov_token] = 0 # OOV token
return vocab
# Convert texts to sequences of indices
def texts_to_sequences(texts, vocab):
sequences = []
for text in texts:
sequence = [vocab.get(word, vocab['<OOV>']) for word in text.split()]
sequences.append(sequence)
return sequences
# Pad sequences to a fixed length
def pad_sequences(sequences, maxlen):
padded_sequences = torch.zeros((len(sequences), maxlen), dtype=torch.long)
for idx, sequence in enumerate(sequences):
if len(sequence) > maxlen:
sequence = sequence[:maxlen]
padded_sequences[idx, :len(sequence)] = torch.tensor(sequence)
return padded_sequences
# Create the vocabulary
vocab = create_vocab(X_train, max_words)
# Convert texts to sequences
X_train_seq = pad_sequences(texts_to_sequences(X_train, vocab), maxlen=max_len)
X_test_seq = pad_sequences(texts_to_sequences(X_test, vocab), maxlen=max_len)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Convert labels to tensors
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)
num_epochs = 10
# Create a custom dataset
class TextDataset(Dataset):
def __init__(self, sequences, labels):
self.sequences = sequences
self.labels = labels
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
return self.sequences[idx], self.labels[idx]
# Create datasets
train_dataset = TextDataset(X_train_seq, y_train_tensor)
test_dataset = TextDataset(X_test_seq, y_test_tensor)
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Define the model
class LSTMModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(LSTMModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = self.embedding(x)
x, (hidden, cell) = self.lstm(x)
x = self.fc(x[:, -1, :]) # Use the last hidden state
return x
# Instantiate the model
model = LSTMModel(max_words, 16, 64, 3)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Training loop
for epoch in range(num_epochs):
for inputs, labels in train_loader:
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') |