File size: 5,628 Bytes
ea19ac8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('Emotion_classify_Data.csv')

"""
https://www.kaggle.com/code/vidhikishorwaghela/emonlp-decoding-human-feelings-with-deep-learning
"""

def preprocess_data(df):
    """
    Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces.
    """
    df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'})
    df = df.dropna()
    df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip()
    return df

df = preprocess_data(df)

indep = df['text']
dep = df['label']

labelEncoder = LabelEncoder()
dep = labelEncoder.fit_transform(dep)

# First split: Separate out a training set and a temporary set
X_train, X_temp, y_train, y_temp = train_test_split(indep, dep, test_size=0.4, random_state=42)

# Second split: Divide the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, max_words, max_len):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=max_words, embedding_dim=16, max_norm=max_len)
        self.lstm = nn.LSTM(input_size=16, hidden_size=64, num_layers=1, batch_first=True, dropout=0.1)
        self.fc = nn.Linear(in_features=64, out_features=3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.lstm(x)
        x = x[:, -1, :]  # Get the last output of the sequence
        x = self.fc(x)
        x = self.softmax(x)
        return x

# Usage
max_words = 10000  # Adjust as per your vocabulary size
max_len = 100      # Adjust as per your sequence length
model = LSTMModel(max_words, max_len)

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

import torch
from collections import Counter
from itertools import chain

# Create a vocabulary from the training set
def create_vocab(texts, max_words, oov_token='<OOV>'):
    # Count the words
    word_counts = Counter(chain.from_iterable([text.split() for text in texts]))
    # Most common words
    most_common = word_counts.most_common(max_words - 1)  # Reserve one for OOV token
    # Create the vocabulary
    vocab = {word: idx + 1 for idx, (word, count) in enumerate(most_common)}
    vocab[oov_token] = 0  # OOV token
    return vocab

# Convert texts to sequences of indices
def texts_to_sequences(texts, vocab):
    sequences = []
    for text in texts:
        sequence = [vocab.get(word, vocab['<OOV>']) for word in text.split()]
        sequences.append(sequence)
    return sequences

# Pad sequences to a fixed length
def pad_sequences(sequences, maxlen):
    padded_sequences = torch.zeros((len(sequences), maxlen), dtype=torch.long)
    for idx, sequence in enumerate(sequences):
        if len(sequence) > maxlen:
            sequence = sequence[:maxlen]
        padded_sequences[idx, :len(sequence)] = torch.tensor(sequence)
    return padded_sequences

# Create the vocabulary
vocab = create_vocab(X_train, max_words)

# Convert texts to sequences
X_train_seq = pad_sequences(texts_to_sequences(X_train, vocab), maxlen=max_len)
X_test_seq = pad_sequences(texts_to_sequences(X_test, vocab), maxlen=max_len)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

num_epochs = 10

# Create a custom dataset
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Create datasets
train_dataset = TextDataset(X_train_seq, y_train_tensor)
test_dataset = TextDataset(X_test_seq, y_test_tensor)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.1)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the last hidden state
        return x

# Instantiate the model
model = LSTMModel(max_words, 16, 64, 3)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')