Lin0He commited on
Commit
c5d0bfd
·
1 Parent(s): 3daf0b8

Delete text_summary.py

Browse files
Files changed (1) hide show
  1. text_summary.py +0 -248
text_summary.py DELETED
@@ -1,248 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import torch
4
- import torch.optim as optim
5
- from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
6
- from torch import cuda
7
- from torch.utils.data import Dataset, DataLoader
8
- from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config
9
- import argparse
10
-
11
-
12
- #from google.colab import drive
13
- #drive.mount('/content/drive')
14
-
15
- device = 'mps' if torch.backends.mps.is_available() else 'cpu'
16
-
17
- #!pip install datasets
18
-
19
- '''
20
- from datasets import load_dataset
21
- dataset1 = load_dataset("dair-ai/emotion")
22
- for split, data in dataset1.items():
23
- data.to_csv(f"emotion_{split}.csv", index = None)
24
- '''
25
-
26
- def read_reviews(data_path):
27
- dataset = pd.DataFrame()
28
- for path in data_path:
29
- df = pd.read_csv("/content/drive/MyDrive/Text_summary_datasets/"+ path)
30
- # Remove null values:
31
- df.dropna(inplace=True)
32
- # Convert label:
33
- if path == "emotion_train.csv":
34
- class_mapping = {0:'sad', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
35
- # Replace the numerical/categorical values with words using the mapping
36
- df['Summary'] = df['label'].replace(class_mapping)
37
- df['training'] = df['text'] + 'TL;DR' + df['Summary']
38
- df['Text'] = df['text']
39
- if path == "amazon_review.csv":
40
- df['training'] = df['Text'] + 'TL;DR' + df['Summary']
41
- if path == "kindle_review.csv":
42
- df['training'] = df['reviewText'] + 'TL;DR' + df['summary']
43
- df['Text'] = df['reviewText']
44
- df['Summary'] = df['summary']
45
- if path == "tweet_train.csv":
46
- df['training'] = df['content'] + 'TL;DR' + df['c_summary']
47
- df['Text'] = df['content']
48
- df['Summary'] = df['c_summary']
49
-
50
- sampled_data = df.sample(n=1250, random_state=42)
51
- dataset = dataset.append(sampled_data, ignore_index=True)
52
-
53
- # Combining the two columns review and summary:
54
- #df['training'] = df['text'] + 'TL;DR' + df['Summary']
55
- dataset = dataset[['Summary','Text','training']]
56
- return dataset
57
-
58
-
59
-
60
- #reviews.head(1800)
61
-
62
- class GPT2ReviewDataset(Dataset):
63
- def __init__(self, tokenizer, reviews, max_len):
64
- self.max_len = max_len
65
- self.tokenizer = tokenizer
66
- self.eos = self.tokenizer.eos_token
67
- self.eos_id = self.tokenizer.eos_token_id
68
- self.reviews = reviews
69
- self.result = []
70
-
71
- for review in self.reviews:
72
- # Encode the text using tokenizer.encode(). We add EOS at the end
73
- tokenized = self.tokenizer.encode(review + self.eos, max_length = 512, truncation = True)
74
-
75
- # Padding/truncating the encoded sequence to max_len
76
- padded = self.pad_truncate(tokenized)
77
-
78
- # Creating a tensor and adding to the result
79
- self.result.append(torch.tensor(padded))
80
-
81
-
82
-
83
- def __len__(self):
84
- return len(self.result)
85
-
86
-
87
- def __getitem__(self, item):
88
- return self.result[item]
89
-
90
- def pad_truncate(self, name):
91
- extra_length = 4
92
- name_length = len(name) - extra_length
93
- if name_length < self.max_len:
94
- difference = self.max_len - name_length
95
- result = name + [self.eos_id] * difference
96
- elif name_length > self.max_len:
97
- result = name[:self.max_len + 3]+[self.eos_id]
98
- else:
99
- result = name
100
- return result
101
-
102
- def train(model, optimizer, dl, epochs):
103
- for epoch in range(epochs):
104
- for idx, batch in enumerate(dl):
105
- print(idx)
106
- with torch.set_grad_enabled(True):
107
- optimizer.zero_grad()
108
- batch = batch.to(device)
109
- output = model(batch, labels=batch)
110
- loss = output[0]
111
- loss.backward()
112
- optimizer.step()
113
- torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
114
- if idx % 50 == 0:
115
- print("loss: %f, %d"%(loss, idx))
116
-
117
- def main():
118
- data_path = ["emotion_train.csv","kindle_review.csv", "amazon_review.csv", "tweet_train.csv"]
119
- reviews = read_reviews(data_path)
120
-
121
- model = GPT2LMHeadModel.from_pretrained('gpt2')
122
- #model = torch.load('/content/drive/MyDrive/text_summary.pth')
123
- config = GPT2Config.from_pretrained("gpt2")
124
- model.config = config
125
-
126
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
127
- extra_length = len(tokenizer.encode(" TL;DR "))
128
- max_length = 250
129
- optimizer = optim.Adam(params = model.parameters(), lr=3e-4)
130
-
131
- dataset = GPT2ReviewDataset(tokenizer, reviews['training'], max_len = max_length)
132
- dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
133
-
134
- train(model=model, optimizer=optimizer, dl=dataloader, epochs=3)
135
-
136
- torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
137
-
138
-
139
- def topk(probs, n=9):
140
- # The scores are initially softmaxed to convert to probabilities
141
- probs = torch.softmax(probs, dim= -1)
142
-
143
- # PyTorch has its own topk method, which we use here
144
- tokensProb, topIx = torch.topk(probs, k=n)
145
-
146
- # The new selection pool (9 choices) is normalized
147
- tokensProb = tokensProb / torch.sum(tokensProb)
148
-
149
- # Send to CPU for numpy handling
150
- tokensProb = tokensProb.cpu().detach().numpy()
151
-
152
- # Make a random choice from the pool based on the new prob distribution
153
- choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
154
- tokenId = topIx[choice][0]
155
-
156
- return int(tokenId)
157
-
158
- def model_infer(model, tokenizer, review, max_length=30):
159
- # Preprocess the init token (task designator)
160
- review_encoded = tokenizer.encode(review)
161
- result = review_encoded
162
- initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
163
-
164
- with torch.set_grad_enabled(False):
165
- # Feed the init token to the model
166
- output = model(initial_input)
167
-
168
- # Flatten the logits at the final time step
169
- logits = output.logits[0,-1]
170
-
171
- # Make a top-k choice and append to the result
172
- #choices = [topk(logits) for i in range(5)]
173
- choices = topk(logits)
174
- result.append(choices)
175
-
176
- # For max_length times:
177
- for _ in range(max_length):
178
- # Feed the current sequence to the model and make a choice
179
- input = torch.tensor(result).unsqueeze(0).to(device)
180
- output = model(input)
181
- logits = output.logits[0,-1]
182
- res_id = topk(logits)
183
-
184
- # If the chosen token is EOS, return the result
185
- if res_id == tokenizer.eos_token_id:
186
- return tokenizer.decode(result)
187
- else: # Append to the sequence
188
- result.append(res_id)
189
-
190
- # IF no EOS is generated, return after the max_len
191
- return tokenizer.decode(result)
192
-
193
- def interface(input):
194
- dataset_sample = False
195
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
196
- model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))
197
- if dataset_sample:
198
- sample_reviews = reviews['training'].sample(n=1, random_state=1)
199
- summary = [model_infer(model, tokenizer, review).strip() for review in sample_reviews]
200
-
201
- else:
202
- result_text = []
203
- for i in range(6):
204
- summary = model_infer(model, tokenizer, input+"TL;DR").strip()
205
- result_text.append(summary[len(input)+5:])
206
- #print(sorted(result_text, key=len))
207
- print("summary:", sorted(result_text, key=len)[3])
208
-
209
- '''
210
-
211
-
212
- sample = 'Today was a hard day. I woke up feeling anxious and stressed about a meeting I had at work. The meeting did not go as I had hoped and I left disappointed. I tried to focus on other things and stay positive, but it was hard. I spent most of the evening starving and eating junk food. Not the best way to deal with my emotions, but it’s something I’m working on. Hope tomorrow will be a better day.TL;DR'
213
-
214
- summary = model_infer(model, tokenizer, sample).strip()
215
-
216
- sample
217
-
218
- summary[len(sample):]
219
-
220
- sample = 'Today was much better than yesterday. I wake up feeling more rested and ready to tackle the day. I had a productive day at work and even managed to finish a project I was struggling with. After work, I met some friends for a yoga class and it was just what I needed to relax and unwind. We went out for dinner afterwards and had a really nice time. Overall, it was a much better day than yesterday and I feel more positive about things.TL;DR'
221
-
222
- summary = model_infer(model, tokenizer, sample).strip()
223
-
224
- summary[len(sample):]
225
-
226
- sample = 'Today was a beautiful day. I had a good night’s sleep and was ready to start the day. I went to work and had a productive morning. I even managed to finish a project I’d been working on for weeks. After work, I ran to clear my head. It was a beautiful day and the weather was perfect for it. I came home and cooked dinner with my partner. We had a nice conversation over dinner and then spent the evening watching a movie. Overall, it was a pretty relaxing and enjoyable day.'
227
-
228
- summary = model_infer(model, tokenizer, sample + 'TL;DR').strip()
229
- summary[len(sample)+5:]
230
- '''
231
-
232
- if __name__ == '__main__':
233
- parser = argparse.ArgumentParser(description= "parser")
234
-
235
- # Add command-line arguments
236
- parser.add_argument("--train", action="store_true", help="Train the model")
237
- parser.add_argument("--infer", type=str, help="Interact with the model")
238
-
239
- # Parse the command-line arguments
240
- args = parser.parse_args()
241
-
242
- # Check which argument was provided and call the corresponding function
243
- if args.train:
244
- main()
245
- elif args.infer:
246
- interface(args.infer)
247
- else:
248
- print("No valid option provided. Use --train or --infer.")