Lin0He commited on
Commit
74353d2
·
1 Parent(s): 1a8668c

Upload 2 files

Browse files
Files changed (2) hide show
  1. text_summary.py +245 -0
  2. text_summary_4sets_2_550.pth +3 -0
text_summary.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.optim as optim
5
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
6
+ from torch import cuda
7
+ from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config
8
+ import argparse
9
+
10
+
11
+ #from google.colab import drive
12
+ #drive.mount('/content/drive')
13
+
14
+ device = 'mps' if torch.backends.mps.is_available() else 'cpu'
15
+
16
+ #!pip install datasets
17
+
18
+ '''
19
+ from datasets import load_dataset
20
+ dataset1 = load_dataset("dair-ai/emotion")
21
+ for split, data in dataset1.items():
22
+ data.to_csv(f"emotion_{split}.csv", index = None)
23
+ '''
24
+
25
+ def read_reviews(data_path):
26
+ dataset = pd.DataFrame()
27
+ for path in data_path:
28
+ df = pd.read_csv("/content/drive/MyDrive/Text_summary_datasets/"+ path)
29
+ # Remove null values:
30
+ df.dropna(inplace=True)
31
+ # Convert label:
32
+ if path == "emotion_train.csv":
33
+ class_mapping = {0:'sad', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
34
+ # Replace the numerical/categorical values with words using the mapping
35
+ df['Summary'] = df['label'].replace(class_mapping)
36
+ df['training'] = df['text'] + 'TL;DR' + df['Summary']
37
+ df['Text'] = df['text']
38
+ if path == "amazon_review.csv":
39
+ df['training'] = df['Text'] + 'TL;DR' + df['Summary']
40
+ if path == "kindle_review.csv":
41
+ df['training'] = df['reviewText'] + 'TL;DR' + df['summary']
42
+ df['Text'] = df['reviewText']
43
+ df['Summary'] = df['summary']
44
+ if path == "tweet_train.csv":
45
+ df['training'] = df['content'] + 'TL;DR' + df['c_summary']
46
+ df['Text'] = df['content']
47
+ df['Summary'] = df['c_summary']
48
+
49
+ sampled_data = df.sample(n=1250, random_state=42)
50
+ dataset = dataset.append(sampled_data, ignore_index=True)
51
+
52
+ # Combining the two columns review and summary:
53
+ #df['training'] = df['text'] + 'TL;DR' + df['Summary']
54
+ dataset = dataset[['Summary','Text','training']]
55
+ return dataset
56
+
57
+
58
+
59
+ #reviews.head(1800)
60
+
61
+ class GPT2ReviewDataset(Dataset):
62
+ def __init__(self, tokenizer, reviews, max_len):
63
+ self.max_len = max_len
64
+ self.tokenizer = tokenizer
65
+ self.eos = self.tokenizer.eos_token
66
+ self.eos_id = self.tokenizer.eos_token_id
67
+ self.reviews = reviews
68
+ self.result = []
69
+
70
+ for review in self.reviews:
71
+ # Encode the text using tokenizer.encode(). We add EOS at the end
72
+ tokenized = self.tokenizer.encode(review + self.eos, max_length = 512, truncation = True)
73
+
74
+ # Padding/truncating the encoded sequence to max_len
75
+ padded = self.pad_truncate(tokenized)
76
+
77
+ # Creating a tensor and adding to the result
78
+ self.result.append(torch.tensor(padded))
79
+
80
+ def __len__(self):
81
+ return len(self.result)
82
+
83
+
84
+ def __getitem__(self, item):
85
+ return self.result[item]
86
+
87
+ def pad_truncate(self, name):
88
+ extra_length = 4
89
+ name_length = len(name) - extra_length
90
+ if name_length < self.max_len:
91
+ difference = self.max_len - name_length
92
+ result = name + [self.eos_id] * difference
93
+ elif name_length > self.max_len:
94
+ result = name[:self.max_len + 3]+[self.eos_id]
95
+ else:
96
+ result = name
97
+ return result
98
+
99
+ def train(model, optimizer, dl, epochs):
100
+ for epoch in range(epochs):
101
+ for idx, batch in enumerate(dl):
102
+ print(idx)
103
+ with torch.set_grad_enabled(True):
104
+ optimizer.zero_grad()
105
+ batch = batch.to(device)
106
+ output = model(batch, labels=batch)
107
+ loss = output[0]
108
+ loss.backward()
109
+ optimizer.step()
110
+ torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
111
+ if idx % 50 == 0:
112
+ print("loss: %f, %d"%(loss, idx))
113
+
114
+ def main():
115
+ data_path = ["emotion_train.csv","kindle_review.csv", "amazon_review.csv", "tweet_train.csv"]
116
+ reviews = read_reviews(data_path)
117
+
118
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
119
+ #model = torch.load('/content/drive/MyDrive/text_summary.pth')
120
+ config = GPT2Config.from_pretrained("gpt2")
121
+ model.config = config
122
+
123
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
124
+ extra_length = len(tokenizer.encode(" TL;DR "))
125
+ max_length = 250
126
+ optimizer = optim.Adam(params = model.parameters(), lr=3e-4)
127
+
128
+ dataset = GPT2ReviewDataset(tokenizer, reviews['training'], max_len = max_length)
129
+ dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
130
+
131
+ train(model=model, optimizer=optimizer, dl=dataloader, epochs=3)
132
+
133
+ torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
134
+
135
+
136
+ def topk(probs, n=9):
137
+ # The scores are initially softmaxed to convert to probabilities
138
+ probs = torch.softmax(probs, dim= -1)
139
+
140
+ # PyTorch has its own topk method, which we use here
141
+ tokensProb, topIx = torch.topk(probs, k=n)
142
+
143
+ # The new selection pool (9 choices) is normalized
144
+ tokensProb = tokensProb / torch.sum(tokensProb)
145
+
146
+ # Send to CPU for numpy handling
147
+ tokensProb = tokensProb.cpu().detach().numpy()
148
+
149
+ # Make a random choice from the pool based on the new prob distribution
150
+ choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
151
+ tokenId = topIx[choice][0]
152
+
153
+ return int(tokenId)
154
+
155
+ def model_infer(model, tokenizer, review, max_length=30):
156
+ # Preprocess the init token (task designator)
157
+ review_encoded = tokenizer.encode(review)
158
+ result = review_encoded
159
+ initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
160
+
161
+ with torch.set_grad_enabled(False):
162
+ # Feed the init token to the model
163
+ output = model(initial_input)
164
+
165
+ # Flatten the logits at the final time step
166
+ logits = output.logits[0,-1]
167
+
168
+ # Make a top-k choice and append to the result
169
+ #choices = [topk(logits) for i in range(5)]
170
+ choices = topk(logits)
171
+ result.append(choices)
172
+
173
+ # For max_length times:
174
+ for _ in range(max_length):
175
+ # Feed the current sequence to the model and make a choice
176
+ input = torch.tensor(result).unsqueeze(0).to(device)
177
+ output = model(input)
178
+ logits = output.logits[0,-1]
179
+ res_id = topk(logits)
180
+
181
+ # If the chosen token is EOS, return the result
182
+ if res_id == tokenizer.eos_token_id:
183
+ return tokenizer.decode(result)
184
+ else: # Append to the sequence
185
+ result.append(res_id)
186
+
187
+ # IF no EOS is generated, return after the max_len
188
+ return tokenizer.decode(result)
189
+
190
+ def interface(input):
191
+ dataset_sample = False
192
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
193
+ model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))
194
+ if dataset_sample:
195
+ sample_reviews = reviews['training'].sample(n=1, random_state=1)
196
+ summary = [model_infer(model, tokenizer, review).strip() for review in sample_reviews]
197
+
198
+ else:
199
+ result_text = []
200
+ for i in range(6):
201
+ summary = model_infer(model, tokenizer, input+"TL;DR").strip()
202
+ result_text.append(summary[len(input)+5:])
203
+ #print(sorted(result_text, key=len))
204
+ print("summary:", sorted(result_text, key=len)[3])
205
+
206
+ '''
207
+
208
+
209
+ sample = 'Today was a hard day. I woke up feeling anxious and stressed about a meeting I had at work. The meeting did not go as I had hoped and I left disappointed. I tried to focus on other things and stay positive, but it was hard. I spent most of the evening starving and eating junk food. Not the best way to deal with my emotions, but it’s something I’m working on. Hope tomorrow will be a better day.TL;DR'
210
+
211
+ summary = model_infer(model, tokenizer, sample).strip()
212
+
213
+ sample
214
+
215
+ summary[len(sample):]
216
+
217
+ sample = 'Today was much better than yesterday. I wake up feeling more rested and ready to tackle the day. I had a productive day at work and even managed to finish a project I was struggling with. After work, I met some friends for a yoga class and it was just what I needed to relax and unwind. We went out for dinner afterwards and had a really nice time. Overall, it was a much better day than yesterday and I feel more positive about things.TL;DR'
218
+
219
+ summary = model_infer(model, tokenizer, sample).strip()
220
+
221
+ summary[len(sample):]
222
+
223
+ sample = 'Today was a beautiful day. I had a good night’s sleep and was ready to start the day. I went to work and had a productive morning. I even managed to finish a project I’d been working on for weeks. After work, I ran to clear my head. It was a beautiful day and the weather was perfect for it. I came home and cooked dinner with my partner. We had a nice conversation over dinner and then spent the evening watching a movie. Overall, it was a pretty relaxing and enjoyable day.'
224
+
225
+ summary = model_infer(model, tokenizer, sample + 'TL;DR').strip()
226
+ summary[len(sample)+5:]
227
+ '''
228
+
229
+ if __name__ == '__main__':
230
+ parser = argparse.ArgumentParser(description= "parser")
231
+
232
+ # Add command-line arguments
233
+ parser.add_argument("--train", action="store_true", help="Train the model")
234
+ parser.add_argument("--infer", type=str, help="Interact with the model")
235
+
236
+ # Parse the command-line arguments
237
+ args = parser.parse_args()
238
+
239
+ # Check which argument was provided and call the corresponding function
240
+ if args.train:
241
+ main()
242
+ elif args.infer:
243
+ interface(args.infer)
244
+ else:
245
+ print("No valid option provided. Use --train or --infer.")
text_summary_4sets_2_550.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02371a7ea71a71db6526fe99a4388877cd54d94652e770b922031687ddaf9169
3
+ size 510431223