Lin0He commited on
Commit
9d6d2ac
·
1 Parent(s): 281a5cc

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. text_summary.py +248 -0
text_summary.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.optim as optim
5
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
6
+ from torch import cuda
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config
9
+ import argparse
10
+
11
+
12
+ #from google.colab import drive
13
+ #drive.mount('/content/drive')
14
+
15
+ device = 'mps' if torch.backends.mps.is_available() else 'cpu'
16
+
17
+ #!pip install datasets
18
+
19
+ '''
20
+ from datasets import load_dataset
21
+ dataset1 = load_dataset("dair-ai/emotion")
22
+ for split, data in dataset1.items():
23
+ data.to_csv(f"emotion_{split}.csv", index = None)
24
+ '''
25
+
26
+ def read_reviews(data_path):
27
+ dataset = pd.DataFrame()
28
+ for path in data_path:
29
+ df = pd.read_csv("/content/drive/MyDrive/Text_summary_datasets/"+ path)
30
+ # Remove null values:
31
+ df.dropna(inplace=True)
32
+ # Convert label:
33
+ if path == "emotion_train.csv":
34
+ class_mapping = {0:'sad', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
35
+ # Replace the numerical/categorical values with words using the mapping
36
+ df['Summary'] = df['label'].replace(class_mapping)
37
+ df['training'] = df['text'] + 'TL;DR' + df['Summary']
38
+ df['Text'] = df['text']
39
+ if path == "amazon_review.csv":
40
+ df['training'] = df['Text'] + 'TL;DR' + df['Summary']
41
+ if path == "kindle_review.csv":
42
+ df['training'] = df['reviewText'] + 'TL;DR' + df['summary']
43
+ df['Text'] = df['reviewText']
44
+ df['Summary'] = df['summary']
45
+ if path == "tweet_train.csv":
46
+ df['training'] = df['content'] + 'TL;DR' + df['c_summary']
47
+ df['Text'] = df['content']
48
+ df['Summary'] = df['c_summary']
49
+
50
+ sampled_data = df.sample(n=1250, random_state=42)
51
+ dataset = dataset.append(sampled_data, ignore_index=True)
52
+
53
+ # Combining the two columns review and summary:
54
+ #df['training'] = df['text'] + 'TL;DR' + df['Summary']
55
+ dataset = dataset[['Summary','Text','training']]
56
+ return dataset
57
+
58
+
59
+
60
+ #reviews.head(1800)
61
+
62
+ class GPT2ReviewDataset(Dataset):
63
+ def __init__(self, tokenizer, reviews, max_len):
64
+ self.max_len = max_len
65
+ self.tokenizer = tokenizer
66
+ self.eos = self.tokenizer.eos_token
67
+ self.eos_id = self.tokenizer.eos_token_id
68
+ self.reviews = reviews
69
+ self.result = []
70
+
71
+ for review in self.reviews:
72
+ # Encode the text using tokenizer.encode(). We add EOS at the end
73
+ tokenized = self.tokenizer.encode(review + self.eos, max_length = 512, truncation = True)
74
+
75
+ # Padding/truncating the encoded sequence to max_len
76
+ padded = self.pad_truncate(tokenized)
77
+
78
+ # Creating a tensor and adding to the result
79
+ self.result.append(torch.tensor(padded))
80
+
81
+
82
+
83
+ def __len__(self):
84
+ return len(self.result)
85
+
86
+
87
+ def __getitem__(self, item):
88
+ return self.result[item]
89
+
90
+ def pad_truncate(self, name):
91
+ extra_length = 4
92
+ name_length = len(name) - extra_length
93
+ if name_length < self.max_len:
94
+ difference = self.max_len - name_length
95
+ result = name + [self.eos_id] * difference
96
+ elif name_length > self.max_len:
97
+ result = name[:self.max_len + 3]+[self.eos_id]
98
+ else:
99
+ result = name
100
+ return result
101
+
102
+ def train(model, optimizer, dl, epochs):
103
+ for epoch in range(epochs):
104
+ for idx, batch in enumerate(dl):
105
+ print(idx)
106
+ with torch.set_grad_enabled(True):
107
+ optimizer.zero_grad()
108
+ batch = batch.to(device)
109
+ output = model(batch, labels=batch)
110
+ loss = output[0]
111
+ loss.backward()
112
+ optimizer.step()
113
+ torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
114
+ if idx % 50 == 0:
115
+ print("loss: %f, %d"%(loss, idx))
116
+
117
+ def main():
118
+ data_path = ["emotion_train.csv","kindle_review.csv", "amazon_review.csv", "tweet_train.csv"]
119
+ reviews = read_reviews(data_path)
120
+
121
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
122
+ #model = torch.load('/content/drive/MyDrive/text_summary.pth')
123
+ config = GPT2Config.from_pretrained("gpt2")
124
+ model.config = config
125
+
126
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
127
+ extra_length = len(tokenizer.encode(" TL;DR "))
128
+ max_length = 250
129
+ optimizer = optim.Adam(params = model.parameters(), lr=3e-4)
130
+
131
+ dataset = GPT2ReviewDataset(tokenizer, reviews['training'], max_len = max_length)
132
+ dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
133
+
134
+ train(model=model, optimizer=optimizer, dl=dataloader, epochs=3)
135
+
136
+ torch.save(model, '/content/drive/MyDrive/Text_summary_datasets/text_summary_4sets.pth')
137
+
138
+
139
+ def topk(probs, n=9):
140
+ # The scores are initially softmaxed to convert to probabilities
141
+ probs = torch.softmax(probs, dim= -1)
142
+
143
+ # PyTorch has its own topk method, which we use here
144
+ tokensProb, topIx = torch.topk(probs, k=n)
145
+
146
+ # The new selection pool (9 choices) is normalized
147
+ tokensProb = tokensProb / torch.sum(tokensProb)
148
+
149
+ # Send to CPU for numpy handling
150
+ tokensProb = tokensProb.cpu().detach().numpy()
151
+
152
+ # Make a random choice from the pool based on the new prob distribution
153
+ choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
154
+ tokenId = topIx[choice][0]
155
+
156
+ return int(tokenId)
157
+
158
+ def model_infer(model, tokenizer, review, max_length=30):
159
+ # Preprocess the init token (task designator)
160
+ review_encoded = tokenizer.encode(review)
161
+ result = review_encoded
162
+ initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
163
+
164
+ with torch.set_grad_enabled(False):
165
+ # Feed the init token to the model
166
+ output = model(initial_input)
167
+
168
+ # Flatten the logits at the final time step
169
+ logits = output.logits[0,-1]
170
+
171
+ # Make a top-k choice and append to the result
172
+ #choices = [topk(logits) for i in range(5)]
173
+ choices = topk(logits)
174
+ result.append(choices)
175
+
176
+ # For max_length times:
177
+ for _ in range(max_length):
178
+ # Feed the current sequence to the model and make a choice
179
+ input = torch.tensor(result).unsqueeze(0).to(device)
180
+ output = model(input)
181
+ logits = output.logits[0,-1]
182
+ res_id = topk(logits)
183
+
184
+ # If the chosen token is EOS, return the result
185
+ if res_id == tokenizer.eos_token_id:
186
+ return tokenizer.decode(result)
187
+ else: # Append to the sequence
188
+ result.append(res_id)
189
+
190
+ # IF no EOS is generated, return after the max_len
191
+ return tokenizer.decode(result)
192
+
193
+ def interface(input):
194
+ dataset_sample = False
195
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
196
+ model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))
197
+ if dataset_sample:
198
+ sample_reviews = reviews['training'].sample(n=1, random_state=1)
199
+ summary = [model_infer(model, tokenizer, review).strip() for review in sample_reviews]
200
+
201
+ else:
202
+ result_text = []
203
+ for i in range(6):
204
+ summary = model_infer(model, tokenizer, input+"TL;DR").strip()
205
+ result_text.append(summary[len(input)+5:])
206
+ #print(sorted(result_text, key=len))
207
+ print("summary:", sorted(result_text, key=len)[3])
208
+
209
+ '''
210
+
211
+
212
+ sample = 'Today was a hard day. I woke up feeling anxious and stressed about a meeting I had at work. The meeting did not go as I had hoped and I left disappointed. I tried to focus on other things and stay positive, but it was hard. I spent most of the evening starving and eating junk food. Not the best way to deal with my emotions, but it’s something I’m working on. Hope tomorrow will be a better day.TL;DR'
213
+
214
+ summary = model_infer(model, tokenizer, sample).strip()
215
+
216
+ sample
217
+
218
+ summary[len(sample):]
219
+
220
+ sample = 'Today was much better than yesterday. I wake up feeling more rested and ready to tackle the day. I had a productive day at work and even managed to finish a project I was struggling with. After work, I met some friends for a yoga class and it was just what I needed to relax and unwind. We went out for dinner afterwards and had a really nice time. Overall, it was a much better day than yesterday and I feel more positive about things.TL;DR'
221
+
222
+ summary = model_infer(model, tokenizer, sample).strip()
223
+
224
+ summary[len(sample):]
225
+
226
+ sample = 'Today was a beautiful day. I had a good night’s sleep and was ready to start the day. I went to work and had a productive morning. I even managed to finish a project I’d been working on for weeks. After work, I ran to clear my head. It was a beautiful day and the weather was perfect for it. I came home and cooked dinner with my partner. We had a nice conversation over dinner and then spent the evening watching a movie. Overall, it was a pretty relaxing and enjoyable day.'
227
+
228
+ summary = model_infer(model, tokenizer, sample + 'TL;DR').strip()
229
+ summary[len(sample)+5:]
230
+ '''
231
+
232
+ if __name__ == '__main__':
233
+ parser = argparse.ArgumentParser(description= "parser")
234
+
235
+ # Add command-line arguments
236
+ parser.add_argument("--train", action="store_true", help="Train the model")
237
+ parser.add_argument("--infer", type=str, help="Interact with the model")
238
+
239
+ # Parse the command-line arguments
240
+ args = parser.parse_args()
241
+
242
+ # Check which argument was provided and call the corresponding function
243
+ if args.train:
244
+ main()
245
+ elif args.infer:
246
+ interface(args.infer)
247
+ else:
248
+ print("No valid option provided. Use --train or --infer.")