Lin0He commited on
Commit
03c7b32
·
1 Parent(s): e9b3d8a

Upload pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +59 -64
pipeline.py CHANGED
@@ -13,80 +13,75 @@ import torch
13
  from typing import Dict, List, Any
14
  from transformers import pipeline, AutoModel, AutoTokenizer
15
 
16
- class PreTrainedPipeline():
17
- def __init__(self, path=""):
18
- # load model and tokenizer from path
19
- self.tokenizer = AutoTokenizer.from_pretrained("Lin0He/text-summary-gpt2-short")
20
- self.model = AutoModel.from_pretrained("Lin0He/text-summary-gpt2-short")
21
-
22
- def topk(probs, n=9):
23
- # The scores are initially softmaxed to convert to probabilities
24
- probs = torch.softmax(probs, dim= -1)
25
-
26
- # PyTorch has its own topk method, which we use here
27
- tokensProb, topIx = torch.topk(probs, k=n)
28
-
29
- # The new selection pool (9 choices) is normalized
30
- tokensProb = tokensProb / torch.sum(tokensProb)
31
-
32
- # Send to CPU for numpy handling
33
- tokensProb = tokensProb.cpu().detach().numpy()
34
-
35
- # Make a random choice from the pool based on the new prob distribution
36
- choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
37
- tokenId = topIx[choice][0]
38
-
39
- return int(tokenId)
40
-
41
- def model_infer(model, tokenizer, review, max_length=30):
42
- # Preprocess the init token (task designator)
43
- review_encoded = self.tokenizer.encode(review)
44
- result = review_encoded
45
- initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
46
-
47
- with torch.set_grad_enabled(False):
48
- # Feed the init token to the model
49
- output = self.model(initial_input)
50
-
51
- # Flatten the logits at the final time step
 
52
  logits = output.logits[0,-1]
 
53
 
54
- # Make a top-k choice and append to the result
55
- #choices = [topk(logits) for i in range(5)]
56
- choices = self.topk(logits)
57
- result.append(choices)
58
-
59
- # For max_length times:
60
- for _ in range(max_length):
61
- # Feed the current sequence to the model and make a choice
62
- input = torch.tensor(result).unsqueeze(0).to(device)
63
- output = self.model(input)
64
- logits = output.logits[0,-1]
65
- res_id = self.topk(logits)
66
 
67
- # If the chosen token is EOS, return the result
68
- if res_id == self.tokenizer.eos_token_id:
69
- return self.tokenizer.decode(result)
70
- else: # Append to the sequence
71
- result.append(res_id)
72
 
73
- # IF no EOS is generated, return after the max_len
74
- return self.tokenizer.decode(result)
 
 
 
 
 
75
 
76
- def predict(text):
77
- result_text = []
78
- for i in range(6):
79
- summary = self.model_infer(self.model, self.tokenizer, input+"TL;DR").strip()
80
- result_text.append(summary[len(input)+5:])
81
- return sorted(result_text, key=len)[3]
82
- #print("summary:", sorted(result_text, key=len)[3])
83
 
84
 
85
- def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
86
  # process input
87
  inputs = data.pop("inputs", data)
88
  # process input text
89
- prediction = self.predict(inputs)
90
  return {"text":prediction}
91
 
92
 
 
13
  from typing import Dict, List, Any
14
  from transformers import pipeline, AutoModel, AutoTokenizer
15
 
16
+ def topk(probs, n=9):
17
+ # The scores are initially softmaxed to convert to probabilities
18
+ probs = torch.softmax(probs, dim= -1)
19
+ # PyTorch has its own topk method, which we use here
20
+ tokensProb, topIx = torch.topk(probs, k=n)
21
+ # The new selection pool (9 choices) is normalized
22
+ tokensProb = tokensProb / torch.sum(tokensProb)
23
+ # Send to CPU for numpy handling
24
+ tokensProb = tokensProb.cpu().detach().numpy()
25
+ # Make a random choice from the pool based on the new prob distribution
26
+ choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
27
+ tokenId = topIx[choice][0]
28
+ return int(tokenId)
29
+
30
+ def model_infer(model, tokenizer, review, max_length=60):
31
+ # Preprocess the init token (task designator)
32
+ review_encoded = tokenizer.encode(review)
33
+ result = review_encoded
34
+ initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
35
+
36
+ with torch.set_grad_enabled(False):
37
+ # Feed the init token to the model
38
+ output = model(initial_input)
39
+
40
+ # Flatten the logits at the final time step
41
+ logits = output.logits[0,-1]
42
+
43
+ # Make a top-k choice and append to the result
44
+ #choices = [topk(logits) for i in range(5)]
45
+ choices = topk(logits)
46
+ result.append(choices)
47
+
48
+ # For max_length times:
49
+ for _ in range(max_length):
50
+ # Feed the current sequence to the model and make a choice
51
+ input = torch.tensor(result).unsqueeze(0).to(device)
52
+ output = model(input)
53
  logits = output.logits[0,-1]
54
+ res_id = topk(logits)
55
 
56
+ # If the chosen token is EOS, return the result
57
+ if res_id == tokenizer.eos_token_id:
58
+ return tokenizer.decode(result)
59
+ else: # Append to the sequence
60
+ result.append(res_id)
 
 
 
 
 
 
 
61
 
62
+ # IF no EOS is generated, return after the max_len
63
+ return tokenizer.decode(result)
 
 
 
64
 
65
+ def predict(text, model, tokenizer):
66
+ result_text = []
67
+ for i in range(6):
68
+ summary = model_infer(model, tokenizer, input+"TL;DR").strip()
69
+ result_text.append(summary[len(input)+5:])
70
+ return sorted(result_text, key=len)[3]
71
+ #print("summary:", sorted(result_text, key=len)[3])
72
 
73
+ class PreTrainedPipeline():
74
+ def __init__(self, path=""):
75
+ # load model and tokenizer from path
76
+ self.tokenizer = AutoTokenizer.from_pretrained("Lin0He/text-summary-gpt2-short")
77
+ self.model = AutoModel.from_pretrained("Lin0He/text-summary-gpt2-short")
 
 
78
 
79
 
80
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
81
  # process input
82
  inputs = data.pop("inputs", data)
83
  # process input text
84
+ prediction = self.predict(inputs, self.model, self.tokenizer)
85
  return {"text":prediction}
86
 
87