Lin0He commited on
Commit
f36ca98
·
1 Parent(s): 1da9760

Delete pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +0 -93
pipeline.py DELETED
@@ -1,93 +0,0 @@
1
- '''
2
- # upload model
3
- import torch
4
- from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config
5
-
6
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
7
- model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))
8
-
9
- model.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
10
- tokenizer.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
11
- '''
12
- import torch
13
- from typing import Dict, List, Any
14
- from transformers import pipeline, AutoModel, AutoTokenizer
15
-
16
- def topk(probs, n=9):
17
- # The scores are initially softmaxed to convert to probabilities
18
- probs = torch.softmax(probs, dim= -1)
19
- # PyTorch has its own topk method, which we use here
20
- tokensProb, topIx = torch.topk(probs, k=n)
21
- # The new selection pool (9 choices) is normalized
22
- tokensProb = tokensProb / torch.sum(tokensProb)
23
- # Send to CPU for numpy handling
24
- tokensProb = tokensProb.cpu().detach().numpy()
25
- # Make a random choice from the pool based on the new prob distribution
26
- choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
27
- tokenId = topIx[choice][0]
28
- return int(tokenId)
29
-
30
- def model_infer(model, tokenizer, review, max_length=60):
31
- # Preprocess the init token (task designator)
32
- review_encoded = tokenizer.encode(review)
33
- result = review_encoded
34
- initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
35
-
36
- with torch.set_grad_enabled(False):
37
- # Feed the init token to the model
38
- output = model(initial_input)
39
-
40
- # Flatten the logits at the final time step
41
- logits = output.logits[0,-1]
42
-
43
- # Make a top-k choice and append to the result
44
- #choices = [topk(logits) for i in range(5)]
45
- choices = topk(logits)
46
- result.append(choices)
47
-
48
- # For max_length times:
49
- for _ in range(max_length):
50
- # Feed the current sequence to the model and make a choice
51
- input = torch.tensor(result).unsqueeze(0).to(device)
52
- output = model(input)
53
- logits = output.logits[0,-1]
54
- res_id = topk(logits)
55
-
56
- # If the chosen token is EOS, return the result
57
- if res_id == tokenizer.eos_token_id:
58
- return tokenizer.decode(result)
59
- else: # Append to the sequence
60
- result.append(res_id)
61
-
62
- # IF no EOS is generated, return after the max_len
63
- return tokenizer.decode(result)
64
-
65
- def predict(text, model, tokenizer):
66
- result_text = []
67
- for i in range(6):
68
- summary = model_infer(model, tokenizer, input+"TL;DR").strip()
69
- result_text.append(summary[len(input)+5:])
70
- return sorted(result_text, key=len)[3]
71
- #print("summary:", sorted(result_text, key=len)[3])
72
-
73
- class PreTrainedPipeline():
74
- def __init__(self, path=""):
75
- # load model and tokenizer from path
76
- self.tokenizer = AutoTokenizer.from_pretrained("Lin0He/text-summary-gpt2-short")
77
- self.model = AutoModel.from_pretrained("Lin0He/text-summary-gpt2-short")
78
-
79
-
80
- def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
81
- # process input
82
- inputs = data.pop("inputs", data)
83
- # process input text
84
- prediction = predict(inputs, self.model, self.tokenizer)
85
- return {"generated_text": prediction}
86
-
87
-
88
-
89
- '''
90
- predictor = pipeline("summarization", model = model, tokenizer = tokenizer)
91
- result = predictor("Input text for prediction")
92
- print(result)
93
- '''