Lin0He
/

text-summary-gpt2-short

@@ -1,93 +0,0 @@
-'''
-# upload model
-import torch
-from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))
-model.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
-tokenizer.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
-'''
-import torch
-from typing import Dict, List, Any
-from transformers import pipeline, AutoModel, AutoTokenizer
-def topk(probs, n=9):
-    # The scores are initially softmaxed to convert to probabilities
-    probs = torch.softmax(probs, dim= -1)
-    # PyTorch has its own topk method, which we use here
-    tokensProb, topIx = torch.topk(probs, k=n)
-    # The new selection pool (9 choices) is normalized
-    tokensProb = tokensProb / torch.sum(tokensProb)
-    # Send to CPU for numpy handling
-    tokensProb = tokensProb.cpu().detach().numpy()
-    # Make a random choice from the pool based on the new prob distribution
-    choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
-    tokenId = topIx[choice][0]
-    return int(tokenId)
-def model_infer(model, tokenizer, review, max_length=60):
-    # Preprocess the init token (task designator)
-    review_encoded = tokenizer.encode(review)
-    result = review_encoded
-    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
-    with torch.set_grad_enabled(False):
-        # Feed the init token to the model
-        output = model(initial_input)
-        # Flatten the logits at the final time step
-        logits = output.logits[0,-1]
-        # Make a top-k choice and append to the result
-        #choices = [topk(logits) for i in range(5)]
-        choices = topk(logits)
-        result.append(choices)
-        # For max_length times:
-        for _ in range(max_length):
-            # Feed the current sequence to the model and make a choice
-            input = torch.tensor(result).unsqueeze(0).to(device)
-            output = model(input)
-            logits = output.logits[0,-1]
-            res_id = topk(logits)
-            # If the chosen token is EOS, return the result
-            if res_id == tokenizer.eos_token_id:
-                return tokenizer.decode(result)
-            else: # Append to the sequence
-                result.append(res_id)
-    # IF no EOS is generated, return after the max_len
-    return tokenizer.decode(result)
-def predict(text, model, tokenizer):
-    result_text = []
-    for i in range(6):
-        summary = model_infer(model, tokenizer, input+"TL;DR").strip()
-        result_text.append(summary[len(input)+5:])
-    return sorted(result_text, key=len)[3]
-    #print("summary:", sorted(result_text, key=len)[3])
-class PreTrainedPipeline():
-    def __init__(self, path=""):
-        # load model and tokenizer from path
-        self.tokenizer = AutoTokenizer.from_pretrained("Lin0He/text-summary-gpt2-short")
-        self.model = AutoModel.from_pretrained("Lin0He/text-summary-gpt2-short")
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        # process input
-        inputs = data.pop("inputs", data)
-        # process input text
-        prediction = predict(inputs, self.model, self.tokenizer)
-        return {"generated_text": prediction}
-'''
-predictor = pipeline("summarization", model = model, tokenizer = tokenizer)
-result = predictor("Input text for prediction")
-print(result)
-'''