'''
# upload model
import torch
from transformers import GPT2LMHeadModel,GPT2Tokenizer, GPT2Config

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = torch.load('text_summary_4sets_2_550.pth', map_location=torch.device('mps'))

model.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
tokenizer.push_to_hub(repo_name="text-summary-gpt2-short", repo_id="Lin0He/text-summary-gpt2-short")
'''
import torch
import numpy as np
from typing import Dict, List, Any
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)
    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)
    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)
    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()
    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)#[np.argmax(tokensProb)]#
    tokenId = topIx[choice][0]
    return int(tokenId)

def model_infer(model, tokenizer, review, max_length=300):
    result_text = []
    for i in range(6):
    
        # Preprocess the init token (task designator)
        review_encoded = tokenizer.encode(review)
        result = review_encoded
        initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
    
        with torch.set_grad_enabled(False):
            # Feed the init token to the model
            output = model(initial_input)
    
            # Flatten the logits at the final time step
            logits = output.logits[0,-1]
    
            # Make a top-k choice and append to the result
            #choices = [topk(logits) for i in range(5)]
            choices = topk(logits)
            result.append(choices)
            
            # For max_length times:
            for _ in range(max_length):
                # Feed the current sequence to the model and make a choice
                input = torch.tensor(result).unsqueeze(0).to(device)
                output = model(input)
                logits = output.logits[0,-1]
                res_id = topk(logits)
    
                # If the chosen token is EOS, return the result
                if res_id == tokenizer.eos_token_id:
                    return tokenizer.decode(result)
                else: # Append to the sequence
                    result.append(res_id)

    # IF no EOS is generated, return after the max_len
    result_text.append(tokenizer.decode(result)) 
    return sorted(result_text, key=len)[3]

class EndpointHandler():
    def __init__(self, path=""):
        # load model and tokenizer from path
        self.tokenizer = AutoTokenizer.from_pretrained("Lin0He/text-summary-gpt2-short")
        self.model = AutoModelForCausalLM.from_pretrained("Lin0He/text-summary-gpt2-short")


    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        # process input
        inputs = data.pop("inputs", data)
        # process input text
        prediction = model_infer( self.model, self.tokenizer，inputs+"TL;DR")
        return prediction