import gradio as gr
#import transformers
#from transformers import pipeline
from llama_cpp import Llama

model="TheBloke/Nous-Hermes-13B-GGML"
model="https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_K_S.bin"

def question_answer(context, question):
    text=context + "\n\nQuestion: \"\"\"\n" + question + "\nPlease use markdown formatting for answer. \nAnswer:\n" 
    llm = Llama(model_path=model)
    output = llm(text, max_tokens=33, stop=["### Response", "\n"], echo=True)
    print(output)
    return output.choices[0].text
    '''
    Output is of the form:
    {
  "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
  "object": "text_completion",
  "created": 1679561337,
  "model": "./models/7B/ggml-model.bin",
  "choices": [
    {
      "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
      "index": 0,
      "logprobs": None,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 14,
    "completion_tokens": 28,
    "total_tokens": 42
  }
}
    '''
    
    #generator = pipeline(model=model, device_map="auto")

    #return generator(text)


app=gr.Interface(fn=question_answer, inputs=["text", "text"], outputs=["textbox", "text"])
app.launch()