Spaces:

bkoz
/

bk-sandbox

Running on Zero

File size: 1,866 Bytes

7529aa7
 
2b7cab2
095ce1b
8e2059a
7529aa7
49027e2
2b7cab2
 
7529aa7
 
49027e2
 
2f086fe
 
2b7cab2
1647ffa
 
 
 
 
 
 
 
 
faee068
 
 
 
 
 
 
40ebb44
d2d4546
09a1596
d2d4546
 
 
 
dbe11d5
eb72820
d2d4546
 
 
 
 
faee068
112a7b2
b57a9a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16fbfe8
33ba22e
7520afc
33760f7
7529aa7

import gradio as gr
import spaces
# import torch
from huggingface_hub import hf_hub_download
from llama_cpp import Llama, LlamaGrammar


# zero = torch.Tensor([0]).cuda()
# print(f'zero.device: {zero.device}') # <-- 'cpu' 🤔

@spaces.GPU
def greet(n):
    global llm
    llm = load_model(download_model())

    # print(f'zero.device: {zero.device}') # <-- 'cuda:0' 🤗
    grammar = LlamaGrammar.from_string('''
    root ::= sentence
    answer ::= (weather | complaint | yesno | gen)
    weather ::= ("Sunny." | "Cloudy." | "Rainy.")
    complaint ::= "I don't like talking about the weather."
    yesno ::= ("Yes." | "No.")
    gen ::= "1. " [A-Z] [a-z] [a-z]*
    sentence ::= [A-Z] [A-Za-z0-9 ,-]* ("." | "!" | "?")
    ''')

    prompts = [
        "How's the weather in London?",
        "How's the weather in Munich?",
        "How's the weather in Barcelona?",
    ]

    print(f'Making inference... {prompts[0]}')
    output = llm(
            prompts[0],
            max_tokens=512,
            temperature=0.4,
            grammar=grammar
    )
    print(f'Returned..... {output}')

    s = output['choices'][0]['text']
    print(f'{s} , len(s) = {len(s)}')
    print(output['choices'])
    print(output['choices'][0]['text'])
    print()

    return f"Hello {s} Tensor"

def download_model():

    REPO_ID = "TheBloke/Llama-2-7B-GGUF"
    FILENAME = "llama-2-7b.Q5_K_S.gguf"

    print(f'Downloading model {REPO_ID}/{FILENAME}')
    m = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
    print(f'status: {m}')
    return m

def load_model(fp):
    from llama_cpp import Llama, LlamaGrammar

    print(f'Loading model: {fp}')
    model_file=fp
    llm = Llama(
        model_path=model_file,
        n_gpu_layers=-1, verbose=True
    )
    return llm

demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
demo.launch(share=False)