Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
# import torch | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama, LlamaGrammar | |
# zero = torch.Tensor([0]).cuda() | |
# print(f'zero.device: {zero.device}') # <-- 'cpu' π€ | |
def greet(n): | |
global llm | |
llm = load_model(download_model()) | |
# print(f'zero.device: {zero.device}') # <-- 'cuda:0' π€ | |
grammar = LlamaGrammar.from_string(''' | |
root ::= sentence | |
answer ::= (weather | complaint | yesno | gen) | |
weather ::= ("Sunny." | "Cloudy." | "Rainy.") | |
complaint ::= "I don't like talking about the weather." | |
yesno ::= ("Yes." | "No.") | |
gen ::= "1. " [A-Z] [a-z] [a-z]* | |
sentence ::= [A-Z] [A-Za-z0-9 ,-]* ("." | "!" | "?") | |
''') | |
prompts = [ | |
"How's the weather in London?", | |
"How's the weather in Munich?", | |
"How's the weather in Barcelona?", | |
] | |
print(f'Making inference... {prompts[0]}') | |
output = llm( | |
prompts[0], | |
max_tokens=512, | |
temperature=0.4, | |
grammar=grammar | |
) | |
print(f'Returned..... {output}') | |
s = output['choices'][0]['text'] | |
print(f'{s} , len(s) = {len(s)}') | |
print(output['choices']) | |
print(output['choices'][0]['text']) | |
print() | |
return f"Hello {s} Tensor" | |
def download_model(): | |
REPO_ID = "TheBloke/Llama-2-7B-GGUF" | |
FILENAME = "llama-2-7b.Q5_K_S.gguf" | |
print(f'Downloading model {REPO_ID}/{FILENAME}') | |
m = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
print(f'status: {m}') | |
return m | |
def load_model(fp): | |
from llama_cpp import Llama, LlamaGrammar | |
print(f'Loading model: {fp}') | |
model_file=fp | |
llm = Llama( | |
model_path=model_file, | |
n_gpu_layers=-1, verbose=True | |
) | |
return llm | |
demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text()) | |
demo.launch(share=False) | |