Spaces:

bkoz
/

bk-sandbox

Running on Zero

App Files Files Community

bk-sandbox / app.py

bkoz

updaed

dbe11d5 about 1 year ago

raw

history blame

1.87 kB

	import gradio as gr
	import spaces
	# import torch
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama, LlamaGrammar


	# zero = torch.Tensor([0]).cuda()
	# print(f'zero.device: {zero.device}') # <-- 'cpu' 🤔

	@spaces.GPU
	def greet(n):
	global llm
	llm = load_model(download_model())

	# print(f'zero.device: {zero.device}') # <-- 'cuda:0' 🤗
	grammar = LlamaGrammar.from_string('''
	root ::= sentence
	answer ::= (weather \| complaint \| yesno \| gen)
	weather ::= ("Sunny." \| "Cloudy." \| "Rainy.")
	complaint ::= "I don't like talking about the weather."
	yesno ::= ("Yes." \| "No.")
	gen ::= "1. " [A-Z] [a-z] [a-z]*
	sentence ::= [A-Z] [A-Za-z0-9 ,-]* ("." \| "!" \| "?")
	''')

	prompts = [
	"How's the weather in London?",
	"How's the weather in Munich?",
	"How's the weather in Barcelona?",
	]

	print(f'Making inference... {prompts[0]}')
	output = llm(
	prompts[0],
	max_tokens=512,
	temperature=0.4,
	grammar=grammar
	)
	print(f'Returned..... {output}')

	s = output['choices'][0]['text']
	print(f'{s} , len(s) = {len(s)}')
	print(output['choices'])
	print(output['choices'][0]['text'])
	print()

	return f"Hello {s} Tensor"

	def download_model():

	REPO_ID = "TheBloke/Llama-2-7B-GGUF"
	FILENAME = "llama-2-7b.Q5_K_S.gguf"

	print(f'Downloading model {REPO_ID}/{FILENAME}')
	m = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
	print(f'status: {m}')
	return m

	def load_model(fp):
	from llama_cpp import Llama, LlamaGrammar

	print(f'Loading model: {fp}')
	model_file=fp
	llm = Llama(
	model_path=model_file,
	n_gpu_layers=-1, verbose=True
	)
	return llm

	demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
	demo.launch(share=False)