Ll2

Paused

App Files Files Community

Ll2 / app.py

Makhinur

Update app.py

265cb8e verified 9 months ago

raw

history blame

10.1 kB

	import os
	from typing import Iterator
	import gradio as gr
	from text_generation import Client

	# Ensure the HF_TOKEN environment variable is set
	HF_TOKEN = os.environ.get("HF_TOKEN")
	if HF_TOKEN is None:
	raise ValueError("Please set the HF_TOKEN environment variable.")

	# Model and API setup
	model_id = 'codellama/CodeLlama-34b-Instruct-hf'
	API_URL = "https://api-inference.huggingface.co/models/" + model_id

	client = Client(
	API_URL,
	headers={"Authorization": f"Bearer {HF_TOKEN}"},
	)

	EOS_STRING = "</s>"
	EOT_STRING = "<EOT>"

	HF_PUBLIC = os.environ.get("HF_PUBLIC", False)

	DEFAULT_SYSTEM_PROMPT = """\
	You are a helpful, respectful and honest assistant with a deep knowledge of code and software design. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
	"""
	MAX_MAX_NEW_TOKENS = 4096
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = 4000

	DESCRIPTION = """
	# Code Llama 34B Chat

	This Space demonstrates model [CodeLlama-34b-Instruct](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) by Meta, a Code Llama model with 34B parameters fine-tuned for chat instructions and specialized on code tasks. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).

	🔎 For more details about the Code Llama family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/codellama) or [the paper](https://huggingface.co/papers/2308.12950).

	🏃🏻 Check out our [Playground](https://huggingface.co/spaces/codellama/codellama-playground) for a super-fast code completion demo that leverages a streaming [inference endpoint](https://huggingface.co/inference-endpoints).

	"""

	LICENSE = """
	<p/>

	---
	As a derivate work of Code Llama by Meta,
	this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/codellama-2-34b-chat/blob/main/USE_POLICY.md).
	"""

	def get_prompt(message: str, chat_history: list[tuple[str, str]],
	system_prompt: str) -> str:
	texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
	do_strip = False
	for user_input, response in chat_history:
	user_input = user_input.strip() if do_strip else user_input
	do_strip = True
	texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
	message = message.strip() if do_strip else message
	texts.append(f'{message} [/INST]')
	return ''.join(texts)

	def run(message: str,
	chat_history: list[tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.1,
	top_p: float = 0.9,
	top_k: int = 50) -> Iterator[str]:
	prompt = get_prompt(message, chat_history, system_prompt)

	generate_kwargs = dict(
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	)
	stream = client.generate_stream(prompt, **generate_kwargs)
	output = ""
	for response in stream:
	if any([end_token in response.token.text for end_token in [EOS_STRING, EOT_STRING]]):
	return output
	else:
	output += response.token.text
	yield output
	return output

	def clear_and_save_textbox(message: str) -> tuple[str, str]:
	return '', message

	def display_input(message: str,
	history: list[tuple[str, str]]) -> list[tuple[str, str]]:
	history.append((message, ''))
	return history

	def delete_prev_fn(
	history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
	try:
	message, _ = history.pop()
	except IndexError:
	message = ''
	return history, message or ''

	def generate(
	message: str,
	history_with_input: list[tuple[str, str]],
	system_prompt: str,
	max_new_tokens: int,
	temperature: float,
	top_p: float,
	top_k: int,
	) -> Iterator[list[tuple[str, str]]]:
	if max_new_tokens > MAX_MAX_NEW_TOKENS:
	raise ValueError

	history = history_with_input[:-1]
	generator = run(message, history, system_prompt, max_new_tokens, temperature, top_p, top_k)
	try:
	first_response = next(generator)
	yield history + [(message, first_response)]
	except StopIteration:
	yield history + [(message, '')]
	for response in generator:
	yield history + [(message, response)]

	def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
	generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
	for x in generator:
	pass
	return '', x

	def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
	input_token_length = len(message) + len(chat_history)
	if input_token_length > MAX_INPUT_TOKEN_LENGTH:
	raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')

	with gr.Blocks(css='style.css') as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value='Duplicate Space for private use',
	elem_id='duplicate-button')

	with gr.Group():
	chatbot = gr.Chatbot(label='Chatbot')
	with gr.Row():
	textbox = gr.Textbox(
	container=False,
	show_label=False,
	placeholder='Type a message...',
	scale=10,
	)
	submit_button = gr.Button('Submit',
	variant='primary',
	scale=1,
	min_width=0)
	with gr.Row():
	retry_button = gr.Button('🔄 Retry', variant='secondary')
	undo_button = gr.Button('↩️ Undo', variant='secondary')
	clear_button = gr.Button('🗑️ Clear', variant='secondary')

	saved_input = gr.State()

	with gr.Accordion(label='Advanced options', open=False):
	system_prompt = gr.Textbox(label='System prompt',
	value=DEFAULT_SYSTEM_PROMPT,
	lines=6)
	max_new_tokens = gr.Slider(
	label='Max new tokens',
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	)
	temperature = gr.Slider(
	label='Temperature',
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.1,
	)
	top_p = gr.Slider(
	label='Top-p (nucleus sampling)',
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=0.9,
	)
	top_k = gr.Slider(
	label='Top-k',
	minimum=1,
	maximum=1000,
	step=1,
	value=10,
	)

	gr.Markdown(LICENSE)

	textbox.submit(
	fn=clear_and_save_textbox,
	inputs=textbox,
	outputs=[textbox, saved_input],
	api_name=False,
	queue=False,
	).then(
	fn=display_input,
	inputs=[saved_input, chatbot],
	outputs=chatbot,
	api_name=False,
	queue=False,
	).then(
	fn=check_input_token_length,
	inputs=[saved_input, chatbot, system_prompt],
	api_name=False,
	queue=False,
	).success(
	fn=generate,
	inputs=[
	saved_input,
	chatbot,
	system_prompt,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	],
	outputs=chatbot,
	api_name=False,
	)

	button_event_preprocess = submit_button.click(
	fn=clear_and_save_textbox,
	inputs=textbox,
	outputs=[textbox, saved_input],
	api_name=False,
	queue=False,
	).then(
	fn=display_input,
	inputs=[saved_input, chatbot],
	outputs=chatbot,
	api_name=False,
	queue=False,
	).then(
	fn=check_input_token_length,
	inputs=[saved_input, chatbot, system_prompt],
	api_name=False,
	queue=False,
	).success(
	fn=generate,
	inputs=[
	saved_input,
	chatbot,
	system_prompt,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	],
	outputs=chatbot,
	api_name=False,
	)

	retry_button.click(
	fn=delete_prev_fn,
	inputs=chatbot,
	outputs=[chatbot, saved_input],
	api_name=False,
	queue=False,
	).then(
	fn=display_input,
	inputs=[saved_input, chatbot],
	outputs=chatbot,
	api_name=False,
	queue=False,
	).then(
	fn=generate,
	inputs=[
	saved_input,
	chatbot,
	system_prompt,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	],
	outputs=chatbot,
	api_name=False,
	)

	undo_button.click(
	fn=delete_prev_fn,
	inputs=chatbot,
	outputs=[chatbot, saved_input],
	api_name=False,
	queue=False,
	).then(
	fn=lambda x: x,
	inputs=[saved_input],
	outputs=textbox,
	api_name=False,
	queue=False,
	)

	clear_button.click(
	fn=lambda: ([], ''),
	outputs=[chatbot, saved_input],
	queue=False,
	api_name=False,
	)

	demo.queue(max_size=32).launch(share=HF_PUBLIC)