Spaces:

pandora-s
/

Pixtral-12B-EXL2

Runtime error

App Files Files Community

Pixtral-12B-EXL2 / app.py

pandora-s

Update app.py

305e0ae verified 8 months ago

raw

history blame

4.99 kB

	## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
	import cuda_bug
	cuda_bug.install_cuda_toolkit_requirements()
	##

	import gradio as gr
	from gradio.data_classes import FileData
	from huggingface_hub import snapshot_download
	from pathlib import Path
	import base64
	import spaces
	import os

	import sys, os

	import torch

	from exllamav2 import (
	ExLlamaV2,
	ExLlamaV2Config,
	ExLlamaV2Cache,
	ExLlamaV2Tokenizer,
	ExLlamaV2VisionTower,
	)

	from exllamav2.generator import (
	ExLlamaV2DynamicGenerator,
	ExLlamaV2Sampler,
	)

	from PIL import Image
	import requests

	from huggingface_hub import snapshot_download

	default_bpw = "4.0bpw"
	available_models = [
	"2.5bpw",
	"3.0bpw",
	"3.5bpw",
	"4.0bpw",
	"4.5bpw",
	"5.0bpw",
	"6.0bpw",
	"8.0bpw"
	]
	dirs = {}
	for model in available_models:
	dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})

	@spaces.GPU(duration=45)
	def run_inference(message, history, model_picked):
	local_dir = dirs[model_picked]
	print(message)
	print(history)
	# Loading only once GPU available
	config = ExLlamaV2Config(local_dir)
	config.max_seq_len = 16384

	vision_model = ExLlamaV2VisionTower(config)
	vision_model.load(progress = True)

	model = ExLlamaV2(config)
	cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
	model.load_autosplit(cache, progress = True)
	tokenizer = ExLlamaV2Tokenizer(config)

	generator = ExLlamaV2DynamicGenerator(
	model = model,
	cache = cache,
	tokenizer = tokenizer
	)

	# Making Prompt Template
	prompt = ""
	image_prompt = ""
	images_embeddings = []
	for couple in history:
	if type(couple[0]) is tuple:
	images_embeddings += [
	vision_model.get_image_embeddings(
	model = model,
	tokenizer = tokenizer,
	image = img,
	text_alias = alias,
	)
	for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
	]
	image_prompt = ""
	for i in range(len(couple[0])):
	image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
	elif couple[0][1]:
	prompt += "[INST]" + image_prompt + couple[0][1] + "[/INST]"
	prompt += couple[1] + "</s>"

	if type(message) is dict:
	images_embeddings += [
	vision_model.get_image_embeddings(
	model = model,
	tokenizer = tokenizer,
	image = img,
	text_alias = alias,
	)
	for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
	]
	image_prompt = ""
	for i in range(len(message['files'])):
	image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
	prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
	else:
	prompt += "[INST]" + image_prompt + message + "[/INST]"

	print(prompt)

	# Gnerating Response
	output = generator.generate(
	prompt = prompt,
	max_new_tokens = 1024,
	temperature = 0.15,
	add_bos = True,
	encode_special_tokens = True,
	decode_special_tokens = True,
	stop_conditions = [tokenizer.eos_token_id],
	gen_settings = ExLlamaV2Sampler.Settings.greedy(),
	embeddings = images_embeddings
	)
	result = out.split("[/INST]")[-1]
	print(result)
	return result

	description="""
	A demo chat interface with Pixtral 12B EXL2 Quants, deployed using ExllamaV2!

	The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.

	The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).

	The model at 4bpw and 16k context size fits in less than 12GB of VRAM!

	The current settings are:
	- Context Size: 16k tokens
	- Max Output: 1024 tokens
	- Temperature: 0.15

	You can select other quants and experiment!

	Thanks, turboderp!
	"""
	examples = [
	[
	{"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
	]
	]

	drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
	demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = drop)
	demo.queue().launch()