Spaces:

Renjith95
/

Renj-portfolio-ai-bot

Runtime error

App Files Files Community

Renj-portfolio-ai-bot / app.py

Renjith95

Update app.py (#1)

453eab3 verified 7 months ago

raw

history blame

3.01 kB

	import os
	import gradio as gr
	from transformers import TextStreamer
	from peft import PeftModel
	from unsloth import FastLanguageModel

	# Load your model and tokenizer
	model_name = "Renjith95/renj-portfolio-finetuned-model" # Replace with your model name
	auth_token = os.getenv("HF_TOKEN") # Now this should work
	# print("Auth token:", auth_token) # To verify it's loaded

	# Loading the base model and applying the local adapter.
	max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
	dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
	load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

	# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
	fourbit_models = [
	"unsloth/mistral-7b-bnb-4bit",
	"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
	"unsloth/llama-2-7b-bnb-4bit",
	"unsloth/llama-2-13b-bnb-4bit",
	"unsloth/codellama-34b-bnb-4bit",
	"unsloth/tinyllama-bnb-4bit",
	"unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
	"unsloth/gemma-2b-bnb-4bit",
	] # More models at https://huggingface.co/unsloth

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	token = auth_token, # use one if using gated models like meta-llama/Llama-2-7b-hf
	)
	model = PeftModel.from_pretrained(model, "Renjith95/renj-portfolio-finetuned-adapter", use_auth_token=auth_token)
	FastLanguageModel.for_inference(model)


	# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=auth_token)
	# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_auth_token=auth_token)
	text_streamer = TextStreamer(tokenizer, skip_prompt = True)
	"""
	For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	"""
	def respond(message, history):
	messages = []
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(model.device)

	outputs = model.generate(
	input_ids=inputs,
	max_new_tokens=512,
	use_cache=True,
	temperature=0.7,
	top_p=0.95,
	)

	response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
	return response

	demo = gr.ChatInterface(
	respond,
	title="Renj Chatbot",
	description="Ask me anything about my portfolio and projects."
	)

	if __name__ == "__main__":
	demo.launch(share = True)