Spaces:

Renjith95
/

Renj-portfolio-ai-bot

Runtime error

File size: 3,014 Bytes

c5a91ee
bffceb3
453eab3
 
 
 
c5a91ee
 
453eab3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5a91ee
bffceb3
453eab3
 
 
bffceb3
 
 
453eab3
 
c5a91ee
 
 
bffceb3
 
c5a91ee
bffceb3
c5a91ee
 
 
 
 
 
 
453eab3
c5a91ee
453eab3
 
c5a91ee
453eab3
 
 
 
bffceb3
 
453eab3
 
bffceb3
 
 
910698f

import os
import gradio as gr
from transformers import TextStreamer
from peft import PeftModel
from unsloth import FastLanguageModel

# Load your model and tokenizer
model_name = "Renjith95/renj-portfolio-finetuned-model"  # Replace with your model name
auth_token = os.getenv("HF_TOKEN")   # Now this should work
# print("Auth token:", auth_token)  # To verify it's loaded

# Loading the base model and applying the local adapter.
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = auth_token, # use one if using gated models like meta-llama/Llama-2-7b-hf
)
model = PeftModel.from_pretrained(model, "Renjith95/renj-portfolio-finetuned-adapter", use_auth_token=auth_token)
FastLanguageModel.for_inference(model)


# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=auth_token)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_auth_token=auth_token)
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
def respond(message, history):
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=512,
        use_cache=True,
        temperature=0.7,
        top_p=0.95,
    )

    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    return response

demo = gr.ChatInterface(
    respond,
    title="Renj Chatbot",
    description="Ask me anything about my portfolio and projects."
)

if __name__ == "__main__":
    demo.launch(share = True)