Spaces:
Sleeping
Sleeping
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# """ | |
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
# """ | |
# client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
# ## None type | |
# def respond( | |
# message: str, | |
# history: list[tuple[str, str]], # This will not be used | |
# system_message: str, | |
# max_tokens: int, | |
# temperature: float, | |
# top_p: float, | |
# ): | |
# messages = [{"role": "system", "content": system_message}] | |
# # Append only the latest user message | |
# messages.append({"role": "user", "content": message}) | |
# response = "" | |
# try: | |
# # Generate response from the model | |
# for message in client.chat_completion( | |
# messages, | |
# max_tokens=max_tokens, | |
# stream=True, | |
# temperature=temperature, | |
# top_p=top_p, | |
# ): | |
# if message.choices[0].delta.content is not None: | |
# token = message.choices[0].delta.content | |
# response += token | |
# yield response | |
# except Exception as e: | |
# yield f"An error occurred: {e}" | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
##Running smothly CHATBOT | |
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# """ | |
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
# """ | |
# client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
# def respond( | |
# message: str, | |
# history: list[tuple[str, str]], # This will not be used | |
# system_message: str, | |
# max_tokens: int, | |
# temperature: float, | |
# top_p: float, | |
# ): | |
# # Build the messages list | |
# messages = [{"role": "system", "content": system_message}] | |
# messages.append({"role": "user", "content": message}) | |
# response = "" | |
# try: | |
# # Generate response from the model | |
# for msg in client.chat_completion( | |
# messages=messages, | |
# max_tokens=max_tokens, | |
# stream=True, | |
# temperature=temperature, | |
# top_p=top_p, | |
# ): | |
# if msg.choices[0].delta.content is not None: | |
# token = msg.choices[0].delta.content | |
# response += token | |
# yield response | |
# except Exception as e: | |
# yield f"An error occurred: {e}" | |
# """ | |
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
# """ | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p (nucleus sampling)", | |
# ), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
### 20aug | |
# import os | |
# import time | |
# import spaces | |
# import torch | |
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig | |
# import gradio as gr | |
# from threading import Thread | |
# MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"] | |
# HF_TOKEN = os.environ.get("HF_API_TOKEN", None) | |
# MODEL = os.environ.get("MODEL_ID") | |
# TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>" | |
# PLACEHOLDER = """ | |
# <center> | |
# <p>Hi! How can I help you today?</p> | |
# </center> | |
# """ | |
# CSS = """ | |
# .duplicate-button { | |
# margin: auto !important; | |
# color: white !important; | |
# background: black !important; | |
# border-radius: 100vh !important; | |
# } | |
# h3 { | |
# text-align: center; | |
# } | |
# """ | |
# device = "cuda" # for GPU usage or "cpu" for CPU usage | |
# quantization_config = BitsAndBytesConfig( | |
# load_in_4bit=True, | |
# bnb_4bit_compute_dtype=torch.bfloat16, | |
# bnb_4bit_use_double_quant=True, | |
# bnb_4bit_quant_type= "nf4") | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
# model = AutoModelForCausalLM.from_pretrained( | |
# MODEL, | |
# torch_dtype=torch.bfloat16, | |
# device_map="auto", | |
# quantization_config=quantization_config) | |
# @spaces.GPU() | |
# def stream_chat( | |
# message: str, | |
# history: list, | |
# system_prompt: str, | |
# temperature: float = 0.8, | |
# max_new_tokens: int = 1024, | |
# top_p: float = 1.0, | |
# top_k: int = 20, | |
# penalty: float = 1.2, | |
# ): | |
# print(f'message: {message}') | |
# print(f'history: {history}') | |
# conversation = [ | |
# {"role": "system", "content": system_prompt} | |
# ] | |
# for prompt, answer in history: | |
# conversation.extend([ | |
# {"role": "user", "content": prompt}, | |
# {"role": "assistant", "content": answer}, | |
# ]) | |
# conversation.append({"role": "user", "content": message}) | |
# input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device) | |
# streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
# generate_kwargs = dict( | |
# input_ids=input_ids, | |
# max_new_tokens = max_new_tokens, | |
# do_sample = False if temperature == 0 else True, | |
# top_p = top_p, | |
# top_k = top_k, | |
# temperature = temperature, | |
# repetition_penalty=penalty, | |
# eos_token_id=[128001,128008,128009], | |
# streamer=streamer, | |
# ) | |
# with torch.no_grad(): | |
# thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
# thread.start() | |
# buffer = "" | |
# for new_text in streamer: | |
# buffer += new_text | |
# yield buffer | |
# chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
# with gr.Blocks(css=CSS, theme="soft") as demo: | |
# gr.HTML(TITLE) | |
# gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
# gr.ChatInterface( | |
# fn=stream_chat, | |
# chatbot=chatbot, | |
# fill_height=True, | |
# additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
# additional_inputs=[ | |
# gr.Textbox( | |
# value="You are a helpful assistant", | |
# label="System Prompt", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0, | |
# maximum=1, | |
# step=0.1, | |
# value=0.8, | |
# label="Temperature", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=128, | |
# maximum=8192, | |
# step=1, | |
# value=1024, | |
# label="Max new tokens", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=1.0, | |
# step=0.1, | |
# value=1.0, | |
# label="top_p", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=1, | |
# maximum=20, | |
# step=1, | |
# value=20, | |
# label="top_k", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=2.0, | |
# step=0.1, | |
# value=1.2, | |
# label="Repetition penalty", | |
# render=False, | |
# ), | |
# ], | |
# examples=[ | |
# ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."], | |
# ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."], | |
# ["Tell me a random fun fact about the Roman Empire."], | |
# ["Show me a code snippet of a website's sticky header in CSS and JavaScript."], | |
# ], | |
# cache_examples=False, | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
import os | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
# Your Hugging Face configuration | |
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
# token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
# Initialize Inference Client with model and token | |
inference_client = InferenceClient() | |
def chat_completion(message, history): | |
# Pass user input through Hugging Face model | |
response = inference_client.chat( | |
model=model_name, | |
messages=[{"role": "user", "content": message}], | |
max_tokens=500, | |
stream=False | |
) | |
# Extract content from the response | |
response_text = response['choices'][0]['delta']['content'] | |
# Return response and updated history | |
return response_text | |
# Create Gradio chat interface | |
chatbot = gr.ChatInterface(fn=chat_completion) | |
chatbot.launch() | |