Spaces:
Sleeping
Sleeping
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# """ | |
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
# """ | |
# client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
# ## None type | |
# def respond( | |
# message: str, | |
# history: list[tuple[str, str]], # This will not be used | |
# system_message: str, | |
# max_tokens: int, | |
# temperature: float, | |
# top_p: float, | |
# ): | |
# messages = [{"role": "system", "content": system_message}] | |
# # Append only the latest user message | |
# messages.append({"role": "user", "content": message}) | |
# response = "" | |
# try: | |
# # Generate response from the model | |
# for message in client.chat_completion( | |
# messages, | |
# max_tokens=max_tokens, | |
# stream=True, | |
# temperature=temperature, | |
# top_p=top_p, | |
# ): | |
# if message.choices[0].delta.content is not None: | |
# token = message.choices[0].delta.content | |
# response += token | |
# yield response | |
# except Exception as e: | |
# yield f"An error occurred: {e}" | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
##Running smothly CHATBOT | |
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# """ | |
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
# """ | |
# client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
# def respond( | |
# message: str, | |
# history: list[tuple[str, str]], # This will not be used | |
# system_message: str, | |
# max_tokens: int, | |
# temperature: float, | |
# top_p: float, | |
# ): | |
# # Build the messages list | |
# messages = [{"role": "system", "content": system_message}] | |
# messages.append({"role": "user", "content": message}) | |
# response = "" | |
# try: | |
# # Generate response from the model | |
# for msg in client.chat_completion( | |
# messages=messages, | |
# max_tokens=max_tokens, | |
# stream=True, | |
# temperature=temperature, | |
# top_p=top_p, | |
# ): | |
# if msg.choices[0].delta.content is not None: | |
# token = msg.choices[0].delta.content | |
# response += token | |
# yield response | |
# except Exception as e: | |
# yield f"An error occurred: {e}" | |
# """ | |
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
# """ | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p (nucleus sampling)", | |
# ), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
### 26 aug Use a pipeline as a high-level Logic | |
# import spaces | |
# import os | |
# import subprocess | |
# from llama_cpp import Llama | |
# from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType | |
# from llama_cpp_agent.providers import LlamaCppPythonProvider | |
# from llama_cpp_agent.chat_history import BasicChatHistory | |
# from llama_cpp_agent.chat_history.messages import Roles | |
# import gradio as gr | |
# from huggingface_hub import hf_hub_download | |
# huggingface_token = os.getenv("HF_TOKEN") | |
# # Download the Meta-Llama-3.1-8B-Instruct model | |
# hf_hub_download( | |
# repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", | |
# filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf", | |
# local_dir="./models", | |
# token=huggingface_token | |
# ) | |
# llm = None | |
# llm_model = None | |
# @spaces.GPU(duration=120) | |
# def respond( | |
# message, | |
# history: list[tuple[str, str]], | |
# model, | |
# system_message, | |
# max_tokens, | |
# temperature, | |
# top_p, | |
# top_k, | |
# repeat_penalty, | |
# ): | |
# chat_template = MessagesFormatterType.GEMMA_2 | |
# global llm | |
# global llm_model | |
# # Load model only if it's not already loaded or if a new model is selected | |
# if llm is None or llm_model != model: | |
# try: | |
# llm = Llama( | |
# model_path=f"models/{model}", | |
# flash_attn=True, | |
# n_gpu_layers=81, # Adjust based on available GPU resources | |
# n_batch=1024, | |
# n_ctx=8192, | |
# ) | |
# llm_model = model | |
# except Exception as e: | |
# return f"Error loading model: {str(e)}" | |
# provider = LlamaCppPythonProvider(llm) | |
# agent = LlamaCppAgent( | |
# provider, | |
# system_prompt=f"{system_message}", | |
# predefined_messages_formatter_type=chat_template, | |
# debug_output=True | |
# ) | |
# settings = provider.get_provider_default_settings() | |
# settings.temperature = temperature | |
# settings.top_k = top_k | |
# settings.top_p = top_p | |
# settings.max_tokens = max_tokens | |
# settings.repeat_penalty = repeat_penalty | |
# settings.stream = True | |
# messages = BasicChatHistory() | |
# # Add user and assistant messages to the history | |
# for msn in history: | |
# user = {'role': Roles.user, 'content': msn[0]} | |
# assistant = {'role': Roles.assistant, 'content': msn[1]} | |
# messages.add_message(user) | |
# messages.add_message(assistant) | |
# # Stream the response | |
# try: | |
# stream = agent.get_chat_response( | |
# message, | |
# llm_sampling_settings=settings, | |
# chat_history=messages, | |
# returns_streaming_generator=True, | |
# print_output=False | |
# ) | |
# outputs = "" | |
# for output in stream: | |
# outputs += output | |
# yield outputs | |
# except Exception as e: | |
# yield f"Error during response generation: {str(e)}" | |
# description = """<p align="center">Using the Meta-Llama-3.1-8B-Instruct Model</p>""" | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Dropdown([ | |
# 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf' | |
# ], | |
# value="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf", | |
# label="Model" | |
# ), | |
# gr.Textbox(value="You are a helpful assistant.", label="System message"), | |
# gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p", | |
# ), | |
# gr.Slider( | |
# minimum=0, | |
# maximum=100, | |
# value=40, | |
# step=1, | |
# label="Top-k", | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=2.0, | |
# value=1.1, | |
# step=0.1, | |
# label="Repetition penalty", | |
# ), | |
# ], | |
# retry_btn="Retry", | |
# undo_btn="Undo", | |
# clear_btn="Clear", | |
# submit_btn="Send", | |
# title="Chat with Meta-Llama-3.1-8B-Instruct using llama.cpp", | |
# description=description, | |
# chatbot=gr.Chatbot( | |
# scale=1, | |
# likeable=False, | |
# show_copy_button=True | |
# ) | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
####03 3.1 8b | |
# import os | |
# import time | |
# import spaces | |
# import torch | |
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig | |
# import gradio as gr | |
# from threading import Thread | |
# MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"] | |
# HF_TOKEN = os.environ.get("HF_API_TOKEN",None) | |
# print(HF_TOKEN,"######$$$$$$$$$$$$$$$") | |
# MODEL = os.environ.get("MODEL_ID","meta-llama/Meta-Llama-3.1-8B-Instruct") | |
# TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>" | |
# PLACEHOLDER = """ | |
# <center> | |
# <p>Hi! How can I help you today?</p> | |
# </center> | |
# """ | |
# CSS = """ | |
# .duplicate-button { | |
# margin: auto !important; | |
# color: white !important; | |
# background: black !important; | |
# border-radius: 100vh !important; | |
# } | |
# h3 { | |
# text-align: center; | |
# } | |
# """ | |
# device = "cuda" # for GPU usage or "cpu" for CPU usage | |
# quantization_config = BitsAndBytesConfig( | |
# load_in_4bit=True, | |
# bnb_4bit_compute_dtype=torch.bfloat16, | |
# bnb_4bit_use_double_quant=True, | |
# bnb_4bit_quant_type= "nf4") | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
# model = AutoModelForCausalLM.from_pretrained( | |
# MODEL, | |
# torch_dtype=torch.bfloat16, | |
# device_map="auto", | |
# quantization_config=quantization_config) | |
# @spaces.GPU() | |
# def stream_chat( | |
# message: str, | |
# history: list, | |
# system_prompt: str, | |
# temperature: float = 0.8, | |
# max_new_tokens: int = 1024, | |
# top_p: float = 1.0, | |
# top_k: int = 20, | |
# penalty: float = 1.2, | |
# ): | |
# print(f'message: {message}') | |
# print(f'history: {history}') | |
# conversation = [ | |
# {"role": "system", "content": system_prompt} | |
# ] | |
# for prompt, answer in history: | |
# conversation.extend([ | |
# {"role": "user", "content": prompt}, | |
# {"role": "assistant", "content": answer}, | |
# ]) | |
# conversation.append({"role": "user", "content": message}) | |
# input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device) | |
# streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
# generate_kwargs = dict( | |
# input_ids=input_ids, | |
# max_new_tokens = max_new_tokens, | |
# do_sample = False if temperature == 0 else True, | |
# top_p = top_p, | |
# top_k = top_k, | |
# temperature = temperature, | |
# repetition_penalty=penalty, | |
# eos_token_id=[128001,128008,128009], | |
# streamer=streamer, | |
# ) | |
# with torch.no_grad(): | |
# thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
# thread.start() | |
# buffer = "" | |
# for new_text in streamer: | |
# buffer += new_text | |
# yield buffer | |
# chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
# with gr.Blocks(css=CSS, theme="soft") as demo: | |
# gr.HTML(TITLE) | |
# gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
# gr.ChatInterface( | |
# fn=stream_chat, | |
# chatbot=chatbot, | |
# fill_height=True, | |
# additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
# additional_inputs=[ | |
# gr.Textbox( | |
# value="You are a helpful assistant", | |
# label="System Prompt", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0, | |
# maximum=1, | |
# step=0.1, | |
# value=0.8, | |
# label="Temperature", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=128, | |
# maximum=8192, | |
# step=1, | |
# value=1024, | |
# label="Max new tokens", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=1.0, | |
# step=0.1, | |
# value=1.0, | |
# label="top_p", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=1, | |
# maximum=20, | |
# step=1, | |
# value=20, | |
# label="top_k", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=2.0, | |
# step=0.1, | |
# value=1.2, | |
# label="Repetition penalty", | |
# render=False, | |
# ), | |
# ], | |
# examples=[ | |
# ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."], | |
# ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."], | |
# ["Tell me a random fun fact about the Roman Empire."], | |
# ["Show me a code snippet of a website's sticky header in CSS and JavaScript."], | |
# ], | |
# cache_examples=False, | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
###########new clientkey | |
# import os | |
# import time | |
# import spaces | |
# import torch | |
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
# import gradio as gr | |
# from threading import Thread | |
# MODEL = "THUDM/LongWriter-llama3.1-8b" | |
# TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>" | |
# PLACEHOLDER = """ | |
# <center> | |
# <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p> | |
# </center> | |
# """ | |
# CSS = """ | |
# .duplicate-button { | |
# margin: auto !important; | |
# color: white !important; | |
# background: black !important; | |
# border-radius: 100vh !important; | |
# } | |
# h3 { | |
# text-align: center; | |
# } | |
# """ | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
# model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto") | |
# model = model.eval() | |
# @spaces.GPU() | |
# def stream_chat( | |
# message: str, | |
# history: list, | |
# system_prompt: str, | |
# temperature: float = 0.5, | |
# max_new_tokens: int = 32768, | |
# top_p: float = 1.0, | |
# top_k: int = 50, | |
# ): | |
# print(f'message: {message}') | |
# print(f'history: {history}') | |
# full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" | |
# for prompt, answer in history: | |
# full_prompt += f"[INST]{prompt}[/INST]{answer}" | |
# full_prompt += f"[INST]{message}[/INST]" | |
# inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device) | |
# context_length = inputs.input_ids.shape[-1] | |
# streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
# generate_kwargs = dict( | |
# inputs=inputs.input_ids, | |
# max_new_tokens=max_new_tokens, | |
# do_sample=True, | |
# top_p=top_p, | |
# top_k=top_k, | |
# temperature=temperature, | |
# num_beams=1, | |
# streamer=streamer, | |
# ) | |
# thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
# thread.start() | |
# buffer = "" | |
# for new_text in streamer: | |
# buffer += new_text | |
# yield buffer | |
# chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
# with gr.Blocks(css=CSS, theme="soft") as demo: | |
# gr.HTML(TITLE) | |
# gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
# gr.ChatInterface( | |
# fn=stream_chat, | |
# chatbot=chatbot, | |
# fill_height=True, | |
# additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
# additional_inputs=[ | |
# gr.Textbox( | |
# value="You are a helpful assistant capable of generating long-form content.", | |
# label="System Prompt", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0, | |
# maximum=1, | |
# step=0.1, | |
# value=0.5, | |
# label="Temperature", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=1024, | |
# maximum=32768, | |
# step=1024, | |
# value=32768, | |
# label="Max new tokens", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=0.0, | |
# maximum=1.0, | |
# step=0.1, | |
# value=1.0, | |
# label="Top p", | |
# render=False, | |
# ), | |
# gr.Slider( | |
# minimum=1, | |
# maximum=100, | |
# step=1, | |
# value=50, | |
# label="Top k", | |
# render=False, | |
# ), | |
# ], | |
# examples=[ | |
# ["Write a 5000-word comprehensive guide on machine learning for beginners."], | |
# ["Create a detailed 3000-word business plan for a sustainable energy startup."], | |
# ["Compose a 2000-word short story set in a futuristic underwater city."], | |
# ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."], | |
# ], | |
# cache_examples=False, | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
import gradio as gr | |
from threading import Thread | |
# Model and constants | |
MODEL = "THUDM/LongWriter-llama3.1-8b" | |
TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>" | |
PLACEHOLDER = """ | |
<center> | |
<p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p> | |
</center> | |
""" | |
CSS = """ | |
.duplicate-button { | |
margin: auto !important; | |
color: white !important; | |
background: black !important; | |
border-radius: 100vh !important; | |
} | |
h3 { | |
text-align: center; | |
} | |
""" | |
# Check device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Load model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto").eval() | |
def stream_chat( | |
message: str, | |
history: list, | |
system_prompt: str, | |
temperature: float = 0.5, | |
max_new_tokens: int = 4096, # Lowered max tokens for efficiency | |
top_p: float = 1.0, | |
top_k: int = 50, | |
): | |
try: | |
full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" | |
for prompt, answer in history: | |
full_prompt += f"[INST]{prompt}[/INST]{answer}" | |
full_prompt += f"[INST]{message}[/INST]" | |
# Tokenize input | |
inputs = tokenizer(full_prompt, truncation=True, max_length=2048, return_tensors="pt").to(device) | |
context_length = inputs.input_ids.shape[-1] | |
# Setup TextIteratorStreamer for streaming response | |
streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
# Generation parameters | |
generate_kwargs = dict( | |
inputs=inputs.input_ids, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
top_p=top_p, | |
top_k=top_k, | |
temperature=temperature, | |
num_beams=1, | |
streamer=streamer, | |
) | |
# Generate text in a separate thread to avoid blocking | |
thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
thread.start() | |
# Stream response | |
buffer = "" | |
for new_text in streamer: | |
buffer += new_text | |
yield buffer | |
except Exception as e: | |
yield f"An error occurred: {str(e)}" | |
# Gradio setup | |
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
with gr.Blocks(css=CSS, theme="soft") as demo: | |
gr.HTML(TITLE) | |
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
gr.ChatInterface( | |
fn=stream_chat, | |
chatbot=chatbot, | |
fill_height=True, | |
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
additional_inputs=[ | |
gr.Textbox( | |
value="You are a helpful assistant capable of generating long-form content.", | |
label="System Prompt", | |
render=False, | |
), | |
gr.Slider( | |
minimum=0, | |
maximum=1, | |
step=0.1, | |
value=0.5, | |
label="Temperature", | |
render=False, | |
), | |
gr.Slider( | |
minimum=1024, | |
maximum=4096, # Reduced to a more manageable value | |
step=1024, | |
value=4096, | |
label="Max new tokens", | |
render=False, | |
), | |
gr.Slider( | |
minimum=0.0, | |
maximum=1.0, | |
step=0.1, | |
value=1.0, | |
label="Top p", | |
render=False, | |
), | |
gr.Slider( | |
minimum=1, | |
maximum=100, | |
step=1, | |
value=50, | |
label="Top k", | |
render=False, | |
), | |
], | |
# examples=[ | |
# ["Write a 5000-word comprehensive guide on machine learning for beginners."], | |
# ["Create a detailed 3000-word business plan for a sustainable energy startup."], | |
# ["Compose a 2000-word short story set in a futuristic underwater city."], | |
# ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."], | |
# ], | |
cache_examples=False, | |
) | |
if __name__ == "__main__": | |
demo.launch() |