not run
colab t4
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β256.19it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 2681.67it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 46.53it/s]
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:590: UserWarning: do_sample
is set to False
. However, temperature
is set to 0.6
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset temperature
.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:595: UserWarning: do_sample
is set to False
. However, top_p
is set to 0.9
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset top_p
.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Exception in thread Thread-13 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"
2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()
Empty:
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β209.51it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 3212.56it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 451.46it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:None for open-end generation.
Exception in thread Thread-14 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"
2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()
Empty:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=10, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=10, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β464.39it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 4014.17it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 993.45it/s]
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:628: UserWarning: do_sample
is set to False
. However, temperature
is set to 0.6
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset temperature
.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:633: UserWarning: do_sample
is set to False
. However, top_p
is set to 0.9
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset top_p
.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Exception in thread Thread-13 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/usr/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 2252, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 3251, in _sample
outputs = self(**model_inputs, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 1163, in forward
outputs = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 889, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 159, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Empty Traceback (most recent call last)
in <cell line: 0>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"
2 frames
/usr/lib/python3.11/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()
Empty:
model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)
##########################################################################################################
import transformers
Instead of using a separate thread, we'll use the streamer directly in the main thread.
from threading import Thread
from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()
#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')
def chat_processor(chat, max_new_tokens=10, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_params = dict(
{"input_ids": process_conversation(chat)},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=0.90,
top_k=50,
temperature= 0.6,
num_beams=1,
repetition_penalty=1.2,
)
# Removed the thread and instead call generate directly
model.generate(**generate_params)
outputs = []
for text in streamer: # Now this loop will wait for text from the streamer
outputs.append(text)
print_flush("".join(outputs))
return outputs
###################################################################################################
outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=10, do_sample=False)
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β230.17it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:251: FutureWarning: You are using torch.load
with weights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only
will be flipped to True
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals
. We recommend you start setting weights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|ββββββββββ| 32/32 [00:00<00:00, 2021.59it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 414.25it/s]
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:628: UserWarning: do_sample
is set to False
. However, temperature
is set to 0.6
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset temperature
.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:633: UserWarning: do_sample
is set to False
. However, top_p
is set to 0.9
-- this flag is only used in sample-based generation modes. You should set do_sample=True
or unset top_p
.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:2 for open-end generation.
RuntimeError Traceback (most recent call last)
in <cell line: 0>()
58 ###################################################################################################
59
---> 60 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
61 'chat_history':[],
62 'message':"How can I build a car?"
13 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, x, position_ids)
157 device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
158 with torch.autocast(device_type=device_type, enabled=False):
--> 159 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
160 emb = torch.cat((freqs, freqs), dim=-1)
161 cos = emb.cos()
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
Can you replace HQQModelForCausalLM with AutoHQQHFModel
from hqq.models.hf.base import AutoHQQHFModel
...