sdyy

Dec 12, 2024

colab t4

model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)

##########################################################################################################
import transformers
from threading import Thread

from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()

#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']

conversation = []
if system_prompt:
    conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
    conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})

return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')

def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    {"input_ids": process_conversation(chat)},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    top_p=0.90,
    top_k=50,
    temperature= 0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()

outputs = []
for text in streamer:
    outputs.append(text)
    print_flush("".join(outputs))

return outputs

###################################################################################################

outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)

Fetching 9 files: 100%
9/9 [00:00<00:00, 256.19it/s]
100%|██████████| 32/32 [00:00<00:00, 2681.67it/s]
100%|██████████| 32/32 [00:00<00:00, 46.53it/s]
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:590: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:595: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Exception in thread Thread-13 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"

2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()

Empty:

sdyy

Dec 12, 2024

model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)

##########################################################################################################
import transformers
from threading import Thread

from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()

#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']

conversation = []
if system_prompt:
    conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
    conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})

return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')

def chat_processor(chat, max_new_tokens=100, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    {"input_ids": process_conversation(chat)},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    top_p=0.90,
    top_k=50,
    temperature= 0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()

outputs = []
for text in streamer:
    outputs.append(text)
    print_flush("".join(outputs))

return outputs

###################################################################################################

outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=1000, do_sample=False)

Fetching 9 files: 100%
9/9 [00:00<00:00, 209.51it/s]
100%|██████████| 32/32 [00:00<00:00, 3212.56it/s]
100%|██████████| 32/32 [00:00<00:00, 451.46it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Exception in thread Thread-14 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

Empty Traceback (most recent call last)
in <cell line: 59>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"

2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()

Empty:

sdyy

Dec 12, 2024

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

rakmik

Feb 4

model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)

##########################################################################################################
import transformers
from threading import Thread

from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()

#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']

conversation = []
if system_prompt:
    conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
    conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})

return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')

def chat_processor(chat, max_new_tokens=10, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    {"input_ids": process_conversation(chat)},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    top_p=0.90,
    top_k=50,
    temperature= 0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()

outputs = []
for text in streamer:
    outputs.append(text)
    print_flush("".join(outputs))

return outputs

###################################################################################################

outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=10, do_sample=False)

Fetching 9 files: 100%
9/9 [00:00<00:00, 464.39it/s]
100%|██████████| 32/32 [00:00<00:00, 4014.17it/s]
100%|██████████| 32/32 [00:00<00:00, 993.45it/s]
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:628: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:633: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Exception in thread Thread-13 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/usr/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 2252, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 3251, in _sample
outputs = self(model_inputs, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 1163, in forward
outputs = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 889, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 159, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
^~~~~
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

Empty Traceback (most recent call last)
in <cell line: 0>()
57 ###################################################################################################
58
---> 59 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
60 'chat_history':[],
61 'message':"How can I build a car?"

2 frames
/usr/lib/python3.11/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()

Empty:

rakmik

Feb 4

model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf-4bit_g64-HQQ'

from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)

##########################################################################################################
import transformers

Instead of using a separate thread, we'll use the streamer directly in the main thread.

from threading import Thread

from sys import stdout
def print_flush(data):
stdout.write("\r" + data)
stdout.flush()

#Adapted from https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/app.py
def process_conversation(chat):
system_prompt = chat['system_prompt']
chat_history = chat['chat_history']
message = chat['message']

conversation = []
if system_prompt:
    conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
    conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})

return tokenizer.apply_chat_template(conversation, return_tensors="pt").to('cuda')

def chat_processor(chat, max_new_tokens=10, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    {"input_ids": process_conversation(chat)},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    top_p=0.90,
    top_k=50,
    temperature= 0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

# Removed the thread and instead call generate directly
model.generate(**generate_params) 

outputs = []
for text in streamer:  # Now this loop will wait for text from the streamer
    outputs.append(text)
    print_flush("".join(outputs))

return outputs

###################################################################################################

outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
'chat_history':[],
'message':"How can I build a car?"
},
max_new_tokens=10, do_sample=False)

Fetching 9 files: 100%
9/9 [00:00<00:00, 230.17it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:251: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|██████████| 32/32 [00:00<00:00, 2021.59it/s]
100%|██████████| 32/32 [00:00<00:00, 414.25it/s]
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:628: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/transformers/generation/configuration_utils.py:633: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.

RuntimeError Traceback (most recent call last)
in <cell line: 0>()
58 ###################################################################################################
59
---> 60 outputs = chat_processor({'system_prompt':"You are a helpful assistant.",
61 'chat_history':[],
62 'message':"How can I build a car?"

13 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, x, position_ids)
157 device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
158 with torch.autocast(device_type=device_type, enabled=False):
--> 159 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
160 emb = torch.cat((freqs, freqs), dim=-1)
161 cos = emb.cos()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

mobicham

Mobius Labs GmbH org Feb 4

Can you replace HQQModelForCausalLM with AutoHQQHFModel

from hqq.models.hf.base import AutoHQQHFModel
...

mobiuslabsgmbh
/

Llama-2-7b-chat-hf-4bit_g64-HQQ

not run

Instead of using a separate thread, we'll use the streamer directly in the main thread.

from threading import Thread