Spaces:
Runtime error
Runtime error
| import torch | |
| import os | |
| import shutil | |
| from transformers import AutoTokenizer | |
| from transformers import AutoModelForCausalLM | |
| from moe_infinity import MoE | |
| from typing import List, Tuple, Optional, Union | |
| from lm_eval.api.registry import register_model | |
| from src.backend.hflm_with_measurement import HFLMWithMeasurement | |
| class MoEHFLM(HFLMWithMeasurement): | |
| def __init__( | |
| self, | |
| pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| moe_config: dict = None, | |
| offload_path=os.path.expanduser("~"), | |
| device_memory_ratio=0.75, | |
| use_chat_template=True, | |
| *args, | |
| **kwargs, | |
| ): | |
| # Initialize parent class without calling _create_model in the parent's __init__ | |
| self.checkpoint = pretrained | |
| self.moe_config = moe_config if moe_config is not None else {} | |
| self.offload_path = offload_path | |
| self.device_memory_ratio = device_memory_ratio | |
| self.use_chat_template = use_chat_template | |
| if "device" in kwargs: | |
| kwargs.pop("device") | |
| if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")): | |
| shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) | |
| kwargs["device_map"] = "cuda:0" | |
| super().__init__( | |
| *args, **kwargs, pretrained=pretrained | |
| ) # Assuming HFLM accepts a 'pretrained' arg and handles it | |
| # self._create_model() | |
| def __del__(self): | |
| self._model.engine.clean_up() # clean up hooks | |
| self._model.engine.archer_engine.clean_up_resources() # clean up resources | |
| if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")): | |
| shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model | |
| def _create_model(self, *args, **kwargs): | |
| """ | |
| Initializes the MoE model from MoE-infinity with the provided configuration. | |
| """ | |
| # Ensure default configurations are set if not provided | |
| default_moe_config = { | |
| "offload_path": os.path.join(self.offload_path, "moe-infinity-offloads"), | |
| "device_memory_ratio": self.device_memory_ratio, # Default value, adjust as necessary | |
| } | |
| # Update default config with any user-provided config | |
| final_moe_config = {**default_moe_config, **self.moe_config} | |
| # dirty fix, to be removed when MoE-infinity supports move input to correct device | |
| def MoEGenDecorator(func): | |
| def wrapper(*args, **kwargs): | |
| # Ensure all tensor in the input are in the same device as the model | |
| args = [arg.to("cuda:0") if isinstance(arg, torch.Tensor) else arg for arg in args] | |
| kwargs = {k: v.to("cuda:0") if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()} | |
| return func(*args, **kwargs) | |
| return wrapper | |
| self._model = MoE(self.checkpoint, final_moe_config) | |
| self._model.generate = MoEGenDecorator(self._model.generate) | |
| # self._model = AutoModelForCausalLM.from_pretrained( | |
| # self.checkpoint, torch_dtype=torch.float16, device_map="auto" | |
| # ) | |
| def max_length(self): | |
| if self._max_length: # if max length manually set, return it | |
| return self._max_length | |
| seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") | |
| for attr in seqlen_config_attrs: | |
| if hasattr(self.model.model.config, attr): | |
| return getattr(self.model.model.config, attr) | |
| if hasattr(self.tokenizer, "model_max_length"): | |
| if self.tokenizer.model_max_length == 1000000000000000019884624838656: | |
| return self._DEFAULT_MAX_LENGTH | |
| return self.tokenizer.model_max_length | |
| return self._DEFAULT_MAX_LENGTH | |
| def tok_batch_encode( | |
| self, | |
| strings: List[str], | |
| padding_side: str = "left", | |
| left_truncate_len: int = None, | |
| truncation: bool = False, | |
| ) -> Tuple[torch.Tensor, torch.Tensor]: | |
| if self.use_chat_template: | |
| try: | |
| updated_strings = [] | |
| for input_string in strings: | |
| messages = [ | |
| {"role": "user", "content": f"{input_string}"}, | |
| ] | |
| updated_string = self.tokenizer.apply_chat_template(messages, tokenize=False) | |
| updated_strings.append(updated_string) | |
| strings = updated_strings[:] | |
| except: | |
| print(f"failed to update input string with chat template: {self._model}") | |
| # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode. | |
| old_padding_side = self.tokenizer.padding_side | |
| self.tokenizer.padding_side = padding_side | |
| add_special_tokens = False | |
| encoding = self.tokenizer( | |
| strings, | |
| truncation=truncation, | |
| padding="longest", | |
| return_tensors="pt", | |
| add_special_tokens=add_special_tokens, | |
| ) | |
| if left_truncate_len: | |
| encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:] | |
| encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:] | |
| self.tokenizer.padding_side = old_padding_side | |
| return encoding["input_ids"], encoding["attention_mask"] | |