Upload 2 files
Browse files- mm_utils.py +0 -31
- mmalaya_arch.py +44 -1
mm_utils.py
CHANGED
|
@@ -21,15 +21,6 @@ DEFAULT_IM_START_TOKEN = "<im_start>"
|
|
| 21 |
DEFAULT_IM_END_TOKEN = "<im_end>"
|
| 22 |
|
| 23 |
|
| 24 |
-
def disable_torch_init():
|
| 25 |
-
"""
|
| 26 |
-
Disable the redundant torch default initialization to accelerate model creation.
|
| 27 |
-
"""
|
| 28 |
-
import torch
|
| 29 |
-
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
| 30 |
-
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
def load_image_from_base64(image):
|
| 34 |
return Image.open(BytesIO(base64.b64decode(image)))
|
| 35 |
|
|
@@ -63,28 +54,6 @@ def process_images(images, image_processor, model_cfg):
|
|
| 63 |
return new_images
|
| 64 |
|
| 65 |
|
| 66 |
-
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
|
| 67 |
-
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
| 68 |
-
|
| 69 |
-
def insert_separator(X, sep):
|
| 70 |
-
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
| 71 |
-
|
| 72 |
-
input_ids = []
|
| 73 |
-
offset = 0
|
| 74 |
-
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
| 75 |
-
offset = 1
|
| 76 |
-
input_ids.append(prompt_chunks[0][0])
|
| 77 |
-
|
| 78 |
-
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
| 79 |
-
input_ids.extend(x[offset:])
|
| 80 |
-
|
| 81 |
-
if return_tensors is not None:
|
| 82 |
-
if return_tensors == 'pt':
|
| 83 |
-
return torch.tensor(input_ids, dtype=torch.long)
|
| 84 |
-
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
| 85 |
-
return input_ids
|
| 86 |
-
|
| 87 |
-
|
| 88 |
def get_model_name_from_path(model_path):
|
| 89 |
model_path = model_path.strip("/")
|
| 90 |
model_paths = model_path.split("/")
|
|
|
|
| 21 |
DEFAULT_IM_END_TOKEN = "<im_end>"
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def load_image_from_base64(image):
|
| 25 |
return Image.open(BytesIO(base64.b64decode(image)))
|
| 26 |
|
|
|
|
| 54 |
return new_images
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def get_model_name_from_path(model_path):
|
| 58 |
model_path = model_path.strip("/")
|
| 59 |
model_paths = model_path.split("/")
|
mmalaya_arch.py
CHANGED
|
@@ -4,6 +4,7 @@ import torch
|
|
| 4 |
import torch.nn as nn
|
| 5 |
from transformers import Blip2Model, Blip2Processor, Blip2Config
|
| 6 |
from .mm_utils import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class BLIP2VisionTower(nn.Module):
|
|
@@ -265,6 +266,48 @@ class MMAlayaMetaForCausalLM(ABC):
|
|
| 265 |
|
| 266 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
| 267 |
|
| 268 |
-
def
|
| 269 |
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)
|
| 270 |
self.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import torch.nn as nn
|
| 5 |
from transformers import Blip2Model, Blip2Processor, Blip2Config
|
| 6 |
from .mm_utils import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
| 7 |
+
from .mm_utils import conv_templates
|
| 8 |
|
| 9 |
|
| 10 |
class BLIP2VisionTower(nn.Module):
|
|
|
|
| 266 |
|
| 267 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
| 268 |
|
| 269 |
+
def initialize_tokenizer(self, tokenizer):
|
| 270 |
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)
|
| 271 |
self.resize_token_embeddings(len(tokenizer))
|
| 272 |
+
|
| 273 |
+
def prepare_for_inference(
|
| 274 |
+
self,
|
| 275 |
+
prompt,
|
| 276 |
+
tokenizer,
|
| 277 |
+
image,
|
| 278 |
+
image_token_index=IMAGE_TOKEN_INDEX,
|
| 279 |
+
return_tensors=None
|
| 280 |
+
):
|
| 281 |
+
# 加载对话模板
|
| 282 |
+
conv = conv_templates["mmalaya_llama"].copy()
|
| 283 |
+
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
|
| 284 |
+
conv.append_message(conv.roles[0], inp)
|
| 285 |
+
conv.append_message(conv.roles[1], None)
|
| 286 |
+
prompt = conv.get_prompt()
|
| 287 |
+
|
| 288 |
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
| 289 |
+
|
| 290 |
+
def insert_separator(X, sep):
|
| 291 |
+
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
| 292 |
+
|
| 293 |
+
input_ids = []
|
| 294 |
+
offset = 0
|
| 295 |
+
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
| 296 |
+
offset = 1
|
| 297 |
+
input_ids.append(prompt_chunks[0][0])
|
| 298 |
+
|
| 299 |
+
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
| 300 |
+
input_ids.extend(x[offset:])
|
| 301 |
+
|
| 302 |
+
if return_tensors is not None:
|
| 303 |
+
if return_tensors == 'pt':
|
| 304 |
+
return torch.tensor(input_ids, dtype=torch.long)
|
| 305 |
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
| 306 |
+
|
| 307 |
+
# 加载generate stop条件
|
| 308 |
+
stopping_criteria = KeywordsStoppingCriteria([conv.sep2], tokenizer, input_ids)
|
| 309 |
+
# 加载图像
|
| 310 |
+
image_processor = model.get_vision_tower().image_processor
|
| 311 |
+
image_tensor = image_processor(image, return_tensors='pt')['pixel_values'].half().cuda()
|
| 312 |
+
|
| 313 |
+
return input_ids, image_tensor, stopping_criteria
|