Marco
commited on
Commit
·
e6f1577
1
Parent(s):
1165597
Removed the preprocess fn from the modeling
Browse files- modeling_ovis.py +0 -69
modeling_ovis.py
CHANGED
@@ -480,75 +480,6 @@ class Ovis(OvisPreTrainedModel):
|
|
480 |
pad_sequence = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[0]) for i in sequences],batch_first=True, padding_value=padding_value).flip(dims=[1])
|
481 |
return pad_sequence[:,-self.config.multimodal_max_length:]
|
482 |
|
483 |
-
def preprocess_inputs(
|
484 |
-
self,
|
485 |
-
text_or_conversations: Union[List[Dict], str],
|
486 |
-
images: Optional[List[PIL.Image.Image]],
|
487 |
-
max_partition=9,
|
488 |
-
generation_preface='',
|
489 |
-
return_labels=False,
|
490 |
-
propagate_exception=True,
|
491 |
-
frame_selector=None,
|
492 |
-
frame_selector_kwargs=None
|
493 |
-
):
|
494 |
-
# convert text to conversations
|
495 |
-
if isinstance(text_or_conversations, str):
|
496 |
-
conversations = [{
|
497 |
-
"from": "human",
|
498 |
-
"value": text_or_conversations
|
499 |
-
}]
|
500 |
-
elif isinstance(text_or_conversations, list):
|
501 |
-
conversations = text_or_conversations
|
502 |
-
else:
|
503 |
-
raise ValueError(f'Invalid type of `text_or_conversations`, expected `List[Dict]` or `str`,'
|
504 |
-
f' but got {type(text_or_conversations)}')
|
505 |
-
|
506 |
-
if frame_selector is not None:
|
507 |
-
frame_selector_kwargs = frame_selector_kwargs or {}
|
508 |
-
conversations, images = frame_selector(conversations=conversations, frames=images, **frame_selector_kwargs)
|
509 |
-
|
510 |
-
# format conversations
|
511 |
-
prompt, raw_input_ids, raw_labels = self.get_conversation_formatter().format(
|
512 |
-
conversations, generation_preface=generation_preface)
|
513 |
-
|
514 |
-
# place image placeholders
|
515 |
-
input_ids = []
|
516 |
-
labels = []
|
517 |
-
pixel_values = []
|
518 |
-
invalidate_label = False
|
519 |
-
image_token_indices = [i for i, v in enumerate(raw_input_ids) if v == IMAGE_TOKEN_ID]
|
520 |
-
last_image_token_index = -1
|
521 |
-
for i in range(len(image_token_indices)):
|
522 |
-
head = 0 if i == 0 else image_token_indices[i - 1] + 1
|
523 |
-
tail = image_token_indices[i]
|
524 |
-
last_image_token_index = tail
|
525 |
-
input_ids.extend(raw_input_ids[head:tail])
|
526 |
-
labels.extend(raw_labels[head:tail])
|
527 |
-
try:
|
528 |
-
image = images[i]
|
529 |
-
raw_pixel_values, image_placeholders = self.visual_tokenizer.preprocess_image(
|
530 |
-
image, max_partition=max_partition)
|
531 |
-
except Exception as e:
|
532 |
-
if propagate_exception:
|
533 |
-
raise e
|
534 |
-
logging.exception(e)
|
535 |
-
invalidate_label = True
|
536 |
-
raw_pixel_values, image_placeholders = self.visual_tokenizer.mock_input()
|
537 |
-
input_ids.extend(image_placeholders)
|
538 |
-
labels.extend([IGNORE_ID] * len(image_placeholders))
|
539 |
-
pixel_values.append(raw_pixel_values)
|
540 |
-
input_ids.extend(raw_input_ids[last_image_token_index + 1:])
|
541 |
-
labels.extend(raw_labels[last_image_token_index + 1:])
|
542 |
-
|
543 |
-
# return tensors
|
544 |
-
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
545 |
-
labels = torch.tensor([IGNORE_ID] * len(labels) if invalidate_label else labels, dtype=torch.long)
|
546 |
-
pixel_values = torch.cat(pixel_values, dim=0) if len(pixel_values) > 0 else None
|
547 |
-
|
548 |
-
if return_labels:
|
549 |
-
return prompt, input_ids, pixel_values, labels
|
550 |
-
else:
|
551 |
-
return prompt, input_ids, pixel_values
|
552 |
|
553 |
def save_pretrained(
|
554 |
self,
|
|
|
480 |
pad_sequence = torch.nn.utils.rnn.pad_sequence([i.flip(dims=[0]) for i in sequences],batch_first=True, padding_value=padding_value).flip(dims=[1])
|
481 |
return pad_sequence[:,-self.config.multimodal_max_length:]
|
482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
|
484 |
def save_pretrained(
|
485 |
self,
|