joy-caption-pre-alpha-mod

Running

App Files Files Community

MegaTronX commited on Feb 4

Commit

fdbb7d2

verified ·

1 Parent(s): 8f3655b

Update joycaption.py

Browse files

Files changed (1) hide show

joycaption.py +132 -130

joycaption.py CHANGED Viewed

@@ -267,144 +267,146 @@ load_text_model(MODEL_PATH, None, LOAD_IN_NF4, True)
 @spaces.GPU()
 @torch.inference_mode()
-@demo.queue()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> tuple[str, str]:
-    global tokenizer, text_model, image_adapter, pixtral_model, pixtral_processor, text_model_client, use_inference_client
-    torch.cuda.empty_cache()
-    gc.collect()
-    # 'any' means no length specified
-    length = None if caption_length == "any" else caption_length
-    if isinstance(length, str):
-        try:
-            length = int(length)
-        except ValueError:
-            pass
-    # Build prompt
-    if length is None:
-        map_idx = 0
-    elif isinstance(length, int):
-        map_idx = 1
-    elif isinstance(length, str):
-        map_idx = 2
-    else:
-        raise ValueError(f"Invalid caption length: {length}")
-    prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
-    # Add extra options
-    if len(extra_options) > 0:
-        prompt_str += " " + " ".join(extra_options)
-    # Add name, length, word_count
-    prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length)
-    if custom_prompt.strip() != "":
-        prompt_str = custom_prompt.strip()
-    # For debugging
-    print(f"Prompt: {prompt_str}")
-    # Pixtral
-    if model_name in PIXTRAL_PATHS:
-        print(f"pixtral_model: {type(pixtral_model)}") #
-        print(f"pixtral_processor: {type(pixtral_processor)}") #
-        input_images = [input_image.convert("RGB")]
-        input_prompt = "[INST]Caption this image:\n[IMG][/INST]"
-        inputs = pixtral_processor(images=input_images, text=input_prompt, return_tensors="pt").to(device)
-        generate_ids = pixtral_model.generate(**inputs, max_new_tokens=max_new_tokens)
-        output = pixtral_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        return input_prompt, output.strip()
-    # Preprocess image
-    # NOTE: I found the default processor for so400M to have worse results than just using PIL directly
-    #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
-    image = input_image.resize((384, 384), Image.LANCZOS)
-    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    pixel_values = pixel_values.to(device)
-    # Embed image
-    # This results in Batch x Image Tokens x Features
-    with torch.amp.autocast_mode.autocast(device, enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to(device)
-    # Build the conversation
-    convo = [
-        {
-            "role": "system",
-            "content": "You are a helpful image captioner.",
-        },
-        {
-            "role": "user",
-            "content": prompt_str,
-        },
-    ]
-    # Format the conversation
-    convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
-    assert isinstance(convo_string, str)
-    # Tokenize the conversation
-    # prompt_str is tokenized separately so we can do the calculations below
-    convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
-    prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
-    assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
-    convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
-    prompt_tokens = prompt_tokens.squeeze(0)
-    # Calculate where to inject the image
-    eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
-    assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
-    preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
-    # Embed the tokens
-    convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(device))
-    # Construct the input
-    input_embeds = torch.cat([
-        convo_embeds[:, :preamble_len],   # Part before the prompt
-        embedded_images.to(dtype=convo_embeds.dtype),   # Image
-        convo_embeds[:, preamble_len:],   # The prompt and anything after it
-    ], dim=1).to(device)
-    input_ids = torch.cat([
-        convo_tokens[:preamble_len].unsqueeze(0),
-        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),   # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
-        convo_tokens[preamble_len:].unsqueeze(0),
-    ], dim=1).to(device)
-    attention_mask = torch.ones_like(input_ids)
-    # Debugging
-    #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
-    text_model.to(device)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
-                                       do_sample=True, suppress_tokens=None, top_p=top_p, temperature=temperature)
-    # Trim off the prompt
-    generate_ids = generate_ids[:, input_ids.shape[1]:]
-    if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
-        generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-    return prompt_str, caption.strip()
-# https://huggingface.co/docs/transformers/v4.44.2/main_classes/text_generation#transformers.FlaxGenerationMixin.generate
-# https://github.com/huggingface/transformers/issues/6535
-# https://zenn.dev/hijikix/articles/8c445f4373fdcc ja
-# https://github.com/ggerganov/llama.cpp/discussions/7712
-# https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility
-# https://huggingface.co/docs/huggingface_hub/v0.24.6/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation
 def is_repo_name(s):

 @spaces.GPU()
 @torch.inference_mode()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_length: Union[str, int], extra_options: list[str], name_input: str, custom_prompt: str,
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> tuple[str, str]:
+    try:
+        global tokenizer, text_model, image_adapter, pixtral_model, pixtral_processor, text_model_client, use_inference_client
+        torch.cuda.empty_cache()
+        gc.collect()
+        # 'any' means no length specified
+        length = None if caption_length == "any" else caption_length
+        if isinstance(length, str):
+            try:
+                length = int(length)
+            except ValueError:
+                pass
+        # Build prompt
+        if length is None:
+            map_idx = 0
+        elif isinstance(length, int):
+            map_idx = 1
+        elif isinstance(length, str):
+            map_idx = 2
+        else:
+            raise ValueError(f"Invalid caption length: {length}")
+        prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
+        # Add extra options
+        if len(extra_options) > 0:
+            prompt_str += " " + " ".join(extra_options)
+        # Add name, length, word_count
+        prompt_str = prompt_str.format(name=name_input, length=caption_length, word_count=caption_length)
+        if custom_prompt.strip() != "":
+            prompt_str = custom_prompt.strip()
+        # For debugging
+        print(f"Prompt: {prompt_str}")
+        # Pixtral
+        if model_name in PIXTRAL_PATHS:
+            print(f"pixtral_model: {type(pixtral_model)}") #
+            print(f"pixtral_processor: {type(pixtral_processor)}") #
+            input_images = [input_image.convert("RGB")]
+            input_prompt = "[INST]Caption this image:\n[IMG][/INST]"
+            inputs = pixtral_processor(images=input_images, text=input_prompt, return_tensors="pt").to(device)
+            generate_ids = pixtral_model.generate(**inputs, max_new_tokens=max_new_tokens)
+            output = pixtral_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+            return input_prompt, output.strip()
+        # Preprocess image
+        # NOTE: I found the default processor for so400M to have worse results than just using PIL directly
+        #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
+        image = input_image.resize((384, 384), Image.LANCZOS)
+        pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+        pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+        pixel_values = pixel_values.to(device)
+        # Embed image
+        # This results in Batch x Image Tokens x Features
+        with torch.amp.autocast_mode.autocast(device, enabled=True):
+            vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+            image_features = vision_outputs.hidden_states
+            embedded_images = image_adapter(image_features)
+            embedded_images = embedded_images.to(device)
+        # Build the conversation
+        convo = [
+            {
+                "role": "system",
+                "content": "You are a helpful image captioner.",
+            },
+            {
+                "role": "user",
+                "content": prompt_str,
+            },
+        ]
+        # Format the conversation
+        convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
+        assert isinstance(convo_string, str)
+        # Tokenize the conversation
+        # prompt_str is tokenized separately so we can do the calculations below
+        convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
+        prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
+        assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
+        convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
+        prompt_tokens = prompt_tokens.squeeze(0)
+        # Calculate where to inject the image
+        eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
+        assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
+        preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
+        # Embed the tokens
+        convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to(device))
+        # Construct the input
+        input_embeds = torch.cat([
+            convo_embeds[:, :preamble_len],   # Part before the prompt
+            embedded_images.to(dtype=convo_embeds.dtype),   # Image
+            convo_embeds[:, preamble_len:],   # The prompt and anything after it
+        ], dim=1).to(device)
+        input_ids = torch.cat([
+            convo_tokens[:preamble_len].unsqueeze(0),
+            torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),   # Dummy tokens for the image (TODO: Should probably use a special token here so as not to confuse any generation algorithms that might be inspecting the input)
+            convo_tokens[preamble_len:].unsqueeze(0),
+        ], dim=1).to(device)
+        attention_mask = torch.ones_like(input_ids)
+        # Debugging
+        #print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
+        text_model.to(device)
+        generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
+                                           do_sample=True, suppress_tokens=None, top_p=top_p, temperature=temperature)
+        # Trim off the prompt
+        generate_ids = generate_ids[:, input_ids.shape[1]:]
+        if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+            generate_ids = generate_ids[:, :-1]
+        caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+        return prompt_str, caption.strip()
+    except Exception as e:
+        print(e)
+    # https://huggingface.co/docs/transformers/v4.44.2/main_classes/text_generation#transformers.FlaxGenerationMixin.generate
+    # https://github.com/huggingface/transformers/issues/6535
+    # https://zenn.dev/hijikix/articles/8c445f4373fdcc ja
+    # https://github.com/ggerganov/llama.cpp/discussions/7712
+    # https://huggingface.co/docs/huggingface_hub/guides/inference#openai-compatibility
+    # https://huggingface.co/docs/huggingface_hub/v0.24.6/en/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation
 def is_repo_name(s):