Janus-Pro-7B

Runtime error

App Files Files Community

ginipick commited on Feb 3

Commit

827cb17

verified ·

1 Parent(s): c26cedb

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -26

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-from transformers import AutoConfig, AutoModelForCausalLM
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
@@ -9,15 +9,32 @@ import os
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
 # Load model and processor
 model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
-vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
-                                             language_config=language_config,
-                                             trust_remote_code=True)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
@@ -28,16 +45,14 @@ tokenizer = vl_chat_processor.tokenizer
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # SR model
-sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights('weights/RealESRGAN_x2.pth', download=False)
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 def multimodal_understanding(image, question, seed, top_p, temperature):
-    # Clear CUDA cache before generating
     torch.cuda.empty_cache()
-    # Set seed
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
@@ -90,9 +105,11 @@ def generate(input_ids, width, height, temperature: float = 1,
     pkv = None
     for i in range(image_token_num_per_image):
         with torch.no_grad():
-            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
-                                                  use_cache=True,
-                                                  past_key_values=pkv)
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
@@ -107,8 +124,10 @@ def generate(input_ids, width, height, temperature: float = 1,
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
-    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
-                                                  shape=[parallel_size, 8, width // patch_size, height // patch_size])
     return generated_tokens.to(dtype=torch.int), patches
 def unpack(dec, width, height, parallel_size=5):
@@ -121,6 +140,9 @@ def unpack(dec, width, height, parallel_size=5):
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0):
     torch.cuda.empty_cache()
     if seed is not None:
         torch.manual_seed(seed)
@@ -140,16 +162,20 @@ def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0):
         )
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
-        output, patches = generate(input_ids,
-                                   width // 16 * 16,
-                                   height // 16 * 16,
-                                   cfg_weight=guidance,
-                                   parallel_size=parallel_size,
-                                   temperature=t2i_temperature)
-        images = unpack(patches,
-                        width // 16 * 16,
-                        height // 16 * 16,
-                        parallel_size=parallel_size)
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
@@ -231,8 +257,7 @@ with gr.Blocks(css=custom_css, title="Multimodal & T2I Demo") as demo:
                     gr.Examples(
                         label="Multimodal Understanding Examples",
                         examples=[
-                            ["explain this meme", "doge.png"],
-                            ["이 이미지를 설명해줘", "korean_example.png"]
                         ],
                         inputs=[question_input, image_input],
                     )
@@ -273,4 +298,3 @@ with gr.Blocks(css=custom_css, title="Multimodal & T2I Demo") as demo:
     gr.Markdown("<footer style='text-align:center; padding:20px 0;'>Join our community on <a href='https://discord.gg/openfreeai' target='_blank'>Discord</a></footer>")
 demo.launch(share=True)

 import gradio as gr
 import torch
+from transformers import AutoConfig, AutoModelForCausalLM, pipeline as translation_pipeline
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
+import re
+# 번역 파이프라인 초기화 (한글 → 영어)
+translator = translation_pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+def translate_if_korean(prompt: str) -> str:
+    """프롬프트에 한글이 포함되어 있으면 영어로 번역"""
+    if re.search(r'[ㄱ-ㅎㅏ-ㅣ가-힣]', prompt):
+        try:
+            translation = translator(prompt)[0]['translation_text']
+            return translation
+        except Exception as e:
+            print(f"Translation error: {e}")
+            return prompt
+    return prompt
 # Load model and processor
 model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    language_config=language_config,
+    trust_remote_code=True
+)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # SR model
+sr_model = RealESRGAN(torch.device(cuda_device), scale=2)
 sr_model.load_weights('weights/RealESRGAN_x2.pth', download=False)
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 def multimodal_understanding(image, question, seed, top_p, temperature):
+    # (생략) 기존 multimodal 이해 함수 내용 그대로...
     torch.cuda.empty_cache()
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
     pkv = None
     for i in range(image_token_num_per_image):
         with torch.no_grad():
+            outputs = vl_gpt.language_model.model(
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                past_key_values=pkv
+            )
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
+    patches = vl_gpt.gen_vision_model.decode_code(
+        generated_tokens.to(dtype=torch.int),
+        shape=[parallel_size, 8, width // patch_size, height // patch_size]
+    )
     return generated_tokens.to(dtype=torch.int), patches
 def unpack(dec, width, height, parallel_size=5):
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0):
+    # 번역: 입력 프롬프트에 한글이 포함되어 있으면 영어로 변환
+    prompt = translate_if_korean(prompt)
     torch.cuda.empty_cache()
     if seed is not None:
         torch.manual_seed(seed)
         )
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
+        output, patches = generate(
+            input_ids,
+            width // 16 * 16,
+            height // 16 * 16,
+            cfg_weight=guidance,
+            parallel_size=parallel_size,
+            temperature=t2i_temperature
+        )
+        images = unpack(
+            patches,
+            width // 16 * 16,
+            height // 16 * 16,
+            parallel_size=parallel_size
+        )
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
                     gr.Examples(
                         label="Multimodal Understanding Examples",
                         examples=[
+                            ["explain this meme", "doge.png"]
                         ],
                         inputs=[question_input, image_input],
                     )
     gr.Markdown("<footer style='text-align:center; padding:20px 0;'>Join our community on <a href='https://discord.gg/openfreeai' target='_blank'>Discord</a></footer>")
 demo.launch(share=True)