Enhance README with image loading function and update model usage; add inference mode to generate methods

Files changed (2) hide show

README.md CHANGED Viewed

@@ -56,25 +56,29 @@ library_name: t5
 Below is a minimal usage example in Python. You can load the model with `AutoModel.from_pretrained(...)` and simply call `.generate(...)` or `.generate_batch(...)` to create images.
 ```python
-import torch
 from PIL import Image
 from transformers import AutoModel
 from torchvision.transforms import functional as F
 # 1. Load the model
-model = AutoModel.from_pretrained("blowing-up-groundhogs/emuru")
 model.cuda()  # Move to GPU if available
 # 2. Prepare your inputs
-style_text = "A beautiful watercolor style"
-gen_text = "A majestic mountain with a rainbow"
-style_img = Image.open("my_style_image.png").convert("RGB")
-# Convert the style image to a suitable tensor
-style_img = F.to_tensor(style_img)
-style_img = F.resize((style_img.width * 64 // style_img.height, 64))    # Example resize
-style_img = F.normalize(style_img, [0.5], [0.5])  # Normalize to [-1, 1]
-style_img = style_img.unsqueeze(0).cuda()
 # 3. Generate an image
 generated_pil_image = model.generate(
@@ -84,7 +88,7 @@ generated_pil_image = model.generate(
     max_new_tokens=64
 )
-# 4. Save or display the result
 generated_pil_image.save("generated_image.png")
 ```

 Below is a minimal usage example in Python. You can load the model with `AutoModel.from_pretrained(...)` and simply call `.generate(...)` or `.generate_batch(...)` to create images.
 ```python
 from PIL import Image
 from transformers import AutoModel
+from huggingface_hub import hf_hub_download
 from torchvision.transforms import functional as F
+def load_image(img_path):
+    img = Image.open(img_path).convert("RGB")
+    # Resize the image to have a fixed height of 64 pixels
+    img = img.resize((img.width * 64 // img.height, 64))
+    img = F.to_tensor(img)
+    img = F.normalize(img, [0.5], [0.5])
+    return img.unsqueeze(0)
 # 1. Load the model
+model = AutoModel.from_pretrained("blowing-up-groundhogs/emuru", trust_remote_code=True)
 model.cuda()  # Move to GPU if available
 # 2. Prepare your inputs
+style_text = 'THE JOLLY IS "U"'
+gen_text = 'EMURU'
+img_path = hf_hub_download(repo_id="blowing-up-groundhogs/emuru", filename="sample.png")
+style_img = load_image(img_path)
+style_img = style_img.cuda()
 # 3. Generate an image
 generated_pil_image = model.generate(
     max_new_tokens=64
 )
+# 4. Save the result
 generated_pil_image.save("generated_image.png")
 ```

modeling_emuru.py CHANGED Viewed

@@ -108,6 +108,7 @@ class Emuru(PreTrainedModel):
         mse_loss = self.mse_criterion(vae_latent, z_sequence)
         return mse_loss, pred_latent, z
     def generate(
         self,
         style_text: str,
@@ -139,6 +140,7 @@ class Emuru(PreTrainedModel):
         imgs = (imgs + 1) / 2
         return F.to_pil_image(imgs[0, ..., style_img.size(-1):img_ends.item()].detach().cpu())
     def generate_batch(
         self,
         style_texts: List[str],

         mse_loss = self.mse_criterion(vae_latent, z_sequence)
         return mse_loss, pred_latent, z
+    @torch.inference_mode()
     def generate(
         self,
         style_text: str,
         imgs = (imgs + 1) / 2
         return F.to_pil_image(imgs[0, ..., style_img.size(-1):img_ends.item()].detach().cpu())
+    @torch.inference_mode()
     def generate_batch(
         self,
         style_texts: List[str],