Abdualkader
/

Multi-View

@@ -660,14 +660,14 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
     def __init__(
         self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
         dropout=0,
-        channel_mult=(1, 2, 4, 8),
         conv_resample=True,
         dims=2,
         num_classes=None,
@@ -688,7 +688,15 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
     ):
         super().__init__()
         assert context_dim is not None
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
@@ -730,7 +738,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                 f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                 f"attention will still not be set."
             )
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
@@ -1418,20 +1426,18 @@ class MVDreamPipeline(DiffusionPipeline):
         return torch.zeros_like(image_embeds), image_embeds
     def encode_image_latents(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
-        image = (
-            torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2).to(device=device)
-        )  # [1, 3, H, W]
         image = 2 * image - 1
-        image = F.interpolate(image, (256, 256), mode="bilinear", align_corners=False)
         image = image.to(dtype=dtype)
         posterior = self.vae.encode(image).latent_dist
-        latents = posterior.sample() * self.vae.config.scaling_factor  # [B, C, H, W]
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
         return torch.zeros_like(latents), latents
     @torch.no_grad()
@@ -1439,8 +1445,8 @@ class MVDreamPipeline(DiffusionPipeline):
         self,
         prompt: str = "",
         image: Optional[np.ndarray] = None,
-        height: int = 256,
-        width: int = 256,
         elevation: float = 0,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.0,
@@ -1454,6 +1460,10 @@ class MVDreamPipeline(DiffusionPipeline):
         num_frames: int = 4,
         device=torch.device("cuda:0"),
     ):
         self.unet = self.unet.to(device=device)
         self.vae = self.vae.to(device=device)
         self.text_encoder = self.text_encoder.to(device=device)

     def __init__(
         self,
+        image_size=512,  # Force 512 resolution
+        in_channels=4,
+        model_channels=320,
+        out_channels=4,
+        num_res_blocks=[2, 2, 2, 2],
+        attention_resolutions=[8, 4, 2],  # Adjusted for 512x512
+        channel_mult=[1, 2, 4, 8],
         dropout=0,
         conv_resample=True,
         dims=2,
         num_classes=None,
     ):
         super().__init__()
         assert context_dim is not None
+        # Add resolution validation
+        assert image_size in [256, 512], "Only 256/512 resolutions supported"
+        super().__init__()
+        # Modify attention resolutions for 512
+        if image_size == 512:
+            attention_resolutions = [16, 8, 4]
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
                 f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                 f"attention will still not be set."
             )
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         return torch.zeros_like(image_embeds), image_embeds
     def encode_image_latents(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
+        # Change interpolation size to match target resolution
+        image = torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2).to(device=device)
         image = 2 * image - 1
+        image = F.interpolate(image, (512, 512), mode='bilinear', align_corners=False)  # Changed from 256
         image = image.to(dtype=dtype)
         posterior = self.vae.encode(image).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
         latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
         return torch.zeros_like(latents), latents
     @torch.no_grad()
         self,
         prompt: str = "",
         image: Optional[np.ndarray] = None,
+        height: int = 512,
+        width: int = 512,
         elevation: float = 0,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.0,
         num_frames: int = 4,
         device=torch.device("cuda:0"),
     ):
+        # Add resolution validation
+        if height != 512 or width != 512:
+            raise ValueError("Current implementation requires 512x512 resolution")
         self.unet = self.unet.to(device=device)
         self.vae = self.vae.to(device=device)
         self.text_encoder = self.text_encoder.to(device=device)