Spaces:

Himanshu1804
/

Voxify

Runtime error

App Files Files Community

himanshu1844 commited on Apr 4

Commit

1d4e95e

1 Parent(s): c221508

add model

Browse files

Files changed (3) hide show

app.py +3 -1
model.py +3 -165
setup.py +7 -0

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-from Voxify import VoxifyInfereence
 import torchaudio
 voxify=VoxifyInfereence(name="declare-lab/TangoFlux")
 def gradio_generate(prompt, steps, guidance,duration=10):
@@ -14,6 +14,8 @@ def gradio_generate(prompt, steps, guidance,duration=10):
     return filename
 description_text = """
 * Powered by **Stability AI**
 Generate high quality and faithful audio in just a few seconds using <b>VOXIFY</b> by providing a text prompt. <b>VOXIFY</b> was trained from scratch and underwent alignment to follow human instructions using a new method called <b>CLAP-Ranked Preference Optimization (CRPO)</b>.

 import gradio as gr
 import torchaudio
+from Voxify import VoxifyInfereence
 voxify=VoxifyInfereence(name="declare-lab/TangoFlux")
 def gradio_generate(prompt, steps, guidance,duration=10):
     return filename
 description_text = """
 * Powered by **Stability AI**
 Generate high quality and faithful audio in just a few seconds using <b>VOXIFY</b> by providing a text prompt. <b>VOXIFY</b> was trained from scratch and underwent alignment to follow human instructions using a new method called <b>CLAP-Ranked Preference Optimization (CRPO)</b>.

model.py CHANGED Viewed

@@ -1,25 +1,17 @@
-from transformers  import T5EncoderModel,T5TokenizerFast
 import torch
 from diffusers import   FluxTransformer2DModel
 from torch import nn
 from typing import List
 from diffusers import FlowMatchEulerDiscreteScheduler
-from diffusers.training_utils import compute_density_for_timestep_sampling
 import copy
 import torch.nn.functional as F
 import numpy as np
 from tqdm import tqdm
-from typing import Optional,Union,List
-from datasets import load_dataset, Audio
 from math import pi
 import inspect
-import yaml
-import random
 class StableAudioPositionalEmbedding(nn.Module):
     """Used for continuous time
     Adapted from stable audio open.
@@ -38,7 +30,6 @@ class StableAudioPositionalEmbedding(nn.Module):
         fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
         fouriered = torch.cat((times, fouriered), dim=-1)
         return fouriered
 class DurationEmbedder(nn.Module):
     """
     A simple linear projection model to map numbers to a latent space.
@@ -350,157 +341,4 @@ class Voxify(nn.Module):
             latents = scheduler.step(noise_pred, t, latents).prev_sample
-        return latents
-    def forward(self,
-                latents,
-                prompt,
-                duration=torch.tensor([10]),
-                sft=True
-                ):
-        device = latents.device
-        audio_seq_length = self.audio_seq_len
-        bsz = latents.shape[0]
-        encoder_hidden_states, boolean_encoder_mask = self.encode_text(prompt)
-        duration_hidden_states = self.encode_duration(duration)
-        mask_expanded = boolean_encoder_mask.unsqueeze(-1).expand_as(encoder_hidden_states)
-        masked_data = torch.where(mask_expanded, encoder_hidden_states, torch.tensor(float('nan')))
-        pooled = torch.nanmean(masked_data, dim=1)
-        pooled_projection = self.fc(pooled)
-        ## Add duration hidden states to encoder hidden states
-        encoder_hidden_states = torch.cat([encoder_hidden_states,duration_hidden_states],dim=1) ## (bs,seq_len,dim)
-        txt_ids = torch.zeros(bsz,encoder_hidden_states.shape[1],3).to(device)
-        audio_ids = torch.arange(audio_seq_length).unsqueeze(0).unsqueeze(-1).repeat(bsz,1,3).to(device)
-        if sft:
-            if self.uncondition:
-                mask_indices = [k for k in range(len(prompt)) if random.random() < 0.1]
-                if len(mask_indices) > 0:
-                    encoder_hidden_states[mask_indices] = 0
-            noise = torch.randn_like(latents)
-            u = compute_density_for_timestep_sampling(
-                    weighting_scheme='logit_normal',
-                    batch_size=bsz,
-                    logit_mean=0,
-                    logit_std=1,
-                    mode_scale=None,
-                )
-            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
-            timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)
-            sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
-            noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
-            model_pred =  self.transformer(
-                                    hidden_states=noisy_model_input,
-                                    encoder_hidden_states=encoder_hidden_states,
-                                    pooled_projections=pooled_projection,
-                                    img_ids=audio_ids,
-                                    txt_ids=txt_ids,
-                                    guidance=None,
-                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
-                                    timestep=timesteps/1000,
-                                    return_dict=False)[0]
-            target = noise - latents
-            loss = torch.mean(
-                        ( (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
-                        1,
-                    )
-            loss = loss.mean()
-            raw_model_loss, raw_ref_loss,implicit_acc,epsilon_diff = 0,0,0,0 ## default this to 0 if doing sft
-        else:
-            encoder_hidden_states = encoder_hidden_states.repeat(2, 1, 1)
-            pooled_projection = pooled_projection.repeat(2,1)
-            noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1) ## Have to sample same noise for preferred and rejected
-            u = compute_density_for_timestep_sampling(
-                    weighting_scheme='logit_normal',
-                    batch_size=bsz//2,
-                    logit_mean=0,
-                    logit_std=1,
-                    mode_scale=None,
-                )
-            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
-            timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)
-            timesteps = timesteps.repeat(2)
-            sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
-            noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
-            model_pred =  self.transformer(
-                                    hidden_states=noisy_model_input,
-                                    encoder_hidden_states=encoder_hidden_states,
-                                    pooled_projections=pooled_projection,
-                                    img_ids=audio_ids,
-                                    txt_ids=txt_ids,
-                                    guidance=None,
-                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
-                                    timestep=timesteps/1000,
-                                    return_dict=False)[0]
-            target = noise - latents
-            model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-            model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
-            model_losses_w, model_losses_l = model_losses.chunk(2)
-            model_diff = model_losses_w - model_losses_l
-            raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
-            with torch.no_grad():
-                ref_preds = self.ref_transformer(
-                                    hidden_states=noisy_model_input,
-                                    encoder_hidden_states=encoder_hidden_states,
-                                    pooled_projections=pooled_projection,
-                                    img_ids=audio_ids,
-                                    txt_ids=txt_ids,
-                                    guidance=None,
-                                    timestep=timesteps/1000,
-                                    return_dict=False)[0]
-                ref_loss = F.mse_loss(ref_preds.float(), target.float(), reduction="none")
-                ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
-                ref_losses_w, ref_losses_l = ref_loss.chunk(2)
-                ref_diff = ref_losses_w - ref_losses_l
-                raw_ref_loss = ref_loss.mean()
-            epsilon_diff = torch.max(torch.zeros_like(model_losses_w),
-                                      ref_losses_w-model_losses_w).mean()
-            scale_term = -0.5 * self.beta_dpo
-            inside_term = scale_term * (model_diff - ref_diff)
-            implicit_acc = (scale_term * (model_diff - ref_diff)  > 0).sum().float() / inside_term.size(0)
-            loss = -1 * F.logsigmoid(inside_term).mean()  + model_losses_w.mean()
-        return loss, raw_model_loss, raw_ref_loss, implicit_acc,epsilon_diff

 import torch
+from transformers  import T5EncoderModel,T5TokenizerFast
 from diffusers import   FluxTransformer2DModel
 from torch import nn
 from typing import List
 from diffusers import FlowMatchEulerDiscreteScheduler
 import copy
 import torch.nn.functional as F
 import numpy as np
 from tqdm import tqdm
 from math import pi
 import inspect
+from typing import Optional,Union,List
 class StableAudioPositionalEmbedding(nn.Module):
     """Used for continuous time
     Adapted from stable audio open.
         fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
         fouriered = torch.cat((times, fouriered), dim=-1)
         return fouriered
 class DurationEmbedder(nn.Module):
     """
     A simple linear projection model to map numbers to a latent space.
             latents = scheduler.step(noise_pred, t, latents).prev_sample
+        return latents

setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+requirement_path = "requirements.txt"
+install_requires = []
+if os.path.isfile(requirement_path):
+    with open(requirement_path) as f:
+        install_requires = f.read().splitlines()
+setup(name="mypackage", install_requires=install_requires, [...])