Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	File size: 7,752 Bytes
			
			| 9f21f94 d789727 a801789 d789727 7fef4b1 cbfb85b d789727 7d0e0a6 d789727 7d0e0a6 d789727 40b1908 d789727 54547cf fabec3c 54547cf d789727 54547cf 1ea390e d789727 54547cf 1f1eee1 d789727 54547cf d789727 54547cf d789727 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | import gradio as gr
import spaces
import torch
from transformers import AutoTokenizer,VitsModel
import os
import numpy as np  
token=os.environ.get("key_")
#tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
models= {}
import noisereduce as nr
import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator
import torch.nn as nn # Import the missing module
def remove_noise_nr(audio_data,sr=16000):
    reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
    return reduced_noise
def _inference_forward_stream(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        speaker_embeddings: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        padding_mask: Optional[torch.Tensor] = None,
        chunk_size: int = 32,  # Chunk size for streaming output
        is_streaming: bool = True,
    ) -> Iterator[torch.Tensor]:
        """Generates speech waveforms in a streaming fashion."""
        if attention_mask is not None:
            padding_mask = attention_mask.unsqueeze(-1).float()
        else:
            padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
        text_encoder_output = self.text_encoder(
            input_ids=input_ids,
            padding_mask=padding_mask,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
        hidden_states = hidden_states.transpose(1, 2)
        input_padding_mask = padding_mask.transpose(1, 2)
        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
        if self.config.use_stochastic_duration_prediction:
            log_duration = self.duration_predictor(
                hidden_states,
                input_padding_mask,
                speaker_embeddings,
                reverse=True,
                noise_scale=self.noise_scale_duration,
            )
        else:
            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
        length_scale = 1.0 / self.speaking_rate
        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
        batch_size, _, output_length, input_length = attn_mask.shape
        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
        valid_indices = indices.unsqueeze(0) < cum_duration
        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
        # Expand prior distribution
        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
        spectrogram = latents * output_padding_mask
        if is_streaming:
            
            for i in range(0, spectrogram.size(-1), chunk_size):
                with torch.no_grad():
                    wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
                yield wav.squeeze().cpu().numpy()
        else:
              wav=self.decoder(spectrogram,speaker_embeddings)
              yield  wav.squeeze().cpu().numpy()
@spaces.GPU
def  get_model(name_model):
    global models
    if name_model in   models:
        if  name_model=='wasmdashai/vits-en-v1':
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
        else: 
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
        
        
        return models[name_model],tokenizer
    models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
    
    
   
    models[name_model].decoder.apply_weight_norm()
    # torch.nn.utils.weight_norm(self.decoder.conv_pre)
    # torch.nn.utils.weight_norm(self.decoder.conv_post)
    for flow in models[name_model].flow.flows:
        torch.nn.utils.weight_norm(flow.conv_pre)
        torch.nn.utils.weight_norm(flow.conv_post)
    
    if  name_model=='wasmdashai/vits-en-v1':
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
    else: 
            tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
    return models[name_model],tokenizer
zero = torch.Tensor([0]).cuda()
print(zero.device) # <-- 'cpu' 🤔
import torch
TXT="""السلام  عليكم  ورحمة الله وبركاتة  يا هلا وسهلا ومراحب بالغالي  اخباركم  طيبين ان شاء الله     ارحبوا  على العين والراس     """
@spaces.GPU
def   modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
     model,tokenizer=get_model(name_model)
    
     inputs = tokenizer(text, return_tensors="pt")
     
     model.speaking_rate=speaking_rate
     with torch.no_grad():
        wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
     # with torch.no_grad():
     #      wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
          
     return  (model.config.sampling_rate,wav)
model_choices = gr.Dropdown(
                            choices=[
                                
                                "wasmdashai/vits-ar-sa-huba-v1",
                                 "wasmdashai/vits-ar-sa-huba-v2",
                                 
                                 "wasmdashai/vits-ar-sa-A",
                                "wasmdashai/vits-ar-ye-sa",
                                "wasmdashai/vits-ar-sa-M-v1",
                                'wasmdashai/vits-en-v1'
                                
                                  
                            ],
                            label="اختر النموذج",
                            value="wasmdashai/vits-ar-sa-huba-v2",
                        )
demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0.1, 1, step=0.1,value=0.8)], outputs=["audio"])
demo.queue()
demo.launch()
 | 
