Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	update app.py adding whisper to catala, also the model text-to-speech
Browse files
    	
        app.py
    CHANGED
    
    | @@ -2,7 +2,6 @@ import gradio as gr | |
| 2 | 
             
            import numpy as np
         | 
| 3 | 
             
            import torch
         | 
| 4 | 
             
            from datasets import load_dataset
         | 
| 5 | 
            -
             | 
| 6 | 
             
            from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
         | 
| 7 |  | 
| 8 |  | 
| @@ -14,15 +13,21 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", | |
| 14 | 
             
            # load text-to-speech checkpoint and speaker embeddings
         | 
| 15 | 
             
            processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
         | 
| 16 |  | 
| 17 | 
            -
            model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
         | 
|  | |
|  | |
|  | |
| 18 | 
             
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
         | 
| 19 | 
            -
             | 
|  | |
|  | |
|  | |
| 20 | 
             
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
         | 
| 21 | 
             
            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
         | 
| 22 |  | 
| 23 |  | 
| 24 | 
             
            def translate(audio):
         | 
| 25 | 
            -
                outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
         | 
| 26 | 
             
                return outputs["text"]
         | 
| 27 |  | 
| 28 |  | 
|  | |
| 2 | 
             
            import numpy as np
         | 
| 3 | 
             
            import torch
         | 
| 4 | 
             
            from datasets import load_dataset
         | 
|  | |
| 5 | 
             
            from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
         | 
| 6 |  | 
| 7 |  | 
|  | |
| 13 | 
             
            # load text-to-speech checkpoint and speaker embeddings
         | 
| 14 | 
             
            processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
         | 
| 15 |  | 
| 16 | 
            +
            # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
         | 
| 17 | 
            +
            model = SpeechT5ForTextToSpeech.from_pretrained(
         | 
| 18 | 
            +
                "JanLilan/speecht5_finetuned_openslr-slr69-cat"
         | 
| 19 | 
            +
            ).to(device)
         | 
| 20 | 
             
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
         | 
| 21 | 
            +
            # we will try to translate with this voice embedding... Let's see what happen. else:
         | 
| 22 | 
            +
            # dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
         | 
| 23 | 
            +
            # dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
         | 
| 24 | 
            +
            # etc.
         | 
| 25 | 
             
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
         | 
| 26 | 
             
            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
         | 
| 27 |  | 
| 28 |  | 
| 29 | 
             
            def translate(audio):
         | 
| 30 | 
            +
                outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "cat"})
         | 
| 31 | 
             
                return outputs["text"]
         | 
| 32 |  | 
| 33 |  |