Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
             
            import numpy as np
         | 
| 3 | 
             
            import torch
         | 
| 4 | 
             
            from datasets import load_dataset
         | 
| 5 | 
            -
             | 
| 6 | 
             
            from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
         | 
| 7 |  | 
| 8 |  | 
| @@ -12,14 +12,14 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| 12 | 
             
            asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
         | 
| 13 |  | 
| 14 | 
             
            # load text-to-speech checkpoint and speaker embeddings
         | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
            model = SpeechT5ForTextToSpeech.from_pretrained( | 
| 18 | 
            -
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | 
| 19 | 
            -
             | 
| 20 | 
             
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
         | 
| 21 | 
            -
            speaker_embeddings = torch.tensor(embeddings_dataset[ | 
| 22 |  | 
|  | |
| 23 |  | 
| 24 | 
             
            replacements = [
         | 
| 25 | 
             
                 ("á", "a"),
         | 
| @@ -70,9 +70,8 @@ def speech_to_speech_translation(audio): | |
| 70 |  | 
| 71 | 
             
            title = "Cascaded STST"
         | 
| 72 | 
             
            description = """
         | 
| 73 | 
            -
            Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in  | 
| 74 | 
            -
            [ | 
| 75 | 
            -
             | 
| 76 | 
             
            
         | 
| 77 | 
             
            """
         | 
| 78 |  | 
| @@ -99,3 +98,5 @@ with demo: | |
| 99 | 
             
                gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
         | 
| 100 |  | 
| 101 | 
             
            demo.launch()
         | 
|  | |
|  | 
|  | |
| 1 | 
            +
             | 
| 2 | 
             
            import gradio as gr
         | 
| 3 | 
             
            import numpy as np
         | 
| 4 | 
             
            import torch
         | 
| 5 | 
             
            from datasets import load_dataset
         | 
|  | |
| 6 | 
             
            from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
         | 
| 7 |  | 
| 8 |  | 
|  | |
| 12 | 
             
            asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
         | 
| 13 |  | 
| 14 | 
             
            # load text-to-speech checkpoint and speaker embeddings
         | 
| 15 | 
            +
            model_id = "ckandemir/speecht5_finetuned_voxpopuli_fr"  # update with your model id
         | 
| 16 | 
            +
            # pipe = pipeline("automatic-speech-recognition", model=model_id)
         | 
| 17 | 
            +
            model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
         | 
| 18 | 
            +
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         | 
|  | |
| 19 | 
             
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
         | 
| 20 | 
            +
            speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
         | 
| 21 |  | 
| 22 | 
            +
            processor = SpeechT5Processor.from_pretrained(model_id)
         | 
| 23 |  | 
| 24 | 
             
            replacements = [
         | 
| 25 | 
             
                 ("á", "a"),
         | 
|  | |
| 70 |  | 
| 71 | 
             
            title = "Cascaded STST"
         | 
| 72 | 
             
            description = """
         | 
| 73 | 
            +
            Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish](https://huggingface.co/Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish) checkpoint for text-to-speech, which is based on Microsoft's
         | 
| 74 | 
            +
            [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Spanish Audio dataset:
         | 
|  | |
| 75 | 
             
            
         | 
| 76 | 
             
            """
         | 
| 77 |  | 
|  | |
| 98 | 
             
                gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
         | 
| 99 |  | 
| 100 | 
             
            demo.launch()
         | 
| 101 | 
            +
             | 
| 102 | 
            +
             | 
