speech-to-speech-translation

Runtime error

leofltt commited on Mar 22, 2024

Commit

9ff0018

verified ·

1 Parent(s): 3999b52

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,8 +8,6 @@ from transformers import BarkModel, BarkProcessor
 from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-SAMPLE_RATE = 16000
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 asr_model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
@@ -20,7 +18,7 @@ bark_processor = BarkProcessor.from_pretrained("suno/bark")
 def translate(audio):
-    inputs = asr_processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
     generated_ids = asr_model.generate(inputs["input_features"],attention_mask=inputs["attention_mask"],
     forced_bos_token_id=asr_processor.tokenizer.lang_code_to_id["it"],)
     translation = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)
@@ -37,7 +35,7 @@ def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
-    return SAMPLE_RATE, synthesised_speech
 title = "Cascaded STST"

 from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 asr_model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 def translate(audio):
+    inputs = asr_processor(audio, sampling_rate=16000, return_tensors="pt")
     generated_ids = asr_model.generate(inputs["input_features"],attention_mask=inputs["attention_mask"],
     forced_bos_token_id=asr_processor.tokenizer.lang_code_to_id["it"],)
     translation = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
+    return 16000, synthesised_speech
 title = "Cascaded STST"