Spaces:
Runtime error
Runtime error
File size: 4,589 Bytes
ae71d4b 27a58ec ae71d4b 6e9b076 ae71d4b 6e9b076 ae71d4b caaf71e ae71d4b b68b40e 9d22775 b68b40e ae71d4b bc0b828 7e14a8d b68b40e 7e14a8d ae71d4b bbb7e65 caaf71e ae71d4b bbb7e65 ae71d4b a1ec3a4 ae71d4b 6a34194 ae71d4b caaf71e ae71d4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
import librosa
import numpy as np
import torch
import requests
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
"RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}
def getNews(search_key):
return requests.get ("https://newsapi.org/v2/everything?q=" +search_key+ "&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
# return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def getHeadlines():
return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def predict(text, speaker):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
# text = getNews ()
# inputs = processor(text=text, return_tensors="pt")
inputs = processor(text=getNews(text),
return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
if speaker == "Surprise Me!":
# load one of the provided speaker embeddings at random
idx = np.random.randint(len(speaker_embeddings))
key = list(speaker_embeddings.keys())[idx]
speaker_embedding = np.load(speaker_embeddings[key])
# randomly shuffle the elements
np.random.shuffle(speaker_embedding)
# randomly flip half the values
x = (np.random.rand(512) >= 0.5) * 1.0
x[x == 0] = -1.0
speaker_embedding *= x
#speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
else:
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "SpeechT5: Speech Synthesis"
description = """
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
</div>
"""
examples = [
["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
["The octopus and Oliver went to the opera in October.", "CLB (female)"],
["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"],
["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "SLT (female)"],
["A synonym for cinnamon is a cinnamon synonym.", "BDL (male)"],
["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"],
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Speaker", choices=[
"BDL (male)",
"CLB (female)",
"KSP (male)",
"RMS (male)",
"SLT (female)",
"Surprise Me!"
],
value="BDL (male)"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()
|