Spaces:
Runtime error
Runtime error
File size: 3,222 Bytes
ae71d4b 27a58ec ae71d4b 6e9b076 ae71d4b 6e9b076 ae71d4b 9483da5 ae71d4b b68b40e 9483da5 9d22775 b68b40e ae71d4b 32fb746 ae71d4b 9483da5 a4591c0 27fd4f5 bbb7e65 9483da5 2142965 ae71d4b bbb7e65 ae71d4b a1ec3a4 ae71d4b 6a34194 ae71d4b 9483da5 27fd4f5 caaf71e 867b7ff ae71d4b 2142965 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
import librosa
import numpy as np
import torch
import requests
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
"RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}
def getNews(search_key):
return requests.get ("https://newsapi.org/v2/everything?q=" +search_key+ "&pagesize=3&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def getHeadlines():
return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
def predict(text, preset):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
# text = getNews ()
# inputs = processor(text=text, return_tensors="pt")
inputs = processor(text=getNews(text), return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
speaker_embedding = np.load('spkemb/cmu_us_awb_arctic-wav-arctic_a0002.npy', allow_pickle=True)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "SpeechT5: Speech Synthesis"
description = """
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>
</div>
"""
examples = [
["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Preset", choices=[
"US",
"International",
"Technology",
"KPop",
"Surprise Me!"
], value="KPop"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch(share=True)
|