Spaces:
Configuration error
Configuration error
File size: 3,626 Bytes
2337a8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from typing import Dict, Optional, Union
import numpy as np
from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic
def text_to_semantic(
text: str,
history_prompt: Optional[Union[Dict, str]] = None,
temp: float = 0.7,
silent: bool = False,
):
"""Generate semantic array from text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
Returns:
numpy semantic array to be fed into `semantic_to_waveform`
"""
x_semantic = generate_text_semantic(
text,
history_prompt=history_prompt,
temp=temp,
silent=silent,
use_kv_caching=True
)
return x_semantic
def semantic_to_waveform(
semantic_tokens: np.ndarray,
history_prompt: Optional[Union[Dict, str]] = None,
temp: float = 0.7,
silent: bool = False,
output_full: bool = False,
):
"""Generate audio array from semantic input.
Args:
semantic_tokens: semantic token output from `text_to_semantic`
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
output_full: return full generation to be used as a history prompt
Returns:
numpy audio array at sample frequency 24khz
"""
coarse_tokens = generate_coarse(
semantic_tokens,
history_prompt=history_prompt,
temp=temp,
silent=silent,
use_kv_caching=True
)
fine_tokens = generate_fine(
coarse_tokens,
history_prompt=history_prompt,
temp=0.5,
)
audio_arr = codec_decode(fine_tokens)
if output_full:
full_generation = {
"semantic_prompt": semantic_tokens,
"coarse_prompt": coarse_tokens,
"fine_prompt": fine_tokens,
}
return full_generation, audio_arr
return audio_arr
def save_as_prompt(filepath, full_generation):
assert(filepath.endswith(".npz"))
assert(isinstance(full_generation, dict))
assert("semantic_prompt" in full_generation)
assert("coarse_prompt" in full_generation)
assert("fine_prompt" in full_generation)
np.savez(filepath, **full_generation)
def generate_audio(
text: str,
history_prompt: Optional[Union[Dict, str]] = None,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
silent: bool = False,
output_full: bool = False,
):
"""Generate audio array from input text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
output_full: return full generation to be used as a history prompt
Returns:
numpy audio array at sample frequency 24khz
"""
semantic_tokens = text_to_semantic(
text,
history_prompt=history_prompt,
temp=text_temp,
silent=silent,
)
out = semantic_to_waveform(
semantic_tokens,
history_prompt=history_prompt,
temp=waveform_temp,
silent=silent,
output_full=output_full,
)
if output_full:
full_generation, audio_arr = out
return full_generation, audio_arr
else:
audio_arr = out
return audio_arr
|