Spaces:
Runtime error
Runtime error
| from typing import Dict, Optional, Union | |
| import numpy as np | |
| from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic | |
| def generate_with_settings(text_prompt, semantic_temp=0.6, eos_p=0.2, coarse_temp=0.7, fine_temp=0.5, voice_name=None, output_full=False): | |
| # generation with more control | |
| x_semantic = generate_text_semantic( | |
| text_prompt, | |
| history_prompt=voice_name, | |
| temp=semantic_temp, | |
| min_eos_p = eos_p, | |
| use_kv_caching=True | |
| ) | |
| x_coarse_gen = generate_coarse( | |
| x_semantic, | |
| history_prompt=voice_name, | |
| temp=coarse_temp, | |
| use_kv_caching=True | |
| ) | |
| x_fine_gen = generate_fine( | |
| x_coarse_gen, | |
| history_prompt=voice_name, | |
| temp=fine_temp, | |
| ) | |
| if output_full: | |
| full_generation = { | |
| 'semantic_prompt': x_semantic, | |
| 'coarse_prompt': x_coarse_gen, | |
| 'fine_prompt': x_fine_gen | |
| } | |
| return full_generation, codec_decode(x_fine_gen) | |
| return codec_decode(x_fine_gen) | |
| def text_to_semantic( | |
| text: str, | |
| history_prompt: Optional[Union[Dict, str]] = None, | |
| temp: float = 0.7, | |
| silent: bool = False, | |
| ): | |
| """Generate semantic array from text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| Returns: | |
| numpy semantic array to be fed into `semantic_to_waveform` | |
| """ | |
| x_semantic = generate_text_semantic( | |
| text, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| silent=silent, | |
| use_kv_caching=True | |
| ) | |
| return x_semantic | |
| def semantic_to_waveform( | |
| semantic_tokens: np.ndarray, | |
| history_prompt: Optional[Union[Dict, str]] = None, | |
| temp: float = 0.7, | |
| silent: bool = False, | |
| output_full: bool = False, | |
| ): | |
| """Generate audio array from semantic input. | |
| Args: | |
| semantic_tokens: semantic token output from `text_to_semantic` | |
| history_prompt: history choice for audio cloning | |
| temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| output_full: return full generation to be used as a history prompt | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| coarse_tokens = generate_coarse( | |
| semantic_tokens, | |
| history_prompt=history_prompt, | |
| temp=temp, | |
| silent=silent, | |
| use_kv_caching=True | |
| ) | |
| fine_tokens = generate_fine( | |
| coarse_tokens, | |
| history_prompt=history_prompt, | |
| temp=0.5, | |
| ) | |
| audio_arr = codec_decode(fine_tokens) | |
| if output_full: | |
| full_generation = { | |
| "semantic_prompt": semantic_tokens, | |
| "coarse_prompt": coarse_tokens, | |
| "fine_prompt": fine_tokens, | |
| } | |
| return full_generation, audio_arr | |
| return audio_arr | |
| def save_as_prompt(filepath, full_generation): | |
| assert(filepath.endswith(".npz")) | |
| assert(isinstance(full_generation, dict)) | |
| assert("semantic_prompt" in full_generation) | |
| assert("coarse_prompt" in full_generation) | |
| assert("fine_prompt" in full_generation) | |
| np.savez(filepath, **full_generation) | |
| def generate_audio( | |
| text: str, | |
| history_prompt: Optional[Union[Dict, str]] = None, | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| silent: bool = False, | |
| output_full: bool = False, | |
| ): | |
| """Generate audio array from input text. | |
| Args: | |
| text: text to be turned into audio | |
| history_prompt: history choice for audio cloning | |
| text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | |
| silent: disable progress bar | |
| output_full: return full generation to be used as a history prompt | |
| Returns: | |
| numpy audio array at sample frequency 24khz | |
| """ | |
| semantic_tokens = text_to_semantic( | |
| text, | |
| history_prompt=history_prompt, | |
| temp=text_temp, | |
| silent=silent, | |
| ) | |
| out = semantic_to_waveform( | |
| semantic_tokens, | |
| history_prompt=history_prompt, | |
| temp=waveform_temp, | |
| silent=silent, | |
| output_full=output_full, | |
| ) | |
| if output_full: | |
| full_generation, audio_arr = out | |
| return full_generation, audio_arr | |
| else: | |
| audio_arr = out | |
| return audio_arr | |