ylacombe commited on
Commit
c675805
·
1 Parent(s): 69e573b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from threading import Thread
3
+
4
+ from transformers import AutoProcessor
5
+ from transformers import set_seed
6
+
7
+ from utils.vocos_bark import BarkModel
8
+ from scipy.io.wavfile import write
9
+ from pydub import AudioSegment
10
+
11
+ import numpy as np
12
+
13
+ import os
14
+ import gradio as gr
15
+ import uuid
16
+ import io
17
+
18
+
19
+ set_seed(0)
20
+
21
+ def _grab_best_device(use_gpu=True):
22
+ if torch.cuda.device_count() > 0 and use_gpu:
23
+ device = "cuda"
24
+ else:
25
+ device = "cpu"
26
+ return device
27
+
28
+ device = _grab_best_device()
29
+
30
+ HUB_PATH = "suno/bark"
31
+
32
+ processor = AutoProcessor.from_pretrained(HUB_PATH)
33
+
34
+ speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])
35
+
36
+ SAMPLE_RATE = 24_000
37
+
38
+
39
+ # import model
40
+ if device == "cpu":
41
+ bark = BarkModel.from_pretrained(HUB_PATH)
42
+ else:
43
+ bark = BarkModel.from_pretrained(HUB_PATH).to(device)
44
+ bark = bark.to_bettertransformer()
45
+
46
+
47
+ # streaming inference
48
+ def generate_audio(text, voice_preset = None, lag = 0):
49
+ if voice_preset not in speaker_embeddings:
50
+ voice_preset = None
51
+
52
+ sentences = [
53
+ text,
54
+ ]
55
+ inputs = processor(sentences, voice_preset=voice_preset).to(device)
56
+ # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
57
+
58
+ waveform = bark.generate(
59
+ **inputs, coarse_temperature = 0.8, semantic_temperature = 0.5
60
+ )
61
+
62
+ return (SAMPLE_RATE, waveform.squeeze().cpu().numpy())
63
+
64
+
65
+
66
+ # Gradio blocks demo
67
+ with gr.Blocks() as demo_blocks:
68
+ gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
69
+ gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio 🦾! </h3>""")
70
+ with gr.Group():
71
+ with gr.Row():
72
+ inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
73
+ dd = gr.Dropdown(
74
+ speaker_embeddings,
75
+ value=None,
76
+ label="Available voice presets",
77
+ info="Defaults to no speaker embeddings!"
78
+ )
79
+
80
+ with gr.Row():
81
+ btn = gr.Button("Bark with Vocos TTS")
82
+
83
+ with gr.Row():
84
+ out_audio = gr.Audio(type="numpy", autoplay=True)
85
+ btn.click(generate_audio, [inp_text, dd], out_audio)
86
+
87
+ demo_blocks.queue().launch(debug=True)