Spaces:
Configuration error
Configuration error
Upload 4 files
Browse files- fake_classifier.ipynb +112 -0
- long_form_generation.ipynb +400 -0
- memory_profiling_bark.ipynb +201 -0
- use_small_models_on_cpu.ipynb +142 -0
fake_classifier.ipynb
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "e330c1de",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import torchaudio\n",
|
11 |
+
"from transformers import HubertModel\n",
|
12 |
+
"from sklearn.metrics import PrecisionRecallDisplay"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": null,
|
18 |
+
"id": "2ac3dd88",
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"# use hubert from HF for feature embedding\n",
|
23 |
+
"model = HubertModel.from_pretrained(\"facebook/hubert-base-ls960\")\n",
|
24 |
+
"arr, sr = torchaudio.load(\"my_audio.wav\")\n",
|
25 |
+
"if sr != 16_000:\n",
|
26 |
+
" arr = torchaudio.functional.resample(arr, sr, 16_000)\n",
|
27 |
+
"# use intermediate layer\n",
|
28 |
+
"hidden_state = model(arr[None], output_hidden_states=True).hidden_states[6]\n",
|
29 |
+
"# take mean over time\n",
|
30 |
+
"feats = hidden_state.detach().cpu().numpy().squeeze().mean(0)"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": null,
|
36 |
+
"id": "03a602e0",
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"# load sk-learn classifier from here: https://dl.suno-models.io/bark/models/v0/classifier.pkl\n",
|
41 |
+
"with open(\"classifier.pkl\", \"rb\") as f:\n",
|
42 |
+
" clf = pickle.load(f)"
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"cell_type": "markdown",
|
47 |
+
"id": "8e423794",
|
48 |
+
"metadata": {},
|
49 |
+
"source": [
|
50 |
+
"### Precision-recall curve on test set"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"attachments": {
|
55 |
+
"image.png": {
|
56 |
+
"image/png": ""
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"cell_type": "markdown",
|
60 |
+
"id": "e1486424",
|
61 |
+
"metadata": {},
|
62 |
+
"source": [
|
63 |
+
"![image.png](attachment:image.png)"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"cell_type": "code",
|
68 |
+
"execution_count": null,
|
69 |
+
"id": "668856bf",
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": []
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"id": "c87326bd",
|
78 |
+
"metadata": {},
|
79 |
+
"outputs": [],
|
80 |
+
"source": []
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": null,
|
85 |
+
"id": "decdbf09",
|
86 |
+
"metadata": {},
|
87 |
+
"outputs": [],
|
88 |
+
"source": []
|
89 |
+
}
|
90 |
+
],
|
91 |
+
"metadata": {
|
92 |
+
"kernelspec": {
|
93 |
+
"display_name": "Python 3 (ipykernel)",
|
94 |
+
"language": "python",
|
95 |
+
"name": "python3"
|
96 |
+
},
|
97 |
+
"language_info": {
|
98 |
+
"codemirror_mode": {
|
99 |
+
"name": "ipython",
|
100 |
+
"version": 3
|
101 |
+
},
|
102 |
+
"file_extension": ".py",
|
103 |
+
"mimetype": "text/x-python",
|
104 |
+
"name": "python",
|
105 |
+
"nbconvert_exporter": "python",
|
106 |
+
"pygments_lexer": "ipython3",
|
107 |
+
"version": "3.8.15"
|
108 |
+
}
|
109 |
+
},
|
110 |
+
"nbformat": 4,
|
111 |
+
"nbformat_minor": 5
|
112 |
+
}
|
long_form_generation.ipynb
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "39ea4bed",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import os\n",
|
11 |
+
"\n",
|
12 |
+
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
13 |
+
"\n",
|
14 |
+
"\n",
|
15 |
+
"from IPython.display import Audio\n",
|
16 |
+
"import nltk # we'll use this to split into sentences\n",
|
17 |
+
"import numpy as np\n",
|
18 |
+
"\n",
|
19 |
+
"from bark.generation import (\n",
|
20 |
+
" generate_text_semantic,\n",
|
21 |
+
" preload_models,\n",
|
22 |
+
")\n",
|
23 |
+
"from bark.api import semantic_to_waveform\n",
|
24 |
+
"from bark import generate_audio, SAMPLE_RATE"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": 29,
|
30 |
+
"id": "776964b6",
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"preload_models()"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": null,
|
40 |
+
"id": "1d03f4d2",
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": []
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"cell_type": "markdown",
|
47 |
+
"id": "74a025a4",
|
48 |
+
"metadata": {},
|
49 |
+
"source": [
|
50 |
+
"# Simple Long-Form Generation\n",
|
51 |
+
"We split longer text into sentences using `nltk` and generate the sentences one by one."
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 33,
|
57 |
+
"id": "57b06e2a",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [],
|
60 |
+
"source": [
|
61 |
+
"script = \"\"\"\n",
|
62 |
+
"Hey, have you heard about this new text-to-audio model called \"Bark\"? \n",
|
63 |
+
"Apparently, it's the most realistic and natural-sounding text-to-audio model \n",
|
64 |
+
"out there right now. People are saying it sounds just like a real person speaking. \n",
|
65 |
+
"I think it uses advanced machine learning algorithms to analyze and understand the \n",
|
66 |
+
"nuances of human speech, and then replicates those nuances in its own speech output. \n",
|
67 |
+
"It's pretty impressive, and I bet it could be used for things like audiobooks or podcasts. \n",
|
68 |
+
"In fact, I heard that some publishers are already starting to use Bark to create audiobooks. \n",
|
69 |
+
"It would be like having your own personal voiceover artist. I really think Bark is going to \n",
|
70 |
+
"be a game-changer in the world of text-to-audio technology.\n",
|
71 |
+
"\"\"\".replace(\"\\n\", \" \").strip()"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 34,
|
77 |
+
"id": "f747f804",
|
78 |
+
"metadata": {},
|
79 |
+
"outputs": [],
|
80 |
+
"source": [
|
81 |
+
"sentences = nltk.sent_tokenize(script)"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": 35,
|
87 |
+
"id": "17400a9b",
|
88 |
+
"metadata": {
|
89 |
+
"scrolled": true
|
90 |
+
},
|
91 |
+
"outputs": [
|
92 |
+
{
|
93 |
+
"name": "stderr",
|
94 |
+
"output_type": "stream",
|
95 |
+
"text": [
|
96 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:02<00:00, 43.03it/s]\n",
|
97 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 17/17 [00:06<00:00, 2.45it/s]\n",
|
98 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 22.73it/s]\n",
|
99 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 33/33 [00:13<00:00, 2.52it/s]\n",
|
100 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 66.30it/s]\n",
|
101 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 11/11 [00:04<00:00, 2.46it/s]\n",
|
102 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 20.99it/s]\n",
|
103 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 35/35 [00:14<00:00, 2.46it/s]\n",
|
104 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:03<00:00, 25.63it/s]\n",
|
105 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 29/29 [00:11<00:00, 2.50it/s]\n",
|
106 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 23.90it/s]\n",
|
107 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 30/30 [00:12<00:00, 2.46it/s]\n",
|
108 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 53.24it/s]\n",
|
109 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 14/14 [00:05<00:00, 2.51it/s]\n",
|
110 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 50.63it/s]\n",
|
111 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:05<00:00, 2.57it/s]\n"
|
112 |
+
]
|
113 |
+
}
|
114 |
+
],
|
115 |
+
"source": [
|
116 |
+
"SPEAKER = \"v2/en_speaker_6\"\n",
|
117 |
+
"silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n",
|
118 |
+
"\n",
|
119 |
+
"pieces = []\n",
|
120 |
+
"for sentence in sentences:\n",
|
121 |
+
" audio_array = generate_audio(sentence, history_prompt=SPEAKER)\n",
|
122 |
+
" pieces += [audio_array, silence.copy()]\n"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": null,
|
128 |
+
"id": "04cf77f9",
|
129 |
+
"metadata": {},
|
130 |
+
"outputs": [],
|
131 |
+
"source": [
|
132 |
+
"Audio(np.concatenate(pieces), rate=SAMPLE_RATE)"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": null,
|
138 |
+
"id": "ac2d4625",
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [],
|
141 |
+
"source": []
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "markdown",
|
145 |
+
"id": "6d13249b",
|
146 |
+
"metadata": {},
|
147 |
+
"source": [
|
148 |
+
"# $ \\\\ $"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "markdown",
|
153 |
+
"id": "cdfc8bf5",
|
154 |
+
"metadata": {},
|
155 |
+
"source": [
|
156 |
+
"# Advanced Long-Form Generation\n",
|
157 |
+
"Somtimes Bark will hallucinate a little extra audio at the end of the prompt.\n",
|
158 |
+
"We can solve this issue by lowering the threshold for bark to stop generating text. \n",
|
159 |
+
"We use the `min_eos_p` kwarg in `generate_text_semantic`"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"cell_type": "code",
|
164 |
+
"execution_count": 37,
|
165 |
+
"id": "62807fd0",
|
166 |
+
"metadata": {},
|
167 |
+
"outputs": [
|
168 |
+
{
|
169 |
+
"name": "stderr",
|
170 |
+
"output_type": "stream",
|
171 |
+
"text": [
|
172 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:02<00:00, 38.05it/s]\n",
|
173 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 18/18 [00:07<00:00, 2.46it/s]\n",
|
174 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:03<00:00, 32.28it/s]\n",
|
175 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 21/21 [00:08<00:00, 2.54it/s]\n",
|
176 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 55.78it/s]\n",
|
177 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 14/14 [00:05<00:00, 2.57it/s]\n",
|
178 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:06<00:00, 14.73it/s]\n",
|
179 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 35/35 [00:14<00:00, 2.47it/s]\n",
|
180 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:02<00:00, 40.29it/s]\n",
|
181 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 18/18 [00:07<00:00, 2.56it/s]\n",
|
182 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:03<00:00, 32.92it/s]\n",
|
183 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 20/20 [00:08<00:00, 2.47it/s]\n",
|
184 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 68.87it/s]\n",
|
185 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 12/12 [00:04<00:00, 2.62it/s]\n",
|
186 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:02<00:00, 47.64it/s]\n",
|
187 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:06<00:00, 2.46it/s]\n"
|
188 |
+
]
|
189 |
+
}
|
190 |
+
],
|
191 |
+
"source": [
|
192 |
+
"GEN_TEMP = 0.6\n",
|
193 |
+
"SPEAKER = \"v2/en_speaker_6\"\n",
|
194 |
+
"silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n",
|
195 |
+
"\n",
|
196 |
+
"pieces = []\n",
|
197 |
+
"for sentence in sentences:\n",
|
198 |
+
" semantic_tokens = generate_text_semantic(\n",
|
199 |
+
" sentence,\n",
|
200 |
+
" history_prompt=SPEAKER,\n",
|
201 |
+
" temp=GEN_TEMP,\n",
|
202 |
+
" min_eos_p=0.05, # this controls how likely the generation is to end\n",
|
203 |
+
" )\n",
|
204 |
+
"\n",
|
205 |
+
" audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)\n",
|
206 |
+
" pieces += [audio_array, silence.copy()]\n",
|
207 |
+
"\n"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": null,
|
213 |
+
"id": "133fec46",
|
214 |
+
"metadata": {},
|
215 |
+
"outputs": [],
|
216 |
+
"source": [
|
217 |
+
"Audio(np.concatenate(pieces), rate=SAMPLE_RATE)"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"cell_type": "code",
|
222 |
+
"execution_count": null,
|
223 |
+
"id": "6eee9f5a",
|
224 |
+
"metadata": {},
|
225 |
+
"outputs": [],
|
226 |
+
"source": []
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"cell_type": "markdown",
|
230 |
+
"id": "be8e125e",
|
231 |
+
"metadata": {},
|
232 |
+
"source": [
|
233 |
+
"# $ \\\\ $"
|
234 |
+
]
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"cell_type": "markdown",
|
238 |
+
"id": "03a16c1b",
|
239 |
+
"metadata": {},
|
240 |
+
"source": [
|
241 |
+
"# Make a Long-Form Dialog with Bark"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"cell_type": "markdown",
|
246 |
+
"id": "06c5eff8",
|
247 |
+
"metadata": {},
|
248 |
+
"source": [
|
249 |
+
"### Step 1: Format a script and speaker lookup"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"cell_type": "code",
|
254 |
+
"execution_count": 14,
|
255 |
+
"id": "5238b297",
|
256 |
+
"metadata": {},
|
257 |
+
"outputs": [
|
258 |
+
{
|
259 |
+
"data": {
|
260 |
+
"text/plain": [
|
261 |
+
"['Samantha: Hey, have you heard about this new text-to-audio model called \"Bark\"?',\n",
|
262 |
+
" \"John: No, I haven't. What's so special about it?\",\n",
|
263 |
+
" \"Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.\",\n",
|
264 |
+
" 'John: Wow, that sounds amazing. How does it work?',\n",
|
265 |
+
" 'Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.',\n",
|
266 |
+
" \"John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?\",\n",
|
267 |
+
" 'Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.',\n",
|
268 |
+
" 'John: I can imagine. It would be like having your own personal voiceover artist.',\n",
|
269 |
+
" 'Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology.']"
|
270 |
+
]
|
271 |
+
},
|
272 |
+
"execution_count": 14,
|
273 |
+
"metadata": {},
|
274 |
+
"output_type": "execute_result"
|
275 |
+
}
|
276 |
+
],
|
277 |
+
"source": [
|
278 |
+
"speaker_lookup = {\"Samantha\": \"v2/en_speaker_9\", \"John\": \"v2/en_speaker_2\"}\n",
|
279 |
+
"\n",
|
280 |
+
"# Script generated by chat GPT\n",
|
281 |
+
"script = \"\"\"\n",
|
282 |
+
"Samantha: Hey, have you heard about this new text-to-audio model called \"Bark\"?\n",
|
283 |
+
"\n",
|
284 |
+
"John: No, I haven't. What's so special about it?\n",
|
285 |
+
"\n",
|
286 |
+
"Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.\n",
|
287 |
+
"\n",
|
288 |
+
"John: Wow, that sounds amazing. How does it work?\n",
|
289 |
+
"\n",
|
290 |
+
"Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.\n",
|
291 |
+
"\n",
|
292 |
+
"John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?\n",
|
293 |
+
"\n",
|
294 |
+
"Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.\n",
|
295 |
+
"\n",
|
296 |
+
"John: I can imagine. It would be like having your own personal voiceover artist.\n",
|
297 |
+
"\n",
|
298 |
+
"Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology.\"\"\"\n",
|
299 |
+
"script = script.strip().split(\"\\n\")\n",
|
300 |
+
"script = [s.strip() for s in script if s]\n",
|
301 |
+
"script"
|
302 |
+
]
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"cell_type": "markdown",
|
306 |
+
"id": "ee547efd",
|
307 |
+
"metadata": {},
|
308 |
+
"source": [
|
309 |
+
"### Step 2: Generate the audio for every speaker turn"
|
310 |
+
]
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"cell_type": "code",
|
314 |
+
"execution_count": 15,
|
315 |
+
"id": "203e5081",
|
316 |
+
"metadata": {},
|
317 |
+
"outputs": [
|
318 |
+
{
|
319 |
+
"name": "stderr",
|
320 |
+
"output_type": "stream",
|
321 |
+
"text": [
|
322 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:02<00:00, 34.03it/s]\n",
|
323 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 22/22 [00:08<00:00, 2.55it/s]\n",
|
324 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 71.58it/s]\n",
|
325 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 11/11 [00:04<00:00, 2.65it/s]\n",
|
326 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 22.75it/s]\n",
|
327 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 33/33 [00:13<00:00, 2.53it/s]\n",
|
328 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 70.76it/s]\n",
|
329 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 11/11 [00:04<00:00, 2.63it/s]\n",
|
330 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 20.46it/s]\n",
|
331 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 36/36 [00:14<00:00, 2.47it/s]\n",
|
332 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 20.18it/s]\n",
|
333 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 37/37 [00:14<00:00, 2.51it/s]\n",
|
334 |
+
"100%|ββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:04<00:00, 23.04it/s]\n",
|
335 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 32/32 [00:12<00:00, 2.48it/s]\n",
|
336 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 54.64it/s]\n",
|
337 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 14/14 [00:05<00:00, 2.58it/s]\n",
|
338 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:03<00:00, 31.71it/s]\n",
|
339 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 24/24 [00:09<00:00, 2.56it/s]\n"
|
340 |
+
]
|
341 |
+
}
|
342 |
+
],
|
343 |
+
"source": [
|
344 |
+
"pieces = []\n",
|
345 |
+
"silence = np.zeros(int(0.5*SAMPLE_RATE))\n",
|
346 |
+
"for line in script:\n",
|
347 |
+
" speaker, text = line.split(\": \")\n",
|
348 |
+
" audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )\n",
|
349 |
+
" pieces += [audio_array, silence.copy()]"
|
350 |
+
]
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"cell_type": "markdown",
|
354 |
+
"id": "7c54bada",
|
355 |
+
"metadata": {},
|
356 |
+
"source": [
|
357 |
+
"### Step 3: Concatenate all of the audio and play it"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
{
|
361 |
+
"cell_type": "code",
|
362 |
+
"execution_count": null,
|
363 |
+
"id": "27a56842",
|
364 |
+
"metadata": {},
|
365 |
+
"outputs": [],
|
366 |
+
"source": [
|
367 |
+
"Audio(np.concatenate(pieces), rate=SAMPLE_RATE)"
|
368 |
+
]
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"cell_type": "code",
|
372 |
+
"execution_count": null,
|
373 |
+
"id": "a1bc5877",
|
374 |
+
"metadata": {},
|
375 |
+
"outputs": [],
|
376 |
+
"source": []
|
377 |
+
}
|
378 |
+
],
|
379 |
+
"metadata": {
|
380 |
+
"kernelspec": {
|
381 |
+
"display_name": "Python 3 (ipykernel)",
|
382 |
+
"language": "python",
|
383 |
+
"name": "python3"
|
384 |
+
},
|
385 |
+
"language_info": {
|
386 |
+
"codemirror_mode": {
|
387 |
+
"name": "ipython",
|
388 |
+
"version": 3
|
389 |
+
},
|
390 |
+
"file_extension": ".py",
|
391 |
+
"mimetype": "text/x-python",
|
392 |
+
"name": "python",
|
393 |
+
"nbconvert_exporter": "python",
|
394 |
+
"pygments_lexer": "ipython3",
|
395 |
+
"version": "3.9.16"
|
396 |
+
}
|
397 |
+
},
|
398 |
+
"nbformat": 4,
|
399 |
+
"nbformat_minor": 5
|
400 |
+
}
|
memory_profiling_bark.ipynb
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "90641144",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Bark Memory Profiling\n",
|
9 |
+
"Bark has two ways to reduce GPU memory: \n",
|
10 |
+
" - Small models: a smaller version of the model. This can be set by using the environment variable `SUNO_USE_SMALL_MODELS`\n",
|
11 |
+
" - offloading models to CPU: Holding only one model at a time on the GPU, and shuttling the models to the CPU in between generations. \n",
|
12 |
+
"\n",
|
13 |
+
"# $ \\\\ $\n",
|
14 |
+
"## First, we'll use the most memory efficient configuration"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 1,
|
20 |
+
"id": "39ea4bed",
|
21 |
+
"metadata": {},
|
22 |
+
"outputs": [],
|
23 |
+
"source": [
|
24 |
+
"import os\n",
|
25 |
+
"\n",
|
26 |
+
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
|
27 |
+
"os.environ[\"SUNO_USE_SMALL_MODELS\"] = \"1\"\n",
|
28 |
+
"os.environ[\"SUNO_OFFLOAD_CPU\"] = \"1\"\n",
|
29 |
+
"\n",
|
30 |
+
"from bark.generation import (\n",
|
31 |
+
" generate_text_semantic,\n",
|
32 |
+
" preload_models,\n",
|
33 |
+
")\n",
|
34 |
+
"from bark import generate_audio, SAMPLE_RATE\n",
|
35 |
+
"\n",
|
36 |
+
"import torch"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 2,
|
42 |
+
"id": "66b0c006",
|
43 |
+
"metadata": {},
|
44 |
+
"outputs": [
|
45 |
+
{
|
46 |
+
"name": "stderr",
|
47 |
+
"output_type": "stream",
|
48 |
+
"text": [
|
49 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:01<00:00, 62.17it/s]\n",
|
50 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 10/10 [00:03<00:00, 2.74it/s]\n"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"name": "stdout",
|
55 |
+
"output_type": "stream",
|
56 |
+
"text": [
|
57 |
+
"max memory usage = 2396MB\n"
|
58 |
+
]
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"source": [
|
62 |
+
"torch.cuda.reset_peak_memory_stats()\n",
|
63 |
+
"preload_models()\n",
|
64 |
+
"audio_array = generate_audio(\"madam I'm adam\", history_prompt=\"v2/en_speaker_5\")\n",
|
65 |
+
"max_utilization = torch.cuda.max_memory_allocated()\n",
|
66 |
+
"print(f\"max memory usage = {max_utilization / 1024 / 1024:.0f}MB\")"
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"cell_type": "code",
|
71 |
+
"execution_count": null,
|
72 |
+
"id": "9922dd2d",
|
73 |
+
"metadata": {},
|
74 |
+
"outputs": [],
|
75 |
+
"source": []
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"cell_type": "code",
|
79 |
+
"execution_count": null,
|
80 |
+
"id": "bdbe578e",
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [],
|
83 |
+
"source": []
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "markdown",
|
87 |
+
"id": "213d1b5b",
|
88 |
+
"metadata": {},
|
89 |
+
"source": [
|
90 |
+
"# Memory Profiling:\n",
|
91 |
+
"We can profile the memory consumption of 4 scenarios\n",
|
92 |
+
" - Small models, offloading to CPU\n",
|
93 |
+
" - Large models, offloading to CPU\n",
|
94 |
+
" - Small models, not offloading to CPU\n",
|
95 |
+
" - Large models, not offloading to CPU"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 1,
|
101 |
+
"id": "417d5e9c",
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [],
|
104 |
+
"source": [
|
105 |
+
"import os\n",
|
106 |
+
"\n",
|
107 |
+
"from bark.generation import (\n",
|
108 |
+
" generate_text_semantic,\n",
|
109 |
+
" preload_models,\n",
|
110 |
+
" models,\n",
|
111 |
+
")\n",
|
112 |
+
"import bark.generation\n",
|
113 |
+
"\n",
|
114 |
+
"from bark.api import semantic_to_waveform\n",
|
115 |
+
"from bark import generate_audio, SAMPLE_RATE\n",
|
116 |
+
"\n",
|
117 |
+
"import torch\n",
|
118 |
+
"import time"
|
119 |
+
]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"cell_type": "code",
|
123 |
+
"execution_count": 2,
|
124 |
+
"id": "cd83b45d",
|
125 |
+
"metadata": {},
|
126 |
+
"outputs": [
|
127 |
+
{
|
128 |
+
"name": "stdout",
|
129 |
+
"output_type": "stream",
|
130 |
+
"text": [
|
131 |
+
"Small models True, offloading to CPU: True\n",
|
132 |
+
"\tmax memory usage = 967MB, time 4s\n",
|
133 |
+
"\n",
|
134 |
+
"Small models False, offloading to CPU: True\n",
|
135 |
+
"\tmax memory usage = 2407MB, time 8s\n",
|
136 |
+
"\n",
|
137 |
+
"Small models True, offloading to CPU: False\n",
|
138 |
+
"\tmax memory usage = 2970MB, time 3s\n",
|
139 |
+
"\n",
|
140 |
+
"Small models False, offloading to CPU: False\n",
|
141 |
+
"\tmax memory usage = 7824MB, time 6s\n",
|
142 |
+
"\n"
|
143 |
+
]
|
144 |
+
}
|
145 |
+
],
|
146 |
+
"source": [
|
147 |
+
"global models\n",
|
148 |
+
"\n",
|
149 |
+
"for offload_models in (True, False):\n",
|
150 |
+
" # this setattr is needed to do on the fly\n",
|
151 |
+
" # the easier way to do this is with `os.environ[\"SUNO_OFFLOAD_CPU\"] = \"1\"`\n",
|
152 |
+
" setattr(bark.generation, \"OFFLOAD_CPU\", offload_models)\n",
|
153 |
+
" for use_small_models in (True, False):\n",
|
154 |
+
" models = {}\n",
|
155 |
+
" torch.cuda.empty_cache()\n",
|
156 |
+
" torch.cuda.reset_peak_memory_stats()\n",
|
157 |
+
" preload_models(\n",
|
158 |
+
" text_use_small=use_small_models,\n",
|
159 |
+
" coarse_use_small=use_small_models,\n",
|
160 |
+
" fine_use_small=use_small_models,\n",
|
161 |
+
" force_reload=True,\n",
|
162 |
+
" )\n",
|
163 |
+
" t0 = time.time()\n",
|
164 |
+
" audio_array = generate_audio(\"madam I'm adam\", history_prompt=\"v2/en_speaker_5\", silent=True)\n",
|
165 |
+
" dur = time.time() - t0\n",
|
166 |
+
" max_utilization = torch.cuda.max_memory_allocated()\n",
|
167 |
+
" print(f\"Small models {use_small_models}, offloading to CPU: {offload_models}\")\n",
|
168 |
+
" print(f\"\\tmax memory usage = {max_utilization / 1024 / 1024:.0f}MB, time {dur:.0f}s\\n\")"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"cell_type": "code",
|
173 |
+
"execution_count": null,
|
174 |
+
"id": "bfe5fa06",
|
175 |
+
"metadata": {},
|
176 |
+
"outputs": [],
|
177 |
+
"source": []
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"metadata": {
|
181 |
+
"kernelspec": {
|
182 |
+
"display_name": "Python 3 (ipykernel)",
|
183 |
+
"language": "python",
|
184 |
+
"name": "python3"
|
185 |
+
},
|
186 |
+
"language_info": {
|
187 |
+
"codemirror_mode": {
|
188 |
+
"name": "ipython",
|
189 |
+
"version": 3
|
190 |
+
},
|
191 |
+
"file_extension": ".py",
|
192 |
+
"mimetype": "text/x-python",
|
193 |
+
"name": "python",
|
194 |
+
"nbconvert_exporter": "python",
|
195 |
+
"pygments_lexer": "ipython3",
|
196 |
+
"version": "3.9.16"
|
197 |
+
}
|
198 |
+
},
|
199 |
+
"nbformat": 4,
|
200 |
+
"nbformat_minor": 5
|
201 |
+
}
|
use_small_models_on_cpu.ipynb
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6a682b61",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Benchmarking small models on CPU\n",
|
9 |
+
" - We can enable small models with the `SUNO_USE_SMALL_MODELS` environment variable"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 1,
|
15 |
+
"id": "9500dd93",
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"import os\n",
|
20 |
+
"\n",
|
21 |
+
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
|
22 |
+
"os.environ[\"SUNO_USE_SMALL_MODELS\"] = \"1\"\n",
|
23 |
+
"\n",
|
24 |
+
"from IPython.display import Audio\n",
|
25 |
+
"import numpy as np\n",
|
26 |
+
"\n",
|
27 |
+
"from bark import generate_audio, preload_models, SAMPLE_RATE\n",
|
28 |
+
"\n",
|
29 |
+
"import time"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 2,
|
35 |
+
"id": "4e3454b6",
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [
|
38 |
+
{
|
39 |
+
"name": "stderr",
|
40 |
+
"output_type": "stream",
|
41 |
+
"text": [
|
42 |
+
"No GPU being used. Careful, inference might be very slow!\n"
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"name": "stdout",
|
47 |
+
"output_type": "stream",
|
48 |
+
"text": [
|
49 |
+
"CPU times: user 5.52 s, sys: 2.34 s, total: 7.86 s\n",
|
50 |
+
"Wall time: 4.33 s\n"
|
51 |
+
]
|
52 |
+
}
|
53 |
+
],
|
54 |
+
"source": [
|
55 |
+
"%%time\n",
|
56 |
+
"preload_models()"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 3,
|
62 |
+
"id": "f6024e5f",
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [
|
65 |
+
{
|
66 |
+
"name": "stderr",
|
67 |
+
"output_type": "stream",
|
68 |
+
"text": [
|
69 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 100/100 [00:10<00:00, 9.89it/s]\n",
|
70 |
+
"100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:43<00:00, 2.90s/it]\n"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "stdout",
|
75 |
+
"output_type": "stream",
|
76 |
+
"text": [
|
77 |
+
"took 62s to generate 6s of audio\n"
|
78 |
+
]
|
79 |
+
}
|
80 |
+
],
|
81 |
+
"source": [
|
82 |
+
"t0 = time.time()\n",
|
83 |
+
"text = \"In the light of the moon, a little egg lay on a leaf\"\n",
|
84 |
+
"audio_array = generate_audio(text)\n",
|
85 |
+
"generation_duration_s = time.time() - t0\n",
|
86 |
+
"audio_duration_s = audio_array.shape[0] / SAMPLE_RATE\n",
|
87 |
+
"\n",
|
88 |
+
"print(f\"took {generation_duration_s:.0f}s to generate {audio_duration_s:.0f}s of audio\")"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": 4,
|
94 |
+
"id": "2dcce86c",
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [
|
97 |
+
{
|
98 |
+
"data": {
|
99 |
+
"text/plain": [
|
100 |
+
"10"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
"execution_count": 4,
|
104 |
+
"metadata": {},
|
105 |
+
"output_type": "execute_result"
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"source": [
|
109 |
+
"os.cpu_count()"
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "code",
|
114 |
+
"execution_count": null,
|
115 |
+
"id": "3046eddb",
|
116 |
+
"metadata": {},
|
117 |
+
"outputs": [],
|
118 |
+
"source": []
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"metadata": {
|
122 |
+
"kernelspec": {
|
123 |
+
"display_name": "Python 3 (ipykernel)",
|
124 |
+
"language": "python",
|
125 |
+
"name": "python3"
|
126 |
+
},
|
127 |
+
"language_info": {
|
128 |
+
"codemirror_mode": {
|
129 |
+
"name": "ipython",
|
130 |
+
"version": 3
|
131 |
+
},
|
132 |
+
"file_extension": ".py",
|
133 |
+
"mimetype": "text/x-python",
|
134 |
+
"name": "python",
|
135 |
+
"nbconvert_exporter": "python",
|
136 |
+
"pygments_lexer": "ipython3",
|
137 |
+
"version": "3.9.16"
|
138 |
+
}
|
139 |
+
},
|
140 |
+
"nbformat": 4,
|
141 |
+
"nbformat_minor": 5
|
142 |
+
}
|