Spaces:
Sleeping
Sleeping
Commit
·
b4e6550
1
Parent(s):
bb8f724
Refactor
Browse files
app.py
CHANGED
@@ -40,7 +40,7 @@ SEED = 42
|
|
40 |
def numpy_to_mp3(audio_array, sampling_rate):
|
41 |
# Normalize audio_array if it's floating-point
|
42 |
if np.issubdtype(audio_array.dtype, np.floating):
|
43 |
-
max_val = np.max(np.abs(audio_array))
|
44 |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
|
45 |
audio_array = audio_array.astype(np.int16)
|
46 |
|
@@ -66,19 +66,21 @@ sampling_rate = model.audio_encoder.config.sampling_rate
|
|
66 |
frame_rate = model.audio_encoder.config.frame_rate
|
67 |
|
68 |
|
69 |
-
|
70 |
-
@spaces.GPU
|
71 |
-
def generate_base(subject, setting):
|
72 |
-
|
73 |
messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
|
74 |
"You want to write a bed time story for your child. They will give you the subject and setting "
|
75 |
"and you will write the entire story. It should be targetted at children 5 and younger and take about "
|
76 |
"a minute to read")},
|
77 |
{"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
|
78 |
-
gr.Info("Generating story", duration=3)
|
79 |
response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
|
80 |
gr.Info("Story Generated", duration=3)
|
81 |
story = response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
model_input = story.replace("\n", " ").strip()
|
84 |
model_input_tokens = nltk.sent_tokenize(model_input)
|
@@ -86,7 +88,7 @@ def generate_base(subject, setting):
|
|
86 |
play_steps_in_s = 4.0
|
87 |
play_steps = int(frame_rate * play_steps_in_s)
|
88 |
|
89 |
-
gr.Info("Generating Audio")
|
90 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
91 |
story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
|
92 |
description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
|
@@ -95,22 +97,16 @@ def generate_base(subject, setting):
|
|
95 |
attention_mask=description_tokens.attention_mask,
|
96 |
prompt_attention_mask=story_tokens.attention_mask)
|
97 |
speech_output = [output.cpu().numpy() for output in speech_output]
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
def stream_audio(state):
|
103 |
-
speech_output = state["audio"]
|
104 |
-
sentences = state["text"]
|
105 |
|
106 |
gr.Info("Reading Story")
|
107 |
|
108 |
-
|
109 |
-
for sentence, new_audio in zip(sentences, speech_output):
|
110 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
111 |
-
|
112 |
-
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
113 |
-
time.sleep(5)
|
114 |
|
115 |
|
116 |
with gr.Blocks() as block:
|
@@ -122,19 +118,20 @@ with gr.Blocks() as block:
|
|
122 |
)
|
123 |
with gr.Group():
|
124 |
with gr.Row():
|
125 |
-
subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"])
|
126 |
-
setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"])
|
127 |
with gr.Row():
|
128 |
run_button = gr.Button("Generate Story", variant="primary")
|
129 |
with gr.Row():
|
130 |
with gr.Group():
|
131 |
-
audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True)
|
132 |
story = gr.Textbox(label="Story")
|
133 |
|
134 |
inputs = [subject, setting]
|
135 |
outputs = [story, audio_out]
|
136 |
state = gr.State()
|
137 |
-
|
|
|
138 |
|
139 |
block.queue()
|
140 |
block.launch(share=True)
|
|
|
40 |
def numpy_to_mp3(audio_array, sampling_rate):
|
41 |
# Normalize audio_array if it's floating-point
|
42 |
if np.issubdtype(audio_array.dtype, np.floating):
|
43 |
+
max_val = np.max(np.abs(audio_array)) + 1
|
44 |
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
|
45 |
audio_array = audio_array.astype(np.int16)
|
46 |
|
|
|
66 |
frame_rate = model.audio_encoder.config.frame_rate
|
67 |
|
68 |
|
69 |
+
def generate_story(subject: str, setting: str) -> str:
|
|
|
|
|
|
|
70 |
messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
|
71 |
"You want to write a bed time story for your child. They will give you the subject and setting "
|
72 |
"and you will write the entire story. It should be targetted at children 5 and younger and take about "
|
73 |
"a minute to read")},
|
74 |
{"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
|
|
|
75 |
response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
|
76 |
gr.Info("Story Generated", duration=3)
|
77 |
story = response.choices[0].message.content
|
78 |
+
return None, None, story
|
79 |
+
|
80 |
+
|
81 |
+
@spaces.GPU
|
82 |
+
def generate_base(story):
|
83 |
+
|
84 |
|
85 |
model_input = story.replace("\n", " ").strip()
|
86 |
model_input_tokens = nltk.sent_tokenize(model_input)
|
|
|
88 |
play_steps_in_s = 4.0
|
89 |
play_steps = int(frame_rate * play_steps_in_s)
|
90 |
|
91 |
+
gr.Info("Generating Audio", duration=3)
|
92 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
93 |
story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
|
94 |
description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
|
|
|
97 |
attention_mask=description_tokens.attention_mask,
|
98 |
prompt_attention_mask=story_tokens.attention_mask)
|
99 |
speech_output = [output.cpu().numpy() for output in speech_output]
|
100 |
+
return None, None, speech_output
|
101 |
+
|
102 |
|
103 |
+
def stream_audio(hidden_story, speech_output):
|
|
|
|
|
|
|
104 |
|
105 |
gr.Info("Reading Story")
|
106 |
|
107 |
+
for new_audio in speech_output:
|
|
|
108 |
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
109 |
+
yield hidden_story, (sampling_rate, new_audio) #numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
|
|
|
|
110 |
|
111 |
|
112 |
with gr.Blocks() as block:
|
|
|
118 |
)
|
119 |
with gr.Group():
|
120 |
with gr.Row():
|
121 |
+
subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"], label="Subject")
|
122 |
+
setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"], label="Setting")
|
123 |
with gr.Row():
|
124 |
run_button = gr.Button("Generate Story", variant="primary")
|
125 |
with gr.Row():
|
126 |
with gr.Group():
|
127 |
+
audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True, format="wav")
|
128 |
story = gr.Textbox(label="Story")
|
129 |
|
130 |
inputs = [subject, setting]
|
131 |
outputs = [story, audio_out]
|
132 |
state = gr.State()
|
133 |
+
hidden_story = gr.State()
|
134 |
+
run_button.click(generate_story, inputs=inputs, outputs=[story, audio_out, hidden_story]).success(fn=generate_base, inputs=hidden_story, outputs=[story, audio_out, state]).success(stream_audio, inputs=[hidden_story, state], outputs=[story, audio_out])
|
135 |
|
136 |
block.queue()
|
137 |
block.launch(share=True)
|