bedtime-story-batches

Sleeping

App Files Files Community

freddyaboulton HF Staff commited on Aug 2, 2024

Commit

b4e6550

1 Parent(s): bb8f724

Refactor

Browse files

Files changed (1) hide show

app.py +19 -22

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ SEED = 42
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
-        max_val = np.max(np.abs(audio_array))
         audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
         audio_array = audio_array.astype(np.int16)
@@ -66,19 +66,21 @@ sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
-@spaces.GPU
-def generate_base(subject, setting):
     messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
                                               "You want to write a bed time story for your child. They will give you the subject and setting "
                                               "and you will write the entire story. It should be targetted at children 5 and younger and take about "
                                               "a minute to read")},
                 {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
-    gr.Info("Generating story", duration=3)
     response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
     gr.Info("Story Generated", duration=3)
     story = response.choices[0].message.content
     model_input = story.replace("\n", " ").strip()
     model_input_tokens = nltk.sent_tokenize(model_input)
@@ -86,7 +88,7 @@ def generate_base(subject, setting):
     play_steps_in_s = 4.0
     play_steps = int(frame_rate * play_steps_in_s)
-    gr.Info("Generating Audio")
     description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
     story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
     description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
@@ -95,22 +97,16 @@ def generate_base(subject, setting):
                                    attention_mask=description_tokens.attention_mask,
                                    prompt_attention_mask=story_tokens.attention_mask)
     speech_output = [output.cpu().numpy() for output in speech_output]
-    gr.Info("Generated Audio")
-    return None, None, {"audio": speech_output, "text": model_input_tokens}
-import time
-def stream_audio(state):
-    speech_output = state["audio"]
-    sentences = state["text"]
     gr.Info("Reading Story")
-    story = ""
-    for sentence, new_audio in zip(sentences, speech_output):
         print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
-        story += f"{sentence}\n"
-        yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
-        time.sleep(5)
 with gr.Blocks() as block:
@@ -122,19 +118,20 @@ with gr.Blocks() as block:
     )
     with gr.Group():
         with gr.Row():
-            subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"])
-            setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"])
         with gr.Row():
             run_button = gr.Button("Generate Story", variant="primary")
     with gr.Row():
         with gr.Group():
-            audio_out = gr.Audio(label="Bed time story",  streaming=True, autoplay=True)
             story = gr.Textbox(label="Story")
     inputs = [subject, setting]
     outputs = [story, audio_out]
     state = gr.State()
-    run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
 block.queue()
 block.launch(share=True)

 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     if np.issubdtype(audio_array.dtype, np.floating):
+        max_val = np.max(np.abs(audio_array)) + 1
         audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
         audio_array = audio_array.astype(np.int16)
 frame_rate = model.audio_encoder.config.frame_rate
+def generate_story(subject: str, setting: str) -> str:
     messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
                                               "You want to write a bed time story for your child. They will give you the subject and setting "
                                               "and you will write the entire story. It should be targetted at children 5 and younger and take about "
                                               "a minute to read")},
                 {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
     response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
     gr.Info("Story Generated", duration=3)
     story = response.choices[0].message.content
+    return None, None, story
+@spaces.GPU
+def generate_base(story):
     model_input = story.replace("\n", " ").strip()
     model_input_tokens = nltk.sent_tokenize(model_input)
     play_steps_in_s = 4.0
     play_steps = int(frame_rate * play_steps_in_s)
+    gr.Info("Generating Audio", duration=3)
     description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
     story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
     description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
                                    attention_mask=description_tokens.attention_mask,
                                    prompt_attention_mask=story_tokens.attention_mask)
     speech_output = [output.cpu().numpy() for output in speech_output]
+    return None, None, speech_output
+def stream_audio(hidden_story, speech_output):
     gr.Info("Reading Story")
+    for new_audio in speech_output:
         print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
+        yield hidden_story, (sampling_rate, new_audio) #numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 with gr.Blocks() as block:
     )
     with gr.Group():
         with gr.Row():
+            subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"], label="Subject")
+            setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"], label="Setting")
         with gr.Row():
             run_button = gr.Button("Generate Story", variant="primary")
     with gr.Row():
         with gr.Group():
+            audio_out = gr.Audio(label="Bed time story",  streaming=True, autoplay=True, format="wav")
             story = gr.Textbox(label="Story")
     inputs = [subject, setting]
     outputs = [story, audio_out]
     state = gr.State()
+    hidden_story = gr.State()
+    run_button.click(generate_story, inputs=inputs, outputs=[story, audio_out, hidden_story]).success(fn=generate_base, inputs=hidden_story, outputs=[story, audio_out, state]).success(stream_audio, inputs=[hidden_story, state], outputs=[story, audio_out])
 block.queue()
 block.launch(share=True)