Spaces:

jbilcke-hf
/

ai-bedtime-story-server

Paused

App Files Files Community

jbilcke-hf HF Staff commited on Nov 16, 2023

Commit

c2cf399

1 Parent(s): fe3096b

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -21

app.py CHANGED Viewed

@@ -575,20 +575,17 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
                 reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
                 wav_bytestream = (reduced_noise * 32767).astype(np.int16)
                 wav_bytestream = wav_bytestream.tobytes()
             if audio_stream is not None:
-                if not return_as_byte:
-                    audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
-                    with wave.open(audio_unique_filename, "w") as f:
-                        f.setnchannels(1)
-                        # 2 bytes per sample.
-                        f.setsampwidth(2)
-                        f.setframerate(24000)
-                        f.writeframes(wav_bytestream)
-                    return (history , gr.Audio(value=audio_unique_filename, autoplay=True))
-                else:
-                    return (history , gr.Audio(value=wav_bytestream, autoplay=True))
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
@@ -634,14 +631,12 @@ def generate_story_and_speech(input_text, chatbot_role):
         history_tuples = [tuple(entry) for entry in last_history]
         synthesized_speech = generate_speech_for_sentence(history_tuples, chatbot_role, story_text, return_as_byte=True)
-        if synthesized_speech:
-            # Get the Gradio Audio object
-            audio_obj = synthesized_speech[1]
-            # Access the BytesIO object containing the WAV file and extract bytes
-            speech_audio_bytes = audio_obj.data  # Use the 'data' attribute to get the bytearray
-            # Convert the speech audio bytes to base64 for JSON serialization
-            speech_audio_base64 = base64.b64encode(speech_audio_bytes).decode('utf8')
-            return {"text": story_text.strip(), "audio": speech_audio_base64}
         else:
             return {"text": "Failed to generate story (no synthesized speech)", "audio": None}

                 reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
                 wav_bytestream = (reduced_noise * 32767).astype(np.int16)
                 wav_bytestream = wav_bytestream.tobytes()
+            # Directly encode the WAV bytestream to base64
+            base64_audio = base64.b64encode(wav_bytestream).decode('utf8')
             if audio_stream is not None:
+                return (history, base64_audio)
+            else:
+                # Handle the case where the audio stream is None (e.g., silent response)
+                return (history, None)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
         history_tuples = [tuple(entry) for entry in last_history]
         synthesized_speech = generate_speech_for_sentence(history_tuples, chatbot_role, story_text, return_as_byte=True)
+          if synthesized_speech:
+            # Retrieve the base64 audio string from the tuple
+            base64_audio = synthesized_speech[1]
+            return {"text": story_text.strip(), "audio": base64_audio}
         else:
             return {"text": "Failed to generate story (no synthesized speech)", "audio": None}