freddyaboulton HF Staff commited on
Commit
b4e6550
·
1 Parent(s): bb8f724
Files changed (1) hide show
  1. app.py +19 -22
app.py CHANGED
@@ -40,7 +40,7 @@ SEED = 42
40
  def numpy_to_mp3(audio_array, sampling_rate):
41
  # Normalize audio_array if it's floating-point
42
  if np.issubdtype(audio_array.dtype, np.floating):
43
- max_val = np.max(np.abs(audio_array))
44
  audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
45
  audio_array = audio_array.astype(np.int16)
46
 
@@ -66,19 +66,21 @@ sampling_rate = model.audio_encoder.config.sampling_rate
66
  frame_rate = model.audio_encoder.config.frame_rate
67
 
68
 
69
-
70
- @spaces.GPU
71
- def generate_base(subject, setting):
72
-
73
  messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
74
  "You want to write a bed time story for your child. They will give you the subject and setting "
75
  "and you will write the entire story. It should be targetted at children 5 and younger and take about "
76
  "a minute to read")},
77
  {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
78
- gr.Info("Generating story", duration=3)
79
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
80
  gr.Info("Story Generated", duration=3)
81
  story = response.choices[0].message.content
 
 
 
 
 
 
82
 
83
  model_input = story.replace("\n", " ").strip()
84
  model_input_tokens = nltk.sent_tokenize(model_input)
@@ -86,7 +88,7 @@ def generate_base(subject, setting):
86
  play_steps_in_s = 4.0
87
  play_steps = int(frame_rate * play_steps_in_s)
88
 
89
- gr.Info("Generating Audio")
90
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
91
  story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
92
  description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
@@ -95,22 +97,16 @@ def generate_base(subject, setting):
95
  attention_mask=description_tokens.attention_mask,
96
  prompt_attention_mask=story_tokens.attention_mask)
97
  speech_output = [output.cpu().numpy() for output in speech_output]
98
- gr.Info("Generated Audio")
99
- return None, None, {"audio": speech_output, "text": model_input_tokens}
100
 
101
- import time
102
- def stream_audio(state):
103
- speech_output = state["audio"]
104
- sentences = state["text"]
105
 
106
  gr.Info("Reading Story")
107
 
108
- story = ""
109
- for sentence, new_audio in zip(sentences, speech_output):
110
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
111
- story += f"{sentence}\n"
112
- yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
113
- time.sleep(5)
114
 
115
 
116
  with gr.Blocks() as block:
@@ -122,19 +118,20 @@ with gr.Blocks() as block:
122
  )
123
  with gr.Group():
124
  with gr.Row():
125
- subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"])
126
- setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"])
127
  with gr.Row():
128
  run_button = gr.Button("Generate Story", variant="primary")
129
  with gr.Row():
130
  with gr.Group():
131
- audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True)
132
  story = gr.Textbox(label="Story")
133
 
134
  inputs = [subject, setting]
135
  outputs = [story, audio_out]
136
  state = gr.State()
137
- run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
 
138
 
139
  block.queue()
140
  block.launch(share=True)
 
40
  def numpy_to_mp3(audio_array, sampling_rate):
41
  # Normalize audio_array if it's floating-point
42
  if np.issubdtype(audio_array.dtype, np.floating):
43
+ max_val = np.max(np.abs(audio_array)) + 1
44
  audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
45
  audio_array = audio_array.astype(np.int16)
46
 
 
66
  frame_rate = model.audio_encoder.config.frame_rate
67
 
68
 
69
+ def generate_story(subject: str, setting: str) -> str:
 
 
 
70
  messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
71
  "You want to write a bed time story for your child. They will give you the subject and setting "
72
  "and you will write the entire story. It should be targetted at children 5 and younger and take about "
73
  "a minute to read")},
74
  {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
 
75
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
76
  gr.Info("Story Generated", duration=3)
77
  story = response.choices[0].message.content
78
+ return None, None, story
79
+
80
+
81
+ @spaces.GPU
82
+ def generate_base(story):
83
+
84
 
85
  model_input = story.replace("\n", " ").strip()
86
  model_input_tokens = nltk.sent_tokenize(model_input)
 
88
  play_steps_in_s = 4.0
89
  play_steps = int(frame_rate * play_steps_in_s)
90
 
91
+ gr.Info("Generating Audio", duration=3)
92
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
93
  story_tokens = prompt_tokenizer(model_input_tokens, return_tensors="pt", padding=True).to(device)
94
  description_tokens = description_tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").to(device)
 
97
  attention_mask=description_tokens.attention_mask,
98
  prompt_attention_mask=story_tokens.attention_mask)
99
  speech_output = [output.cpu().numpy() for output in speech_output]
100
+ return None, None, speech_output
101
+
102
 
103
+ def stream_audio(hidden_story, speech_output):
 
 
 
104
 
105
  gr.Info("Reading Story")
106
 
107
+ for new_audio in speech_output:
 
108
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
109
+ yield hidden_story, (sampling_rate, new_audio) #numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 
 
110
 
111
 
112
  with gr.Blocks() as block:
 
118
  )
119
  with gr.Group():
120
  with gr.Row():
121
+ subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"], label="Subject")
122
+ setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"], label="Setting")
123
  with gr.Row():
124
  run_button = gr.Button("Generate Story", variant="primary")
125
  with gr.Row():
126
  with gr.Group():
127
+ audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True, format="wav")
128
  story = gr.Textbox(label="Story")
129
 
130
  inputs = [subject, setting]
131
  outputs = [story, audio_out]
132
  state = gr.State()
133
+ hidden_story = gr.State()
134
+ run_button.click(generate_story, inputs=inputs, outputs=[story, audio_out, hidden_story]).success(fn=generate_base, inputs=hidden_story, outputs=[story, audio_out, state]).success(stream_audio, inputs=[hidden_story, state], outputs=[story, audio_out])
135
 
136
  block.queue()
137
  block.launch(share=True)