Spaces:

ylacombe
/

children-story

Paused

App Files Files Community

ylacombe commited on Nov 3, 2023

Commit

4ae5d02

1 Parent(s): e31acce

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -37

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from gradio_client import Client
 import torch
 import nltk  # we'll use this to split into sentences
 import numpy as np
 from transformers import BarkModel, AutoProcessor
 nltk.download('punkt')
 import gradio as gr
 def _grab_best_device(use_gpu=True):
     if torch.cuda.device_count() > 0 and use_gpu:
@@ -15,11 +20,9 @@ def _grab_best_device(use_gpu=True):
 device = _grab_best_device()
-BATCH_SIZE = 8
 SYST_PROMPT="""You're the storyteller, crafting a short tale for young listeners. Please abide by these guidelines:
-- Keep your sentences concise and easy to understand.
-- There should be only the narrator speaking. No dialogues."""
 #story_prompt = "A panda going on an adventure with a caterpillar. This is a story teaching a wonderful life lesson."
 story_prompt = "A princess breaks free from a dragon's grip. This evocates women empowerement and freedom."
@@ -27,10 +30,24 @@ temperature = 0.9
 top_p = 0.6
 repetition_penalty = 1.2
-text_client = Client("https://ysharma-explore-llamav2-with-tgi.hf.space/")
-image_client = Client("prodia/fast-stable-diffusion")
-image_negative_prompt = "ultrarealistic, soft lighting, 8k, ugly"
-image_positive_prompt = ". Cartoon, anime"
 image_seed = 9
 processor = AutoProcessor.from_pretrained("suno/bark")
@@ -41,29 +58,97 @@ voice_preset = "v2/en_speaker_6"
 # convert to bettertransformer
 model = model.to_bettertransformer()
 # enable CPU offload
 model.enable_cpu_offload()
 def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
-    story = text_client.predict(
-                    story_prompt,
-                    SYST_PROMPT,
-                    temperature,
-                    4096,
-                    temperature,
-                    repetition_penalty,
-                    api_name="/chat"
-    )
     model_input = story.replace("\n", " ").strip()
     model_input = nltk.sent_tokenize(model_input)
     pieces = []
     for i in range(0, len(model_input), BATCH_SIZE):
         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
@@ -71,35 +156,26 @@ def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
         if len(inputs) != 0:
             inputs = processor(inputs, voice_preset=voice_preset)
-            speech_output = model.generate(**inputs.to(device)).cpu().numpy()
             pieces += [*speech_output, silence.copy()]
-    #job_img = image_client.submit(
-    #                story_prompt+image_positive_prompt,	# str in 'parameter_11' Textbox component
-    #                image_negative_prompt,	# str in 'parameter_12' Textbox component
-    #                "absolutereality_v181.safetensors [3d9d4d2b]",	# str (Option from: ['absolutereality_V16.safetensors [37db0fc3]', 'absolutereality_v181.safetensors [3d9d4d2b]', 'analog-diffusion-1.0.ckpt [9ca13f02]', 'anythingv3_0-pruned.ckpt [2700c435]', 'anything-v4.5-pruned.ckpt [65745d25]', 'anythingV5_PrtRE.safetensors [893e49b9]', 'AOM3A3_orangemixs.safetensors [9600da17]', 'childrensStories_v13D.safetensors [9dfaabcb]', 'childrensStories_v1SemiReal.safetensors [a1c56dbb]', 'childrensStories_v1ToonAnime.safetensors [2ec7b88b]', 'cyberrealistic_v33.safetensors [82b0d085]', 'deliberate_v2.safetensors [10ec4b29]', 'deliberate_v3.safetensors [afd9d2d4]', 'dreamlike-anime-1.0.safetensors [4520e090]', 'dreamlike-diffusion-1.0.safetensors [5c9fd6e0]', 'dreamlike-photoreal-2.0.safetensors [fdcf65e7]', 'dreamshaper_6BakedVae.safetensors [114c8abb]', 'dreamshaper_7.safetensors [5cf5ae06]', 'dreamshaper_8.safetensors [9d40847d]', 'edgeOfRealism_eorV20.safetensors [3ed5de15]', 'EimisAnimeDiffusion_V1.ckpt [4f828a15]', 'elldreths-vivid-mix.safetensors [342d9d26]', 'epicrealism_naturalSinRC1VAE.safetensors [90a4c676]', 'ICantBelieveItsNotPhotography_seco.safetensors [4e7a3dfd]', 'juggernaut_aftermath.safetensors [5e20c455]', 'lyriel_v16.safetensors [68fceea2]', 'mechamix_v10.safetensors [ee685731]', 'meinamix_meinaV9.safetensors [2ec66ab0]', 'meinamix_meinaV11.safetensors [b56ce717]', 'openjourney_V4.ckpt [ca2f377f]', 'portraitplus_V1.0.safetensors [1400e684]', 'Realistic_Vision_V1.4-pruned-fp16.safetensors [8d21810b]', 'Realistic_Vision_V2.0.safetensors [79587710]', 'Realistic_Vision_V4.0.safetensors [29a7afaa]', 'Realistic_Vision_V5.0.safetensors [614d1063]', 'redshift_diffusion-V10.safetensors [1400e684]', 'revAnimated_v122.safetensors [3f4fefd9]', 'rundiffusionFX25D_v10.safetensors [cd12b0ee]', 'rundiffusionFX_v10.safetensors [cd4e694d]', 'sdv1_4.ckpt [7460a6fa]', 'v1-5-pruned-emaonly.safetensors [d7049739]', 'shoninsBeautiful_v10.safetensors [25d8c546]', 'theallys-mix-ii-churned.safetensors [5d9225a4]', 'timeless-1.0.ckpt [7c4971d4]', 'toonyou_beta6.safetensors [980f6b15]'])
-    #                25,
-    #                "Euler a",
-    #                7,
-    #                512,
-    #                512,
-    #                image_seed,
-	#			    "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png,https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",	# str (path to directory with images and a file associating images with captions called captions.json)
-    #                fn_index=0
-    #)
-    #img = job_img.result()
-    return story, (sampling_rate, np.concatenate(pieces))
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
-    gr.Markdown("""<h1 align="center">🐶Children story<</h1>""")
     gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio (v3.40.0 onwards)🦾! </h3>""")
     with gr.Group():
       with gr.Row():
@@ -114,11 +190,16 @@ with gr.Blocks() as demo_blocks:
     with gr.Row():
         btn = gr.Button("Create a story")
     with gr.Row():
         out_audio = gr.Audio(
                 streaming=False, autoplay=True) # needed to stream output audio
         out_text = gr.Text()
-        btn.click(generate_audio_and_image, [inp_text], [out_text, out_audio] ) #[out_audio]) #, out_count])
 demo_blocks.queue().launch(debug=True)

+from huggingface_hub import InferenceClient
 from gradio_client import Client
 import torch
 import nltk  # we'll use this to split into sentences
 import numpy as np
 from transformers import BarkModel, AutoProcessor
 nltk.download('punkt')
 import gradio as gr
+import os
+os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
 def _grab_best_device(use_gpu=True):
     if torch.cuda.device_count() > 0 and use_gpu:
 device = _grab_best_device()
 SYST_PROMPT="""You're the storyteller, crafting a short tale for young listeners. Please abide by these guidelines:
+- Keep your sentences short, concise and easy to understand.
+- There should be only the narrator speaking. If there are dialogues, they should be indirect."""
 #story_prompt = "A panda going on an adventure with a caterpillar. This is a story teaching a wonderful life lesson."
 story_prompt = "A princess breaks free from a dragon's grip. This evocates women empowerement and freedom."
 top_p = 0.6
 repetition_penalty = 1.2
+TIMEOUT = int(os.environ.get("TIMEOUT", 45))
+temperature = 0.9
+top_p = 0.6
+repetition_penalty = 1.2
+# TODO: requirements: accelerate optimum
+text_client = InferenceClient(
+    "mistralai/Mistral-7B-Instruct-v0.1",
+    timeout=TIMEOUT,
+)
+image_client = Client("https://openskyml-fast-sdxl-stable-diffusion-xl.hf.space/--replicas/ffe2bn2dk/")
+image_negative_prompt = "ultrarealistic, soft lighting, 8k, ugly, text, blurry"
+image_positive_prompt = ""
 image_seed = 9
 processor = AutoProcessor.from_pretrained("suno/bark")
 # convert to bettertransformer
 model = model.to_bettertransformer()
+BATCH_SIZE = 16
 # enable CPU offload
 model.enable_cpu_offload()
+# MISTRAL ONLY
+default_system_understand_message = (
+    "I understand, I am a Mistral chatbot."
+)
+system_understand_message = os.environ.get(
+    "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
+)
+# Mistral formatter
+def format_prompt(message):
+    prompt = (
+        "<s>[INST]" + SYST_PROMPT + "[/INST]" + system_understand_message + "</s>"
+    )
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
+def generate_story(
+    story_prompt,
+    temperature=0.9,
+    max_new_tokens=1024,
+    top_p=0.95,
+    repetition_penalty=1.0,):
+    temperature = float(temperature)
+    if temperature < 1e-2:
+        temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=42,
+    )
+    try:
+        output = text_client.text_generation(
+                format_prompt(story_prompt),
+                **generate_kwargs,
+                details=False,
+                return_full_text=False,
+            )
+    except Exception as e:
+        if "Too Many Requests" in str(e):
+            print("ERROR: Too many requests on mistral client")
+            gr.Warning("Unfortunately Mistral is unable to process")
+            output = "Unfortuanately I am not able to process your request now, too many people are asking me !"
+        elif "Model not loaded on the server" in str(e):
+            print("ERROR: Mistral server down")
+            gr.Warning("Unfortunately Mistral LLM is unable to process")
+            output = "Unfortuanately I am not able to process your request now, I have problem with Mistral!"
+        else:
+            print("Unhandled Exception: ", str(e))
+            gr.Warning("Unfortunately Mistral is unable to process")
+            output = "I do not know what happened but I could not understand you."
+        return output
+    return output
 def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
+    story = generate_story(story_prompt)
+    print(story)
     model_input = story.replace("\n", " ").strip()
     model_input = nltk.sent_tokenize(model_input)
+    print("text generated - now calling for image")
+    job_img = image_client.submit(
+                    story_prompt+image_positive_prompt,	# str in 'parameter_11' Textbox component
+                    image_negative_prompt,	# str in 'parameter_12' Textbox component
+                    25,
+                    7,
+                    1024,
+                    1024,
+                    image_seed,
+                    fn_index=0,
+    )
+    print("image called - now generating audio")
     pieces = []
     for i in range(0, len(model_input), BATCH_SIZE):
         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
         if len(inputs) != 0:
             inputs = processor(inputs, voice_preset=voice_preset)
+            speech_output, output_lengths = model.generate(**inputs.to(device), return_output_lengths=True, min_eos_p=0.2)
+            speech_output = [output[:length].cpu().numpy() for (output,length) in zip(speech_output, output_lengths)]
+            print(f"{i}-th part generated")
             pieces += [*speech_output, silence.copy()]
+    print("Calling image")
+    # TODO: if error catch it
+    img = job_img.result()
+    return story, (sampling_rate, np.concatenate(pieces)), img
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
+    gr.Markdown("""<h1 align="center">🐶Children story</h1>""")
     gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio (v3.40.0 onwards)🦾! </h3>""")
     with gr.Group():
       with gr.Row():
     with gr.Row():
         btn = gr.Button("Create a story")
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_output = gr.Image(elem_id="gallery")
     with gr.Row():
         out_audio = gr.Audio(
                 streaming=False, autoplay=True) # needed to stream output audio
         out_text = gr.Text()
+        btn.click(generate_audio_and_image, [inp_text], [out_text, out_audio, image_output] ) #[out_audio]) #, out_count])
 demo_blocks.queue().launch(debug=True)