Spaces:

Sourikta
/

Gen_Ai

No application file

App Files Files Community

Sourikta commited on Nov 26, 2024

Commit

a8b76e1

verified ·

1 Parent(s): 1181c71

Create app.py

Browse files

Files changed (1) hide show

app.py +417 -0

app.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import gradio as gr
+import os
+import spaces
+import json
+import re
+import random
+import numpy as np
+from gradio_client import Client, handle_file
+hf_token = os.environ.get("HF_TOKEN")
+MAX_SEED = np.iinfo(np.int32).max
+def check_api(model_name):
+    if model_name == "MAGNet":
+        try :
+            client = Client("fffiloni/MAGNet")
+            return "api ready"
+        except :
+            return "api not ready yet"
+    elif model_name == "AudioLDM-2":
+        try :
+            client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
+            return "api ready"
+        except :
+            return "api not ready yet"
+    elif model_name == "Riffusion":
+        try :
+            client = Client("fffiloni/spectrogram-to-music")
+            return "api ready"
+        except :
+            return "api not ready yet"
+    elif model_name == "Mustango":
+        try :
+            client = Client("fffiloni/mustango-API", hf_token=hf_token)
+            return "api ready"
+        except :
+            return "api not ready yet"
+    elif model_name == "MusicGen":
+        try :
+            client = Client("https://facebook-musicgen.hf.space/")
+            return "api ready"
+        except :
+            return "api not ready yet"
+    elif model_name == "Stable Audio Open":
+        try:
+            client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
+            return "api ready"
+        except:
+            return "api not ready yet"
+from moviepy.editor import VideoFileClip
+from moviepy.audio.AudioClip import AudioClip
+def extract_audio(video_in):
+    input_video = video_in
+    output_audio = 'audio.wav'
+    # Open the video file and extract the audio
+    video_clip = VideoFileClip(input_video)
+    audio_clip = video_clip.audio
+    # Save the audio as a .wav file
+    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files
+    print("Audio extraction complete.")
+    return 'audio.wav'
+def get_caption(image_in):
+    kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
+    kosmos2_result = kosmos2_client.predict(
+		image_input=handle_file(image_in),
+		text_input="Detailed",
+		api_name="/generate_predictions"
+    )
+    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    data = kosmos2_result[1]
+    # Extract and combine tokens starting from the second element
+    sentence = ''.join(item['token'] for item in data[1:])
+    # Find the last occurrence of "."
+    #last_period_index = full_sentence.rfind('.')
+    # Truncate the string up to the last period
+    #truncated_caption = full_sentence[:last_period_index + 1]
+    # print(truncated_caption)
+    #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return sentence
+def get_caption_from_MD(image_in):
+    client = Client("https://vikhyatk-moondream1.hf.space/")
+    result = client.predict(
+		image_in,	# filepath  in 'image' Image component
+		"Describe precisely the image.",	# str  in 'Question' Textbox component
+		api_name="/answer_question"
+    )
+    print(result)
+    return result
+def get_magnet(prompt):
+    client = Client("fffiloni/MAGNet")
+    result = client.predict(
+        model="facebook/magnet-small-10secs",	# Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium']  in 'Model' Radio component
+        model_path="",	# str  in 'Model Path (custom models)' Textbox component
+        text=prompt,	# str  in 'Input Text' Textbox component
+        temperature=3,	# float  in 'Temperature' Number component
+        topp=0.9,	# float  in 'Top-p' Number component
+        max_cfg_coef=10,	# float  in 'Max CFG coefficient' Number component
+        min_cfg_coef=1,	# float  in 'Min CFG coefficient' Number component
+        decoding_steps1=20,	# float  in 'Decoding Steps (stage 1)' Number component
+        decoding_steps2=10,	# float  in 'Decoding Steps (stage 2)' Number component
+        decoding_steps3=10,	# float  in 'Decoding Steps (stage 3)' Number component
+        decoding_steps4=10,	# float  in 'Decoding Steps (stage 4)' Number component
+        span_score="prod-stride1 (new!)",	# Literal['max-nonoverlap', 'prod-stride1 (new!)']  in 'Span Scoring' Radio component
+        api_name="/predict_full"
+    )
+    print(result)
+    return result[1]
+def get_audioldm(prompt):
+    client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
+    seed = random.randint(0, MAX_SEED)
+    result = client.predict(
+        text=prompt,	# str in 'Input text' Textbox component
+        negative_prompt="Low quality.",	# str in 'Negative prompt' Textbox component
+        duration=10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
+        guidance_scale=6.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
+        random_seed=seed,	# int | float in 'Seed' Number component
+        n_candidates=3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
+        api_name="/text2audio"
+    )
+    print(result)
+    return result
+def get_riffusion(prompt):
+    client = Client("fffiloni/spectrogram-to-music")
+    result = client.predict(
+		prompt=prompt,	# str  in 'Musical prompt' Textbox component
+		negative_prompt="",	# str  in 'Negative prompt' Textbox component
+		audio_input=None,	# filepath  in 'parameter_4' Audio component
+		duration=10,	# float (numeric value between 5 and 10) in 'Duration in seconds' Slider component
+		api_name="/predict"
+    )
+    print(result)
+    return result[1]
+def get_mustango(prompt):
+    client = Client("fffiloni/mustango-API", hf_token=hf_token)
+    result = client.predict(
+		prompt=prompt,	# str  in 'Prompt' Textbox component
+		steps=200,	# float (numeric value between 100 and 200) in 'Steps' Slider component
+		guidance=6,	# float (numeric value between 1 and 10) in 'Guidance Scale' Slider component
+		api_name="/predict"
+    )
+    print(result)
+    return result
+def get_musicgen(prompt):
+    client = Client("https://facebook-musicgen.hf.space/")
+    result = client.predict(
+        prompt,	# str  in 'Describe your music' Textbox component
+        None,	# str (filepath or URL to file) in 'File' Audio component
+        fn_index=0
+    )
+    print(result)
+    return result[1]
+def get_stable_audio_open(prompt):
+    client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
+    result = client.predict(
+		prompt=prompt,
+		seconds_total=10,
+		steps=100,
+		cfg_scale=7,
+		api_name="/predict"
+    )
+    print(result)
+    return result
+import re
+import torch
+from transformers import pipeline
+zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
+mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
+standard_sys = f"""
+You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
+In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model.
+For example, if a user says, "a picture of a man in a black suit and tie riding a black dragon", provide immediately a musical prompt corresponding to the image description.
+Immediately STOP after that. It should be EXACTLY in this format:
+"A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle"
+"""
+mustango_sys = f"""
+You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
+In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model, you MUST include chords progression.
+For example, if a user says, "a painting of three old women having tea party", provide immediately a musical prompt corresponding to the image description.
+Immediately STOP after that. It should be EXACTLY in this format:
+"The song is an instrumental. The song is in medium tempo with a classical guitar playing a lilting melody in accompaniment style. The song is emotional and romantic. The song is a romantic instrumental song. The chord sequence is Gm, F6, Ebm. The time signature is 4/4. This song is in Adagio. The key of this song is G minor."
+"""
+@spaces.GPU(enable_queue=True)
+def get_musical_prompt(user_prompt, chosen_model):
+    """
+    if chosen_model == "Mustango" :
+        agent_maker_sys = standard_sys
+    else :
+        agent_maker_sys = standard_sys
+    """
+    agent_maker_sys = standard_sys
+    instruction = f"""
+<|system|>
+{agent_maker_sys}</s>
+<|user|>
+"""
+    prompt = f"{instruction.strip()}\n{user_prompt}</s>"
+    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
+    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
+    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
+    print(f"SUGGESTED Musical prompt: {cleaned_text}")
+    return cleaned_text.lstrip("\n")
+def infer(image_in, chosen_model, api_status):
+    if image_in == None :
+        raise gr.Error("Please provide an image input")
+    if chosen_model == [] :
+        raise gr.Error("Please pick a model")
+    if api_status == "api not ready yet" :
+        raise gr.Error("This model is not ready yet, you can pick another one instead :)")
+    gr.Info("Getting image caption with Kosmos-2...")
+    user_prompt = get_caption(image_in)
+    #user_prompt = get_caption_from_MD(image_in)
+    gr.Info("Building a musical prompt according to the image caption ...")
+    musical_prompt = get_musical_prompt(user_prompt, chosen_model)
+    if chosen_model == "MAGNet" :
+        gr.Info("Now calling MAGNet for music...")
+        music_o = get_magnet(musical_prompt)
+    elif chosen_model == "AudioLDM-2" :
+        gr.Info("Now calling AudioLDM-2 for music...")
+        music_o = get_audioldm(musical_prompt)
+    elif chosen_model == "Riffusion" :
+        gr.Info("Now calling Riffusion for music...")
+        music_o = get_riffusion(musical_prompt)
+    elif chosen_model == "Mustango" :
+        gr.Info("Now calling Mustango for music...")
+        music_o = get_mustango(musical_prompt)
+    elif chosen_model == "MusicGen" :
+        gr.Info("Now calling MusicGen for music...")
+        music_o = get_musicgen(musical_prompt)
+    elif chosen_model == "Stable Audio Open" :
+        gr.Info("Now calling Stable Audio Open for music...")
+        music_o = get_stable_audio_open(musical_prompt)
+    return gr.update(value=musical_prompt, interactive=True), gr.update(visible=True), music_o
+def retry(chosen_model, caption):
+    musical_prompt = caption
+    music_o = None
+    if chosen_model == "MAGNet" :
+        gr.Info("Now calling MAGNet for music...")
+        music_o = get_magnet(musical_prompt)
+    elif chosen_model == "AudioLDM-2" :
+        gr.Info("Now calling AudioLDM-2 for music...")
+        music_o = get_audioldm(musical_prompt)
+    elif chosen_model == "Riffusion" :
+        gr.Info("Now calling Riffusion for music...")
+        music_o = get_riffusion(musical_prompt)
+    elif chosen_model == "Mustango" :
+        gr.Info("Now calling Mustango for music...")
+        music_o = get_mustango(musical_prompt)
+    elif chosen_model == "MusicGen" :
+        gr.Info("Now calling MusicGen for music...")
+        music_o = get_musicgen(musical_prompt)
+    elif chosen_model == "Stable Audio Open" :
+        gr.Info("Now calling Stable Audio Open for music...")
+        music_o = get_stable_audio_open(musical_prompt)
+    return music_o
+demo_title = "Image to Music V2"
+description = "Get music from a picture, compare text-to-music models"
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 980px;
+    text-align: left;
+}
+#inspi-prompt textarea {
+    font-size: 20px;
+    line-height: 24px;
+    font-weight: 600;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(f"""
+        <h2 style="text-align: center;">{demo_title}</h2>
+        <p style="text-align: center;">{description}</p>
+        """)
+        with gr.Row():
+            with gr.Column():
+                image_in = gr.Image(
+                    label = "Image reference",
+                    type = "filepath",
+                    elem_id = "image-in"
+                )
+                with gr.Row():
+                    chosen_model = gr.Dropdown(
+                        label = "Choose a model",
+                        choices = [
+                            #"MAGNet",
+                            "AudioLDM-2",
+                            "Riffusion",
+                            "Mustango",
+                            #"MusicGen",
+                            "Stable Audio Open"
+                        ],
+                        value = None,
+                        filterable = False
+                    )
+                    check_status = gr.Textbox(
+                        label="API status",
+                        interactive=False
+                    )
+                submit_btn = gr.Button("Make music from my pic !")
+                gr.Examples(
+                    examples = [
+                        ["examples/ocean_poet.jpeg"],
+                        ["examples/jasper_horace.jpeg"],
+                        ["examples/summer.jpeg"],
+                        ["examples/mona_diner.png"],
+                        ["examples/monalisa.png"],
+                        ["examples/santa.png"],
+                        ["examples/winter_hiking.png"],
+                        ["examples/teatime.jpeg"],
+                        ["examples/news_experts.jpeg"]
+                    ],
+                    fn = infer,
+                    inputs = [image_in, chosen_model],
+                    examples_per_page = 4
+                )
+            with gr.Column():
+                caption = gr.Textbox(
+                    label = "Inspirational musical prompt",
+                    interactive = False,
+                    elem_id = "inspi-prompt"
+                )
+                retry_btn = gr.Button("Retry with edited prompt", visible=False)
+                result = gr.Audio(
+                    label = "Music"
+                )
+    chosen_model.change(
+        fn = check_api,
+        inputs = chosen_model,
+        outputs = check_status,
+        queue = False
+    )
+    retry_btn.click(
+        fn = retry,
+        inputs = [chosen_model, caption],
+        outputs = [result]
+    )
+    submit_btn.click(
+        fn = infer,
+        inputs = [
+            image_in,
+            chosen_model,
+            check_status
+        ],
+        outputs =[
+            caption,
+            retry_btn,
+            result
+        ]
+    )
+demo.queue(max_size=16).launch(show_api=False, show_error=True)