Spaces:
Runtime error
Runtime error
File size: 6,089 Bytes
efe75b3 8008efd dafd76d efe75b3 fb4c1f6 efe75b3 dafd76d e5701a3 ff9cb56 e5701a3 ff9cb56 9dad860 efe75b3 ff9cb56 0a8b3fd ff9cb56 47e7f67 b8a7ca9 0d6943c 2722cbc efe75b3 2722cbc 1cf7891 410fca0 2722cbc 1cf7891 2722cbc e2f50d5 2722cbc cbe6641 2722cbc a85cefc 8cd024c efe75b3 e2f50d5 efe75b3 e2f50d5 efe75b3 217a229 e2f50d5 efe75b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2
# gr.themes.builder()
messages = []
multimodal_phi2 = MultiModalPhi2(
modelname_or_path="GunaKoppula/Llava-Phi2",
temperature=0.2,
max_new_tokens=1024,
device="cpu",
)
def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
textflag, imageflag, audioflag = False, False, False
if text not in ["", None]:
chatbot.append((text, None))
textflag = True
if image is not None:
chatbot.append(((image,), None))
imageflag = True
if audio_mic is not None:
chatbot.append(((audio_mic,), None))
audioflag = True
else:
if audio_upload is not None:
chatbot.append(((audio_upload,), None))
audioflag = True
if not any([textflag, imageflag, audioflag]):
# Raise an error if neither text nor file is provided
raise gr.Error("Enter a valid text, image or audio")
return chatbot
def clear_data():
return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}
def run(history, text, image, audio_upload, audio_mic):
if text in [None, ""]:
text = None
if audio_upload is not None:
audio = audio_upload
elif audio_mic is not None:
audio = audio_mic
else:
audio = None
print("text", text)
print("image", image)
print("audio", audio)
if image is not None:
image = Image.open(image)
outputs = multimodal_phi2(text, audio, image)
# outputs = ""
history.append((None, outputs.title()))
return history, None, None, None, None
theme = gr.themes.Default(primary_hue="blue").set(
loader_color="#FF0000",
slider_color="#FF0000",
)
with gr.Blocks(theme='upsatwal/mlsc_tiet') as demo:
with gr.Row():
gr.Markdown("## MulitModal Phi2 Model Pretraining and Finetuning from Scratch")
# with gr.Row() as title_row:
# with gr.Column():
# # Create an empty column on the left for spacing
# pass
# with gr.Column():
# # Add Markdown with centered text
# gr.Markdown("## MulitModal Phi2 Model Pretraining and Finetuning from Scratch")
# gr.Markdown("This is a multimodal implementation of [Phi2](https://huggingface.co/microsoft/phi-2) model. Please find the source code and training details [here](https://github.com/RaviNaik/ERA-CAPSTONE/MultiModalPhi2).")
# # with gr.Column():
# # # Create an empty column on the right for spacing
# # pass
with gr.Row():
gr.Markdown(
"""This is a multimodal implementation of [Phi2](https://huggingface.co/microsoft/phi-2) model. Please find the source code and training details [here](https://github.com/RaviNaik/ERA-CAPSTONE/MultiModalPhi2).
### Details:
1. LLM Backbone: [Phi2](https://huggingface.co/microsoft/phi-2)
2. Vision Tower: [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336)
3. Audio Model: [Whisper Tiny](https://huggingface.co/openai/whisper-tiny)
4. Pretraining Dataset: [LAION-CC-SBU dataset with BLIP captions(200k samples)](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)
5. Finetuning Dataset: [Instruct 150k dataset based on COCO](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)
6. Finetuned Model: [GunaKoppula/Llava-Phi2](https://huggingface.co/GunaKoppula/Llava-Phi2)
"""
)
# with gr.Row(scale=8):
# with gr.Box():
# with gr.Row():
# chatbot = gr.Chatbot(
# avatar_images=("π§", "π€"),
# height=550,
# )
with gr.Row():
chatbot = gr.Chatbot(
avatar_images=("π§", "π€"),
height=550,
)
with gr.Row():
with gr.Column(scale=4):
# Creating a column with a scale of 6
with gr.Box():
with gr.Column():
with gr.Row():
# Adding a Textbox with a placeholder "write prompt"
prompt = gr.Textbox(
placeholder="Enter Prompt", lines=2, label="Query", value=None
)
# Creating a column with a scale of 2
with gr.Row():
# Adding image
image = gr.Image(type="filepath", value=None)
# Creating a column with a scale of 2
with gr.Row():
# Add audio
# audio_upload = gr.Audio(source="upload", type="filepath")
# audio_mic = gr.Audio(source="microphone", type="filepath", format="mp3")
audio_upload = gr.Audio(type="filepath")
# with gr.Column():
# Adding a Button
with gr.Row():
submit = gr.Button()
with gr.Row():
clear = gr.Button(value="Clear")
submit.click(
add_content,
# inputs=[chatbot, prompt, image, audio_upload, audio_mic],
inputs=[chatbot, prompt, image, audio_upload],
outputs=[chatbot],
).success(
run,
# inputs=[chatbot, prompt, image, audio_upload, audio_mic],
inputs=[chatbot, prompt, image, audio_upload],
# outputs=[chatbot, prompt, image, audio_upload, audio_mic],
outputs=[chatbot, prompt, image, audio_upload],
)
clear.click(
clear_data,
# outputs=[prompt, image, audio_upload, audio_mic, chatbot],
outputs=[prompt, image, audio_upload, chatbot],
)
demo.launch()
|