File size: 9,146 Bytes
6edeb66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import io
import json
import torch
import requests
from PIL import Image
import soundfile as sf
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

# -------------------------------
# 模型與處理器載入設定
# -------------------------------
model_path = "microsoft/Phi-4-multimodal-instruct"

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

generation_config = GenerationConfig.from_pretrained(model_path)

# -------------------------------
# 根據任務模式組合 prompt 並調用模型生成結果
# -------------------------------
def process_task(mode, system_msg, user_msg, image_multi, audio, vs_images, vs_audio):
    """

    根據不同任務模式組合 prompt,並使用 processor 與 model 進行生成

    """
    # -------------------------------
    # 依據不同模式構建 prompt 與處理輸入資料
    # -------------------------------
    if mode == "Text Chat":
        prompt = f"<|system|>{system_msg}<|end|><|user|>{user_msg}<|end|><|assistant|>"
        inputs = processor(text=prompt, return_tensors='pt').to(model.device)
        
    elif mode == "Tool-enabled Function Calling":
        tools = [{
            "name": "get_weather_updates",
            "description": "Fetches weather updates for a given city using the RapidAPI Weather API.",
            "parameters": {
                "city": {
                    "description": "The name of the city for which to retrieve weather information.",
                    "type": "str",
                    "default": "London"
                }
            }
        }]
        tools_json = json.dumps(tools, ensure_ascii=False)
        prompt = f"<|system|>{system_msg}<|tool|>{tools_json}<|/tool|><|end|><|user|>{user_msg}<|end|><|assistant|>"
        inputs = processor(text=prompt, return_tensors='pt').to(model.device)
        
    elif mode == "Vision-Language":
        # 優先判斷單一圖片上傳;若無則檢查多圖上傳
        if image_multi is not None and len(image_multi) > 0:
            num = len(image_multi)
            image_tags = ''.join([f"<|image_{i+1}|>" for i in range(num)])
            prompt = f"<|user|>{image_tags}{user_msg}<|end|><|assistant|>"
            images = []
            for file in image_multi:
                images.append(Image.open(file))
            inputs = processor(text=prompt, images=images, return_tensors='pt').to(model.device)
        else:
            return "No image provided."
        
    elif mode == "Speech-Language":
        prompt = f"<|user|><|audio_1|>{user_msg}<|end|><|assistant|>"
        if audio is None:
            return "No audio provided."
        # 若 audio 為 tuple,則直接取出取樣率與音訊資料
        if isinstance(audio, tuple):
            sample_rate, audio_data = audio
        else:
            audio_data, sample_rate = sf.read(audio)
        inputs = processor(text=prompt, audios=[(audio_data, sample_rate)], return_tensors='pt').to(model.device)
        
    elif mode == "Vision-Speech":
        prompt = f"<|user|>"
        images = []
        if vs_images is not None and len(vs_images) > 0:
            num = len(vs_images)
            image_tags = ''.join([f"<|image_{i+1}|>" for i in range(num)])
            prompt += image_tags
            for file in vs_images:
                images.append(Image.open(file))
        if vs_audio is None:
            return "No audio provided for vision-speech."
        prompt += "<|audio_1|><|end|><|assistant|>"
        audio_data, samplerate = sf.read(vs_audio)
        inputs = processor(text=prompt, images=images, audios=[(audio_data, samplerate)], return_tensors='pt').to(model.device)
        
    else:
        return "Invalid mode."

    # -------------------------------
    # 調用模型生成回應
    # -------------------------------
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        generation_config=generation_config,
    )
    # 裁剪掉輸入部分的 token
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response

# -------------------------------
# 更新介面元件顯示 (根據任務模式決定顯示哪些輸入區塊)
# -------------------------------
def update_visibility(mode):
    if mode == "Text Chat":
        return (gr.update(visible=True),    # system_msg
                gr.update(visible=True),    # user_msg
                gr.update(visible=False),   # image_upload_multi (多圖)
                gr.update(visible=False),   # audio_upload
                gr.update(visible=False),   # vs_image_upload
                gr.update(visible=False))   # vs_audio_upload
    elif mode == "Tool-enabled Function Calling":
        return (gr.update(visible=True),
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False))
    elif mode == "Vision-Language":
        return (gr.update(visible=False),
                gr.update(visible=True),
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False))
    elif mode == "Speech-Language":
        return (gr.update(visible=False),
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=False))
    elif mode == "Vision-Speech":
        return (gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=True),
                gr.update(visible=True))
    else:
        return (gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update())

# -------------------------------
# 建立 Gradio Blocks 介面
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## Multi-Modal Prompt Builder & Model Inference")
    # 任務模式選單
    mode_radio = gr.Radio(
        choices=["Text Chat", "Vision-Language", "Speech-Language", "Vision-Speech"], #, "Tool-enabled Function Calling"
        label="Select Task Mode",
        value="Text Chat"
    )
    
    # 文字輸入區塊 (Text Chat 與 Tool-enabled 都需要)
    system_text = gr.Textbox(label="System Message", value="You are a helpful assistant.", visible=True)
    user_text = gr.Textbox(label="User Message", visible=True)
    
    # 圖片上傳區塊 (Vision-Language)
    # image_upload = gr.Image(label="Upload Image (Single)", type="pil", visible=False)
    image_upload_multi = gr.File(label="Upload Image(s) (Multiple)", file_count="multiple", visible=False)
    
    # 音檔上傳區塊 (Speech-Language)
    audio_upload = gr.Audio(label="Upload Audio (wav, mp3, flac)", visible=False)
    
    # Vision-Speech 區塊:圖片上傳 (多張) 與音檔上傳
    vs_image_upload = gr.File(label="Upload Image(s) for Vision-Speech", file_count="multiple", visible=False)
    vs_audio_upload = gr.Audio(label="Upload Audio for Vision-Speech", visible=False)
    
    # 送出按鈕與結果輸出區塊
    submit_btn = gr.Button("Submit")
    output_text = gr.Textbox(label="Result", lines=6)
    
    # gr.Examples 區塊,提供部份任務的文字範例(其他任務請自行上傳圖片或音檔)
    examples = gr.Examples(
        examples=[
            ["Text Chat", "hi who are you?"],
            # ["Tool-enabled Function Calling", "You are a helpful assistant with some tools.", "What is the weather like in Paris today?"],
            ["Vision-Language", "Describe the image in detail."],
            ["Speech-Language", "Transcribe the audio to text."],
            ["Vision-Speech", ""]
        ],
        inputs=[mode_radio, user_text],
        label="Examples"
    )
    
    # 當任務模式改變時,更新介面各元件顯示狀態
    mode_radio.change(fn=update_visibility, 
                      inputs=mode_radio, 
                      outputs=[system_text, user_text, image_upload_multi, audio_upload, vs_image_upload, vs_audio_upload])
    
    # 點擊送出按鈕時根據選擇的模式與輸入內容生成 prompt 並調用模型生成回答
    submit_btn.click(
        fn=process_task,
        inputs=[mode_radio, system_text, user_text, image_upload_multi, audio_upload, vs_image_upload, vs_audio_upload],
        outputs=output_text
    )

demo.launch()