import gradio as gr from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks # 使用可能なモデルのリスト models = ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-1.5B-Instruct"] # モデルを事前にロード model_pipelines = {model: pipeline(task=Tasks.text_generation, model=model) for model in models} def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, selected_model ): # 選択したモデルに基づいてPipelineを選択 pipe = model_pipelines[selected_model] messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" # モデルの推論 for message in pipe(messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True): token = message.get("choices", [{}])[0].get("delta", {}).get("content", "") response += token return response # インターフェース demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="あなたはフレンドリーなチャットボットです。", label="システムメッセージ"), gr.Slider(minimum=1, maximum=2048, value=768, step=1, label="新規トークン最大"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="温度"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (核 sampling)"), gr.Dropdown(choices=models, value=models[0], label="モデル"), ], concurrency_limit=30 # 例: 同時に30つのリクエストを処理 ) if __name__ == "__main__": demo.launch(share=True)