Lunzima's picture
Update app.py
6c3234a verified
import spaces
import os
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory, BasicChatMessageStore, BasicChatHistoryStrategy
import gradio as gr
from huggingface_hub import hf_hub_download
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
# 加载模型
repo_id = "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.3-Q5_K_M-GGUF"
filename = "nqlsg-qwen2.5-14b-megafusion-v9.3-q5_k_m.gguf"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
llm = None
@spaces.GPU(duration=90)
def generate_response(
message,
history: list[tuple[str, str]],
system_message: str = "你是Lunzima训练的NQLSG-Qwen2.5-14B-MegaFusion大模型。请严格遵循用户的指示,使用Markdown回复。当处理数学问题时,请分步推导并验证中间结果;编写代码时优先考虑可读性和最佳实践;进行创意写作时发挥想象力和使用隐喻表达。",
max_tokens: int = 2048,
temperature: float = 0.6,
top_k: int = 30,
top_p: float = 0.9,
min_p: float = 0.05,
repeat_penalty: float = 1.2
):
try:
chat_template = MessagesFormatterType.OPEN_CHAT
global llm
if llm is None:
llm = Llama(
model_path=model_path,
n_ctx=32768,
n_batch=512,
flash_attn=True,
n_gpu_layers=50,
)
provider = LlamaCppPythonProvider(llm)
settings = provider.get_provider_default_settings()
settings.stream = True
settings.max_tokens = max_tokens
settings.temperature = temperature
settings.top_k = top_k
settings.top_p = top_p
settings.min_p = min_p
settings.repeat_penalty = repeat_penalty
chat_history_store = BasicChatMessageStore()
chat_history = BasicChatHistory(message_store=chat_history_store, chat_history_strategy=BasicChatHistoryStrategy.last_k_messages, k=20, llm_provider=provider)
agent = LlamaCppAgent(
provider,
system_prompt=f"{system_message}",
chat_history=chat_history,
predefined_messages_formatter_type=chat_template,
debug_output=True
)
stream = agent.get_chat_response(
message,
system_prompt=f"{system_message}",
chat_history=chat_history,
llm_sampling_settings=settings,
returns_streaming_generator=True,
print_output=False
)
outputs = ""
for output in stream:
outputs += output
yield outputs
except Exception as e:
error_str = f"Error occurred: {str(e)}"
print(f"Error occurred: {error_str}")
history.append((message, error_str))
yield history, error_str
demo = gr.ChatInterface(
generate_response,
type="messages",
title="NQLSG Qwen2.5 14B MegaFusion v9.3 Playground"
)
if __name__ == "__main__":
import multiprocessing
# 启动Gradio服务
demo.launch()