|
import spaces |
|
import os |
|
import json |
|
import subprocess |
|
from llama_cpp import Llama |
|
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType |
|
from llama_cpp_agent.providers import LlamaCppPythonProvider |
|
from llama_cpp_agent.chat_history import BasicChatHistory, BasicChatMessageStore, BasicChatHistoryStrategy |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from fastapi import FastAPI, HTTPException |
|
from pydantic import BaseModel |
|
import uvicorn |
|
|
|
|
|
repo_id = "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.3-Q5_K_M-GGUF" |
|
filename = "nqlsg-qwen2.5-14b-megafusion-v9.3-q5_k_m.gguf" |
|
model_path = hf_hub_download(repo_id=repo_id, filename=filename) |
|
llm = None |
|
|
|
@spaces.GPU(duration=90) |
|
def generate_response( |
|
message, |
|
history: list[tuple[str, str]], |
|
system_message: str = "你是Lunzima训练的NQLSG-Qwen2.5-14B-MegaFusion大模型。请严格遵循用户的指示,使用Markdown回复。当处理数学问题时,请分步推导并验证中间结果;编写代码时优先考虑可读性和最佳实践;进行创意写作时发挥想象力和使用隐喻表达。", |
|
max_tokens: int = 2048, |
|
temperature: float = 0.6, |
|
top_k: int = 30, |
|
top_p: float = 0.9, |
|
min_p: float = 0.05, |
|
repeat_penalty: float = 1.2 |
|
): |
|
try: |
|
chat_template = MessagesFormatterType.OPEN_CHAT |
|
|
|
global llm |
|
|
|
if llm is None: |
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=32768, |
|
n_batch=512, |
|
flash_attn=True, |
|
n_gpu_layers=50, |
|
) |
|
|
|
provider = LlamaCppPythonProvider(llm) |
|
|
|
settings = provider.get_provider_default_settings() |
|
settings.stream = True |
|
settings.max_tokens = max_tokens |
|
settings.temperature = temperature |
|
settings.top_k = top_k |
|
settings.top_p = top_p |
|
settings.min_p = min_p |
|
settings.repeat_penalty = repeat_penalty |
|
|
|
chat_history_store = BasicChatMessageStore() |
|
chat_history = BasicChatHistory(message_store=chat_history_store, chat_history_strategy=BasicChatHistoryStrategy.last_k_messages, k=20, llm_provider=provider) |
|
|
|
agent = LlamaCppAgent( |
|
provider, |
|
system_prompt=f"{system_message}", |
|
chat_history=chat_history, |
|
predefined_messages_formatter_type=chat_template, |
|
debug_output=True |
|
) |
|
|
|
stream = agent.get_chat_response( |
|
message, |
|
system_prompt=f"{system_message}", |
|
chat_history=chat_history, |
|
llm_sampling_settings=settings, |
|
returns_streaming_generator=True, |
|
print_output=False |
|
) |
|
|
|
outputs = "" |
|
for output in stream: |
|
outputs += output |
|
yield outputs |
|
|
|
except Exception as e: |
|
error_str = f"Error occurred: {str(e)}" |
|
print(f"Error occurred: {error_str}") |
|
history.append((message, error_str)) |
|
yield history, error_str |
|
|
|
|
|
demo = gr.ChatInterface( |
|
generate_response, |
|
type="messages", |
|
title="NQLSG Qwen2.5 14B MegaFusion v9.3 Playground" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
import multiprocessing |
|
|
|
|
|
demo.launch() |
|
|
|
|