File size: 3,337 Bytes
2c70b2b
cb47c06
 
 
 
2c70b2b
73dbe3a
 
 
 
 
 
 
 
 
 
 
cb2e59c
 
d0ef350
cb2e59c
f422903
 
 
 
 
cb2e59c
f422903
cb47c06
cb2e59c
cb47c06
 
 
 
 
cb2e59c
cb47c06
 
cb2e59c
cb47c06
2c70b2b
cb47c06
cb2e59c
cb47c06
 
 
 
2c70b2b
cb47c06
 
2c70b2b
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
 
cb2e59c
2c70b2b
cb2e59c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download

def run_command(command, cwd=None):
    """运行系统命令"""
    result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
    if result.returncode != 0:
        print(f"命令执行失败: {command}")
        print(f"错误信息: {result.stderr}")
        exit(result.returncode)
    else:
        print(f"命令执行成功: {command}")
        print(result.stdout)

def setup_llama_cpp():
    """克隆并编译llama.cpp仓库"""
    run_command('pip install llama-cpp-python')
    if not os.path.exists('llama.cpp'):
        run_command('git clone https://github.com/ggml-org/llama.cpp.git')
        os.chdir('llama.cpp')
        run_command('pip install -r requirements.txt')
        run_command('cmake -B build')
        run_command('cmake --build build --config Release -j 8')
        os.chdir('..')

def setup_model(model_id):
    """下载并转换模型为GGUF格式,返回量化模型路径"""
    local_dir = model_id.split('/')[-1]
    if not os.path.exists(local_dir):
        snapshot_download(repo_id=model_id, local_dir=local_dir)
    gguf_path = f"{local_dir}.gguf"
    if not os.path.exists(gguf_path):
        run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
    quantized_path = f"{local_dir}-Q2_K.gguf"
    if not os.path.exists(quantized_path):
        run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K')
    return quantized_path

def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
    """调用Llama模型生成回复"""
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    stream = llm.create_chat_completion(
        messages=messages,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_tokens=max_tokens,
        stop=["<|im_end|>"]
    )
    response = ""
    for chunk in stream:
        if "choices" in chunk and chunk["choices"]:
            text = chunk["choices"][0].get("delta", {}).get("content", "")
            response += text
            yield response

if __name__ == "__main__":
    MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
    setup_llama_cpp()
    MODEL_PATH = setup_model(MODEL_ID)
    llm = Llama(
        model_path=MODEL_PATH,
        verbose=False,
        n_threads=4,
        n_ctx=32768
    )
    gr.ChatInterface(
        fn=chat_with_model,
        title="Llama GGUF Chatbot",
        description="使用Llama GGUF量化模型进行推理",
        additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
        additional_inputs=[
            gr.Textbox("You are a helpful assistant.", label="System Prompt"),
            gr.Slider(0, 1, 0.6, label="Temperature"),
            gr.Slider(100, 4096, 1000, label="Max Tokens"),
            gr.Slider(1, 100, 40, label="Top K"),
            gr.Slider(0, 1, 0.85, label="Top P"),
        ],
    ).queue().launch()