File size: 2,799 Bytes
eb2e235
 
5fd0c28
2936c26
01945bd
eb2e235
 
 
932195b
 
eb2e235
932195b
 
eb2e235
 
 
932195b
20e04d7
 
d13f282
 
20e04d7
eb2e235
 
 
 
20e04d7
 
 
eb2e235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01945bd
def541d
 
eb2e235
01945bd
eb2e235
 
01945bd
eb2e235
def541d
01945bd
 
eb2e235
01945bd
eb2e235
 
01945bd
 
 
 
b97d649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

# Hugging Face repository IDs
base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
adapter_repo = "Mat17892/llama_lora_gguf"

# Download the base model GGUF file
print("Downloading base model...")
base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")

# Download the LoRA adapter GGUF file
print("Downloading LoRA adapter...")
lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")

# Define the llama-cli path explicitly
llama_cli_path = "./llama.cpp/build/bin/llama-cli"
if not os.access(llama_cli_path, os.X_OK):  # Check if the file is executable
    os.chmod(llama_cli_path, 0o755)  # Set executable permissions

# Function to run `llama-cli` with base model and adapter
def run_llama_cli(prompt):
    print("Running inference with llama-cli...")
    cmd = [
        llama_cli_path,  # Path to the llama-cli executable
        "-c", "2048",    # Context length
        "-cnv",          # Enable conversational mode
        "-m", base_model_path,
        "--lora", lora_adapter_path,
        "--prompt", prompt,
    ]
    try:
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        if process.returncode != 0:
            print("Error during inference:")
            print(stderr.decode())
            return "Error: Could not generate response."

        return stdout.decode().strip()
    except Exception as e:
        print(f"Exception occurred: {e}")
        return "Error: Could not generate response."

# Gradio interface
def chatbot_fn(user_input, chat_history):
    # Build the full chat history as the prompt
    prompt = ""
    for user, ai in chat_history:
        prompt += f"User: {user}\nAI: {ai}\n"
    prompt += f"User: {user_input}\nAI:"  # Add latest user input

    # Generate response using llama-cli
    response = run_llama_cli(prompt)

    # Update chat history
    chat_history.append((user_input, response))
    return chat_history, chat_history

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
    chatbot = gr.Chatbot(label="Chat with the Model")

    with gr.Row():
        with gr.Column(scale=4):
            user_input = gr.Textbox(label="Your Message", placeholder="Type a message...")
        with gr.Column(scale=1):
            submit_btn = gr.Button("Send")

    chat_history = gr.State([])

    # Link components
    submit_btn.click(
        chatbot_fn,
        inputs=[user_input, chat_history],
        outputs=[chatbot, chat_history],
        show_progress=True,
    )

# Launch the Gradio app
demo.launch()