File size: 2,765 Bytes
8644f7f
 
9ff433d
8644f7f
9ff433d
 
 
 
 
8644f7f
 
 
 
 
9ff433d
 
 
 
 
 
 
 
 
 
 
 
8644f7f
 
 
9ff433d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Import necessary libraries
import torch
from unsloth import FastLanguageModel
import gradio as gr
from transformers import TextStreamer

# Load the model and tokenizer
model_name = "BidhanAcharya/FineTunedQWENoncoding"  # Replace with your actual model path
max_seq_length = 512  # Example, adjust according to your model

# Check if a GPU is available, otherwise fall back to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
load_in_4bit = torch.cuda.is_available()  # Use 4-bit precision if a GPU is present, otherwise use standard precision

# Load the model and tokenizer with the FastLanguageModel method
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Set the model to inference mode
FastLanguageModel.for_inference(model)

# Move the model to the appropriate device (GPU/CPU)
model = model.to(device)

# Define the Alpaca prompt format
alpaca_prompt = "### Instruction:\n{}\n\n### Input:\n{}\n\n### Response:\n{}"

# Gradio function for performing inference
def generate_response(instruction, input_data):
    # Handle case where input data is empty
    if input_data.strip() == "":
        input_data = "No additional input provided."

    # Format the prompt using the instruction and input data
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                instruction,  # user-provided instruction
                input_data,   # optional user input data
                ""            # output (leave blank for generation)
            )
        ],
        return_tensors="pt"
    )

    # Move input tensors to the correct device (GPU/CPU)
    inputs = inputs.to(device)

    # Generate tokens with the model
    generated_tokens = model.generate(**inputs, max_new_tokens=500)

    # Decode the generated tokens into text
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    return generated_text

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# FastLanguageModel Inference App")

    instruction_input = gr.Textbox(label="Instruction", placeholder="Enter your instruction here")
    input_data_input = gr.Textbox(label="Input Data (Optional)", placeholder="Enter your input data here (optional)")
    output_text = gr.Textbox(label="Generated Response")

    generate_button = gr.Button("Generate Response")

    # Connect the Gradio button click event to the response generation function
    generate_button.click(
        fn=generate_response,
        inputs=[instruction_input, input_data_input],
        outputs=[output_text]
    )

# Launch the Gradio app
demo.launch()