LLMproj1 commited on
Commit
81aaffe
·
verified ·
1 Parent(s): df25ad5

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +77 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled18.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_vTVH3hBX8wVXIgrW1T2Q4N1DSkWoXV8
8
+ """
9
+
10
+
11
+
12
+ import gradio as gr
13
+ import torch
14
+ from transformers import TextStreamer
15
+ from unsloth import FastLanguageModel
16
+ from google.colab import drive
17
+ import os
18
+
19
+ # Ensure necessary packages are installed
20
+
21
+
22
+
23
+ # Define the parameters for the model
24
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
25
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
26
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
27
+
28
+ # Load the model and tokenizer
29
+ model, tokenizer = FastLanguageModel.from_pretrained(
30
+ model_name="lora_model", # YOUR MODEL YOU USED FOR TRAINING
31
+ max_seq_length=max_seq_length,
32
+ dtype=dtype,
33
+ load_in_4bit=load_in_4bit,
34
+ )
35
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
36
+
37
+ # Define the Alpaca prompt
38
+ alpaca_prompt = """
39
+ ### Input:
40
+ {}
41
+
42
+ ### Response:
43
+ {}"""
44
+
45
+ # Define the function to generate responses
46
+ def chat_alpaca(message: str, history: list, temperature: float, max_new_tokens: int) -> str:
47
+ prompt = alpaca_prompt.format(message, "")
48
+ inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
49
+
50
+ # Define the streamer
51
+ text_streamer = TextStreamer(tokenizer)
52
+
53
+ # Generate the response
54
+ outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_new_tokens, temperature=temperature)
55
+ response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
56
+
57
+ # Return the response
58
+ return response
59
+
60
+ # Define the response function for the Gradio interface
61
+ def respond(message, history, system_message, max_new_tokens, temperature, top_p):
62
+ return chat_alpaca(message, history, temperature, max_new_tokens)
63
+
64
+ # Create the Gradio interface
65
+ demo = gr.ChatInterface(
66
+ respond,
67
+ additional_inputs=[
68
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
69
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
70
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
71
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
72
+ ],
73
+ )
74
+
75
+ if __name__ == "__main__":
76
+ demo.launch(share=True)
77
+