LPX55 commited on
Commit
b6ffc30
·
verified ·
1 Parent(s): 58774f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -72
app.py CHANGED
@@ -1,79 +1,38 @@
1
- import torch
2
- import os
3
  import gradio as gr
4
- from huggingface_hub import InferenceClient, client
5
- # Use a pipeline as a high-level helper
6
  from transformers import BitsAndBytesConfig
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
9
 
10
  quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
11
- model_4bit = AutoModelForCausalLM.from_pretrained(
12
- "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit",
13
- quantization_config=quantization_config,
14
- torch_dtype="auto"
15
- )
16
- # pipe = pipeline("image-text-to-text", model="")
17
- # pipe(messages)
18
-
19
- client = client(model_4bit)
20
-
21
-
22
- def respond(
23
- message,
24
- history: list[tuple[str, str]],
25
- system_message,
26
- max_tokens,
27
- temperature,
28
- top_p,
29
- ):
30
- messages = [
31
- {"role": "user", "content": "Who are you?"},
32
- ]
33
- messages = [{"role": "system", "content": system_message}]
34
-
35
- for val in history:
36
- if val[0]:
37
- messages.append({"role": "user", "content": val[0]})
38
- if val[1]:
39
- messages.append({"role": "assistant", "content": val[1]})
40
-
41
- messages.append({"role": "user", "content": message})
42
-
43
- response = ""
44
-
45
- for message in client.chat_completion(
46
- messages,
47
- max_tokens=max_tokens,
48
- stream=True,
49
- temperature=temperature,
50
- top_p=top_p,
51
- ):
52
- token = message.choices[0].delta.content
53
-
54
- response += token
55
- yield response
56
-
57
-
58
- """
59
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
60
- """
61
- demo = gr.ChatInterface(
62
- respond,
63
- additional_inputs=[
64
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
65
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
66
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
67
- gr.Slider(
68
- minimum=0.1,
69
- maximum=1.0,
70
- value=0.95,
71
- step=0.05,
72
- label="Top-p (nucleus sampling)",
73
- ),
74
- ],
75
- )
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == "__main__":
79
- demo.launch()
 
 
1
+ import spaces
 
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
4
  from transformers import BitsAndBytesConfig
 
 
5
 
6
  quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit"
9
+
10
+ @spaces.GPU(duration=180)
11
+ def load_model():
12
+ model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ return model, tokenizer
15
+
16
+ @spaces.GPU
17
+ def generate_text(prompt, model, tokenizer):
18
+ inputs = tokenizer(prompt, return_tensors="pt")
19
+ outputs = model.generate(**inputs, max_length=100)
20
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
21
+
22
+ def gradio_interface():
23
+ model, tokenizer = load_model()
24
+
25
+ def wrapped_generate(prompt):
26
+ return generate_text(prompt, model, tokenizer)
27
+
28
+ iface = gr.Interface(
29
+ fn=wrapped_generate,
30
+ inputs="text",
31
+ outputs="text",
32
+ title="Meta-Llama 4 Scout 17B Instruct 4bit bnb"
33
+ )
34
+ return iface
35
 
36
  if __name__ == "__main__":
37
+ demo = gradio_interface()
38
+ demo.launch()