Daemontatox commited on
Commit
05fbf52
·
verified ·
1 Parent(s): a777552

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -6,9 +6,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
6
  import gradio as gr
7
  from threading import Thread
8
 
9
- MODEL_LIST = ["CohereForAI/aya-expanse-8b"]
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
- MODEL = "CohereForAI/aya-expanse-8b"
12
 
13
  TITLE = "<h1><center>Mawred T2 Wip </center></h1>"
14
 
@@ -34,20 +34,20 @@ h3 {
34
  device = "cuda" # for GPU usage or "cpu" for CPU usage
35
 
36
  quantization_config = BitsAndBytesConfig(
37
- load_in_4bit=True,
38
- bnb_4bit_compute_dtype=torch.bfloat16,
39
- bnb_4bit_use_double_quant=True,
40
- bnb_4bit_quant_type= "nf4")
41
 
42
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
43
  model = AutoModelForCausalLM.from_pretrained(
44
  MODEL,
45
  torch_dtype=torch.bfloat16,
46
  device_map="auto",
47
- # quantization_config=quantization_config
48
  )
49
 
50
- @spaces.GPU()
51
  def stream_chat(
52
  message: str,
53
  history: list,
 
6
  import gradio as gr
7
  from threading import Thread
8
 
9
+ MODEL_LIST = ["CohereForAI/aya-expanse-32b"]
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
+ MODEL = "CohereForAI/aya-expanse-32b"
12
 
13
  TITLE = "<h1><center>Mawred T2 Wip </center></h1>"
14
 
 
34
  device = "cuda" # for GPU usage or "cpu" for CPU usage
35
 
36
  quantization_config = BitsAndBytesConfig(
37
+ load_in_8bit=True,
38
+ bnb_8bit_compute_dtype=torch.bfloat16,
39
+ bnb_8bit_use_double_quant=True,
40
+ bnb_8bit_quant_type= "nf4")
41
 
42
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
43
  model = AutoModelForCausalLM.from_pretrained(
44
  MODEL,
45
  torch_dtype=torch.bfloat16,
46
  device_map="auto",
47
+ quantization_config=quantization_config
48
  )
49
 
50
+ @spaces.GPU(660)
51
  def stream_chat(
52
  message: str,
53
  history: list,