rajsecrets0 commited on
Commit
570c187
·
verified ·
1 Parent(s): 040f3b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -10
app.py CHANGED
@@ -18,6 +18,18 @@ if hf_token is None:
18
  st.error("Missing Hugging Face token. Please set HF_TOKEN in your Space secrets.")
19
  st.stop()
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # ---------------------------
22
  # Configure your LLM and embeddings
23
  # ---------------------------
@@ -27,11 +39,10 @@ accurately as possible based on the instructions and context provided.
27
  """
28
  query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
29
 
30
- # Configure BitsAndBytes for quantization
31
- quantization_config = BitsAndBytesConfig(
32
- load_in_8bit=True,
33
- bnb_4bit_compute_dtype=torch.float16
34
- )
35
 
36
  # Initialize the HuggingFaceLLM with your model settings and authentication token
37
  llm = HuggingFaceLLM(
@@ -43,11 +54,7 @@ llm = HuggingFaceLLM(
43
  tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
44
  model_name="meta-llama/Llama-2-7b-chat-hf",
45
  device_map="auto",
46
- model_kwargs={
47
- "torch_dtype": torch.float16,
48
- "quantization_config": quantization_config,
49
- "use_auth_token": hf_token # Pass the HF token for gated access
50
- }
51
  )
52
 
53
  # Set up the embedding model using Langchain's HuggingFaceEmbeddings
 
18
  st.error("Missing Hugging Face token. Please set HF_TOKEN in your Space secrets.")
19
  st.stop()
20
 
21
+ # ---------------------------
22
+ # Configure BitsAndBytes Quantization (only if GPU is available)
23
+ # ---------------------------
24
+ if torch.cuda.is_available():
25
+ quantization_config = BitsAndBytesConfig(
26
+ load_in_8bit=True,
27
+ bnb_4bit_compute_dtype=torch.float16
28
+ )
29
+ else:
30
+ # If no GPU is available, disable bitsandbytes quantization
31
+ quantization_config = None
32
+
33
  # ---------------------------
34
  # Configure your LLM and embeddings
35
  # ---------------------------
 
39
  """
40
  query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
41
 
42
+ # Prepare model_kwargs based on whether quantization is enabled
43
+ model_kwargs = {"torch_dtype": torch.float16}
44
+ if quantization_config is not None:
45
+ model_kwargs["quantization_config"] = quantization_config
 
46
 
47
  # Initialize the HuggingFaceLLM with your model settings and authentication token
48
  llm = HuggingFaceLLM(
 
54
  tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
55
  model_name="meta-llama/Llama-2-7b-chat-hf",
56
  device_map="auto",
57
+ model_kwargs=model_kwargs,
 
 
 
 
58
  )
59
 
60
  # Set up the embedding model using Langchain's HuggingFaceEmbeddings