Spaces:

rajsecrets0
/

NEP_Chatbot

Sleeping

rajsecrets0 commited on Feb 3

Commit

570c187

verified ·

1 Parent(s): 040f3b6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,6 +18,18 @@ if hf_token is None:
     st.error("Missing Hugging Face token. Please set HF_TOKEN in your Space secrets.")
     st.stop()
 # ---------------------------
 # Configure your LLM and embeddings
 # ---------------------------
@@ -27,11 +39,10 @@ accurately as possible based on the instructions and context provided.
 """
 query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
-# Configure BitsAndBytes for quantization
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
 # Initialize the HuggingFaceLLM with your model settings and authentication token
 llm = HuggingFaceLLM(
@@ -43,11 +54,7 @@ llm = HuggingFaceLLM(
     tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
     model_name="meta-llama/Llama-2-7b-chat-hf",
     device_map="auto",
-    model_kwargs={
-        "torch_dtype": torch.float16,
-        "quantization_config": quantization_config,
-        "use_auth_token": hf_token  # Pass the HF token for gated access
-    }
 )
 # Set up the embedding model using Langchain's HuggingFaceEmbeddings

     st.error("Missing Hugging Face token. Please set HF_TOKEN in your Space secrets.")
     st.stop()
+# ---------------------------
+# Configure BitsAndBytes Quantization (only if GPU is available)
+# ---------------------------
+if torch.cuda.is_available():
+    quantization_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        bnb_4bit_compute_dtype=torch.float16
+    )
+else:
+    # If no GPU is available, disable bitsandbytes quantization
+    quantization_config = None
 # ---------------------------
 # Configure your LLM and embeddings
 # ---------------------------
 """
 query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
+# Prepare model_kwargs based on whether quantization is enabled
+model_kwargs = {"torch_dtype": torch.float16}
+if quantization_config is not None:
+    model_kwargs["quantization_config"] = quantization_config
 # Initialize the HuggingFaceLLM with your model settings and authentication token
 llm = HuggingFaceLLM(
     tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
     model_name="meta-llama/Llama-2-7b-chat-hf",
     device_map="auto",
+    model_kwargs=model_kwargs,
 )
 # Set up the embedding model using Langchain's HuggingFaceEmbeddings