Spaces:

Daemontatox
/

Mawared-Support-Assistant

Running

Daemontatox commited on Jan 12

Commit

5a6715f

verified ·

1 Parent(s): e90736c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,3 +1,12 @@
 import os
 from dotenv import load_dotenv
 from langchain_community.vectorstores import Qdrant
@@ -65,7 +74,7 @@ try:
     client = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY"),
-        prefer_grpc=False
     )
 except Exception as e:
     logger.error("Failed to connect to Qdrant. Ensure QDRANT_URL and QDRANT_API_KEY are correctly set.")
@@ -119,10 +128,31 @@ retriever = db.as_retriever(
 #     timeout=None
 # )
-model_id = "CohereForAI/c4ai-command-r7b-12-2024"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
 llm = HuggingFacePipeline(pipeline=pipe)

+import subprocess
+subprocess.run(
+    'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+    shell=True
+)
 import os
 from dotenv import load_dotenv
 from langchain_community.vectorstores import Qdrant
     client = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY"),
+        prefer_grpc=True
     )
 except Exception as e:
     logger.error("Failed to connect to Qdrant. Ensure QDRANT_URL and QDRANT_API_KEY are correctly set.")
 #     timeout=None
 # )
+quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True
+    )
+model_id = "unsloth/phi-4"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        device_map="cuda",
+        attn_implementation="flash_attention_2",
+        quantization_config=quantization_config
+    )
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
 llm = HuggingFacePipeline(pipeline=pipe)