Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
from langchain_community.vectorstores import Qdrant
|
|
@@ -65,7 +74,7 @@ try:
|
|
| 65 |
client = QdrantClient(
|
| 66 |
url=os.getenv("QDRANT_URL"),
|
| 67 |
api_key=os.getenv("QDRANT_API_KEY"),
|
| 68 |
-
prefer_grpc=
|
| 69 |
)
|
| 70 |
except Exception as e:
|
| 71 |
logger.error("Failed to connect to Qdrant. Ensure QDRANT_URL and QDRANT_API_KEY are correctly set.")
|
|
@@ -119,10 +128,31 @@ retriever = db.as_retriever(
|
|
| 119 |
# timeout=None
|
| 120 |
|
| 121 |
# )
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
|
|
|
|
| 126 |
llm = HuggingFacePipeline(pipeline=pipe)
|
| 127 |
|
| 128 |
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
subprocess.run(
|
| 4 |
+
'pip install flash-attn --no-build-isolation',
|
| 5 |
+
env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
|
| 6 |
+
shell=True
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
import os
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
from langchain_community.vectorstores import Qdrant
|
|
|
|
| 74 |
client = QdrantClient(
|
| 75 |
url=os.getenv("QDRANT_URL"),
|
| 76 |
api_key=os.getenv("QDRANT_API_KEY"),
|
| 77 |
+
prefer_grpc=True
|
| 78 |
)
|
| 79 |
except Exception as e:
|
| 80 |
logger.error("Failed to connect to Qdrant. Ensure QDRANT_URL and QDRANT_API_KEY are correctly set.")
|
|
|
|
| 128 |
# timeout=None
|
| 129 |
|
| 130 |
# )
|
| 131 |
+
|
| 132 |
+
quantization_config = BitsAndBytesConfig(
|
| 133 |
+
load_in_4bit=True,
|
| 134 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 135 |
+
bnb_4bit_quant_type="nf4",
|
| 136 |
+
bnb_4bit_use_double_quant=True
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
model_id = "unsloth/phi-4"
|
| 143 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 144 |
+
|
| 145 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 146 |
+
MODEL_ID,
|
| 147 |
+
torch_dtype=torch.float16,
|
| 148 |
+
device_map="cuda",
|
| 149 |
+
attn_implementation="flash_attention_2",
|
| 150 |
+
quantization_config=quantization_config
|
| 151 |
+
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
|
| 155 |
+
|
| 156 |
llm = HuggingFacePipeline(pipeline=pipe)
|
| 157 |
|
| 158 |
|