Update app.py
Browse files
app.py
CHANGED
@@ -151,16 +151,16 @@ retriever = db.as_retriever(
|
|
151 |
# )
|
152 |
|
153 |
|
154 |
-
llm = ChatOpenAI(
|
155 |
-
|
156 |
-
temperature=0.01,
|
157 |
-
|
158 |
-
model="google/gemini-2.0-flash-exp:free",
|
159 |
-
max_tokens=None,
|
160 |
-
timeout=None,
|
161 |
-
|
162 |
|
163 |
-
)
|
164 |
|
165 |
|
166 |
# llm = ChatCerebras(
|
@@ -171,31 +171,31 @@ llm = ChatOpenAI(
|
|
171 |
|
172 |
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
|
182 |
|
183 |
|
184 |
-
|
185 |
-
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
|
194 |
-
|
195 |
|
196 |
-
|
197 |
|
198 |
-
|
199 |
|
200 |
|
201 |
|
|
|
151 |
# )
|
152 |
|
153 |
|
154 |
+
#llm = ChatOpenAI(
|
155 |
+
# base_url="https://openrouter.ai/api/v1",
|
156 |
+
#temperature=0.01,
|
157 |
+
# api_key=OPENAPI_KEY,
|
158 |
+
#model="google/gemini-2.0-flash-exp:free",
|
159 |
+
#max_tokens=None,
|
160 |
+
#timeout=None,
|
161 |
+
# max_retries=3,
|
162 |
|
163 |
+
#)
|
164 |
|
165 |
|
166 |
# llm = ChatCerebras(
|
|
|
171 |
|
172 |
|
173 |
|
174 |
+
quantization_config = BitsAndBytesConfig(
|
175 |
+
load_in_8bit=True,
|
176 |
+
bnb_8bit_compute_dtype=torch.bfloat16,
|
177 |
+
bnb_8bit_quant_type="nf4",
|
178 |
+
bnb_8bit_use_double_quant=True
|
179 |
+
)
|
180 |
|
181 |
|
182 |
|
183 |
|
184 |
+
model_id = "mistralai/Mistral-Nemo-Instruct-2407"
|
185 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
186 |
|
187 |
+
model = AutoModelForCausalLM.from_pretrained(
|
188 |
+
model_id,
|
189 |
+
torch_dtype=torch.float16,
|
190 |
+
device_map="cuda",
|
191 |
+
attn_implementation="flash_attention_2",
|
192 |
+
quantization_config=quantization_config
|
193 |
|
194 |
+
)
|
195 |
|
196 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
|
197 |
|
198 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
199 |
|
200 |
|
201 |
|