Update app.py
Browse files
app.py
CHANGED
@@ -151,16 +151,16 @@ retriever = db.as_retriever(
|
|
151 |
# )
|
152 |
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
|
163 |
-
|
164 |
|
165 |
|
166 |
# llm = ChatCerebras(
|
@@ -171,31 +171,6 @@ retriever = db.as_retriever(
|
|
171 |
|
172 |
|
173 |
|
174 |
-
quantization_config = BitsAndBytesConfig(
|
175 |
-
load_in_4bit=True,
|
176 |
-
bnb_4bit_compute_dtype=torch.bfloat16,
|
177 |
-
bnb_4bit_quant_type="nf4",
|
178 |
-
bnb_4bit_use_double_quant=True
|
179 |
-
)
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
model_id = "meta-llama/Llama-3.2-3B-Instruct"
|
185 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
186 |
-
|
187 |
-
model = AutoModelForCausalLM.from_pretrained(
|
188 |
-
model_id,
|
189 |
-
torch_dtype=torch.float16,
|
190 |
-
device_map="cuda",
|
191 |
-
attn_implementation="flash_attention_2",
|
192 |
-
#quantization_config=quantization_config
|
193 |
-
)
|
194 |
-
|
195 |
-
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
|
196 |
-
|
197 |
-
llm = HuggingFacePipeline(pipeline=pipe)
|
198 |
-
|
199 |
|
200 |
|
201 |
|
|
|
151 |
# )
|
152 |
|
153 |
|
154 |
+
llm = ChatOpenAI(
|
155 |
+
base_url="https://openrouter.ai/api/v1",
|
156 |
+
temperature=0.01,
|
157 |
+
api_key=OPENAPI_KEY,
|
158 |
+
model="google/gemini-2.0-flash-exp:free",
|
159 |
+
max_tokens=None,
|
160 |
+
timeout=None,
|
161 |
+
max_retries=3,
|
162 |
|
163 |
+
)
|
164 |
|
165 |
|
166 |
# llm = ChatCerebras(
|
|
|
171 |
|
172 |
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
|
176 |
|