Daemontatox commited on
Commit
92b6108
·
verified ·
1 Parent(s): 52a3d0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -26
app.py CHANGED
@@ -151,16 +151,16 @@ retriever = db.as_retriever(
151
  # )
152
 
153
 
154
- llm = ChatOpenAI(
155
- base_url="https://openrouter.ai/api/v1",
156
- temperature=0.01,
157
- api_key=OPENAPI_KEY,
158
- model="google/gemini-2.0-flash-exp:free",
159
- max_tokens=None,
160
- timeout=None,
161
- max_retries=3,
162
 
163
- )
164
 
165
 
166
  # llm = ChatCerebras(
@@ -171,31 +171,31 @@ llm = ChatOpenAI(
171
 
172
 
173
 
174
- # quantization_config = BitsAndBytesConfig(
175
- # load_in_4bit=True,
176
- # bnb_4bit_compute_dtype=torch.bfloat16,
177
- # bnb_4bit_quant_type="nf4",
178
- # bnb_4bit_use_double_quant=True
179
- # )
180
 
181
 
182
 
183
 
184
- # model_id = "unsloth/phi-4"
185
- # tokenizer = AutoTokenizer.from_pretrained(model_id)
186
 
187
- # model = AutoModelForCausalLM.from_pretrained(
188
- # model_id,
189
- # torch_dtype=torch.float16,
190
- # device_map="cuda",
191
- # attn_implementation="flash_attention_2",
192
- # quantization_config=quantization_config
193
 
194
- # )
195
 
196
- # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
197
 
198
- # llm = HuggingFacePipeline(pipeline=pipe)
199
 
200
 
201
 
 
151
  # )
152
 
153
 
154
+ #llm = ChatOpenAI(
155
+ # base_url="https://openrouter.ai/api/v1",
156
+ #temperature=0.01,
157
+ # api_key=OPENAPI_KEY,
158
+ #model="google/gemini-2.0-flash-exp:free",
159
+ #max_tokens=None,
160
+ #timeout=None,
161
+ # max_retries=3,
162
 
163
+ #)
164
 
165
 
166
  # llm = ChatCerebras(
 
171
 
172
 
173
 
174
+ quantization_config = BitsAndBytesConfig(
175
+ load_in_8bit=True,
176
+ bnb_8bit_compute_dtype=torch.bfloat16,
177
+ bnb_8bit_quant_type="nf4",
178
+ bnb_8bit_use_double_quant=True
179
+ )
180
 
181
 
182
 
183
 
184
+ model_id = "mistralai/Mistral-Nemo-Instruct-2407"
185
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
186
 
187
+ model = AutoModelForCausalLM.from_pretrained(
188
+ model_id,
189
+ torch_dtype=torch.float16,
190
+ device_map="cuda",
191
+ attn_implementation="flash_attention_2",
192
+ quantization_config=quantization_config
193
 
194
+ )
195
 
196
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=8192 )
197
 
198
+ llm = HuggingFacePipeline(pipeline=pipe)
199
 
200
 
201