Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
11 |
model = AutoModelForCausalLM.from_pretrained(
|
12 |
model_path,
|
13 |
device_map="auto",
|
|
|
14 |
torch_dtype=torch.float16,
|
15 |
trust_remote_code=True
|
16 |
)
|
@@ -58,16 +59,17 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
|
|
58 |
|
59 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
71 |
|
72 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
73 |
return response
|
|
|
11 |
model = AutoModelForCausalLM.from_pretrained(
|
12 |
model_path,
|
13 |
device_map="auto",
|
14 |
+
load_in_8bit=True, # Use 8-bit quantization instead of 4-bit
|
15 |
torch_dtype=torch.float16,
|
16 |
trust_remote_code=True
|
17 |
)
|
|
|
59 |
|
60 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
61 |
|
62 |
+
with torch.no_grad(): # Disable gradient computation for inference
|
63 |
+
outputs = model.generate(
|
64 |
+
**inputs,
|
65 |
+
max_length=max_length,
|
66 |
+
temperature=temperature,
|
67 |
+
num_return_sequences=1,
|
68 |
+
pad_token_id=tokenizer.eos_token_id,
|
69 |
+
do_sample=True,
|
70 |
+
top_p=top_p,
|
71 |
+
top_k=top_k,
|
72 |
+
)
|
73 |
|
74 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
75 |
return response
|