padmanabhbosamia commited on
Commit
14f2b83
·
verified ·
1 Parent(s): 0840957

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -11,6 +11,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_path)
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_path,
13
  device_map="auto",
 
14
  torch_dtype=torch.float16,
15
  trust_remote_code=True
16
  )
@@ -58,16 +59,17 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9, top_k=
58
 
59
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
60
 
61
- outputs = model.generate(
62
- **inputs,
63
- max_length=max_length,
64
- temperature=temperature,
65
- num_return_sequences=1,
66
- pad_token_id=tokenizer.eos_token_id,
67
- do_sample=True,
68
- top_p=top_p,
69
- top_k=top_k,
70
- )
 
71
 
72
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
73
  return response
 
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_path,
13
  device_map="auto",
14
+ load_in_8bit=True, # Use 8-bit quantization instead of 4-bit
15
  torch_dtype=torch.float16,
16
  trust_remote_code=True
17
  )
 
59
 
60
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
61
 
62
+ with torch.no_grad(): # Disable gradient computation for inference
63
+ outputs = model.generate(
64
+ **inputs,
65
+ max_length=max_length,
66
+ temperature=temperature,
67
+ num_return_sequences=1,
68
+ pad_token_id=tokenizer.eos_token_id,
69
+ do_sample=True,
70
+ top_p=top_p,
71
+ top_k=top_k,
72
+ )
73
 
74
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
  return response