Tonic commited on
Commit
a926d81
·
1 Parent(s): d3fa67d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -14
app.py CHANGED
@@ -10,34 +10,26 @@ model_id = "01-ai/Yi-34B-200K"
10
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:54'
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
- # Load the model and tokenizer using transformers
14
  tokenizer = YiTokenizer(vocab_file="./tokenizer.model")
15
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
16
- model = model.to(torch.bfloat16)
17
  model = model.to(device)
18
 
19
  def run(message, chat_history, max_new_tokens=4056, temperature=3.5, top_p=0.9, top_k=800):
20
  prompt = get_prompt(message, chat_history)
21
-
22
- # Encode the prompt to tensor
23
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
24
-
25
- # Move input_ids to the same device as the model
26
  input_ids = input_ids.to(model.device)
27
-
28
- # Generate a response using the model with adjusted parameters
29
  response_ids = model.generate(
30
  input_ids,
31
  max_length=max_new_tokens + input_ids.shape[1],
32
- temperature=temperature, # Controls randomness. Lower values make text more deterministic.
33
- top_p=top_p, # Nucleus sampling: higher values allow more diversity.
34
- top_k=top_k, # Top-k sampling: limits the number of top tokens considered.
35
  pad_token_id=tokenizer.eos_token_id,
36
- do_sample=True # Enable sampling-based generation
37
 
38
  )
39
 
40
- # Decode the response
41
  response = tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
42
  return response
43
 
 
10
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:54'
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
 
13
  tokenizer = YiTokenizer(vocab_file="./tokenizer.model")
14
+ model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, trust_remote_code=True)
15
+ # model = model.to(torch.bfloat16)
16
  model = model.to(device)
17
 
18
  def run(message, chat_history, max_new_tokens=4056, temperature=3.5, top_p=0.9, top_k=800):
19
  prompt = get_prompt(message, chat_history)
 
 
20
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
 
 
21
  input_ids = input_ids.to(model.device)
 
 
22
  response_ids = model.generate(
23
  input_ids,
24
  max_length=max_new_tokens + input_ids.shape[1],
25
+ temperature=temperature,
26
+ top_p=top_p,
27
+ top_k=top_k,
28
  pad_token_id=tokenizer.eos_token_id,
29
+ do_sample=True
30
 
31
  )
32
 
 
33
  response = tokenizer.decode(response_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
34
  return response
35