Spaces:

OscarFAI
/

inference

Runtime error

OscarFAI commited on Mar 11

Commit

804e8e0

1 Parent(s): de6224b

Added pad token and attention mask

Files changed (1) hide show

app.py CHANGED Viewed

@@ -77,17 +77,20 @@ def chat_llama3_8b(message: str,
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
         eos_token_id=terminators,
     )
     # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0:

     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
+    attention_mask = input_ids.ne(tokenizer.pad_token_id).long()
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids= input_ids,
+        attention_mask=attention_mask,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         top_p=top_p,
         eos_token_id=terminators,
+        pad_token_id=tokenizer.eos_token_id,
     )
     # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0: