SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 13

Commit

2062515

1 Parent(s): eb1a863

remove some unneeded lines, fix pipe issue

Browse files

Files changed (1) hide show

utils/models.py +5 -16

utils/models.py CHANGED Viewed

@@ -135,15 +135,10 @@ def run_inference(model_name, context, question):
             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
-            # Determine the Hugging Face model name for the tokenizer
             actual_model_name_for_tokenizer = model_name
             if "icecream" in model_name.lower():
                 actual_model_name_for_tokenizer = "meta-llama/llama-3.2-3b-instruct"
-            # Note: tokenizer_kwargs (defined earlier, with add_generation_prompt etc.)
-            # is intended for tokenizer.apply_chat_template, not for AutoTokenizer.from_pretrained generally.
-            # If a specific tokenizer (e.g., Qwen) needs special __init__ args that happen to be in tokenizer_kwargs,
-            # that would require more specific handling here. For now, we assume general constructor args.
             tokenizer = AutoTokenizer.from_pretrained(actual_model_name_for_tokenizer, **tokenizer_load_args)
             tokenizer_cache[model_name] = tokenizer
@@ -201,8 +196,6 @@ def run_inference(model_name, context, question):
         elif "icecream" in model_name.lower():
             print("ICECREAM")
-            # text_input is the list of messages from format_rag_prompt
-            # tokenizer_kwargs (e.g., {"add_generation_prompt": True}) are correctly passed to apply_chat_template
             model_inputs = tokenizer.apply_chat_template(
                 text_input,
                 tokenize=True,
@@ -211,38 +204,34 @@ def run_inference(model_name, context, question):
                 **tokenizer_kwargs,
             )
-            # Move all tensors within the BatchEncoding (model_inputs) to the model's device
             model_inputs = model_inputs.to(model.device)
             input_ids = model_inputs.input_ids
-            attention_mask = model_inputs.attention_mask # Expecting this from a correctly configured tokenizer
-            prompt_tokens_length = input_ids.shape[1] # Get length of tokenized prompt
             with torch.inference_mode():
                 # Check interrupt before generation
                 if generation_interrupt.is_set():
                     return ""
-                # Explicitly pass input_ids, attention_mask, and pad_token_id
-                # tokenizer.pad_token is set to tokenizer.eos_token if None, earlier in the code.
                 output_sequences = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     max_new_tokens=512,
-                    eos_token_id=tokenizer.eos_token_id, # Good practice for stopping generation
                     pad_token_id=tokenizer.pad_token_id  # Addresses the warning
                 )
-            # output_sequences[0] contains the full sequence (prompt + generation)
-            # Decode only the newly generated tokens
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,
-                tokenize=True,
                 **tokenizer_kwargs,
             )

             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
             actual_model_name_for_tokenizer = model_name
             if "icecream" in model_name.lower():
                 actual_model_name_for_tokenizer = "meta-llama/llama-3.2-3b-instruct"
             tokenizer = AutoTokenizer.from_pretrained(actual_model_name_for_tokenizer, **tokenizer_load_args)
             tokenizer_cache[model_name] = tokenizer
         elif "icecream" in model_name.lower():
             print("ICECREAM")
             model_inputs = tokenizer.apply_chat_template(
                 text_input,
                 tokenize=True,
                 **tokenizer_kwargs,
             )
             model_inputs = model_inputs.to(model.device)
             input_ids = model_inputs.input_ids
+            attention_mask = model_inputs.attention_mask
+            prompt_tokens_length = input_ids.shape[1]
             with torch.inference_mode():
                 # Check interrupt before generation
                 if generation_interrupt.is_set():
                     return ""
                 output_sequences = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
                     max_new_tokens=512,
+                    eos_token_id=tokenizer.eos_token_id,
                     pad_token_id=tokenizer.pad_token_id  # Addresses the warning
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,
+                tokenize=False,
                 **tokenizer_kwargs,
             )