CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 22

Commit

5caebdc

verified ·

1 Parent(s): f260d4a

Update src/txagent/txagent.py

Browse files

Files changed (1) hide show

src/txagent/txagent.py +72 -20

src/txagent/txagent.py CHANGED Viewed

@@ -74,10 +74,20 @@ class TxAgent:
                 return f"The model {model_name} is already loaded."
             self.model_name = model_name
-        self.model = LLM(model=self.model_name, dtype="float16", max_model_len=32768, gpu_memory_utilization=0.8)
         self.chat_template = Template(self.model.get_tokenizer().chat_template)
         self.tokenizer = self.model.get_tokenizer()
-        logger.info("Model %s loaded successfully", self.model_name)
         return f"Model {model_name} loaded successfully."
     def load_tooluniverse(self):
@@ -204,7 +214,7 @@ class TxAgent:
                             )
                             call_result = self.run_multistep_agent(
                                 full_message, temperature=temperature,
-                                max_new_tokens=512, max_token=2048,
                                 call_agent=False, call_agent_level=call_agent_level)
                             if call_result is None:
                                 call_result = "⚠️ No content returned from sub-agent."
@@ -277,7 +287,7 @@ class TxAgent:
                             sub_agent_task = "Sub TxAgent plan: " + str(solution_plan)
                             call_result = yield from self.run_gradio_chat(
                                 full_message, history=[], temperature=temperature,
-                                max_new_tokens=512, max_token=2048,
                                 call_agent=False, call_agent_level=call_agent_level,
                                 conversation=None, sub_agent_task=sub_agent_task)
                             if call_result is not None and isinstance(call_result, str):
@@ -387,7 +397,7 @@ class TxAgent:
                 tools=picked_tools_prompt,
                 skip_special_tokens=False,
                 max_new_tokens=2048,
-                max_token=32768,
                 check_token_status=True)
             if last_outputs_str is None:
                 logger.warning("Token limit exceeded")
@@ -410,7 +420,7 @@ class TxAgent:
     def llm_infer(self, messages, temperature=0.1, tools=None,
                   output_begin_string=None, max_new_tokens=512,
-                  max_token=2048, skip_special_tokens=True,
                   model=None, tokenizer=None, terminators=None,
                   seed=None, check_token_status=False):
         if model is None:
@@ -430,21 +440,23 @@ class TxAgent:
         if check_token_status and max_token is not None:
             token_overflow = False
-            num_input_tokens = len(self.tokenizer.encode(prompt, return_tensors="pt")[0])
             if num_input_tokens > max_token:
                 torch.cuda.empty_cache()
                 gc.collect()
-                logger.info("Token overflow: %d > %d", num_input_tokens, max_token)
                 return None, True
         output = model.generate(prompt, sampling_params=sampling_params)
-        output = output[0].outputs[0].text
-        logger.debug("Inference output: %s", output[:100])
         torch.cuda.empty_cache()
         gc.collect()
         if check_token_status and max_token is not None:
-            return output, token_overflow
-        return output
     def run_self_agent(self, message: str,
                        temperature: float,
@@ -514,7 +526,7 @@ Function calls' responses:
 \"\"\"
 {function_response}
 \"\"\"
-Summarize the function calls' responses in one sentence with all necessary information.
 """
         conversation = [{"role": "user", "content": prompt}]
         output = self.llm_infer(
@@ -559,7 +571,7 @@ Summarize the function calls' responses in one sentence with all necessary infor
                             function_response=function_response,
                             temperature=0.1,
                             max_new_tokens=512,
-                            max_token=2048)
                         input_list.insert(last_call_idx + 1, {'role': 'tool', 'content': result_summary})
                         status['summarized_index'] = last_call_idx + 2
                         idx += 1
@@ -581,7 +593,7 @@ Summarize the function calls' responses in one sentence with all necessary infor
                 function_response=function_response,
                 temperature=0.1,
                 max_new_tokens=512,
-                max_token=2048)
             tool_calls = json.loads(input_list[last_call_idx]['tool_calls'])
             for tool_call in tool_calls:
                 del tool_call['call_id']
@@ -603,10 +615,10 @@ Summarize the function calls' responses in one sentence with all necessary infor
     def run_gradio_chat(self, message: str,
                         history: list,
                         temperature: float,
-                        max_new_tokens: 2048,
-                        max_token: 32768,
-                        call_agent: bool,
-                        conversation: gr.State,
                         max_round: int = 5,
                         seed: int = None,
                         call_agent_level: int = 0,
@@ -755,4 +767,44 @@ Summarize the function calls' responses in one sentence with all necessary infor
                 logger.info("Forced final answer after error: %s", final_answer[:100])
                 yield history
                 return final_answer
-            return error_msg

                 return f"The model {model_name} is already loaded."
             self.model_name = model_name
+        self.model = LLM(
+            model=self.model_name,
+            dtype="float16",
+            max_model_len=131072,
+            max_num_batched_tokens=32768,  # Increased for A100 80GB
+            gpu_memory_utilization=0.9,    # Higher utilization for better performance
+            trust_remote_code=True
+        )
         self.chat_template = Template(self.model.get_tokenizer().chat_template)
         self.tokenizer = self.model.get_tokenizer()
+        logger.info(
+            "Model %s loaded with max_model_len=%d, max_num_batched_tokens=%d, gpu_memory_utilization=%.2f",
+            self.model_name, 131072, 32768, 0.9
+        )
         return f"Model {model_name} loaded successfully."
     def load_tooluniverse(self):
                             )
                             call_result = self.run_multistep_agent(
                                 full_message, temperature=temperature,
+                                max_new_tokens=512, max_token=131072,
                                 call_agent=False, call_agent_level=call_agent_level)
                             if call_result is None:
                                 call_result = "⚠️ No content returned from sub-agent."
                             sub_agent_task = "Sub TxAgent plan: " + str(solution_plan)
                             call_result = yield from self.run_gradio_chat(
                                 full_message, history=[], temperature=temperature,
+                                max_new_tokens=512, max_token=131072,
                                 call_agent=False, call_agent_level=call_agent_level,
                                 conversation=None, sub_agent_task=sub_agent_task)
                             if call_result is not None and isinstance(call_result, str):
                 tools=picked_tools_prompt,
                 skip_special_tokens=False,
                 max_new_tokens=2048,
+                max_token=131072,
                 check_token_status=True)
             if last_outputs_str is None:
                 logger.warning("Token limit exceeded")
     def llm_infer(self, messages, temperature=0.1, tools=None,
                   output_begin_string=None, max_new_tokens=512,
+                  max_token=131072, skip_special_tokens=True,
                   model=None, tokenizer=None, terminators=None,
                   seed=None, check_token_status=False):
         if model is None:
         if check_token_status and max_token is not None:
             token_overflow = False
+            num_input_tokens = len(self.tokenizer.encode(prompt, add_special_tokens=False))
+            logger.info("Input prompt tokens: %d, max_token: %d", num_input_tokens, max_token)
             if num_input_tokens > max_token:
                 torch.cuda.empty_cache()
                 gc.collect()
+                logger.warning("Token overflow: %d > %d", num_input_tokens, max_token)
                 return None, True
         output = model.generate(prompt, sampling_params=sampling_params)
+        output_text = output[0].outputs[0].text
+        output_tokens = len(self.tokenizer.encode(output_text, add_special_tokens=False))
+        logger.debug("Inference output: %s (output tokens: %d)", output_text[:100], output_tokens)
         torch.cuda.empty_cache()
         gc.collect()
         if check_token_status and max_token is not None:
+            return output_text, token_overflow
+        return output_text
     def run_self_agent(self, message: str,
                        temperature: float,
 \"\"\"
 {function_response}
 \"\"\"
+Summarize the function calls' l responses in one sentence with all necessary information.
 """
         conversation = [{"role": "user", "content": prompt}]
         output = self.llm_infer(
                             function_response=function_response,
                             temperature=0.1,
                             max_new_tokens=512,
+                            max_token=131072)
                         input_list.insert(last_call_idx + 1, {'role': 'tool', 'content': result_summary})
                         status['summarized_index'] = last_call_idx + 2
                         idx += 1
                 function_response=function_response,
                 temperature=0.1,
                 max_new_tokens=512,
+                max_token=131072)
             tool_calls = json.loads(input_list[last_call_idx]['tool_calls'])
             for tool_call in tool_calls:
                 del tool_call['call_id']
     def run_gradio_chat(self, message: str,
                         history: list,
                         temperature: float,
+                        max_new_tokens: int = 2048,
+                        max_token: int = 131072,
+                        call_agent: bool = False,
+                        conversation: gr.State = None,
                         max_round: int = 5,
                         seed: int = None,
                         call_agent_level: int = 0,
                 logger.info("Forced final answer after error: %s", final_answer[:100])
                 yield history
                 return final_answer
+            return error_msg
+    def run_gradio_chat_batch(self, messages: List[str],
+                             temperature: float,
+                             max_new_tokens: int = 2048,
+                             max_token: int = 131072,
+                             call_agent: bool = False,
+                             conversation: List = None,
+                             max_round: int = 5,
+                             seed: int = None,
+                             call_agent_level: int = 0):
+        """Run batch inference for multiple messages."""
+        logger.info("Starting batch chat for %d messages", len(messages))
+        batch_results = []
+        for message in messages:
+            # Initialize conversation for each message
+            conv = self.initialize_conversation(message, conversation, history=None)
+            picked_tools_prompt, call_agent_level = self.initialize_tools_prompt(
+                call_agent, call_agent_level, message)
+            # Run single inference for simplicity (extend for multi-round if needed)
+            output, token_overflow = self.llm_infer(
+                messages=conv,
+                temperature=temperature,
+                tools=picked_tools_prompt,
+                max_new_tokens=max_new_tokens,
+                max_token=max_token,
+                skip_special_tokens=False,
+                seed=seed,
+                check_token_status=True
+            )
+            if output is None:
+                logger.warning("Token limit exceeded for message: %s", message[:100])
+                batch_results.append("Token limit exceeded.")
+            else:
+                batch_results.append(output)
+        logger.info("Batch chat completed for %d messages", len(messages))
+        return batch_results