ddas commited on
Commit
e1561f0
Β·
unverified Β·
1 Parent(s): e965542

trace format updated

Browse files
Files changed (2) hide show
  1. agent.py +5 -5
  2. app.py +5 -1
agent.py CHANGED
@@ -780,7 +780,7 @@ def is_running_on_spaces():
780
  return os.getenv("IS_SPACE", "").lower() == "true"
781
 
782
 
783
- def push_trace_to_explorer(trace_messages, annotations=None, user_info="", evaluation_results=None, model_name=""):
784
  """
785
  Push the complete conversation trace to Invariant Labs Explorer using Push API
786
 
@@ -804,10 +804,8 @@ def push_trace_to_explorer(trace_messages, annotations=None, user_info="", evalu
804
  # Determine dataset based on environment
805
  if is_running_on_spaces():
806
  dataset_name = "public-instruction-challenge"
807
- environment = "Hugging Face Spaces"
808
  else:
809
  dataset_name = "instruction-challenge"
810
- environment = "Local Development"
811
 
812
  # Prepare metadata
813
  metadata = {"pushed_at": datetime.now().isoformat()}
@@ -815,6 +813,8 @@ def push_trace_to_explorer(trace_messages, annotations=None, user_info="", evalu
815
  metadata["user_info"] = user_info.strip()
816
  if model_name and model_name.strip():
817
  metadata["model_name"] = model_name.strip()
 
 
818
 
819
  # Add evaluation results to metadata if provided
820
  if evaluation_results:
@@ -857,7 +857,7 @@ def push_trace_to_explorer(trace_messages, annotations=None, user_info="", evalu
857
  print(f" Error Message: {str(e)}")
858
 
859
 
860
- def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini", defense_enabled=True, user_info="", fasttext_confidence_scores=None):
861
  """
862
  Main tool agent loop implementation with proper tool call tracing:
863
  1. Start with System + User input
@@ -1198,7 +1198,7 @@ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini",
1198
  if fasttext_confidence_scores:
1199
  evaluation_results.update(fasttext_confidence_scores)
1200
 
1201
- push_trace_to_explorer(trace_messages, all_annotations if all_annotations else None, user_info, evaluation_results, model_name)
1202
 
1203
  # Add confirmation to execution log
1204
  final_trace_msg = f"πŸ“Š Trace push completed (with {len(all_annotations)} annotations)"
 
780
  return os.getenv("IS_SPACE", "").lower() == "true"
781
 
782
 
783
+ def push_trace_to_explorer(trace_messages, annotations=None, user_info="", evaluation_results=None, model_name="", attack_email=None):
784
  """
785
  Push the complete conversation trace to Invariant Labs Explorer using Push API
786
 
 
804
  # Determine dataset based on environment
805
  if is_running_on_spaces():
806
  dataset_name = "public-instruction-challenge"
 
807
  else:
808
  dataset_name = "instruction-challenge"
 
809
 
810
  # Prepare metadata
811
  metadata = {"pushed_at": datetime.now().isoformat()}
 
813
  metadata["user_info"] = user_info.strip()
814
  if model_name and model_name.strip():
815
  metadata["model_name"] = model_name.strip()
816
+ if attack_email and attack_email.strip():
817
+ metadata["attack_email"] = attack_email.strip()
818
 
819
  # Add evaluation results to metadata if provided
820
  if evaluation_results:
 
857
  print(f" Error Message: {str(e)}")
858
 
859
 
860
+ def tool_agent_loop(user_query, inbox, system_prompt, model_name="gpt-4o-mini", defense_enabled=True, user_info="", fasttext_confidence_scores=None, attack_email=None):
861
  """
862
  Main tool agent loop implementation with proper tool call tracing:
863
  1. Start with System + User input
 
1198
  if fasttext_confidence_scores:
1199
  evaluation_results.update(fasttext_confidence_scores)
1200
 
1201
+ push_trace_to_explorer(trace_messages, all_annotations if all_annotations else None, user_info, evaluation_results, model_name, attack_email)
1202
 
1203
  # Add confirmation to execution log
1204
  final_trace_msg = f"πŸ“Š Trace push completed (with {len(all_annotations)} annotations)"
app.py CHANGED
@@ -747,6 +747,9 @@ def submit_attack(from_addr, attack_subject, attack_body, model_name="gpt-4o", d
747
  )
748
  INBOX.append(attack_email)
749
 
 
 
 
750
  # Use passed confidence scores or empty defaults
751
  fasttext_confidence_scores = confidence_scores or {
752
  "subject_confidence_scores": [],
@@ -761,7 +764,8 @@ def submit_attack(from_addr, attack_subject, attack_body, model_name="gpt-4o", d
761
  model_name=model_name,
762
  defense_enabled=defense_enabled,
763
  user_info=user_info,
764
- fasttext_confidence_scores=fasttext_confidence_scores
 
765
  )
766
 
767
  # Return execution log, final output, and flagged content separately
 
747
  )
748
  INBOX.append(attack_email)
749
 
750
+ # Create concatenated attack email string for trace logging
751
+ attack_email_string = f"To: [email protected] | From: {from_addr} | Subject: {attack_subject} | Body: {attack_body}"
752
+
753
  # Use passed confidence scores or empty defaults
754
  fasttext_confidence_scores = confidence_scores or {
755
  "subject_confidence_scores": [],
 
764
  model_name=model_name,
765
  defense_enabled=defense_enabled,
766
  user_info=user_info,
767
+ fasttext_confidence_scores=fasttext_confidence_scores,
768
+ attack_email=attack_email_string
769
  )
770
 
771
  # Return execution log, final output, and flagged content separately