Spaces:
Running
Running
Terry Zhuo
commited on
Commit
·
7c88c71
1
Parent(s):
fd079ad
update
Browse files- app.py +145 -0
- log_reader.py +118 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
import gradio as gr
|
5 |
+
import json
|
6 |
+
from log_reader import RemoteLogReader
|
7 |
+
|
8 |
+
# logging.basicConfig(level=logging.INFO)
|
9 |
+
# log = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
def get_file_data(content: str) -> tuple[str, bool]:
|
12 |
+
"""Read file content and return IP and vote condition status"""
|
13 |
+
try:
|
14 |
+
lines = [line.strip() for line in content.split('\n') if line.strip()]
|
15 |
+
if not lines:
|
16 |
+
return None, False
|
17 |
+
|
18 |
+
# Get IP from first line
|
19 |
+
try:
|
20 |
+
first_line_data = json.loads(lines[0])
|
21 |
+
ip = first_line_data.get('ip')
|
22 |
+
except json.JSONDecodeError:
|
23 |
+
ip = None
|
24 |
+
|
25 |
+
# Check vote conditions from last line
|
26 |
+
try:
|
27 |
+
last_line_data = json.loads(lines[-1])
|
28 |
+
feedback = last_line_data.get('feedback')
|
29 |
+
vote_conditions_met = (last_line_data.get('type') == 'vote' and
|
30 |
+
isinstance(feedback, dict) and
|
31 |
+
len(feedback) == 6)
|
32 |
+
except json.JSONDecodeError:
|
33 |
+
vote_conditions_met = False
|
34 |
+
|
35 |
+
return ip, vote_conditions_met
|
36 |
+
except Exception as e:
|
37 |
+
logging.error(f"Error processing file content: {e}")
|
38 |
+
return None, False
|
39 |
+
|
40 |
+
def search_battle_anony(date_str: str, search_query: str) -> list[list]:
|
41 |
+
"""Search battle_anony conversations for a specific date and query."""
|
42 |
+
results = []
|
43 |
+
|
44 |
+
try:
|
45 |
+
# Initialize RemoteLogReader
|
46 |
+
reader = RemoteLogReader()
|
47 |
+
|
48 |
+
# Get conversation logs for battle_anony mode
|
49 |
+
conv_logs = reader.get_conv_logs(date_str)
|
50 |
+
battle_anony_logs = conv_logs.get('battle_anony', {})
|
51 |
+
|
52 |
+
# Process each conversation
|
53 |
+
for conv_id, logs in battle_anony_logs.items():
|
54 |
+
found_query = False
|
55 |
+
ip = None
|
56 |
+
|
57 |
+
# Convert messages to file content format for validation check
|
58 |
+
content = '\n'.join(json.dumps(msg) for msg in logs)
|
59 |
+
ip, is_valid = get_file_data(content)
|
60 |
+
|
61 |
+
if not ip:
|
62 |
+
continue
|
63 |
+
|
64 |
+
# Search through messages for the query
|
65 |
+
for log in logs:
|
66 |
+
if "state" in log and "messages" in log["state"]:
|
67 |
+
messages = log["state"]["messages"]
|
68 |
+
|
69 |
+
# Search through each message
|
70 |
+
for _, message in messages:
|
71 |
+
if search_query in message:
|
72 |
+
found_query = True
|
73 |
+
break
|
74 |
+
|
75 |
+
if found_query:
|
76 |
+
break
|
77 |
+
|
78 |
+
if found_query:
|
79 |
+
# Convert to list format instead of dict
|
80 |
+
results.append([
|
81 |
+
ip,
|
82 |
+
conv_id,
|
83 |
+
'Yes' if is_valid else 'No',
|
84 |
+
'Yes'
|
85 |
+
])
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
logging.error(f"Error searching logs for date {date_str}: {e}")
|
89 |
+
|
90 |
+
return results
|
91 |
+
|
92 |
+
def create_ui():
|
93 |
+
def process_search(date, query):
|
94 |
+
if not query.strip():
|
95 |
+
return []
|
96 |
+
|
97 |
+
# Convert timestamp to datetime object
|
98 |
+
date_obj = datetime.fromtimestamp(date)
|
99 |
+
# Convert date to required format (YYYY_MM_DD)
|
100 |
+
date_str = date_obj.strftime("%Y_%m_%d")
|
101 |
+
results = search_battle_anony(date_str, query)
|
102 |
+
|
103 |
+
if not results:
|
104 |
+
# Return empty list with correct structure for DataFrame
|
105 |
+
return []
|
106 |
+
return results
|
107 |
+
|
108 |
+
with gr.Blocks(title="Battle Search") as app:
|
109 |
+
gr.Markdown("# Battle Search")
|
110 |
+
gr.Markdown("Search for specific content in battle_anony conversations by date and view IP addresses with validation status.")
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
date_input = gr.DateTime(
|
114 |
+
label="Select Date",
|
115 |
+
include_time=False,
|
116 |
+
)
|
117 |
+
|
118 |
+
query_input = gr.Textbox(
|
119 |
+
label="Search Query",
|
120 |
+
placeholder="Enter text to search for in conversations...",
|
121 |
+
lines=1
|
122 |
+
)
|
123 |
+
|
124 |
+
with gr.Row():
|
125 |
+
search_button = gr.Button("Search")
|
126 |
+
|
127 |
+
with gr.Row():
|
128 |
+
table_output = gr.DataFrame(
|
129 |
+
headers=['IP', 'Conversation ID', 'Is Valid', 'Contains Query'],
|
130 |
+
label="Results Table",
|
131 |
+
interactive=False,
|
132 |
+
value=[] # Initialize with empty list
|
133 |
+
)
|
134 |
+
|
135 |
+
search_button.click(
|
136 |
+
fn=process_search,
|
137 |
+
inputs=[date_input, query_input],
|
138 |
+
outputs=[table_output]
|
139 |
+
)
|
140 |
+
|
141 |
+
return app
|
142 |
+
|
143 |
+
if __name__ == "__main__":
|
144 |
+
app = create_ui()
|
145 |
+
app.launch()
|
log_reader.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Facade for reading logs on remote storage.
|
3 |
+
'''
|
4 |
+
|
5 |
+
from collections import defaultdict
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
from typing import Any
|
9 |
+
from azure.storage.fileshare import ShareServiceClient
|
10 |
+
|
11 |
+
|
12 |
+
class RemoteLogReader:
|
13 |
+
'''
|
14 |
+
remote log reader
|
15 |
+
'''
|
16 |
+
|
17 |
+
LOG_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING") or ""
|
18 |
+
LOG_SHARE_NAME = "swearenalogsfileshare"
|
19 |
+
|
20 |
+
IMAGE_DIR_NAME = "serve_images"
|
21 |
+
'''
|
22 |
+
Directory for storing user uploaded images.
|
23 |
+
'''
|
24 |
+
CONV_LOG_DIR_NAME = "conv_logs"
|
25 |
+
'''
|
26 |
+
Directory for conversation logs.
|
27 |
+
'''
|
28 |
+
SANDBOX_LOG_DIR_NAME = "sandbox_logs"
|
29 |
+
'''
|
30 |
+
Directory for sandbox logs.
|
31 |
+
'''
|
32 |
+
|
33 |
+
CHAT_MODES = ["battle_anony", "battle_named", "direct"]
|
34 |
+
|
35 |
+
def __init__(
|
36 |
+
self,
|
37 |
+
connection_string: str = LOG_CONNECTION_STRING,
|
38 |
+
share_name: str = LOG_SHARE_NAME,
|
39 |
+
):
|
40 |
+
if not connection_string:
|
41 |
+
raise ValueError("Connection string is required.")
|
42 |
+
if not share_name:
|
43 |
+
raise ValueError("Share name is required.")
|
44 |
+
|
45 |
+
self.share_service = ShareServiceClient.from_connection_string(
|
46 |
+
conn_str=connection_string)
|
47 |
+
self.share_client = self.share_service.get_share_client(share=share_name)
|
48 |
+
|
49 |
+
def is_conv_log(self, file_name: str) -> bool:
|
50 |
+
return file_name.startswith("conv-log") and file_name.endswith(".json")
|
51 |
+
|
52 |
+
def get_conv_id_from_name(self, file_name: str) -> str:
|
53 |
+
return file_name.split("-")[2].strip('.json')
|
54 |
+
|
55 |
+
def is_sandbox_log(self, file_name: str) -> bool:
|
56 |
+
return file_name.startswith("sandbox-log") and file_name.endswith(".json")
|
57 |
+
|
58 |
+
def get_file_content(self, file_path: str) -> bytes:
|
59 |
+
file_client = self.share_client.get_file_client(file_path)
|
60 |
+
file_content = file_client.download_file().readall()
|
61 |
+
return file_content
|
62 |
+
|
63 |
+
def get_conv_logs(self, date: str) -> dict[str, defaultdict[str, list[Any]]]:
|
64 |
+
'''
|
65 |
+
Return conversation logs based on the date.
|
66 |
+
Returns a dict:
|
67 |
+
mode -> conv_id -> list of logs.
|
68 |
+
'''
|
69 |
+
conv_logs = {
|
70 |
+
mode: defaultdict(list) for mode in self.CHAT_MODES
|
71 |
+
}
|
72 |
+
for mode in self.CHAT_MODES:
|
73 |
+
conv_log_dir = f"{date}/{self.CONV_LOG_DIR_NAME}/{mode}/"
|
74 |
+
# check if the directory exists
|
75 |
+
if not self.share_client.get_directory_client(conv_log_dir).exists():
|
76 |
+
continue
|
77 |
+
for file in self.share_client.list_directories_and_files(conv_log_dir):
|
78 |
+
if not self.is_conv_log(file.name):
|
79 |
+
continue
|
80 |
+
conv_id = self.get_conv_id_from_name(file.name)
|
81 |
+
file_content = self.get_file_content(
|
82 |
+
conv_log_dir + file.name).decode("utf-8").strip(' \n')
|
83 |
+
for line in file_content.split('\n'):
|
84 |
+
if line:
|
85 |
+
conv_logs[mode][conv_id].append(json.loads(line))
|
86 |
+
return conv_logs
|
87 |
+
|
88 |
+
def get_sandbox_logs(self, date: str) -> list[str]:
|
89 |
+
'''
|
90 |
+
Return sandbox logs based on the date.
|
91 |
+
'''
|
92 |
+
sandbox_logs = []
|
93 |
+
sandbox_log_dir = f"{date}/{self.SANDBOX_LOG_DIR_NAME}/"
|
94 |
+
for file in self.share_client.list_directories_and_files(sandbox_log_dir):
|
95 |
+
if self.is_sandbox_log(file.name):
|
96 |
+
file_content = self.get_file_content(
|
97 |
+
sandbox_log_dir + file.name).decode("utf-8").strip(' \n')
|
98 |
+
sandbox_logs.append(json.loads(file_content))
|
99 |
+
return sandbox_logs
|
100 |
+
|
101 |
+
def get_image(self, image_id: str) -> bytes:
|
102 |
+
'''
|
103 |
+
Return image data based on the image id.
|
104 |
+
'''
|
105 |
+
image_path = f"{self.IMAGE_DIR_NAME}/{image_id}.png"
|
106 |
+
return self.get_file_content(image_path)
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
# Example usages
|
111 |
+
log_reader = RemoteLogReader()
|
112 |
+
date = "2025_02_20"
|
113 |
+
conv_logs = log_reader.get_conv_logs(date)
|
114 |
+
sandbox_logs = log_reader.get_sandbox_logs(date)
|
115 |
+
image_data = log_reader.get_image("051fdac24285ff6e219a9ba06d1ac843")
|
116 |
+
print(conv_logs)
|
117 |
+
print(sandbox_logs)
|
118 |
+
print(image_data)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=2.0.0
|
3 |
+
azure-storage-file-share
|