Spaces:
Runtime error
Runtime error
Commit
·
23e06a5
1
Parent(s):
e236b6c
added streamlit files under pages
Browse files- .gitignore +1 -2
- main.py +6 -0
- pages/1_Leaderboard.py +169 -0
- pages/2_Evaluations.py +491 -0
- pages/3_app.py +11 -0
- streamlit_app.py +5 -2
.gitignore
CHANGED
|
@@ -4,5 +4,4 @@
|
|
| 4 |
results/
|
| 5 |
|
| 6 |
*.sqlite
|
| 7 |
-
ux/
|
| 8 |
-
pages/
|
|
|
|
| 4 |
results/
|
| 5 |
|
| 6 |
*.sqlite
|
| 7 |
+
ux/
|
|
|
main.py
CHANGED
|
@@ -30,9 +30,13 @@ def main():
|
|
| 30 |
|
| 31 |
### gpt-4-1106-preview
|
| 32 |
### gpt-3.5-turbo-1106 / gpt-3.5-turbo
|
|
|
|
| 33 |
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
|
|
|
|
|
|
|
| 34 |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
| 35 |
|
|
|
|
| 36 |
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
| 37 |
index = VectorStoreIndex.from_documents([document], service_context=service_context)
|
| 38 |
|
|
@@ -56,6 +60,8 @@ def main():
|
|
| 56 |
|
| 57 |
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
| 58 |
app_id="Direct Query Engine")
|
|
|
|
|
|
|
| 59 |
with tru_recorder as recording:
|
| 60 |
for question in eval_questions:
|
| 61 |
response = query_engine.query(question)
|
|
|
|
| 30 |
|
| 31 |
### gpt-4-1106-preview
|
| 32 |
### gpt-3.5-turbo-1106 / gpt-3.5-turbo
|
| 33 |
+
print("Initializing GPT 3.5 ..")
|
| 34 |
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
|
| 35 |
+
|
| 36 |
+
print("Initializing bge-small-en-v1.5 embedding model ..")
|
| 37 |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
| 38 |
|
| 39 |
+
print("Creating vector store ..")
|
| 40 |
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
| 41 |
index = VectorStoreIndex.from_documents([document], service_context=service_context)
|
| 42 |
|
|
|
|
| 60 |
|
| 61 |
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
| 62 |
app_id="Direct Query Engine")
|
| 63 |
+
|
| 64 |
+
print("Sending each question to llm ..")
|
| 65 |
with tru_recorder as recording:
|
| 66 |
for question in eval_questions:
|
| 67 |
response = query_engine.query(question)
|
pages/1_Leaderboard.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import asyncio
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# https://github.com/jerryjliu/llama_index/issues/7244:
|
| 8 |
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
| 9 |
+
|
| 10 |
+
from millify import millify
|
| 11 |
+
import numpy as np
|
| 12 |
+
import streamlit as st
|
| 13 |
+
from streamlit_extras.switch_page_button import switch_page
|
| 14 |
+
|
| 15 |
+
from trulens_eval.db_migration import MIGRATION_UNKNOWN_STR
|
| 16 |
+
from trulens_eval.ux.styles import CATEGORY
|
| 17 |
+
|
| 18 |
+
st.runtime.legacy_caching.clear_cache()
|
| 19 |
+
|
| 20 |
+
from trulens_eval import Tru
|
| 21 |
+
from trulens_eval.ux import styles
|
| 22 |
+
from trulens_eval.ux.components import draw_metadata
|
| 23 |
+
|
| 24 |
+
st.set_page_config(page_title="Leaderboard", layout="wide")
|
| 25 |
+
|
| 26 |
+
from trulens_eval.ux.add_logo import add_logo_and_style_overrides
|
| 27 |
+
|
| 28 |
+
add_logo_and_style_overrides()
|
| 29 |
+
|
| 30 |
+
database_url = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def streamlit_app():
|
| 34 |
+
tru = Tru(database_url=database_url)
|
| 35 |
+
lms = tru.db
|
| 36 |
+
|
| 37 |
+
# Set the title and subtitle of the app
|
| 38 |
+
st.title("App Leaderboard")
|
| 39 |
+
st.write(
|
| 40 |
+
"Average feedback values displayed in the range from 0 (worst) to 1 (best)."
|
| 41 |
+
)
|
| 42 |
+
df, feedback_col_names = lms.get_records_and_feedback([])
|
| 43 |
+
feedback_defs = lms.get_feedback_defs()
|
| 44 |
+
feedback_directions = {
|
| 45 |
+
(
|
| 46 |
+
row.feedback_json.get("supplied_name", "") or
|
| 47 |
+
row.feedback_json["implementation"]["name"]
|
| 48 |
+
): row.feedback_json.get("higher_is_better", True)
|
| 49 |
+
for _, row in feedback_defs.iterrows()
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if df.empty:
|
| 53 |
+
st.write("No records yet...")
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
df = df.sort_values(by="app_id")
|
| 57 |
+
|
| 58 |
+
if df.empty:
|
| 59 |
+
st.write("No records yet...")
|
| 60 |
+
|
| 61 |
+
apps = list(df.app_id.unique())
|
| 62 |
+
st.markdown("""---""")
|
| 63 |
+
|
| 64 |
+
for app in apps:
|
| 65 |
+
app_df = df.loc[df.app_id == app]
|
| 66 |
+
if app_df.empty:
|
| 67 |
+
continue
|
| 68 |
+
app_str = app_df["app_json"].iloc[0]
|
| 69 |
+
app_json = json.loads(app_str)
|
| 70 |
+
metadata = app_json.get("metadata")
|
| 71 |
+
# st.text('Metadata' + str(metadata))
|
| 72 |
+
st.header(app, help=draw_metadata(metadata))
|
| 73 |
+
app_feedback_col_names = [
|
| 74 |
+
col_name for col_name in feedback_col_names
|
| 75 |
+
if not app_df[col_name].isna().all()
|
| 76 |
+
]
|
| 77 |
+
col1, col2, col3, col4, *feedback_cols, col99 = st.columns(
|
| 78 |
+
5 + len(app_feedback_col_names)
|
| 79 |
+
)
|
| 80 |
+
latency_mean = (
|
| 81 |
+
app_df["latency"].
|
| 82 |
+
apply(lambda td: td if td != MIGRATION_UNKNOWN_STR else None).mean()
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# app_df_feedback = df.loc[df.app_id == app]
|
| 86 |
+
|
| 87 |
+
col1.metric("Records", len(app_df))
|
| 88 |
+
col2.metric(
|
| 89 |
+
"Average Latency (Seconds)",
|
| 90 |
+
(
|
| 91 |
+
f"{millify(round(latency_mean, 5), precision=2)}"
|
| 92 |
+
if not math.isnan(latency_mean) else "nan"
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
col3.metric(
|
| 96 |
+
"Total Cost (USD)",
|
| 97 |
+
f"${millify(round(sum(cost for cost in app_df.total_cost if cost is not None), 5), precision = 2)}",
|
| 98 |
+
)
|
| 99 |
+
col4.metric(
|
| 100 |
+
"Total Tokens",
|
| 101 |
+
millify(
|
| 102 |
+
sum(
|
| 103 |
+
tokens for tokens in app_df.total_tokens
|
| 104 |
+
if tokens is not None
|
| 105 |
+
),
|
| 106 |
+
precision=2
|
| 107 |
+
),
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
for i, col_name in enumerate(app_feedback_col_names):
|
| 111 |
+
mean = app_df[col_name].mean()
|
| 112 |
+
|
| 113 |
+
st.write(
|
| 114 |
+
styles.stmetricdelta_hidearrow,
|
| 115 |
+
unsafe_allow_html=True,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
higher_is_better = feedback_directions.get(col_name, True)
|
| 119 |
+
|
| 120 |
+
if "distance" in col_name:
|
| 121 |
+
feedback_cols[i].metric(
|
| 122 |
+
label=col_name,
|
| 123 |
+
value=f"{round(mean, 2)}",
|
| 124 |
+
delta_color="normal"
|
| 125 |
+
)
|
| 126 |
+
else:
|
| 127 |
+
cat = CATEGORY.of_score(mean, higher_is_better=higher_is_better)
|
| 128 |
+
feedback_cols[i].metric(
|
| 129 |
+
label=col_name,
|
| 130 |
+
value=f"{round(mean, 2)}",
|
| 131 |
+
delta=f"{cat.icon} {cat.adjective}",
|
| 132 |
+
delta_color=(
|
| 133 |
+
"normal" if cat.compare(
|
| 134 |
+
mean, CATEGORY.PASS[cat.direction].threshold
|
| 135 |
+
) else "inverse"
|
| 136 |
+
),
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
with col99:
|
| 140 |
+
if st.button("Select App", key=f"app-selector-{app}"):
|
| 141 |
+
st.session_state.app = app
|
| 142 |
+
switch_page("Evaluations")
|
| 143 |
+
|
| 144 |
+
# with st.expander("Model metadata"):
|
| 145 |
+
# st.markdown(draw_metadata(metadata))
|
| 146 |
+
|
| 147 |
+
st.markdown("""---""")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# Define the main function to run the app
|
| 151 |
+
def main():
|
| 152 |
+
streamlit_app()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
parser = argparse.ArgumentParser()
|
| 157 |
+
parser.add_argument("--database-url", default=None)
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
args = parser.parse_args()
|
| 161 |
+
except SystemExit as e:
|
| 162 |
+
# This exception will be raised if --help or invalid command line arguments
|
| 163 |
+
# are used. Currently, streamlit prevents the program from exiting normally,
|
| 164 |
+
# so we have to do a hard exit.
|
| 165 |
+
sys.exit(e.code)
|
| 166 |
+
|
| 167 |
+
database_url = args.database_url
|
| 168 |
+
|
| 169 |
+
main()
|
pages/2_Evaluations.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
from typing import Iterable, Tuple
|
| 4 |
+
|
| 5 |
+
# https://github.com/jerryjliu/llama_index/issues/7244:
|
| 6 |
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
| 7 |
+
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from st_aggrid import AgGrid
|
| 12 |
+
from st_aggrid.grid_options_builder import GridOptionsBuilder
|
| 13 |
+
from st_aggrid.shared import GridUpdateMode
|
| 14 |
+
from st_aggrid.shared import JsCode
|
| 15 |
+
import streamlit as st
|
| 16 |
+
from ux.add_logo import add_logo_and_style_overrides
|
| 17 |
+
from ux.styles import CATEGORY
|
| 18 |
+
|
| 19 |
+
from trulens_eval import Tru
|
| 20 |
+
from trulens_eval.app import Agent
|
| 21 |
+
from trulens_eval.app import ComponentView
|
| 22 |
+
from trulens_eval.app import instrumented_component_views
|
| 23 |
+
from trulens_eval.app import LLM
|
| 24 |
+
from trulens_eval.app import Other
|
| 25 |
+
from trulens_eval.app import Prompt
|
| 26 |
+
from trulens_eval.app import Tool
|
| 27 |
+
from trulens_eval.db import MULTI_CALL_NAME_DELIMITER
|
| 28 |
+
from trulens_eval.react_components.record_viewer import record_viewer
|
| 29 |
+
from trulens_eval.schema import Record
|
| 30 |
+
from trulens_eval.schema import Select
|
| 31 |
+
from trulens_eval.utils.json import jsonify_for_ui
|
| 32 |
+
from trulens_eval.utils.serial import Lens
|
| 33 |
+
from trulens_eval.ux.components import draw_agent_info
|
| 34 |
+
from trulens_eval.ux.components import draw_call
|
| 35 |
+
from trulens_eval.ux.components import draw_llm_info
|
| 36 |
+
from trulens_eval.ux.components import draw_metadata
|
| 37 |
+
from trulens_eval.ux.components import draw_prompt_info
|
| 38 |
+
from trulens_eval.ux.components import draw_tool_info
|
| 39 |
+
from trulens_eval.ux.components import render_selector_markdown
|
| 40 |
+
from trulens_eval.ux.components import write_or_json
|
| 41 |
+
from trulens_eval.ux.styles import cellstyle_jscode
|
| 42 |
+
|
| 43 |
+
st.set_page_config(page_title="Evaluations", layout="wide")
|
| 44 |
+
|
| 45 |
+
st.title("Evaluations")
|
| 46 |
+
|
| 47 |
+
st.runtime.legacy_caching.clear_cache()
|
| 48 |
+
|
| 49 |
+
add_logo_and_style_overrides()
|
| 50 |
+
|
| 51 |
+
tru = Tru()
|
| 52 |
+
lms = tru.db
|
| 53 |
+
|
| 54 |
+
df_results, feedback_cols = lms.get_records_and_feedback([])
|
| 55 |
+
|
| 56 |
+
# TODO: remove code redundancy / redundant database calls
|
| 57 |
+
feedback_directions = {
|
| 58 |
+
(
|
| 59 |
+
row.feedback_json.get("supplied_name", "") or
|
| 60 |
+
row.feedback_json["implementation"]["name"]
|
| 61 |
+
): (
|
| 62 |
+
"HIGHER_IS_BETTER" if row.feedback_json.get("higher_is_better", True)
|
| 63 |
+
else "LOWER_IS_BETTER"
|
| 64 |
+
) for _, row in lms.get_feedback_defs().iterrows()
|
| 65 |
+
}
|
| 66 |
+
default_direction = "HIGHER_IS_BETTER"
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def render_component(query, component, header=True):
|
| 70 |
+
# Draw the accessor/path within the wrapped app of the component.
|
| 71 |
+
if header:
|
| 72 |
+
st.markdown(
|
| 73 |
+
f"##### Component {render_selector_markdown(Select.for_app(query))}"
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Draw the python class information of this component.
|
| 77 |
+
cls = component.cls
|
| 78 |
+
base_cls = cls.base_class()
|
| 79 |
+
label = f"__{repr(cls)}__"
|
| 80 |
+
if str(base_cls) != str(cls):
|
| 81 |
+
label += f" < __{repr(base_cls)}__"
|
| 82 |
+
st.write("Python class: " + label)
|
| 83 |
+
|
| 84 |
+
# Per-component-type drawing routines.
|
| 85 |
+
if isinstance(component, LLM):
|
| 86 |
+
draw_llm_info(component=component, query=query)
|
| 87 |
+
|
| 88 |
+
elif isinstance(component, Prompt):
|
| 89 |
+
draw_prompt_info(component=component, query=query)
|
| 90 |
+
|
| 91 |
+
elif isinstance(component, Agent):
|
| 92 |
+
draw_agent_info(component=component, query=query)
|
| 93 |
+
|
| 94 |
+
elif isinstance(component, Tool):
|
| 95 |
+
draw_tool_info(component=component, query=query)
|
| 96 |
+
|
| 97 |
+
elif isinstance(component, Other):
|
| 98 |
+
with st.expander("Uncategorized Component Details:"):
|
| 99 |
+
st.json(jsonify_for_ui(component.json))
|
| 100 |
+
|
| 101 |
+
else:
|
| 102 |
+
with st.expander("Unhandled Component Details:"):
|
| 103 |
+
st.json(jsonify_for_ui(component.json))
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# Renders record level metrics (e.g. total tokens, cost, latency) compared to the average when appropriate
|
| 107 |
+
def render_record_metrics(app_df: pd.DataFrame, selected_rows: pd.DataFrame):
|
| 108 |
+
app_specific_df = app_df[app_df["app_id"] == selected_rows["app_id"][0]]
|
| 109 |
+
|
| 110 |
+
token_col, cost_col, latency_col = st.columns(3)
|
| 111 |
+
|
| 112 |
+
num_tokens = selected_rows["total_tokens"][0]
|
| 113 |
+
token_col.metric(label="Total tokens (#)", value=num_tokens)
|
| 114 |
+
|
| 115 |
+
cost = selected_rows["total_cost"][0]
|
| 116 |
+
average_cost = app_specific_df["total_cost"].mean()
|
| 117 |
+
delta_cost = "{:.3g}".format(cost - average_cost)
|
| 118 |
+
cost_col.metric(
|
| 119 |
+
label="Total cost (USD)",
|
| 120 |
+
value=selected_rows["total_cost"][0],
|
| 121 |
+
delta=delta_cost,
|
| 122 |
+
delta_color="inverse",
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
latency = selected_rows["latency"][0]
|
| 126 |
+
average_latency = app_specific_df["latency"].mean()
|
| 127 |
+
delta_latency = "{:.3g}s".format(latency - average_latency)
|
| 128 |
+
latency_col.metric(
|
| 129 |
+
label="Latency (s)",
|
| 130 |
+
value=selected_rows["latency"][0],
|
| 131 |
+
delta=delta_latency,
|
| 132 |
+
delta_color="inverse",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
if df_results.empty:
|
| 137 |
+
st.write("No records yet...")
|
| 138 |
+
|
| 139 |
+
else:
|
| 140 |
+
apps = list(df_results.app_id.unique())
|
| 141 |
+
if "app" in st.session_state:
|
| 142 |
+
app = st.session_state.app
|
| 143 |
+
else:
|
| 144 |
+
app = apps
|
| 145 |
+
|
| 146 |
+
st.experimental_set_query_params(app=app)
|
| 147 |
+
|
| 148 |
+
options = st.multiselect("Filter Applications", apps, default=app)
|
| 149 |
+
|
| 150 |
+
if len(options) == 0:
|
| 151 |
+
st.header("All Applications")
|
| 152 |
+
app_df = df_results
|
| 153 |
+
|
| 154 |
+
elif len(options) == 1:
|
| 155 |
+
st.header(options[0])
|
| 156 |
+
|
| 157 |
+
app_df = df_results[df_results.app_id.isin(options)]
|
| 158 |
+
|
| 159 |
+
else:
|
| 160 |
+
st.header("Multiple Applications Selected")
|
| 161 |
+
|
| 162 |
+
app_df = df_results[df_results.app_id.isin(options)]
|
| 163 |
+
|
| 164 |
+
tab1, tab2 = st.tabs(["Records", "Feedback Functions"])
|
| 165 |
+
|
| 166 |
+
with tab1:
|
| 167 |
+
gridOptions = {"alwaysShowHorizontalScroll": True}
|
| 168 |
+
evaluations_df = app_df
|
| 169 |
+
|
| 170 |
+
# By default the cells in the df are unicode-escaped, so we have to reverse it.
|
| 171 |
+
input_array = evaluations_df['input'].to_numpy()
|
| 172 |
+
output_array = evaluations_df['output'].to_numpy()
|
| 173 |
+
|
| 174 |
+
decoded_input = np.vectorize(
|
| 175 |
+
lambda x: x.encode('utf-8').decode('unicode-escape')
|
| 176 |
+
)(input_array)
|
| 177 |
+
decoded_output = np.vectorize(
|
| 178 |
+
lambda x: x.encode('utf-8').decode('unicode-escape')
|
| 179 |
+
)(output_array)
|
| 180 |
+
|
| 181 |
+
evaluations_df['input'] = decoded_input
|
| 182 |
+
evaluations_df['output'] = decoded_output
|
| 183 |
+
|
| 184 |
+
gb = GridOptionsBuilder.from_dataframe(evaluations_df)
|
| 185 |
+
|
| 186 |
+
gb.configure_column("type", header_name="App Type")
|
| 187 |
+
gb.configure_column("record_json", header_name="Record JSON", hide=True)
|
| 188 |
+
gb.configure_column("app_json", header_name="App JSON", hide=True)
|
| 189 |
+
gb.configure_column("cost_json", header_name="Cost JSON", hide=True)
|
| 190 |
+
gb.configure_column("perf_json", header_name="Perf. JSON", hide=True)
|
| 191 |
+
|
| 192 |
+
gb.configure_column("record_id", header_name="Record ID", hide=True)
|
| 193 |
+
gb.configure_column("app_id", header_name="App ID")
|
| 194 |
+
|
| 195 |
+
gb.configure_column("feedback_id", header_name="Feedback ID", hide=True)
|
| 196 |
+
gb.configure_column("input", header_name="User Input")
|
| 197 |
+
gb.configure_column(
|
| 198 |
+
"output",
|
| 199 |
+
header_name="Response",
|
| 200 |
+
)
|
| 201 |
+
gb.configure_column("total_tokens", header_name="Total Tokens (#)")
|
| 202 |
+
gb.configure_column("total_cost", header_name="Total Cost (USD)")
|
| 203 |
+
gb.configure_column("latency", header_name="Latency (Seconds)")
|
| 204 |
+
gb.configure_column("tags", header_name="Tags")
|
| 205 |
+
gb.configure_column("ts", header_name="Time Stamp", sort="desc")
|
| 206 |
+
|
| 207 |
+
non_feedback_cols = [
|
| 208 |
+
"app_id",
|
| 209 |
+
"type",
|
| 210 |
+
"ts",
|
| 211 |
+
"total_tokens",
|
| 212 |
+
"total_cost",
|
| 213 |
+
"record_json",
|
| 214 |
+
"latency",
|
| 215 |
+
"record_id",
|
| 216 |
+
"app_id",
|
| 217 |
+
"cost_json",
|
| 218 |
+
"app_json",
|
| 219 |
+
"input",
|
| 220 |
+
"output",
|
| 221 |
+
"perf_json",
|
| 222 |
+
]
|
| 223 |
+
|
| 224 |
+
for feedback_col in evaluations_df.columns.drop(non_feedback_cols):
|
| 225 |
+
if "distance" in feedback_col:
|
| 226 |
+
gb.configure_column(
|
| 227 |
+
feedback_col, hide=feedback_col.endswith("_calls")
|
| 228 |
+
)
|
| 229 |
+
else:
|
| 230 |
+
# cell highlight depending on feedback direction
|
| 231 |
+
cellstyle = JsCode(
|
| 232 |
+
cellstyle_jscode[feedback_directions.get(
|
| 233 |
+
feedback_col, default_direction
|
| 234 |
+
)]
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
gb.configure_column(
|
| 238 |
+
feedback_col,
|
| 239 |
+
cellStyle=cellstyle,
|
| 240 |
+
hide=feedback_col.endswith("_calls")
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
gb.configure_pagination()
|
| 244 |
+
gb.configure_side_bar()
|
| 245 |
+
gb.configure_selection(selection_mode="single", use_checkbox=False)
|
| 246 |
+
# gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc="sum", editable=True)
|
| 247 |
+
gridOptions = gb.build()
|
| 248 |
+
data = AgGrid(
|
| 249 |
+
evaluations_df,
|
| 250 |
+
gridOptions=gridOptions,
|
| 251 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED,
|
| 252 |
+
allow_unsafe_jscode=True,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
selected_rows = data["selected_rows"]
|
| 256 |
+
selected_rows = pd.DataFrame(selected_rows)
|
| 257 |
+
|
| 258 |
+
if len(selected_rows) == 0:
|
| 259 |
+
st.write("Hint: select a row to display details of a record")
|
| 260 |
+
|
| 261 |
+
else:
|
| 262 |
+
# Start the record specific section
|
| 263 |
+
st.divider()
|
| 264 |
+
|
| 265 |
+
# Breadcrumbs
|
| 266 |
+
st.caption(
|
| 267 |
+
f"{selected_rows['app_id'][0]} / {selected_rows['record_id'][0]}"
|
| 268 |
+
)
|
| 269 |
+
st.header(f"{selected_rows['record_id'][0]}")
|
| 270 |
+
|
| 271 |
+
render_record_metrics(app_df, selected_rows)
|
| 272 |
+
|
| 273 |
+
st.markdown("")
|
| 274 |
+
|
| 275 |
+
prompt = selected_rows["input"][0]
|
| 276 |
+
response = selected_rows["output"][0]
|
| 277 |
+
details = selected_rows["app_json"][0]
|
| 278 |
+
|
| 279 |
+
app_json = json.loads(
|
| 280 |
+
details
|
| 281 |
+
) # apps may not be deserializable, don't try to, keep it json.
|
| 282 |
+
|
| 283 |
+
row = selected_rows.head().iloc[0]
|
| 284 |
+
|
| 285 |
+
# Display input/response side by side. In each column, we put them in tabs mainly for
|
| 286 |
+
# formatting/styling purposes.
|
| 287 |
+
input_col, response_col = st.columns(2)
|
| 288 |
+
|
| 289 |
+
(input_tab,) = input_col.tabs(["Input"])
|
| 290 |
+
with input_tab:
|
| 291 |
+
with st.expander(
|
| 292 |
+
f"Input {render_selector_markdown(Select.RecordInput)}",
|
| 293 |
+
expanded=True):
|
| 294 |
+
write_or_json(st, obj=prompt)
|
| 295 |
+
|
| 296 |
+
(response_tab,) = response_col.tabs(["Response"])
|
| 297 |
+
with response_tab:
|
| 298 |
+
with st.expander(
|
| 299 |
+
f"Response {render_selector_markdown(Select.RecordOutput)}",
|
| 300 |
+
expanded=True):
|
| 301 |
+
write_or_json(st, obj=response)
|
| 302 |
+
|
| 303 |
+
feedback_tab, metadata_tab = st.tabs(["Feedback", "Metadata"])
|
| 304 |
+
|
| 305 |
+
with metadata_tab:
|
| 306 |
+
metadata = app_json.get("metadata")
|
| 307 |
+
if metadata:
|
| 308 |
+
with st.expander("Metadata"):
|
| 309 |
+
st.markdown(draw_metadata(metadata))
|
| 310 |
+
else:
|
| 311 |
+
st.write("No metadata found")
|
| 312 |
+
|
| 313 |
+
with feedback_tab:
|
| 314 |
+
if len(feedback_cols) == 0:
|
| 315 |
+
st.write("No feedback details")
|
| 316 |
+
|
| 317 |
+
for fcol in feedback_cols:
|
| 318 |
+
feedback_name = fcol
|
| 319 |
+
feedback_result = row[fcol]
|
| 320 |
+
print(feedback_result)
|
| 321 |
+
|
| 322 |
+
if MULTI_CALL_NAME_DELIMITER in fcol:
|
| 323 |
+
fcol = fcol.split(MULTI_CALL_NAME_DELIMITER)[0]
|
| 324 |
+
feedback_calls = row[f"{fcol}_calls"]
|
| 325 |
+
|
| 326 |
+
def display_feedback_call(call):
|
| 327 |
+
|
| 328 |
+
def highlight(s):
|
| 329 |
+
if "distance" in feedback_name:
|
| 330 |
+
return [
|
| 331 |
+
f"background-color: {CATEGORY.UNKNOWN.color}"
|
| 332 |
+
] * len(s)
|
| 333 |
+
cat = CATEGORY.of_score(
|
| 334 |
+
s.result,
|
| 335 |
+
higher_is_better=feedback_directions.get(
|
| 336 |
+
fcol, default_direction
|
| 337 |
+
) == default_direction
|
| 338 |
+
)
|
| 339 |
+
return [f"background-color: {cat.color}"] * len(s)
|
| 340 |
+
|
| 341 |
+
if call is not None and len(call) > 0:
|
| 342 |
+
df = pd.DataFrame.from_records(
|
| 343 |
+
[call[i]["args"] for i in range(len(call))]
|
| 344 |
+
)
|
| 345 |
+
df["result"] = pd.DataFrame(
|
| 346 |
+
[
|
| 347 |
+
float(call[i]["ret"])
|
| 348 |
+
if call[i]["ret"] is not None else -1
|
| 349 |
+
for i in range(len(call))
|
| 350 |
+
]
|
| 351 |
+
)
|
| 352 |
+
df["meta"] = pd.Series(
|
| 353 |
+
[call[i]["meta"] for i in range(len(call))]
|
| 354 |
+
)
|
| 355 |
+
df = df.join(df.meta.apply(lambda m: pd.Series(m))
|
| 356 |
+
).drop(columns="meta")
|
| 357 |
+
|
| 358 |
+
st.dataframe(
|
| 359 |
+
df.style.apply(highlight, axis=1).format(
|
| 360 |
+
"{:.2}", subset=["result"]
|
| 361 |
+
)
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
else:
|
| 365 |
+
st.text("No feedback details.")
|
| 366 |
+
|
| 367 |
+
with st.expander(f"{feedback_name} = {feedback_result}",
|
| 368 |
+
expanded=True):
|
| 369 |
+
display_feedback_call(feedback_calls)
|
| 370 |
+
|
| 371 |
+
record_str = selected_rows["record_json"][0]
|
| 372 |
+
record_json = json.loads(record_str)
|
| 373 |
+
record = Record.model_validate(record_json)
|
| 374 |
+
|
| 375 |
+
classes: Iterable[Tuple[Lens, ComponentView]
|
| 376 |
+
] = list(instrumented_component_views(app_json))
|
| 377 |
+
classes_map = {path: view for path, view in classes}
|
| 378 |
+
|
| 379 |
+
st.markdown("")
|
| 380 |
+
st.subheader("Timeline")
|
| 381 |
+
val = record_viewer(record_json, app_json)
|
| 382 |
+
st.markdown("")
|
| 383 |
+
|
| 384 |
+
match_query = None
|
| 385 |
+
|
| 386 |
+
# Assumes record_json['perf']['start_time'] is always present
|
| 387 |
+
if val != "":
|
| 388 |
+
match = None
|
| 389 |
+
for call in record.calls:
|
| 390 |
+
if call.perf.start_time.isoformat() == val:
|
| 391 |
+
match = call
|
| 392 |
+
break
|
| 393 |
+
|
| 394 |
+
if match:
|
| 395 |
+
length = len(match.stack)
|
| 396 |
+
app_call = match.stack[length - 1]
|
| 397 |
+
|
| 398 |
+
match_query = match.top().path
|
| 399 |
+
|
| 400 |
+
st.subheader(
|
| 401 |
+
f"{app_call.method.obj.cls.name} {render_selector_markdown(Select.for_app(match_query))}"
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
draw_call(match)
|
| 405 |
+
|
| 406 |
+
view = classes_map.get(match_query)
|
| 407 |
+
if view is not None:
|
| 408 |
+
render_component(
|
| 409 |
+
query=match_query, component=view, header=False
|
| 410 |
+
)
|
| 411 |
+
else:
|
| 412 |
+
st.write(
|
| 413 |
+
f"Call by `{match_query}` was not associated with any instrumented"
|
| 414 |
+
" component."
|
| 415 |
+
)
|
| 416 |
+
# Look up whether there was any data at that path even if not an instrumented component:
|
| 417 |
+
|
| 418 |
+
try:
|
| 419 |
+
app_component_json = list(
|
| 420 |
+
match_query.get(app_json)
|
| 421 |
+
)[0]
|
| 422 |
+
if app_component_json is not None:
|
| 423 |
+
with st.expander(
|
| 424 |
+
"Uninstrumented app component details."
|
| 425 |
+
):
|
| 426 |
+
st.json(app_component_json)
|
| 427 |
+
except Exception:
|
| 428 |
+
st.write(
|
| 429 |
+
f"Recorded invocation by component `{match_query}` but cannot find this component in the app json."
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
else:
|
| 433 |
+
st.text("No match found")
|
| 434 |
+
else:
|
| 435 |
+
st.subheader(f"App {render_selector_markdown(Select.App)}")
|
| 436 |
+
with st.expander("App Details:"):
|
| 437 |
+
st.json(jsonify_for_ui(app_json))
|
| 438 |
+
|
| 439 |
+
if match_query is not None:
|
| 440 |
+
container = st.empty()
|
| 441 |
+
|
| 442 |
+
has_subcomponents = False
|
| 443 |
+
for query, component in classes:
|
| 444 |
+
if not match_query.is_immediate_prefix_of(query):
|
| 445 |
+
continue
|
| 446 |
+
|
| 447 |
+
if len(query.path) == 0:
|
| 448 |
+
# Skip App, will still list App.app under "app".
|
| 449 |
+
continue
|
| 450 |
+
|
| 451 |
+
has_subcomponents = True
|
| 452 |
+
render_component(query, component)
|
| 453 |
+
|
| 454 |
+
if has_subcomponents:
|
| 455 |
+
container.markdown("#### Subcomponents:")
|
| 456 |
+
|
| 457 |
+
st.header("More options:")
|
| 458 |
+
|
| 459 |
+
if st.button("Display full app json"):
|
| 460 |
+
st.write(jsonify_for_ui(app_json))
|
| 461 |
+
|
| 462 |
+
if st.button("Display full record json"):
|
| 463 |
+
st.write(jsonify_for_ui(record_json))
|
| 464 |
+
|
| 465 |
+
with tab2:
|
| 466 |
+
feedback = feedback_cols
|
| 467 |
+
cols = 4
|
| 468 |
+
rows = len(feedback) // cols + 1
|
| 469 |
+
|
| 470 |
+
for row_num in range(rows):
|
| 471 |
+
with st.container():
|
| 472 |
+
columns = st.columns(cols)
|
| 473 |
+
for col_num in range(cols):
|
| 474 |
+
with columns[col_num]:
|
| 475 |
+
ind = row_num * cols + col_num
|
| 476 |
+
if ind < len(feedback):
|
| 477 |
+
# Generate histogram
|
| 478 |
+
fig, ax = plt.subplots()
|
| 479 |
+
bins = [
|
| 480 |
+
0, 0.2, 0.4, 0.6, 0.8, 1.0
|
| 481 |
+
] # Quintile buckets
|
| 482 |
+
ax.hist(
|
| 483 |
+
app_df[feedback[ind]],
|
| 484 |
+
bins=bins,
|
| 485 |
+
edgecolor="black",
|
| 486 |
+
color="#2D736D"
|
| 487 |
+
)
|
| 488 |
+
ax.set_xlabel("Feedback Value")
|
| 489 |
+
ax.set_ylabel("Frequency")
|
| 490 |
+
ax.set_title(feedback[ind], loc="center")
|
| 491 |
+
st.pyplot(fig)
|
pages/3_app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
try:
|
| 5 |
+
raw_docs_files = ", ".join(os.listdir("./raw_documents"))
|
| 6 |
+
curr_directory_files = ", ".join(os.listdir("."))
|
| 7 |
+
file_ls_str = raw_docs_files + "\n\n" + curr_directory_files
|
| 8 |
+
except:
|
| 9 |
+
file_ls_str = "NA"
|
| 10 |
+
|
| 11 |
+
st.write(f"Hello World! File list: {file_ls_str}")
|
streamlit_app.py
CHANGED
|
@@ -22,8 +22,11 @@ evaluation_path = pkg_resources.resource_filename(
|
|
| 22 |
ux_path = pkg_resources.resource_filename(
|
| 23 |
"trulens_eval", "ux"
|
| 24 |
)
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
if os.path.exists("./ux"):
|
| 28 |
shutil.rmtree("./ux")
|
| 29 |
shutil.copytree(ux_path, "./ux")
|
|
|
|
| 22 |
ux_path = pkg_resources.resource_filename(
|
| 23 |
"trulens_eval", "ux"
|
| 24 |
)
|
| 25 |
+
|
| 26 |
+
os.makedirs("./pages", exist_ok=True)
|
| 27 |
+
shutil.copyfile(leaderboard_path, os.path.join("./pages", "1_Leaderboard.py"))
|
| 28 |
+
shutil.copyfile(evaluation_path, os.path.join("./pages", "2_Evaluations.py"))
|
| 29 |
+
|
| 30 |
if os.path.exists("./ux"):
|
| 31 |
shutil.rmtree("./ux")
|
| 32 |
shutil.copytree(ux_path, "./ux")
|