shezamunir commited on
Commit
35c36b4
Β·
verified Β·
1 Parent(s): 72b06da

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +101 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,103 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from PIL import Image
5
+ import base64
6
+ from io import BytesIO
7
+
8
+ # --- Page config ---
9
+ st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
10
+
11
+ # --- Load images ---
12
+ @st.cache_data
13
+ def load_image(path):
14
+ return Image.open(path)
15
+
16
+ logo = load_image("factrbench.png")
17
+ chart = load_image("test.png")
18
+
19
+ # Display logo
20
+ buf = BytesIO()
21
+ logo.save(buf, format="PNG")
22
+ logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
23
+ st.markdown(f"""
24
+ <div style="text-align:center; margin-bottom:20px;">
25
+ <img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
26
+ </div>
27
+ """, unsafe_allow_html=True)
28
+
29
+ # Header
30
+ st.markdown("""
31
+ <div style="text-align:center;">
32
+ <p style="font-size:22px;">
33
+ VERIFACT: Enhancing Long-Form Factuality Evaluation...
34
+ </p>
35
+ <p style="font-size:20px;">
36
+ # πŸ“‘ <a href="">Paper</a> | πŸ’» <a href="">GitHub</a> | πŸ€— <a href="">HuggingFace</a> |
37
+ βš™οΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
38
+ </p>
39
+ </div>
40
+ """, unsafe_allow_html=True)
41
+
42
+ # --- Load data ---
43
+ @st.cache_data
44
+ def load_data(path="models.json"):
45
+ df = pd.read_json(path, lines=True)
46
+ df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
47
+ # Compute rank per column
48
+ for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
49
+ df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
50
+ return df
51
+
52
+ df = load_data()
53
+
54
+ # --- Tabs ---
55
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
56
+
57
+ with tab1:
58
+ st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
59
+ # Build HTML table
60
+ cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
61
+ max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
62
+
63
+ html = "<table style='border-collapse:collapse; width:100%;'>"
64
+ # header
65
+ html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
66
+ # rows
67
+ for _, row in df.iterrows():
68
+ html += "<tr>"
69
+ for c in cols:
70
+ val = row[c] if c!="Model" else row[c]
71
+ if c=="Model":
72
+ html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
73
+ else:
74
+ # color gradient
75
+ rank = row[f"{c}_rank"]
76
+ norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
77
+ # interpolate green-white
78
+ r = int(255 - norm*(255-182))
79
+ g = int(255 - norm*(255-243))
80
+ b = 255
81
+ style = f"background-color:rgb({r},{g},{b}); padding:4px;"
82
+ bold = "font-weight:bold;" if rank==1 else ""
83
+ html += f"<td style='{style}{bold}'>{val}</td>"
84
+ html += "</tr>"
85
+ html += "</table>"
86
+ st.markdown(html, unsafe_allow_html=True)
87
+
88
+ with tab2:
89
+ buf2 = BytesIO()
90
+ chart.save(buf2, format="PNG")
91
+ chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
92
+ st.markdown(f"""
93
+ <div style="text-align:center;">
94
+ <img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
95
+ </div>
96
+ """, unsafe_allow_html=True)
97
+ st.markdown("### What is VERIFACT?")
98
+ st.write("VERIFACT is a factuality evaluation framework...")
99
+ st.markdown("### What is FACTRBENCH?")
100
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...")
101
+ st.markdown("### Key Findings")
102
+ st.write("VERIFACT outperforms prior methods [...]")
103