XinLiu-cs commited on
Commit
296ef9e
·
1 Parent(s): c66176f

update logo and description

Browse files
.history/app_20250318180802.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("verifact_steps.png")
19
+ logo_image = Image.open("verifact_logo.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318181927.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("verifact_steps.png")
19
+ logo_image = Image.open("factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318182019.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("verifact_steps.png")
19
+ logo_image = Image.open("factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318182053.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("./verifact_steps.png")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318182342.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("./verifact_steps.pdf")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318182416.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("verifact_steps.pdf")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318184022.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("test.png")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318190307.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("verifact_steps.svg")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318190528.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+
7
+ # Set up page config
8
+ st.set_page_config(
9
+ page_title="VeriFact Leaderboard",
10
+ layout="wide"
11
+ )
12
+
13
+ # load header
14
+ with open("_header.md", "r") as f:
15
+ HEADER_MD = f.read()
16
+
17
+ # Load the image
18
+ image = Image.open("test.png")
19
+ logo_image = Image.open("./factrbench.png")
20
+
21
+ # Custom CSS for the page
22
+ st.markdown(
23
+ """
24
+ <style>
25
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ html, body, [class*="css"] {
28
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ background-color: #f9f9f9; /* Light grey background */
30
+ }
31
+
32
+ .title {
33
+ font-size: 42px;
34
+ font-weight: bold;
35
+ text-align: center;
36
+ color: #333;
37
+ margin-bottom: 5px;
38
+ }
39
+
40
+ .description {
41
+ font-size: 22px;
42
+ text-align: center;
43
+ margin-bottom: 30px;
44
+ color: #555;
45
+ }
46
+
47
+ .header, .metric {
48
+ align-items: left;
49
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ margin-bottom: 20px;
51
+ }
52
+
53
+ .container {
54
+ max-width: 1000px;
55
+ margin: 0 auto;
56
+ padding: 5px;
57
+ }
58
+
59
+ table {
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ border-radius: 10px;
63
+ overflow: hidden;
64
+ }
65
+
66
+ th, td {
67
+ padding: 8px;
68
+ text-align: center;
69
+ border: 1px solid #ddd;
70
+ font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ font-size: 16px;
72
+ transition: background-color 0.3s;
73
+ }
74
+
75
+ th {
76
+ background-color: #f2f2f2;
77
+ font-weight: bold;
78
+ }
79
+
80
+ td:hover {
81
+ background-color: #eaeaea;
82
+ }
83
+ </style>
84
+ """,
85
+ unsafe_allow_html=True
86
+ )
87
+
88
+ # Display title and description
89
+ st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # Convert the image to base64
93
+ buffered = BytesIO()
94
+ logo_image.save(buffered, format="PNG")
95
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ st.markdown(
97
+ f"""
98
+ <style>
99
+ .logo-container {{
100
+ display: flex;
101
+ justify-content: flex-start; /* Aligns to the left */
102
+ }}
103
+ .logo-container img {{
104
+ width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ margin: 0 auto;
106
+ max-width: 700px; /* Set a maximum width */
107
+ background-color: transparent;
108
+ }}
109
+ </style>
110
+ <div class="logo-container">
111
+ <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ </div>
113
+ """,
114
+ unsafe_allow_html=True
115
+ )
116
+
117
+ # header_md_text = HEADER_MD # make some parameters later
118
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ st.markdown(
121
+ '''
122
+ <div class="header">
123
+ <br/>
124
+ <p style="font-size:22px;">
125
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ </p>
127
+ <p style="font-size:20px;">
128
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ </p>
131
+ </div>
132
+ ''',
133
+ unsafe_allow_html=True
134
+ )
135
+
136
+
137
+ # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # unsafe_allow_html=True)
139
+ # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # Load the data
143
+ data_path = "verifact_data.csv"
144
+ df = pd.read_csv(data_path)
145
+
146
+ # Assign ranks within each tier based on factuality_score
147
+ df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ ascending=False, method='min').astype(int)
149
+
150
+ # Replace NaN values with '-'
151
+ df.fillna('-', inplace=True)
152
+
153
+ df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # Create tabs
156
+ st.markdown("""
157
+ <style>
158
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ font-size: 20px;
160
+ }
161
+ </style>
162
+ """, unsafe_allow_html=True)
163
+
164
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # Tab 1: Leaderboard
167
+ with tab1:
168
+ # df['original_order'] = df.groupby('tier').cumcount()
169
+ # print(df['original_order'])
170
+
171
+ # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ st.markdown("""
175
+ <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ Metrics Explanation
177
+ </div>
178
+ """, unsafe_allow_html=True)
179
+
180
+ st.markdown("""
181
+ <div class="metric" style="font-size:16px;">
182
+ <br/>
183
+ <p>
184
+ <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ </p>
186
+ <p>
187
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ </p>
189
+ </div>
190
+ """,
191
+ unsafe_allow_html=True
192
+ )
193
+
194
+ st.markdown("""
195
+ <style>
196
+ /* Selectbox text */
197
+ div[data-baseweb="select"] > div {
198
+ font-size: 20px;
199
+ }
200
+
201
+ /* Dropdown options */
202
+ div[role="listbox"] ul li {
203
+ font-size: 20px !important;
204
+ }
205
+
206
+ /* Checkbox label */
207
+ .stCheckbox label p {
208
+ font-size: 20px !important;
209
+ }
210
+
211
+ /* Selectbox label */
212
+ .stSelectbox label p {
213
+ font-size: 20px !important;
214
+ }
215
+ </style>
216
+ """, unsafe_allow_html=True)
217
+
218
+ # Dropdown menu to filter tiers
219
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # Filter the data based on the selected tier
223
+ if selected_tier != 'All Metrics':
224
+ filtered_df = df[df['tier'] == selected_tier]
225
+ else:
226
+ filtered_df = df
227
+
228
+ sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ if sort_by_factuality:
232
+ updated_filtered_df = filtered_df.sort_values(
233
+ by=['tier', 'Overall'], ascending=[True, False]
234
+ )
235
+ else:
236
+ updated_filtered_df = filtered_df.sort_values(
237
+ by=['tier', 'original_order']
238
+ )
239
+
240
+ # Create HTML for the table
241
+ if selected_tier == 'All Metrics':
242
+ html = '''
243
+ <table>
244
+ <thead>
245
+ <tr>
246
+ <th>Metric</th>
247
+ <th>Rank</th>
248
+ <th>Model</th>
249
+ <th>Factbench</th>
250
+ <th>Reddit</th>
251
+ <th>Overall</th>
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ '''
256
+ else:
257
+ html = '''
258
+ <table>
259
+ <thead>
260
+ <tr>
261
+ <th>Rank</th>
262
+ <th>Model</th>
263
+ <th>Factbench</th>
264
+ <th>Reddit</th>
265
+ <th>Overall</th>
266
+ </tr>
267
+ </thead>
268
+ <tbody>
269
+ '''
270
+
271
+ # Generate the rows of the table
272
+ current_tier = None
273
+ for i, row in updated_filtered_df.iterrows():
274
+ html += '<tr>'
275
+
276
+ # Only display the 'Tier' column if 'All Tiers' is selected
277
+ if selected_tier == 'All Metrics':
278
+ if row['tier'] != current_tier:
279
+ current_tier = row['tier']
280
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # Fill in model and scores
283
+ html += f'''
284
+ <td>{row['rank']}</td>
285
+ <td>{row['model']}</td>
286
+ <td>{row['FactBench']}</td>
287
+ <td>{row['Reddit']}</td>
288
+ <td>{row['Overall']}</td>
289
+ </tr>
290
+ '''
291
+
292
+ # Close the table
293
+ html += '''
294
+ </table>
295
+ '''
296
+
297
+ # Display the table
298
+ st.markdown(html, unsafe_allow_html=True)
299
+
300
+ st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # Tab 2: Details
303
+ with tab2:
304
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # st.markdown('<div class="title"></div>',
307
+ # unsafe_allow_html=True)
308
+ st.image(image, use_column_width=True)
309
+
310
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ st.write(
312
+ "Language models (LMs) are widely used by an increasing number of users, "
313
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ )
317
+
318
+ st.markdown('### Content Categorization')
319
+ st.write(
320
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ )
324
+
325
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ st.write(
327
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ "regularly updated with new prompts."
331
+ )
332
+
333
+ st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # Tab 3: Links
336
+ # with tab3:
337
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # unsafe_allow_html=True)
341
+
342
+ # st.markdown(
343
+ # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # st.markdown(
345
+ # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # st.markdown('</div>', unsafe_allow_html=True)
.history/app_20250318190830.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ import streamlit as st
351
+ import pandas as pd
352
+ from PIL import Image
353
+ import base64
354
+ from io import BytesIO
355
+
356
+ # Set up page config
357
+ st.set_page_config(
358
+ page_title="VeriFact Leaderboard",
359
+ layout="wide"
360
+ )
361
+
362
+ # load header
363
+ with open("_header.md", "r") as f:
364
+ HEADER_MD = f.read()
365
+
366
+ # Load the image
367
+ image = Image.open("test.png")
368
+ logo_image = Image.open("./factrbench.png")
369
+
370
+ # Custom CSS for the page
371
+ st.markdown(
372
+ """
373
+ <style>
374
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ html, body, [class*="css"] {
377
+ font-family: 'Arial', sans-serif;
378
+ background-color: #f9f9f9;
379
+ }
380
+
381
+ .title {
382
+ font-size: 42px;
383
+ font-weight: bold;
384
+ text-align: center;
385
+ color: #333;
386
+ margin-bottom: 5px;
387
+ }
388
+
389
+ .description {
390
+ font-size: 22px;
391
+ text-align: center;
392
+ margin-bottom: 30px;
393
+ color: #555;
394
+ }
395
+
396
+ .header, .metric {
397
+ align-items: left;
398
+ margin-bottom: 20px;
399
+ }
400
+
401
+ .container {
402
+ max-width: 1000px;
403
+ margin: 0 auto;
404
+ padding: 5px;
405
+ }
406
+
407
+ table {
408
+ width: 100%;
409
+ border-collapse: collapse;
410
+ border-radius: 10px;
411
+ overflow: hidden;
412
+ }
413
+
414
+ th, td {
415
+ padding: 8px;
416
+ text-align: center;
417
+ border: 1px solid #ddd;
418
+ font-size: 16px;
419
+ transition: background-color 0.3s;
420
+ }
421
+
422
+ th {
423
+ background-color: #f2f2f2;
424
+ font-weight: bold;
425
+ }
426
+
427
+ td:hover {
428
+ background-color: #eaeaea;
429
+ }
430
+ </style>
431
+ """,
432
+ unsafe_allow_html=True
433
+ )
434
+
435
+ # Display logo
436
+ buffered = BytesIO()
437
+ logo_image.save(buffered, format="PNG")
438
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ st.markdown(
441
+ f"""
442
+ <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ </div>
445
+ """,
446
+ unsafe_allow_html=True
447
+ )
448
+
449
+ st.markdown(
450
+ '''
451
+ <div class="header">
452
+ <br/>
453
+ <p style="font-size:22px;">
454
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ </p>
456
+ <p style="font-size:20px;">
457
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ </p>
460
+ </div>
461
+ ''',
462
+ unsafe_allow_html=True
463
+ )
464
+
465
+ # Load the data
466
+ data_path = "verifact_data.csv"
467
+ df = pd.read_csv(data_path)
468
+
469
+ # Assign ranks within each tier
470
+ df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ ascending=False, method='min').astype(int)
472
+
473
+ df.fillna('-', inplace=True)
474
+ df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # Tabs
477
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # Tab 1: Leaderboard
480
+ with tab1:
481
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ st.markdown("""
484
+ <div class="metric" style="font-size:16px;">
485
+ <p>
486
+ <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ </p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ if selected_tier != 'All Metrics':
496
+ filtered_df = df[df['tier'] == selected_tier]
497
+ else:
498
+ filtered_df = df
499
+
500
+ sort_by_factuality = st.checkbox('Sort by overall score')
501
+ if sort_by_factuality:
502
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ else:
504
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # 缩小表格:用容器包裹并限制最大宽度
507
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ current_tier = None
511
+ for _, row in updated_filtered_df.iterrows():
512
+ html += '<tr>'
513
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ current_tier = row['tier']
515
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ html += '</tbody></table></div>'
519
+ st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # Tab 2: Benchmark Details
522
+ with tab2:
523
+ # 缩小展示的PNG图片
524
+ st.image(image, width=800)
525
+
526
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
527
+ st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
528
+
529
+ st.markdown('### Content Categorization')
530
+ st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
531
+
532
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
533
+ st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
.history/app_20250318190934.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ import streamlit as st
351
+ import pandas as pd
352
+ from PIL import Image
353
+ import base64
354
+ from io import BytesIO
355
+
356
+ # Set up page config
357
+ st.set_page_config(
358
+ page_title="VeriFact Leaderboard",
359
+ layout="wide"
360
+ )
361
+
362
+ # load header
363
+ with open("_header.md", "r") as f:
364
+ HEADER_MD = f.read()
365
+
366
+ # Load the image
367
+ image = Image.open("test.png")
368
+ logo_image = Image.open("./factrbench.png")
369
+
370
+ # Custom CSS for the page
371
+ st.markdown(
372
+ """
373
+ <style>
374
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ html, body, [class*="css"] {
377
+ font-family: 'Arial', sans-serif;
378
+ background-color: #f9f9f9;
379
+ }
380
+
381
+ .title {
382
+ font-size: 42px;
383
+ font-weight: bold;
384
+ text-align: center;
385
+ color: #333;
386
+ margin-bottom: 5px;
387
+ }
388
+
389
+ .description {
390
+ font-size: 22px;
391
+ text-align: center;
392
+ margin-bottom: 30px;
393
+ color: #555;
394
+ }
395
+
396
+ .header, .metric {
397
+ align-items: left;
398
+ margin-bottom: 20px;
399
+ }
400
+
401
+ .container {
402
+ max-width: 1000px;
403
+ margin: 0 auto;
404
+ padding: 5px;
405
+ }
406
+
407
+ table {
408
+ width: 100%;
409
+ border-collapse: collapse;
410
+ border-radius: 10px;
411
+ overflow: hidden;
412
+ }
413
+
414
+ th, td {
415
+ padding: 8px;
416
+ text-align: center;
417
+ border: 1px solid #ddd;
418
+ font-size: 16px;
419
+ transition: background-color 0.3s;
420
+ }
421
+
422
+ th {
423
+ background-color: #f2f2f2;
424
+ font-weight: bold;
425
+ }
426
+
427
+ td:hover {
428
+ background-color: #eaeaea;
429
+ }
430
+ </style>
431
+ """,
432
+ unsafe_allow_html=True
433
+ )
434
+
435
+ # Display logo
436
+ buffered = BytesIO()
437
+ logo_image.save(buffered, format="PNG")
438
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ st.markdown(
441
+ f"""
442
+ <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ </div>
445
+ """,
446
+ unsafe_allow_html=True
447
+ )
448
+
449
+ st.markdown(
450
+ '''
451
+ <div class="header">
452
+ <br/>
453
+ <p style="font-size:22px;">
454
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ </p>
456
+ <p style="font-size:20px;">
457
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ </p>
460
+ </div>
461
+ ''',
462
+ unsafe_allow_html=True
463
+ )
464
+
465
+ # Load the data
466
+ data_path = "verifact_data.csv"
467
+ df = pd.read_csv(data_path)
468
+
469
+ # Assign ranks within each tier
470
+ df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ ascending=False, method='min').astype(int)
472
+
473
+ df.fillna('-', inplace=True)
474
+ df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # Tabs
477
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # Tab 1: Leaderboard
480
+ with tab1:
481
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ st.markdown("""
484
+ <div class="metric" style="font-size:16px;">
485
+ <p>
486
+ <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ </p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ if selected_tier != 'All Metrics':
496
+ filtered_df = df[df['tier'] == selected_tier]
497
+ else:
498
+ filtered_df = df
499
+
500
+ sort_by_factuality = st.checkbox('Sort by overall score')
501
+ if sort_by_factuality:
502
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ else:
504
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # 缩小表格:用容器包裹并限制最大宽度
507
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ current_tier = None
511
+ for _, row in updated_filtered_df.iterrows():
512
+ html += '<tr>'
513
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ current_tier = row['tier']
515
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ html += '</tbody></table></div>'
519
+ st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # Tab 2: Benchmark Details
522
+ with tab2:
523
+ # 缩小展示的PNG图片
524
+ st.image(image, width=800)
525
+ st.markdown(f'''
526
+ <div style="text-align:center;">
527
+ <img src="data:image/png;base64,{base64.b64encode(image.tobytes()).decode()}" style="max-width: 800px; width: 100%;">
528
+ </div>
529
+ ''', unsafe_allow_html=True)
530
+
531
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
532
+ st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
533
+
534
+ st.markdown('### Content Categorization')
535
+ st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
536
+
537
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
538
+ st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
.history/app_20250318190954.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ import streamlit as st
351
+ import pandas as pd
352
+ from PIL import Image
353
+ import base64
354
+ from io import BytesIO
355
+
356
+ # Set up page config
357
+ st.set_page_config(
358
+ page_title="VeriFact Leaderboard",
359
+ layout="wide"
360
+ )
361
+
362
+ # load header
363
+ with open("_header.md", "r") as f:
364
+ HEADER_MD = f.read()
365
+
366
+ # Load the image
367
+ image = Image.open("test.png")
368
+ logo_image = Image.open("./factrbench.png")
369
+
370
+ # Custom CSS for the page
371
+ st.markdown(
372
+ """
373
+ <style>
374
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ html, body, [class*="css"] {
377
+ font-family: 'Arial', sans-serif;
378
+ background-color: #f9f9f9;
379
+ }
380
+
381
+ .title {
382
+ font-size: 42px;
383
+ font-weight: bold;
384
+ text-align: center;
385
+ color: #333;
386
+ margin-bottom: 5px;
387
+ }
388
+
389
+ .description {
390
+ font-size: 22px;
391
+ text-align: center;
392
+ margin-bottom: 30px;
393
+ color: #555;
394
+ }
395
+
396
+ .header, .metric {
397
+ align-items: left;
398
+ margin-bottom: 20px;
399
+ }
400
+
401
+ .container {
402
+ max-width: 1000px;
403
+ margin: 0 auto;
404
+ padding: 5px;
405
+ }
406
+
407
+ table {
408
+ width: 100%;
409
+ border-collapse: collapse;
410
+ border-radius: 10px;
411
+ overflow: hidden;
412
+ }
413
+
414
+ th, td {
415
+ padding: 8px;
416
+ text-align: center;
417
+ border: 1px solid #ddd;
418
+ font-size: 16px;
419
+ transition: background-color 0.3s;
420
+ }
421
+
422
+ th {
423
+ background-color: #f2f2f2;
424
+ font-weight: bold;
425
+ }
426
+
427
+ td:hover {
428
+ background-color: #eaeaea;
429
+ }
430
+ </style>
431
+ """,
432
+ unsafe_allow_html=True
433
+ )
434
+
435
+ # Display logo
436
+ buffered = BytesIO()
437
+ logo_image.save(buffered, format="PNG")
438
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ st.markdown(
441
+ f"""
442
+ <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ </div>
445
+ """,
446
+ unsafe_allow_html=True
447
+ )
448
+
449
+ st.markdown(
450
+ '''
451
+ <div class="header">
452
+ <br/>
453
+ <p style="font-size:22px;">
454
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ </p>
456
+ <p style="font-size:20px;">
457
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ </p>
460
+ </div>
461
+ ''',
462
+ unsafe_allow_html=True
463
+ )
464
+
465
+ # Load the data
466
+ data_path = "verifact_data.csv"
467
+ df = pd.read_csv(data_path)
468
+
469
+ # Assign ranks within each tier
470
+ df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ ascending=False, method='min').astype(int)
472
+
473
+ df.fillna('-', inplace=True)
474
+ df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # Tabs
477
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # Tab 1: Leaderboard
480
+ with tab1:
481
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ st.markdown("""
484
+ <div class="metric" style="font-size:16px;">
485
+ <p>
486
+ <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ </p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ if selected_tier != 'All Metrics':
496
+ filtered_df = df[df['tier'] == selected_tier]
497
+ else:
498
+ filtered_df = df
499
+
500
+ sort_by_factuality = st.checkbox('Sort by overall score')
501
+ if sort_by_factuality:
502
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ else:
504
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # 缩小表格:用容器包裹并限制最大宽度
507
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ current_tier = None
511
+ for _, row in updated_filtered_df.iterrows():
512
+ html += '<tr>'
513
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ current_tier = row['tier']
515
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ html += '</tbody></table></div>'
519
+ st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # Tab 2: Benchmark Details
522
+ with tab2:
523
+ # 缩小展示的PNG图片
524
+ # st.image(image, width=800)
525
+ st.markdown(f'''
526
+ <div style="text-align:center;">
527
+ <img src="data:image/png;base64,{base64.b64encode(image.tobytes()).decode()}" style="max-width: 800px; width: 100%;">
528
+ </div>
529
+ ''', unsafe_allow_html=True)
530
+
531
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
532
+ st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
533
+
534
+ st.markdown('### Content Categorization')
535
+ st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
536
+
537
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
538
+ st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
.history/app_20250318191111.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ import streamlit as st
351
+ import pandas as pd
352
+ from PIL import Image
353
+ import base64
354
+ from io import BytesIO
355
+
356
+ # Set up page config
357
+ st.set_page_config(
358
+ page_title="VeriFact Leaderboard",
359
+ layout="wide"
360
+ )
361
+
362
+ # load header
363
+ with open("_header.md", "r") as f:
364
+ HEADER_MD = f.read()
365
+
366
+ # Load the image
367
+ image = Image.open("test.png")
368
+ logo_image = Image.open("./factrbench.png")
369
+
370
+ # Custom CSS for the page
371
+ st.markdown(
372
+ """
373
+ <style>
374
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ html, body, [class*="css"] {
377
+ font-family: 'Arial', sans-serif;
378
+ background-color: #f9f9f9;
379
+ }
380
+
381
+ .title {
382
+ font-size: 42px;
383
+ font-weight: bold;
384
+ text-align: center;
385
+ color: #333;
386
+ margin-bottom: 5px;
387
+ }
388
+
389
+ .description {
390
+ font-size: 22px;
391
+ text-align: center;
392
+ margin-bottom: 30px;
393
+ color: #555;
394
+ }
395
+
396
+ .header, .metric {
397
+ align-items: left;
398
+ margin-bottom: 20px;
399
+ }
400
+
401
+ .container {
402
+ max-width: 1000px;
403
+ margin: 0 auto;
404
+ padding: 5px;
405
+ }
406
+
407
+ table {
408
+ width: 100%;
409
+ border-collapse: collapse;
410
+ border-radius: 10px;
411
+ overflow: hidden;
412
+ }
413
+
414
+ th, td {
415
+ padding: 8px;
416
+ text-align: center;
417
+ border: 1px solid #ddd;
418
+ font-size: 16px;
419
+ transition: background-color 0.3s;
420
+ }
421
+
422
+ th {
423
+ background-color: #f2f2f2;
424
+ font-weight: bold;
425
+ }
426
+
427
+ td:hover {
428
+ background-color: #eaeaea;
429
+ }
430
+ </style>
431
+ """,
432
+ unsafe_allow_html=True
433
+ )
434
+
435
+ # Display logo
436
+ buffered = BytesIO()
437
+ logo_image.save(buffered, format="PNG")
438
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ st.markdown(
441
+ f"""
442
+ <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ </div>
445
+ """,
446
+ unsafe_allow_html=True
447
+ )
448
+
449
+ st.markdown(
450
+ '''
451
+ <div class="header">
452
+ <br/>
453
+ <p style="font-size:22px;">
454
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ </p>
456
+ <p style="font-size:20px;">
457
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ </p>
460
+ </div>
461
+ ''',
462
+ unsafe_allow_html=True
463
+ )
464
+
465
+ # Load the data
466
+ data_path = "verifact_data.csv"
467
+ df = pd.read_csv(data_path)
468
+
469
+ # Assign ranks within each tier
470
+ df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ ascending=False, method='min').astype(int)
472
+
473
+ df.fillna('-', inplace=True)
474
+ df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # Tabs
477
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # Tab 1: Leaderboard
480
+ with tab1:
481
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ st.markdown("""
484
+ <div class="metric" style="font-size:16px;">
485
+ <p>
486
+ <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ </p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ if selected_tier != 'All Metrics':
496
+ filtered_df = df[df['tier'] == selected_tier]
497
+ else:
498
+ filtered_df = df
499
+
500
+ sort_by_factuality = st.checkbox('Sort by overall score')
501
+ if sort_by_factuality:
502
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ else:
504
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # 缩小表格:用容器包裹并限制最大宽度
507
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ current_tier = None
511
+ for _, row in updated_filtered_df.iterrows():
512
+ html += '<tr>'
513
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ current_tier = row['tier']
515
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ html += '</tbody></table></div>'
519
+ st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # Tab 2: Benchmark Details
522
+ with tab2:
523
+ # 图片剧中显示
524
+ buffered_img = BytesIO()
525
+ image.save(buffered_img, format="PNG")
526
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ st.markdown(f'''<div style="text-align:center;">
529
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
530
+ </div>''', unsafe_allow_html=True)
531
+
532
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ st.markdown('### Content Categorization')
536
+ st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
.history/app_20250318191134.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ import streamlit as st
351
+ import pandas as pd
352
+ from PIL import Image
353
+ import base64
354
+ from io import BytesIO
355
+
356
+ # Set up page config
357
+ st.set_page_config(
358
+ page_title="VeriFact Leaderboard",
359
+ layout="wide"
360
+ )
361
+
362
+ # load header
363
+ with open("_header.md", "r") as f:
364
+ HEADER_MD = f.read()
365
+
366
+ # Load the image
367
+ image = Image.open("test.png")
368
+ logo_image = Image.open("./factrbench.png")
369
+
370
+ # Custom CSS for the page
371
+ st.markdown(
372
+ """
373
+ <style>
374
+ @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ html, body, [class*="css"] {
377
+ font-family: 'Arial', sans-serif;
378
+ background-color: #f9f9f9;
379
+ }
380
+
381
+ .title {
382
+ font-size: 42px;
383
+ font-weight: bold;
384
+ text-align: center;
385
+ color: #333;
386
+ margin-bottom: 5px;
387
+ }
388
+
389
+ .description {
390
+ font-size: 22px;
391
+ text-align: center;
392
+ margin-bottom: 30px;
393
+ color: #555;
394
+ }
395
+
396
+ .header, .metric {
397
+ align-items: left;
398
+ margin-bottom: 20px;
399
+ }
400
+
401
+ .container {
402
+ max-width: 1000px;
403
+ margin: 0 auto;
404
+ padding: 5px;
405
+ }
406
+
407
+ table {
408
+ width: 100%;
409
+ border-collapse: collapse;
410
+ border-radius: 10px;
411
+ overflow: hidden;
412
+ }
413
+
414
+ th, td {
415
+ padding: 8px;
416
+ text-align: center;
417
+ border: 1px solid #ddd;
418
+ font-size: 16px;
419
+ transition: background-color 0.3s;
420
+ }
421
+
422
+ th {
423
+ background-color: #f2f2f2;
424
+ font-weight: bold;
425
+ }
426
+
427
+ td:hover {
428
+ background-color: #eaeaea;
429
+ }
430
+ </style>
431
+ """,
432
+ unsafe_allow_html=True
433
+ )
434
+
435
+ # Display logo
436
+ buffered = BytesIO()
437
+ logo_image.save(buffered, format="PNG")
438
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ st.markdown(
441
+ f"""
442
+ <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ </div>
445
+ """,
446
+ unsafe_allow_html=True
447
+ )
448
+
449
+ st.markdown(
450
+ '''
451
+ <div class="header">
452
+ <br/>
453
+ <p style="font-size:22px;">
454
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ </p>
456
+ <p style="font-size:20px;">
457
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ </p>
460
+ </div>
461
+ ''',
462
+ unsafe_allow_html=True
463
+ )
464
+
465
+ # Load the data
466
+ data_path = "verifact_data.csv"
467
+ df = pd.read_csv(data_path)
468
+
469
+ # Assign ranks within each tier
470
+ df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ ascending=False, method='min').astype(int)
472
+
473
+ df.fillna('-', inplace=True)
474
+ df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # Tabs
477
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # Tab 1: Leaderboard
480
+ with tab1:
481
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ st.markdown("""
484
+ <div class="metric" style="font-size:16px;">
485
+ <p>
486
+ <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ </p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ if selected_tier != 'All Metrics':
496
+ filtered_df = df[df['tier'] == selected_tier]
497
+ else:
498
+ filtered_df = df
499
+
500
+ sort_by_factuality = st.checkbox('Sort by overall score')
501
+ if sort_by_factuality:
502
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ else:
504
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # 缩小表格:用容器包裹并限制最大宽度
507
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ current_tier = None
511
+ for _, row in updated_filtered_df.iterrows():
512
+ html += '<tr>'
513
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ current_tier = row['tier']
515
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ html += '</tbody></table></div>'
519
+ st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # Tab 2: Benchmark Details
522
+ with tab2:
523
+ # 图片剧中显示
524
+ buffered_img = BytesIO()
525
+ image.save(buffered_img, format="PNG")
526
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ st.markdown(f'''<div style="text-align:center;">
529
+ <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ </div>''', unsafe_allow_html=True)
531
+
532
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ st.markdown('### Content Categorization')
536
+ st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
.history/app_20250318191435.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
632
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
633
+
634
+ current_tier = None
635
+ for _, row in updated_filtered_df.iterrows():
636
+ html += '<tr>'
637
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
638
+ current_tier = row['tier']
639
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
640
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
641
+
642
+ html += '</tbody></table></div>'
643
+ st.markdown(html, unsafe_allow_html=True)
644
+
645
+ # Tab 2: Benchmark Details
646
+ with tab2:
647
+ buffered_img = BytesIO()
648
+ image.save(buffered_img, format="PNG")
649
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
650
+
651
+ st.markdown(f'''<div style="text-align:center;">
652
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
653
+ </div>''', unsafe_allow_html=True)
654
+
655
+ st.markdown('### What is VERIFACT?')
656
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
657
+
658
+ st.markdown('### What is FACTRBENCH?')
659
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
660
+
661
+ st.markdown('### Key Findings')
662
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121041.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ html = '<div style="max-width: 1200px; margin: 0 auto;"><table>'
632
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
633
+
634
+ current_tier = None
635
+ for _, row in updated_filtered_df.iterrows():
636
+ html += '<tr>'
637
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
638
+ current_tier = row['tier']
639
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
640
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
641
+
642
+ html += '</tbody></table></div>'
643
+ st.markdown(html, unsafe_allow_html=True)
644
+
645
+ # Tab 2: Benchmark Details
646
+ with tab2:
647
+ buffered_img = BytesIO()
648
+ image.save(buffered_img, format="PNG")
649
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
650
+
651
+ st.markdown(f'''<div style="text-align:center;">
652
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
653
+ </div>''', unsafe_allow_html=True)
654
+
655
+ st.markdown('### What is VERIFACT?')
656
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
657
+
658
+ st.markdown('### What is FACTRBENCH?')
659
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
660
+
661
+ st.markdown('### Key Findings')
662
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121128.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
633
+
634
+ current_tier = None
635
+ for _, row in updated_filtered_df.iterrows():
636
+ html += '<tr>'
637
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
638
+ current_tier = row['tier']
639
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
640
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
641
+
642
+ html += '</tbody></table></div>'
643
+ st.markdown(html, unsafe_allow_html=True)
644
+
645
+ # Tab 2: Benchmark Details
646
+ with tab2:
647
+ buffered_img = BytesIO()
648
+ image.save(buffered_img, format="PNG")
649
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
650
+
651
+ st.markdown(f'''<div style="text-align:center;">
652
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
653
+ </div>''', unsafe_allow_html=True)
654
+
655
+ st.markdown('### What is VERIFACT?')
656
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
657
+
658
+ st.markdown('### What is FACTRBENCH?')
659
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
660
+
661
+ st.markdown('### Key Findings')
662
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121213.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 80%; margin: 0 auto;">
634
+ <table style="width: 100%;">
635
+ '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
+
638
+ current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
+ html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
+
646
+ html += '</tbody></table></div>'
647
+ st.markdown(html, unsafe_allow_html=True)
648
+
649
+ # Tab 2: Benchmark Details
650
+ with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
654
+
655
+ st.markdown(f'''<div style="text-align:center;">
656
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ </div>''', unsafe_allow_html=True)
658
+
659
+ st.markdown('### What is VERIFACT?')
660
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
661
+
662
+ st.markdown('### What is FACTRBENCH?')
663
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
664
+
665
+ st.markdown('### Key Findings')
666
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121227.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 70%; margin: 0 auto;">
634
+ <table style="width: 100%;">
635
+ '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
+
638
+ current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
+ html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
+
646
+ html += '</tbody></table></div>'
647
+ st.markdown(html, unsafe_allow_html=True)
648
+
649
+ # Tab 2: Benchmark Details
650
+ with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
654
+
655
+ st.markdown(f'''<div style="text-align:center;">
656
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ </div>''', unsafe_allow_html=True)
658
+
659
+ st.markdown('### What is VERIFACT?')
660
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
661
+
662
+ st.markdown('### What is FACTRBENCH?')
663
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
664
+
665
+ st.markdown('### Key Findings')
666
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121240.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 60%; margin: 0 auto;">
634
+ <table style="width: 100%;">
635
+ '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
+
638
+ current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
+ html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
+
646
+ html += '</tbody></table></div>'
647
+ st.markdown(html, unsafe_allow_html=True)
648
+
649
+ # Tab 2: Benchmark Details
650
+ with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
654
+
655
+ st.markdown(f'''<div style="text-align:center;">
656
+ <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ </div>''', unsafe_allow_html=True)
658
+
659
+ st.markdown('### What is VERIFACT?')
660
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
661
+
662
+ st.markdown('### What is FACTRBENCH?')
663
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
664
+
665
+ st.markdown('### Key Findings')
666
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121438.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 60%; margin: 0 auto;">
634
+ <table style="width: 100%;">
635
+ '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
+
638
+ current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
+ html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
+
646
+ html += '</tbody></table></div>'
647
+ st.markdown(html, unsafe_allow_html=True)
648
+
649
+ # Tab 2: Benchmark Details
650
+ with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
654
+
655
+ # st.markdown(f'''<div style="text-align:center;">
656
+ # <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ # </div>''', unsafe_allow_html=True)
658
+ st.markdown(f'''<div style="text-align:center; width:85%; margin:0 auto;">
659
+ <img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
660
+ </div>''', unsafe_allow_html=True)
661
+
662
+ st.markdown('### What is VERIFACT?')
663
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
664
+
665
+ st.markdown('### What is FACTRBENCH?')
666
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
667
+
668
+ st.markdown('### Key Findings')
669
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
.history/app_20250319121451.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
+ import streamlit as st
543
+ import pandas as pd
544
+ from PIL import Image
545
+ import base64
546
+ from io import BytesIO
547
+
548
+ # Set up page config
549
+ st.set_page_config(
550
+ page_title="VeriFact Leaderboard",
551
+ layout="wide"
552
+ )
553
+
554
+ # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
557
+
558
+ # Display logo
559
+ buffered = BytesIO()
560
+ logo_image.save(buffered, format="PNG")
561
+ img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
+ st.markdown(
564
+ f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
567
+ </div>
568
+ """,
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ st.markdown(
573
+ '''
574
+ <div class="header">
575
+ <br/>
576
+ <p style="font-size:22px;">
577
+ VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
578
+ </p>
579
+ <p style="font-size:20px;">
580
+ # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
+ </p>
583
+ </div>
584
+ ''',
585
+ unsafe_allow_html=True
586
+ )
587
+
588
+ # Load the data
589
+ data_path = "verifact_data.csv"
590
+ df = pd.read_csv(data_path)
591
+
592
+ # Assign ranks within each tier
593
+ df['rank'] = df.groupby('tier')['Overall'].rank(
594
+ ascending=False, method='min').astype(int)
595
+
596
+ df.fillna('-', inplace=True)
597
+ df['original_order'] = df.groupby('tier').cumcount()
598
+
599
+ # Tabs
600
+ tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
+
602
+ # Tab 1: Leaderboard
603
+ with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
605
+
606
+ st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
+ """, unsafe_allow_html=True)
616
+
617
+ tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
+ selected_tier = st.selectbox('Select metric:', tiers)
619
+
620
+ if selected_tier != 'All Metrics':
621
+ filtered_df = df[df['tier'] == selected_tier]
622
+ else:
623
+ filtered_df = df
624
+
625
+ sort_by_factuality = st.checkbox('Sort by overall score')
626
+ if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
628
+ else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 60%; margin: 0 auto;">
634
+ <table style="width: 100%;">
635
+ '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
+
638
+ current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
+ html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
+
646
+ html += '</tbody></table></div>'
647
+ st.markdown(html, unsafe_allow_html=True)
648
+
649
+ # Tab 2: Benchmark Details
650
+ with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
654
+
655
+ # st.markdown(f'''<div style="text-align:center;">
656
+ # <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ # </div>''', unsafe_allow_html=True)
658
+ st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;">
659
+ <img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
660
+ </div>''', unsafe_allow_html=True)
661
+
662
+ st.markdown('### What is VERIFACT?')
663
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
664
+
665
+ st.markdown('### What is FACTRBENCH?')
666
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
667
+
668
+ st.markdown('### Key Findings')
669
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
app.py CHANGED
@@ -1,3 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  from PIL import Image
@@ -10,113 +551,24 @@ st.set_page_config(
10
  layout="wide"
11
  )
12
 
13
- # load header
14
- with open("_header.md", "r") as f:
15
- HEADER_MD = f.read()
16
-
17
  # Load the image
18
- image = Image.open("verifact_steps.png")
19
- logo_image = Image.open("verifact_logo.png")
20
-
21
- # Custom CSS for the page
22
- st.markdown(
23
- """
24
- <style>
25
- @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
-
27
- html, body, [class*="css"] {
28
- font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
- background-color: #f9f9f9; /* Light grey background */
30
- }
31
-
32
- .title {
33
- font-size: 42px;
34
- font-weight: bold;
35
- text-align: center;
36
- color: #333;
37
- margin-bottom: 5px;
38
- }
39
-
40
- .description {
41
- font-size: 22px;
42
- text-align: center;
43
- margin-bottom: 30px;
44
- color: #555;
45
- }
46
-
47
- .header, .metric {
48
- align-items: left;
49
- font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
- margin-bottom: 20px;
51
- }
52
-
53
- .container {
54
- max-width: 1000px;
55
- margin: 0 auto;
56
- padding: 5px;
57
- }
58
-
59
- table {
60
- width: 100%;
61
- border-collapse: collapse;
62
- border-radius: 10px;
63
- overflow: hidden;
64
- }
65
-
66
- th, td {
67
- padding: 8px;
68
- text-align: center;
69
- border: 1px solid #ddd;
70
- font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
- font-size: 16px;
72
- transition: background-color 0.3s;
73
- }
74
-
75
- th {
76
- background-color: #f2f2f2;
77
- font-weight: bold;
78
- }
79
-
80
- td:hover {
81
- background-color: #eaeaea;
82
- }
83
- </style>
84
- """,
85
- unsafe_allow_html=True
86
- )
87
 
88
- # Display title and description
89
- st.markdown('<div class="container">', unsafe_allow_html=True)
90
- # st.image(logo_image, output_format="PNG", width=200)
91
-
92
- # Convert the image to base64
93
  buffered = BytesIO()
94
  logo_image.save(buffered, format="PNG")
95
  img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
96
  st.markdown(
97
  f"""
98
- <style>
99
- .logo-container {{
100
- display: flex;
101
- justify-content: flex-start; /* Aligns to the left */
102
- }}
103
- .logo-container img {{
104
- width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
- margin: 0 auto;
106
- max-width: 700px; /* Set a maximum width */
107
- background-color: transparent;
108
- }}
109
- </style>
110
- <div class="logo-container">
111
- <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
  </div>
113
  """,
114
  unsafe_allow_html=True
115
  )
116
 
117
- # header_md_text = HEADER_MD # make some parameters later
118
- # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
-
120
  st.markdown(
121
  '''
122
  <div class="header">
@@ -126,222 +578,92 @@ st.markdown(
126
  </p>
127
  <p style="font-size:20px;">
128
  # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
- ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
  </p>
131
  </div>
132
  ''',
133
  unsafe_allow_html=True
134
  )
135
 
136
-
137
- # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
- # unsafe_allow_html=True)
139
- # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
- st.markdown('</div>', unsafe_allow_html=True)
141
-
142
  # Load the data
143
  data_path = "verifact_data.csv"
144
  df = pd.read_csv(data_path)
145
 
146
- # Assign ranks within each tier based on factuality_score
147
  df['rank'] = df.groupby('tier')['Overall'].rank(
148
  ascending=False, method='min').astype(int)
149
 
150
- # Replace NaN values with '-'
151
  df.fillna('-', inplace=True)
152
-
153
  df['original_order'] = df.groupby('tier').cumcount()
154
 
155
- # Create tabs
156
- st.markdown("""
157
- <style>
158
- .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
- font-size: 20px;
160
- }
161
- </style>
162
- """, unsafe_allow_html=True)
163
-
164
  tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
 
166
  # Tab 1: Leaderboard
167
  with tab1:
168
- # df['original_order'] = df.groupby('tier').cumcount()
169
- # print(df['original_order'])
170
-
171
- # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
- st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
 
174
  st.markdown("""
175
- <div class="metric" style="font-size:20px; font-weight: bold;">
176
- Metrics Explanation
177
- </div>
 
 
 
 
 
178
  """, unsafe_allow_html=True)
179
 
180
- st.markdown("""
181
- <div class="metric" style="font-size:16px;">
182
- <br/>
183
- <p>
184
- <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
- </p>
186
- <p>
187
- 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
- </p>
189
- </div>
190
- """,
191
- unsafe_allow_html=True
192
- )
193
-
194
- st.markdown("""
195
- <style>
196
- /* Selectbox text */
197
- div[data-baseweb="select"] > div {
198
- font-size: 20px;
199
- }
200
-
201
- /* Dropdown options */
202
- div[role="listbox"] ul li {
203
- font-size: 20px !important;
204
- }
205
-
206
- /* Checkbox label */
207
- .stCheckbox label p {
208
- font-size: 20px !important;
209
- }
210
-
211
- /* Selectbox label */
212
- .stSelectbox label p {
213
- font-size: 20px !important;
214
- }
215
- </style>
216
- """, unsafe_allow_html=True)
217
-
218
- # Dropdown menu to filter tiers
219
  tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
  selected_tier = st.selectbox('Select metric:', tiers)
221
 
222
- # Filter the data based on the selected tier
223
  if selected_tier != 'All Metrics':
224
  filtered_df = df[df['tier'] == selected_tier]
225
  else:
226
  filtered_df = df
227
 
228
  sort_by_factuality = st.checkbox('Sort by overall score')
229
-
230
- # Sort the dataframe based on Factuality Score if the checkbox is selected
231
  if sort_by_factuality:
232
- updated_filtered_df = filtered_df.sort_values(
233
- by=['tier', 'Overall'], ascending=[True, False]
234
- )
235
  else:
236
- updated_filtered_df = filtered_df.sort_values(
237
- by=['tier', 'original_order']
238
- )
239
-
240
- # Create HTML for the table
241
- if selected_tier == 'All Metrics':
242
- html = '''
243
- <table>
244
- <thead>
245
- <tr>
246
- <th>Metric</th>
247
- <th>Rank</th>
248
- <th>Model</th>
249
- <th>Factbench</th>
250
- <th>Reddit</th>
251
- <th>Overall</th>
252
- </tr>
253
- </thead>
254
- <tbody>
255
- '''
256
- else:
257
- html = '''
258
- <table>
259
- <thead>
260
- <tr>
261
- <th>Rank</th>
262
- <th>Model</th>
263
- <th>Factbench</th>
264
- <th>Reddit</th>
265
- <th>Overall</th>
266
- </tr>
267
- </thead>
268
- <tbody>
269
  '''
 
270
 
271
- # Generate the rows of the table
272
  current_tier = None
273
- for i, row in updated_filtered_df.iterrows():
274
  html += '<tr>'
 
 
 
 
275
 
276
- # Only display the 'Tier' column if 'All Tiers' is selected
277
- if selected_tier == 'All Metrics':
278
- if row['tier'] != current_tier:
279
- current_tier = row['tier']
280
- html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
-
282
- # Fill in model and scores
283
- html += f'''
284
- <td>{row['rank']}</td>
285
- <td>{row['model']}</td>
286
- <td>{row['FactBench']}</td>
287
- <td>{row['Reddit']}</td>
288
- <td>{row['Overall']}</td>
289
- </tr>
290
- '''
291
-
292
- # Close the table
293
- html += '''
294
- </table>
295
- '''
296
-
297
- # Display the table
298
  st.markdown(html, unsafe_allow_html=True)
299
 
300
- st.markdown('</div>', unsafe_allow_html=True)
301
-
302
- # Tab 2: Details
303
  with tab2:
304
- st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
-
306
- # st.markdown('<div class="title"></div>',
307
- # unsafe_allow_html=True)
308
- st.image(image, use_column_width=True)
309
-
310
- st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
- st.write(
312
- "Language models (LMs) are widely used by an increasing number of users, "
313
- "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
- "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
- "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
- )
317
-
318
- st.markdown('### Content Categorization')
319
- st.write(
320
- "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
- "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
- "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
- )
324
-
325
- st.markdown('### Hallucination Prompts & FactBench Dataset')
326
- st.write(
327
- "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
- "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
- "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
- "regularly updated with new prompts."
331
- )
332
-
333
- st.markdown('</div>', unsafe_allow_html=True)
334
-
335
- # # Tab 3: Links
336
- # with tab3:
337
- # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
 
339
- # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
- # unsafe_allow_html=True)
 
 
 
 
341
 
342
- # st.markdown(
343
- # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
- # st.markdown(
345
- # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
 
347
- # st.markdown('</div>', unsafe_allow_html=True)
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import pandas as pd
3
+ # from PIL import Image
4
+ # import base64
5
+ # from io import BytesIO
6
+
7
+ # # Set up page config
8
+ # st.set_page_config(
9
+ # page_title="VeriFact Leaderboard",
10
+ # layout="wide"
11
+ # )
12
+
13
+ # # load header
14
+ # with open("_header.md", "r") as f:
15
+ # HEADER_MD = f.read()
16
+
17
+ # # Load the image
18
+ # image = Image.open("test.png")
19
+ # logo_image = Image.open("./factrbench.png")
20
+
21
+ # # Custom CSS for the page
22
+ # st.markdown(
23
+ # """
24
+ # <style>
25
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
26
+
27
+ # html, body, [class*="css"] {
28
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
29
+ # background-color: #f9f9f9; /* Light grey background */
30
+ # }
31
+
32
+ # .title {
33
+ # font-size: 42px;
34
+ # font-weight: bold;
35
+ # text-align: center;
36
+ # color: #333;
37
+ # margin-bottom: 5px;
38
+ # }
39
+
40
+ # .description {
41
+ # font-size: 22px;
42
+ # text-align: center;
43
+ # margin-bottom: 30px;
44
+ # color: #555;
45
+ # }
46
+
47
+ # .header, .metric {
48
+ # align-items: left;
49
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
50
+ # margin-bottom: 20px;
51
+ # }
52
+
53
+ # .container {
54
+ # max-width: 1000px;
55
+ # margin: 0 auto;
56
+ # padding: 5px;
57
+ # }
58
+
59
+ # table {
60
+ # width: 100%;
61
+ # border-collapse: collapse;
62
+ # border-radius: 10px;
63
+ # overflow: hidden;
64
+ # }
65
+
66
+ # th, td {
67
+ # padding: 8px;
68
+ # text-align: center;
69
+ # border: 1px solid #ddd;
70
+ # font-family: 'Arial', sans-serif; /* or use a similar sans-serif font */
71
+ # font-size: 16px;
72
+ # transition: background-color 0.3s;
73
+ # }
74
+
75
+ # th {
76
+ # background-color: #f2f2f2;
77
+ # font-weight: bold;
78
+ # }
79
+
80
+ # td:hover {
81
+ # background-color: #eaeaea;
82
+ # }
83
+ # </style>
84
+ # """,
85
+ # unsafe_allow_html=True
86
+ # )
87
+
88
+ # # Display title and description
89
+ # st.markdown('<div class="container">', unsafe_allow_html=True)
90
+ # # st.image(logo_image, output_format="PNG", width=200)
91
+
92
+ # # Convert the image to base64
93
+ # buffered = BytesIO()
94
+ # logo_image.save(buffered, format="PNG")
95
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
96
+ # st.markdown(
97
+ # f"""
98
+ # <style>
99
+ # .logo-container {{
100
+ # display: flex;
101
+ # justify-content: flex-start; /* Aligns to the left */
102
+ # }}
103
+ # .logo-container img {{
104
+ # width: 50%; /* Adjust this to control the width, e.g., 50% of container width */
105
+ # margin: 0 auto;
106
+ # max-width: 700px; /* Set a maximum width */
107
+ # background-color: transparent;
108
+ # }}
109
+ # </style>
110
+ # <div class="logo-container">
111
+ # <img src="data:image/png;base64,{img_data}" alt="VeriFact Leaderboard Logo">
112
+ # </div>
113
+ # """,
114
+ # unsafe_allow_html=True
115
+ # )
116
+
117
+ # # header_md_text = HEADER_MD # make some parameters later
118
+ # # gr.Markdown(header_md_text, elem_classes="markdown-text")
119
+
120
+ # st.markdown(
121
+ # '''
122
+ # <div class="header">
123
+ # <br/>
124
+ # <p style="font-size:22px;">
125
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
126
+ # </p>
127
+ # <p style="font-size:20px;">
128
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
129
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
130
+ # </p>
131
+ # </div>
132
+ # ''',
133
+ # unsafe_allow_html=True
134
+ # )
135
+
136
+
137
+ # # st.markdown('<div class="title">VeriFact Leaderboard</div>',
138
+ # # unsafe_allow_html=True)
139
+ # # st.markdown('<div class="description">Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts</div>', unsafe_allow_html=True)
140
+ # st.markdown('</div>', unsafe_allow_html=True)
141
+
142
+ # # Load the data
143
+ # data_path = "verifact_data.csv"
144
+ # df = pd.read_csv(data_path)
145
+
146
+ # # Assign ranks within each tier based on factuality_score
147
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
148
+ # ascending=False, method='min').astype(int)
149
+
150
+ # # Replace NaN values with '-'
151
+ # df.fillna('-', inplace=True)
152
+
153
+ # df['original_order'] = df.groupby('tier').cumcount()
154
+
155
+ # # Create tabs
156
+ # st.markdown("""
157
+ # <style>
158
+ # .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
159
+ # font-size: 20px;
160
+ # }
161
+ # </style>
162
+ # """, unsafe_allow_html=True)
163
+
164
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
165
+
166
+ # # Tab 1: Leaderboard
167
+ # with tab1:
168
+ # # df['original_order'] = df.groupby('tier').cumcount()
169
+ # # print(df['original_order'])
170
+
171
+ # # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
172
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
173
+
174
+ # st.markdown("""
175
+ # <div class="metric" style="font-size:20px; font-weight: bold;">
176
+ # Metrics Explanation
177
+ # </div>
178
+ # """, unsafe_allow_html=True)
179
+
180
+ # st.markdown("""
181
+ # <div class="metric" style="font-size:16px;">
182
+ # <br/>
183
+ # <p>
184
+ # <strong> 🎯 Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> 🌀 Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average length of the response in terms of the number of tokens, the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecidable</strong>), and the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>).
185
+ # </p>
186
+ # <p>
187
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
188
+ # </p>
189
+ # </div>
190
+ # """,
191
+ # unsafe_allow_html=True
192
+ # )
193
+
194
+ # st.markdown("""
195
+ # <style>
196
+ # /* Selectbox text */
197
+ # div[data-baseweb="select"] > div {
198
+ # font-size: 20px;
199
+ # }
200
+
201
+ # /* Dropdown options */
202
+ # div[role="listbox"] ul li {
203
+ # font-size: 20px !important;
204
+ # }
205
+
206
+ # /* Checkbox label */
207
+ # .stCheckbox label p {
208
+ # font-size: 20px !important;
209
+ # }
210
+
211
+ # /* Selectbox label */
212
+ # .stSelectbox label p {
213
+ # font-size: 20px !important;
214
+ # }
215
+ # </style>
216
+ # """, unsafe_allow_html=True)
217
+
218
+ # # Dropdown menu to filter tiers
219
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
220
+ # selected_tier = st.selectbox('Select metric:', tiers)
221
+
222
+ # # Filter the data based on the selected tier
223
+ # if selected_tier != 'All Metrics':
224
+ # filtered_df = df[df['tier'] == selected_tier]
225
+ # else:
226
+ # filtered_df = df
227
+
228
+ # sort_by_factuality = st.checkbox('Sort by overall score')
229
+
230
+ # # Sort the dataframe based on Factuality Score if the checkbox is selected
231
+ # if sort_by_factuality:
232
+ # updated_filtered_df = filtered_df.sort_values(
233
+ # by=['tier', 'Overall'], ascending=[True, False]
234
+ # )
235
+ # else:
236
+ # updated_filtered_df = filtered_df.sort_values(
237
+ # by=['tier', 'original_order']
238
+ # )
239
+
240
+ # # Create HTML for the table
241
+ # if selected_tier == 'All Metrics':
242
+ # html = '''
243
+ # <table>
244
+ # <thead>
245
+ # <tr>
246
+ # <th>Metric</th>
247
+ # <th>Rank</th>
248
+ # <th>Model</th>
249
+ # <th>Factbench</th>
250
+ # <th>Reddit</th>
251
+ # <th>Overall</th>
252
+ # </tr>
253
+ # </thead>
254
+ # <tbody>
255
+ # '''
256
+ # else:
257
+ # html = '''
258
+ # <table>
259
+ # <thead>
260
+ # <tr>
261
+ # <th>Rank</th>
262
+ # <th>Model</th>
263
+ # <th>Factbench</th>
264
+ # <th>Reddit</th>
265
+ # <th>Overall</th>
266
+ # </tr>
267
+ # </thead>
268
+ # <tbody>
269
+ # '''
270
+
271
+ # # Generate the rows of the table
272
+ # current_tier = None
273
+ # for i, row in updated_filtered_df.iterrows():
274
+ # html += '<tr>'
275
+
276
+ # # Only display the 'Tier' column if 'All Tiers' is selected
277
+ # if selected_tier == 'All Metrics':
278
+ # if row['tier'] != current_tier:
279
+ # current_tier = row['tier']
280
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
281
+
282
+ # # Fill in model and scores
283
+ # html += f'''
284
+ # <td>{row['rank']}</td>
285
+ # <td>{row['model']}</td>
286
+ # <td>{row['FactBench']}</td>
287
+ # <td>{row['Reddit']}</td>
288
+ # <td>{row['Overall']}</td>
289
+ # </tr>
290
+ # '''
291
+
292
+ # # Close the table
293
+ # html += '''
294
+ # </table>
295
+ # '''
296
+
297
+ # # Display the table
298
+ # st.markdown(html, unsafe_allow_html=True)
299
+
300
+ # st.markdown('</div>', unsafe_allow_html=True)
301
+
302
+ # # Tab 2: Details
303
+ # with tab2:
304
+ # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
305
+
306
+ # # st.markdown('<div class="title"></div>',
307
+ # # unsafe_allow_html=True)
308
+ # st.image(image, use_column_width=True)
309
+
310
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
311
+ # st.write(
312
+ # "Language models (LMs) are widely used by an increasing number of users, "
313
+ # "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
314
+ # "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
315
+ # "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
316
+ # )
317
+
318
+ # st.markdown('### Content Categorization')
319
+ # st.write(
320
+ # "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
321
+ # "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
322
+ # "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
323
+ # )
324
+
325
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
326
+ # st.write(
327
+ # "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
328
+ # "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
329
+ # "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
330
+ # "regularly updated with new prompts."
331
+ # )
332
+
333
+ # st.markdown('</div>', unsafe_allow_html=True)
334
+
335
+ # # # Tab 3: Links
336
+ # # with tab3:
337
+ # # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
338
+
339
+ # # st.markdown('<div class="title">Submit your model information on our Github</div>',
340
+ # # unsafe_allow_html=True)
341
+
342
+ # # st.markdown(
343
+ # # '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
344
+ # # st.markdown(
345
+ # # '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
346
+
347
+ # # st.markdown('</div>', unsafe_allow_html=True)
348
+
349
+
350
+ # import streamlit as st
351
+ # import pandas as pd
352
+ # from PIL import Image
353
+ # import base64
354
+ # from io import BytesIO
355
+
356
+ # # Set up page config
357
+ # st.set_page_config(
358
+ # page_title="VeriFact Leaderboard",
359
+ # layout="wide"
360
+ # )
361
+
362
+ # # load header
363
+ # with open("_header.md", "r") as f:
364
+ # HEADER_MD = f.read()
365
+
366
+ # # Load the image
367
+ # image = Image.open("test.png")
368
+ # logo_image = Image.open("./factrbench.png")
369
+
370
+ # # Custom CSS for the page
371
+ # st.markdown(
372
+ # """
373
+ # <style>
374
+ # @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
375
+
376
+ # html, body, [class*="css"] {
377
+ # font-family: 'Arial', sans-serif;
378
+ # background-color: #f9f9f9;
379
+ # }
380
+
381
+ # .title {
382
+ # font-size: 42px;
383
+ # font-weight: bold;
384
+ # text-align: center;
385
+ # color: #333;
386
+ # margin-bottom: 5px;
387
+ # }
388
+
389
+ # .description {
390
+ # font-size: 22px;
391
+ # text-align: center;
392
+ # margin-bottom: 30px;
393
+ # color: #555;
394
+ # }
395
+
396
+ # .header, .metric {
397
+ # align-items: left;
398
+ # margin-bottom: 20px;
399
+ # }
400
+
401
+ # .container {
402
+ # max-width: 1000px;
403
+ # margin: 0 auto;
404
+ # padding: 5px;
405
+ # }
406
+
407
+ # table {
408
+ # width: 100%;
409
+ # border-collapse: collapse;
410
+ # border-radius: 10px;
411
+ # overflow: hidden;
412
+ # }
413
+
414
+ # th, td {
415
+ # padding: 8px;
416
+ # text-align: center;
417
+ # border: 1px solid #ddd;
418
+ # font-size: 16px;
419
+ # transition: background-color 0.3s;
420
+ # }
421
+
422
+ # th {
423
+ # background-color: #f2f2f2;
424
+ # font-weight: bold;
425
+ # }
426
+
427
+ # td:hover {
428
+ # background-color: #eaeaea;
429
+ # }
430
+ # </style>
431
+ # """,
432
+ # unsafe_allow_html=True
433
+ # )
434
+
435
+ # # Display logo
436
+ # buffered = BytesIO()
437
+ # logo_image.save(buffered, format="PNG")
438
+ # img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
439
+
440
+ # st.markdown(
441
+ # f"""
442
+ # <div class="logo-container" style="display:flex; justify-content: flex-start;">
443
+ # <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
444
+ # </div>
445
+ # """,
446
+ # unsafe_allow_html=True
447
+ # )
448
+
449
+ # st.markdown(
450
+ # '''
451
+ # <div class="header">
452
+ # <br/>
453
+ # <p style="font-size:22px;">
454
+ # VERIFACT: Enhancing Long-Form Factuality Evaluation with Refined Fact Extraction and Reference Facts
455
+ # </p>
456
+ # <p style="font-size:20px;">
457
+ # # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
458
+ # ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>???</strong>
459
+ # </p>
460
+ # </div>
461
+ # ''',
462
+ # unsafe_allow_html=True
463
+ # )
464
+
465
+ # # Load the data
466
+ # data_path = "verifact_data.csv"
467
+ # df = pd.read_csv(data_path)
468
+
469
+ # # Assign ranks within each tier
470
+ # df['rank'] = df.groupby('tier')['Overall'].rank(
471
+ # ascending=False, method='min').astype(int)
472
+
473
+ # df.fillna('-', inplace=True)
474
+ # df['original_order'] = df.groupby('tier').cumcount()
475
+
476
+ # # Tabs
477
+ # tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
478
+
479
+ # # Tab 1: Leaderboard
480
+ # with tab1:
481
+ # st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
482
+
483
+ # st.markdown("""
484
+ # <div class="metric" style="font-size:16px;">
485
+ # <p>
486
+ # <strong> 🎯 Factual Precision </strong>, <strong> 🌀 Hallucination Score </strong> and other statistics are described in the paper.
487
+ # 🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models
488
+ # </p>
489
+ # </div>
490
+ # """, unsafe_allow_html=True)
491
+
492
+ # tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
493
+ # selected_tier = st.selectbox('Select metric:', tiers)
494
+
495
+ # if selected_tier != 'All Metrics':
496
+ # filtered_df = df[df['tier'] == selected_tier]
497
+ # else:
498
+ # filtered_df = df
499
+
500
+ # sort_by_factuality = st.checkbox('Sort by overall score')
501
+ # if sort_by_factuality:
502
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
503
+ # else:
504
+ # updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
505
+
506
+ # # 缩小表格:用容器包裹并限制最大宽度
507
+ # html = '<div style="max-width: 1000px; margin: 0 auto;"><table>'
508
+ # html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
509
+
510
+ # current_tier = None
511
+ # for _, row in updated_filtered_df.iterrows():
512
+ # html += '<tr>'
513
+ # if selected_tier == 'All Metrics' and row['tier'] != current_tier:
514
+ # current_tier = row['tier']
515
+ # html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
516
+ # html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
517
+
518
+ # html += '</tbody></table></div>'
519
+ # st.markdown(html, unsafe_allow_html=True)
520
+
521
+ # # Tab 2: Benchmark Details
522
+ # with tab2:
523
+ # # 图片剧中显示
524
+ # buffered_img = BytesIO()
525
+ # image.save(buffered_img, format="PNG")
526
+ # image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
527
+
528
+ # st.markdown(f'''<div style="text-align:center;">
529
+ # <img src="data:image/png;base64,{image_data}" style="max-width:1200px; width:100%; height:auto;" />
530
+ # </div>''', unsafe_allow_html=True)
531
+
532
+ # st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
533
+ # st.write("Language models (LMs) are widely used by an increasing number of users, underscoring the challenge of maintaining factual accuracy across a broad range of topics. We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), a pipeline to evaluate LMs' factual accuracy in real-world user interactions.")
534
+
535
+ # st.markdown('### Content Categorization')
536
+ # st.write("VERIFY considers the verifiability of LM-generated content and categorizes content units as `supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods.")
537
+
538
+ # st.markdown('### Hallucination Prompts & FactBench Dataset')
539
+ # st.write("Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is regularly updated with new prompts.")
540
+
541
+
542
  import streamlit as st
543
  import pandas as pd
544
  from PIL import Image
 
551
  layout="wide"
552
  )
553
 
 
 
 
 
554
  # Load the image
555
+ image = Image.open("test.png")
556
+ logo_image = Image.open("./factrbench.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
+ # Display logo
 
 
 
 
559
  buffered = BytesIO()
560
  logo_image.save(buffered, format="PNG")
561
  img_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
562
+
563
  st.markdown(
564
  f"""
565
+ <div class="logo-container" style="display:flex; justify-content: center;">
566
+ <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
 
 
 
 
 
 
 
 
 
 
 
 
567
  </div>
568
  """,
569
  unsafe_allow_html=True
570
  )
571
 
 
 
 
572
  st.markdown(
573
  '''
574
  <div class="header">
 
578
  </p>
579
  <p style="font-size:20px;">
580
  # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a>
581
+ ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 8 | Updated: <strong>Feb 2025</strong>
582
  </p>
583
  </div>
584
  ''',
585
  unsafe_allow_html=True
586
  )
587
 
 
 
 
 
 
 
588
  # Load the data
589
  data_path = "verifact_data.csv"
590
  df = pd.read_csv(data_path)
591
 
592
+ # Assign ranks within each tier
593
  df['rank'] = df.groupby('tier')['Overall'].rank(
594
  ascending=False, method='min').astype(int)
595
 
 
596
  df.fillna('-', inplace=True)
 
597
  df['original_order'] = df.groupby('tier').cumcount()
598
 
599
+ # Tabs
 
 
 
 
 
 
 
 
600
  tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
601
 
602
  # Tab 1: Leaderboard
603
  with tab1:
604
+ st.markdown('<div class="metric" style="font-size:20px; font-weight: bold;">Metrics Explanation</div>', unsafe_allow_html=True)
 
 
 
 
605
 
606
  st.markdown("""
607
+ <div class="metric" style="font-size:16px;">
608
+ <p>
609
+ <strong> 🎯 Precision </strong> measures the ratio of correct facts among all extracted facts. <br>
610
+ <strong> 🔎 Recall </strong> assesses how many reference facts are covered by model outputs. <br>
611
+ <strong> ⚖️ F1 </strong> balances precision and recall for comprehensive factual evaluation.<br>
612
+ This leaderboard benchmarks models on FACTRBENCH, a challenging dataset with real-world prompts and verified reference fact sets.<br>
613
+ </p>
614
+ </div>
615
  """, unsafe_allow_html=True)
616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  tiers = ['All Metrics', 'Precision', 'Recall', 'F1']
618
  selected_tier = st.selectbox('Select metric:', tiers)
619
 
 
620
  if selected_tier != 'All Metrics':
621
  filtered_df = df[df['tier'] == selected_tier]
622
  else:
623
  filtered_df = df
624
 
625
  sort_by_factuality = st.checkbox('Sort by overall score')
 
 
626
  if sort_by_factuality:
627
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'Overall'], ascending=[True, False])
 
 
628
  else:
629
+ updated_filtered_df = filtered_df.sort_values(by=['tier', 'original_order'])
630
+
631
+ # html = '<div style="max-width: 2000px; margin: 0 auto;"><table>'
632
+ html = '''
633
+ <div style="width: 60%; margin: 0 auto;">
634
+ <table style="width: 100%;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  '''
636
+ html += """<thead><tr>""" + ("<th>Metric</th>" if selected_tier == 'All Metrics' else "") + "<th>Rank</th><th>Model</th><th>Factbench</th><th>Reddit</th><th>Overall</th></tr></thead><tbody>"
637
 
 
638
  current_tier = None
639
+ for _, row in updated_filtered_df.iterrows():
640
  html += '<tr>'
641
+ if selected_tier == 'All Metrics' and row['tier'] != current_tier:
642
+ current_tier = row['tier']
643
+ html += f'<td rowspan="8" style="vertical-align: middle;">{current_tier}</td>'
644
+ html += f'<td>{row["rank"]}</td><td>{row["model"]}</td><td>{row["FactBench"]}</td><td>{row["Reddit"]}</td><td>{row["Overall"]}</td></tr>'
645
 
646
+ html += '</tbody></table></div>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
  st.markdown(html, unsafe_allow_html=True)
648
 
649
+ # Tab 2: Benchmark Details
 
 
650
  with tab2:
651
+ buffered_img = BytesIO()
652
+ image.save(buffered_img, format="PNG")
653
+ image_data = base64.b64encode(buffered_img.getvalue()).decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
+ # st.markdown(f'''<div style="text-align:center;">
656
+ # <img src="data:image/png;base64,{image_data}" style="max-width:800px; width:100%; height:auto;" />
657
+ # </div>''', unsafe_allow_html=True)
658
+ st.markdown(f'''<div style="text-align:center; width:65%; margin:0 auto;">
659
+ <img src="data:image/png;base64,{image_data}" style="width:100%; height:auto;" />
660
+ </div>''', unsafe_allow_html=True)
661
 
662
+ st.markdown('### What is VERIFACT?')
663
+ st.write("VERIFACT is a factuality evaluation framework for long-form language model responses. It improves fact extraction by identifying and refining incomplete or missing facts and capturing inter-sentence dependencies.")
 
 
664
 
665
+ st.markdown('### What is FACTRBENCH?')
666
+ st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation that provides reference fact sets and external evidence, allowing both precision and recall evaluation across 1,096 real-world prompts from FactBench and Reddit.")
667
+
668
+ st.markdown('### Key Findings')
669
+ st.write("VERIFACT outperforms prior methods in reducing incomplete and missing facts, resulting in more reliable factuality evaluation. Larger models show better precision and recall, but high precision does not always mean high recall — highlighting the need to consider both.")
verifact_steps.svg ADDED