hongfu_test_20250701
#18
by
hxiang
- opened
- .idea/workspace.xml +0 -58
- app.py +35 -82
- assets/text.py +3 -3
- changelog.md +1 -12
- data/ChineseGuardBench.csv +0 -33
- data/chinese_benchmark_gen.csv +0 -2
- data/chinese_benchmark_per.csv +1 -1
- data/subclass_gen.csv +1 -1
- data/subclass_per.csv +1 -1
.idea/workspace.xml
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="ChangeListManager">
|
4 |
-
<list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
5 |
-
<option name="SHOW_DIALOG" value="false" />
|
6 |
-
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
7 |
-
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
8 |
-
<option name="LAST_RESOLUTION" value="IGNORE" />
|
9 |
-
</component>
|
10 |
-
<component name="Git.Settings">
|
11 |
-
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
12 |
-
</component>
|
13 |
-
<component name="MarkdownSettingsMigration">
|
14 |
-
<option name="stateVersion" value="1" />
|
15 |
-
</component>
|
16 |
-
<component name="ProjectColorInfo">{
|
17 |
-
"customColor": "",
|
18 |
-
"associatedIndex": 2
|
19 |
-
}</component>
|
20 |
-
<component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
|
21 |
-
<component name="ProjectViewState">
|
22 |
-
<option name="hideEmptyMiddlePackages" value="true" />
|
23 |
-
<option name="showLibraryContents" value="true" />
|
24 |
-
</component>
|
25 |
-
<component name="PropertiesComponent"><![CDATA[{
|
26 |
-
"keyToString": {
|
27 |
-
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
28 |
-
"RunOnceActivity.ShowReadmeOnStart": "true",
|
29 |
-
"git-widget-placeholder": "pr/18",
|
30 |
-
"last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
|
31 |
-
"nodejs_package_manager_path": "npm",
|
32 |
-
"vue.rearranger.settings.migration": "true"
|
33 |
-
}
|
34 |
-
}]]></component>
|
35 |
-
<component name="SharedIndexes">
|
36 |
-
<attachedChunks>
|
37 |
-
<set>
|
38 |
-
<option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
|
39 |
-
</set>
|
40 |
-
</attachedChunks>
|
41 |
-
</component>
|
42 |
-
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
43 |
-
<component name="TaskManager">
|
44 |
-
<task active="true" id="Default" summary="Default task">
|
45 |
-
<changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
46 |
-
<created>1751365967779</created>
|
47 |
-
<option name="number" value="Default" />
|
48 |
-
<option name="presentableId" value="Default" />
|
49 |
-
<updated>1751365967779</updated>
|
50 |
-
<workItem from="1751365968934" duration="39000" />
|
51 |
-
<workItem from="1751366116696" duration="54000" />
|
52 |
-
</task>
|
53 |
-
<servers />
|
54 |
-
</component>
|
55 |
-
<component name="TypeScriptGeneratedFilesManager">
|
56 |
-
<option name="version" value="3" />
|
57 |
-
</component>
|
58 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -6,16 +6,15 @@ import pandas as pd
|
|
6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
7 |
|
8 |
|
9 |
-
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv",
|
10 |
-
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv",
|
11 |
|
12 |
-
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv",
|
13 |
-
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv",
|
14 |
-
|
15 |
-
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
16 |
|
17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
18 |
|
|
|
19 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
20 |
|
21 |
#SPLITS = ["Overall", "Subclass"]
|
@@ -27,10 +26,9 @@ CLASSIFICATION = {
|
|
27 |
"~30B",
|
28 |
"10B~20B",
|
29 |
"5B~10B",
|
30 |
-
"1B~5B",
|
31 |
"API",
|
32 |
]
|
33 |
-
|
34 |
}
|
35 |
|
36 |
|
@@ -38,17 +36,17 @@ CLASSIFICATION = {
|
|
38 |
|
39 |
_BIBTEX = """
|
40 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
41 |
-
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
42 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
43 |
year={2024},
|
44 |
eprint={2410.18491},
|
45 |
archivePrefix={arXiv},
|
46 |
primaryClass={cs.CL},
|
47 |
-
url={https://arxiv.org/abs/2410.18491},
|
48 |
}
|
49 |
"""
|
50 |
|
51 |
-
_LAST_UPDATED = "
|
52 |
|
53 |
banner_url = "./assets/logo.png"
|
54 |
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
|
@@ -64,31 +62,18 @@ def format_csv_numbers(text):
|
|
64 |
|
65 |
def format_csv_numbers_second(text):
|
66 |
return text.split()
|
67 |
-
|
68 |
-
|
69 |
def format_number(x):
|
70 |
return float(f"{x:.3}")
|
71 |
|
72 |
|
73 |
-
def get_dataset_new_csv(
|
74 |
-
model_size: List[str],
|
75 |
-
):
|
76 |
-
df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
|
77 |
-
df = df.drop(columns="Size")
|
78 |
-
|
79 |
-
leaderboard_table = gr.components.Dataframe(
|
80 |
-
value=df,
|
81 |
-
interactive=False,
|
82 |
-
visible=True,
|
83 |
-
)
|
84 |
-
return leaderboard_table
|
85 |
-
|
86 |
def get_dataset_csv(
|
87 |
model_size: List[str],
|
88 |
):
|
89 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
90 |
df = df.drop(columns="Size")
|
91 |
-
|
92 |
leaderboard_table = gr.components.Dataframe(
|
93 |
value=df,
|
94 |
interactive=False,
|
@@ -116,11 +101,11 @@ def get_dataset_csv_sub_gen(
|
|
116 |
):
|
117 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
118 |
df = df.drop(columns="Size")
|
119 |
-
|
120 |
# get subclass
|
121 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
122 |
df = df[subclass_choice_label]
|
123 |
-
|
124 |
leaderboard_table = gr.components.Dataframe(
|
125 |
value=df,
|
126 |
interactive=False,
|
@@ -135,11 +120,11 @@ def get_dataset_csv_sub_per(
|
|
135 |
):
|
136 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
137 |
df = df.drop(columns="Size")
|
138 |
-
|
139 |
# get subclass
|
140 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
141 |
df = df[subclass_choice_label]
|
142 |
-
|
143 |
leaderboard_table = gr.components.Dataframe(
|
144 |
value=df,
|
145 |
interactive=False,
|
@@ -158,15 +143,7 @@ def get_dataset_classfier_gen(
|
|
158 |
subclass_choice = main_choice
|
159 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
160 |
return leaderboard_table
|
161 |
-
|
162 |
-
def get_ChineseGuardBench(
|
163 |
-
model_size: List[str],
|
164 |
-
main_choice: List[str],
|
165 |
-
):
|
166 |
-
leaderboard_table = get_dataset_new_csv(model_size)
|
167 |
-
return leaderboard_table
|
168 |
-
|
169 |
-
|
170 |
def get_dataset_classfier_per(
|
171 |
model_size: List[str],
|
172 |
main_choice: List[str],
|
@@ -187,10 +164,10 @@ with gr.Blocks() as demo:
|
|
187 |
|
188 |
with gr.Row():
|
189 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
190 |
-
|
191 |
with gr.Row():
|
192 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
193 |
-
|
194 |
with gr.Row():
|
195 |
with gr.Column(scale=0.8):
|
196 |
main_choice = gr.Dropdown(
|
@@ -199,8 +176,8 @@ with gr.Blocks() as demo:
|
|
199 |
label="Type",
|
200 |
info="Please choose the type to display.",
|
201 |
)
|
202 |
-
|
203 |
-
with gr.Column(scale=10):
|
204 |
model_choice = gr.CheckboxGroup(
|
205 |
choices=CLASSIFICATION["model_size"],
|
206 |
value=CLASSIFICATION["model_size"], # all be choosed
|
@@ -211,29 +188,24 @@ with gr.Blocks() as demo:
|
|
211 |
#👉 this part is for csv table generatived
|
212 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
213 |
# ----------------- modify text -----------------
|
214 |
-
|
215 |
-
with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=
|
216 |
dataframe_all_gen = gr.components.Dataframe(
|
217 |
elem_id="leaderboard-table",
|
218 |
)
|
219 |
-
|
220 |
-
with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=6):
|
221 |
-
dataframe_all_per = gr.components.Dataframe(
|
222 |
-
elem_id="leaderboard-table",
|
223 |
-
)
|
224 |
|
225 |
-
with gr.TabItem("🏅
|
226 |
-
|
227 |
elem_id="leaderboard-table",
|
228 |
)
|
229 |
|
230 |
# ----------------- modify text -----------------
|
231 |
with gr.Row():
|
232 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
233 |
-
|
234 |
with gr.Row():
|
235 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
236 |
-
|
237 |
# 👉 this part is for citation
|
238 |
with gr.Row():
|
239 |
with gr.Accordion("📙 Citation", open=True):
|
@@ -244,18 +216,18 @@ with gr.Blocks() as demo:
|
|
244 |
elem_id="citation-button",
|
245 |
show_copy_button=True
|
246 |
)
|
247 |
-
|
248 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
249 |
-
|
250 |
# --------------------------- all --------------------------------
|
251 |
# this is all result Perplexity
|
252 |
-
|
253 |
main_choice.change(
|
254 |
get_dataset_classfier_per,
|
255 |
inputs=[model_choice, main_choice],
|
256 |
outputs=dataframe_all_per,
|
257 |
)
|
258 |
-
|
259 |
model_choice.change(
|
260 |
get_dataset_classfier_per,
|
261 |
inputs=[model_choice, main_choice],
|
@@ -267,45 +239,26 @@ with gr.Blocks() as demo:
|
|
267 |
inputs=[model_choice, main_choice],
|
268 |
outputs=dataframe_all_per,
|
269 |
)
|
270 |
-
|
271 |
# this is all result generatived
|
272 |
main_choice.change(
|
273 |
get_dataset_classfier_gen,
|
274 |
inputs=[model_choice, main_choice],
|
275 |
outputs=dataframe_all_gen,
|
276 |
)
|
277 |
-
|
278 |
model_choice.change(
|
279 |
get_dataset_classfier_gen,
|
280 |
inputs=[model_choice, main_choice],
|
281 |
outputs=dataframe_all_gen,
|
282 |
)
|
283 |
-
|
284 |
demo.load(
|
285 |
fn=get_dataset_classfier_gen,
|
286 |
inputs=[model_choice, main_choice],
|
287 |
outputs=dataframe_all_gen,
|
288 |
)
|
289 |
-
|
290 |
-
# this is new results for ChineseGuardBench
|
291 |
|
292 |
-
|
293 |
-
# get_ChineseGuardBench,
|
294 |
-
# inputs=[model_choice, main_choice],
|
295 |
-
# outputs=dataframe_all_guardbench,
|
296 |
-
# )
|
297 |
-
|
298 |
-
model_choice.change(
|
299 |
-
get_ChineseGuardBench,
|
300 |
-
inputs=[model_choice, main_choice],
|
301 |
-
outputs=dataframe_all_guardbench,
|
302 |
-
)
|
303 |
-
|
304 |
-
demo.load(
|
305 |
-
fn=get_ChineseGuardBench,
|
306 |
-
inputs=[model_choice, main_choice],
|
307 |
-
outputs=dataframe_all_guardbench,
|
308 |
-
)
|
309 |
-
|
310 |
demo.launch(share=True)
|
311 |
|
|
|
6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
7 |
|
8 |
|
9 |
+
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
|
10 |
+
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
|
11 |
|
12 |
+
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
|
13 |
+
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
|
|
|
|
|
14 |
|
15 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
16 |
|
17 |
+
|
18 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
19 |
|
20 |
#SPLITS = ["Overall", "Subclass"]
|
|
|
26 |
"~30B",
|
27 |
"10B~20B",
|
28 |
"5B~10B",
|
|
|
29 |
"API",
|
30 |
]
|
31 |
+
|
32 |
}
|
33 |
|
34 |
|
|
|
36 |
|
37 |
_BIBTEX = """
|
38 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
39 |
+
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
40 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
41 |
year={2024},
|
42 |
eprint={2410.18491},
|
43 |
archivePrefix={arXiv},
|
44 |
primaryClass={cs.CL},
|
45 |
+
url={https://arxiv.org/abs/2410.18491},
|
46 |
}
|
47 |
"""
|
48 |
|
49 |
+
_LAST_UPDATED = "April 13, 2025"
|
50 |
|
51 |
banner_url = "./assets/logo.png"
|
52 |
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
|
|
|
62 |
|
63 |
def format_csv_numbers_second(text):
|
64 |
return text.split()
|
65 |
+
|
66 |
+
|
67 |
def format_number(x):
|
68 |
return float(f"{x:.3}")
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def get_dataset_csv(
|
72 |
model_size: List[str],
|
73 |
):
|
74 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
75 |
df = df.drop(columns="Size")
|
76 |
+
|
77 |
leaderboard_table = gr.components.Dataframe(
|
78 |
value=df,
|
79 |
interactive=False,
|
|
|
101 |
):
|
102 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
103 |
df = df.drop(columns="Size")
|
104 |
+
|
105 |
# get subclass
|
106 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
107 |
df = df[subclass_choice_label]
|
108 |
+
|
109 |
leaderboard_table = gr.components.Dataframe(
|
110 |
value=df,
|
111 |
interactive=False,
|
|
|
120 |
):
|
121 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
122 |
df = df.drop(columns="Size")
|
123 |
+
|
124 |
# get subclass
|
125 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
126 |
df = df[subclass_choice_label]
|
127 |
+
|
128 |
leaderboard_table = gr.components.Dataframe(
|
129 |
value=df,
|
130 |
interactive=False,
|
|
|
143 |
subclass_choice = main_choice
|
144 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
145 |
return leaderboard_table
|
146 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
def get_dataset_classfier_per(
|
148 |
model_size: List[str],
|
149 |
main_choice: List[str],
|
|
|
164 |
|
165 |
with gr.Row():
|
166 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
167 |
+
|
168 |
with gr.Row():
|
169 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
170 |
+
|
171 |
with gr.Row():
|
172 |
with gr.Column(scale=0.8):
|
173 |
main_choice = gr.Dropdown(
|
|
|
176 |
label="Type",
|
177 |
info="Please choose the type to display.",
|
178 |
)
|
179 |
+
|
180 |
+
with gr.Column(scale=10):
|
181 |
model_choice = gr.CheckboxGroup(
|
182 |
choices=CLASSIFICATION["model_size"],
|
183 |
value=CLASSIFICATION["model_size"], # all be choosed
|
|
|
188 |
#👉 this part is for csv table generatived
|
189 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
190 |
# ----------------- modify text -----------------
|
191 |
+
|
192 |
+
with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
|
193 |
dataframe_all_gen = gr.components.Dataframe(
|
194 |
elem_id="leaderboard-table",
|
195 |
)
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
|
198 |
+
dataframe_all_per = gr.components.Dataframe(
|
199 |
elem_id="leaderboard-table",
|
200 |
)
|
201 |
|
202 |
# ----------------- modify text -----------------
|
203 |
with gr.Row():
|
204 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
205 |
+
|
206 |
with gr.Row():
|
207 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
208 |
+
|
209 |
# 👉 this part is for citation
|
210 |
with gr.Row():
|
211 |
with gr.Accordion("📙 Citation", open=True):
|
|
|
216 |
elem_id="citation-button",
|
217 |
show_copy_button=True
|
218 |
)
|
219 |
+
|
220 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
221 |
+
|
222 |
# --------------------------- all --------------------------------
|
223 |
# this is all result Perplexity
|
224 |
+
|
225 |
main_choice.change(
|
226 |
get_dataset_classfier_per,
|
227 |
inputs=[model_choice, main_choice],
|
228 |
outputs=dataframe_all_per,
|
229 |
)
|
230 |
+
|
231 |
model_choice.change(
|
232 |
get_dataset_classfier_per,
|
233 |
inputs=[model_choice, main_choice],
|
|
|
239 |
inputs=[model_choice, main_choice],
|
240 |
outputs=dataframe_all_per,
|
241 |
)
|
242 |
+
|
243 |
# this is all result generatived
|
244 |
main_choice.change(
|
245 |
get_dataset_classfier_gen,
|
246 |
inputs=[model_choice, main_choice],
|
247 |
outputs=dataframe_all_gen,
|
248 |
)
|
249 |
+
|
250 |
model_choice.change(
|
251 |
get_dataset_classfier_gen,
|
252 |
inputs=[model_choice, main_choice],
|
253 |
outputs=dataframe_all_gen,
|
254 |
)
|
255 |
+
|
256 |
demo.load(
|
257 |
fn=get_dataset_classfier_gen,
|
258 |
inputs=[model_choice, main_choice],
|
259 |
outputs=dataframe_all_gen,
|
260 |
)
|
|
|
|
|
261 |
|
262 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
demo.launch(share=True)
|
264 |
|
assets/text.py
CHANGED
@@ -34,13 +34,13 @@ EVALUTION_TEXT= """
|
|
34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
-
For generation, we use the content generated by the model to make prediction.
|
38 |
-
|
39 |
-
The following are the results of the evaluation.👇👇👇
|
40 |
</span> <br><br>
|
41 |
|
42 |
|
43 |
""" # noqa
|
|
|
44 |
REFERENCE_TEXT = """
|
45 |
# References
|
46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
|
|
34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
37 |
+
For generation, we use the content generated by the model to make prediction.
|
38 |
+
The following are the results of the evaluation. 👇👇👇
|
|
|
39 |
</span> <br><br>
|
40 |
|
41 |
|
42 |
""" # noqa
|
43 |
+
|
44 |
REFERENCE_TEXT = """
|
45 |
# References
|
46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
changelog.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# CHANGELOG
|
2 |
|
3 |
-
|
4 |
### 2024-7-16
|
5 |
version: v1.0.0
|
6 |
|
@@ -67,14 +66,4 @@ version: v1.0.6
|
|
67 |
- Deepseek-chat-v3-0324
|
68 |
- Qwen3
|
69 |
- Gemma-3
|
70 |
-
- OpenThinker2
|
71 |
-
|
72 |
-
### 2025-7-29
|
73 |
-
version: v1.0.7
|
74 |
-
|
75 |
-
changed:
|
76 |
-
- [1]feat: Update the two models required by Deepexi.
|
77 |
-
- Deepexi-Guard-3B
|
78 |
-
- Qwen2.5-3B-Instruct
|
79 |
-
|
80 |
-
- [2]feat: Update a new table ChineseGuardBench required by Deepxi.
|
|
|
1 |
# CHANGELOG
|
2 |
|
|
|
3 |
### 2024-7-16
|
4 |
version: v1.0.0
|
5 |
|
|
|
66 |
- Deepseek-chat-v3-0324
|
67 |
- Qwen3
|
68 |
- Gemma-3
|
69 |
+
- OpenThinker2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/ChineseGuardBench.csv
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
|
2 |
-
Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
|
3 |
-
Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
|
4 |
-
Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
|
5 |
-
Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
|
6 |
-
GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
|
7 |
-
Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
|
8 |
-
QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
|
9 |
-
Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
|
10 |
-
Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
|
11 |
-
DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
|
12 |
-
Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
|
13 |
-
GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
|
14 |
-
MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
|
15 |
-
DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
|
16 |
-
Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
|
17 |
-
Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
|
18 |
-
GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
|
19 |
-
Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
|
20 |
-
Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
|
21 |
-
Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
|
22 |
-
shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
|
23 |
-
Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
|
24 |
-
SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
|
25 |
-
Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
|
26 |
-
SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
|
27 |
-
ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
|
28 |
-
Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
|
29 |
-
internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
|
30 |
-
Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
|
31 |
-
Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
|
32 |
-
DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
|
33 |
-
Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/chinese_benchmark_gen.csv
CHANGED
@@ -7,8 +7,6 @@ Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,7
|
|
7 |
Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
|
8 |
Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
|
9 |
Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
|
10 |
-
Deepexi-Guard-3B,1B~5B,78.26/0.0,89.35/0.0,64.16/0.0,72.04/0.0,92.35/0.0
|
11 |
-
Qwen2.5-3B-Instruct,1B~5B,71.81/0.0,70.36/0.0,75.36/0.0,73.47/0.0,68.25/0.0
|
12 |
Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
|
13 |
Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
|
14 |
DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
|
|
|
7 |
Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
|
8 |
Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
|
9 |
Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
|
|
|
|
|
10 |
Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
|
11 |
Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
|
12 |
DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
|
data/chinese_benchmark_per.csv
CHANGED
@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
|
|
43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
46 |
-
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
|
|
43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
46 |
+
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
data/subclass_gen.csv
CHANGED
@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
|
|
48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
51 |
-
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
|
|
48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
51 |
+
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
data/subclass_per.csv
CHANGED
@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
|
|
41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
44 |
-
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|
|
|
41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
44 |
+
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|