hongfu_test_20250701
#18
by
hxiang
- opened
- .idea/workspace.xml +0 -58
- app.py +35 -82
- assets/text.py +3 -3
- changelog.md +1 -12
- data/ChineseGuardBench.csv +0 -33
- data/chinese_benchmark_gen.csv +0 -2
- data/chinese_benchmark_per.csv +1 -1
- data/subclass_gen.csv +1 -1
- data/subclass_per.csv +1 -1
.idea/workspace.xml
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
-
<project version="4">
|
| 3 |
-
<component name="ChangeListManager">
|
| 4 |
-
<list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
| 5 |
-
<option name="SHOW_DIALOG" value="false" />
|
| 6 |
-
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
| 7 |
-
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
| 8 |
-
<option name="LAST_RESOLUTION" value="IGNORE" />
|
| 9 |
-
</component>
|
| 10 |
-
<component name="Git.Settings">
|
| 11 |
-
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
| 12 |
-
</component>
|
| 13 |
-
<component name="MarkdownSettingsMigration">
|
| 14 |
-
<option name="stateVersion" value="1" />
|
| 15 |
-
</component>
|
| 16 |
-
<component name="ProjectColorInfo">{
|
| 17 |
-
"customColor": "",
|
| 18 |
-
"associatedIndex": 2
|
| 19 |
-
}</component>
|
| 20 |
-
<component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
|
| 21 |
-
<component name="ProjectViewState">
|
| 22 |
-
<option name="hideEmptyMiddlePackages" value="true" />
|
| 23 |
-
<option name="showLibraryContents" value="true" />
|
| 24 |
-
</component>
|
| 25 |
-
<component name="PropertiesComponent"><![CDATA[{
|
| 26 |
-
"keyToString": {
|
| 27 |
-
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
| 28 |
-
"RunOnceActivity.ShowReadmeOnStart": "true",
|
| 29 |
-
"git-widget-placeholder": "pr/18",
|
| 30 |
-
"last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
|
| 31 |
-
"nodejs_package_manager_path": "npm",
|
| 32 |
-
"vue.rearranger.settings.migration": "true"
|
| 33 |
-
}
|
| 34 |
-
}]]></component>
|
| 35 |
-
<component name="SharedIndexes">
|
| 36 |
-
<attachedChunks>
|
| 37 |
-
<set>
|
| 38 |
-
<option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
|
| 39 |
-
</set>
|
| 40 |
-
</attachedChunks>
|
| 41 |
-
</component>
|
| 42 |
-
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
| 43 |
-
<component name="TaskManager">
|
| 44 |
-
<task active="true" id="Default" summary="Default task">
|
| 45 |
-
<changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
|
| 46 |
-
<created>1751365967779</created>
|
| 47 |
-
<option name="number" value="Default" />
|
| 48 |
-
<option name="presentableId" value="Default" />
|
| 49 |
-
<updated>1751365967779</updated>
|
| 50 |
-
<workItem from="1751365968934" duration="39000" />
|
| 51 |
-
<workItem from="1751366116696" duration="54000" />
|
| 52 |
-
</task>
|
| 53 |
-
<servers />
|
| 54 |
-
</component>
|
| 55 |
-
<component name="TypeScriptGeneratedFilesManager">
|
| 56 |
-
<option name="version" value="3" />
|
| 57 |
-
</component>
|
| 58 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -6,16 +6,15 @@ import pandas as pd
|
|
| 6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
| 7 |
|
| 8 |
|
| 9 |
-
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv",
|
| 10 |
-
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv",
|
| 11 |
|
| 12 |
-
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv",
|
| 13 |
-
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv",
|
| 14 |
-
|
| 15 |
-
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
| 16 |
|
| 17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 18 |
|
|
|
|
| 19 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 20 |
|
| 21 |
#SPLITS = ["Overall", "Subclass"]
|
|
@@ -27,10 +26,9 @@ CLASSIFICATION = {
|
|
| 27 |
"~30B",
|
| 28 |
"10B~20B",
|
| 29 |
"5B~10B",
|
| 30 |
-
"1B~5B",
|
| 31 |
"API",
|
| 32 |
]
|
| 33 |
-
|
| 34 |
}
|
| 35 |
|
| 36 |
|
|
@@ -38,17 +36,17 @@ CLASSIFICATION = {
|
|
| 38 |
|
| 39 |
_BIBTEX = """
|
| 40 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
| 41 |
-
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
| 42 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
| 43 |
year={2024},
|
| 44 |
eprint={2410.18491},
|
| 45 |
archivePrefix={arXiv},
|
| 46 |
primaryClass={cs.CL},
|
| 47 |
-
url={https://arxiv.org/abs/2410.18491},
|
| 48 |
}
|
| 49 |
"""
|
| 50 |
|
| 51 |
-
_LAST_UPDATED = "
|
| 52 |
|
| 53 |
banner_url = "./assets/logo.png"
|
| 54 |
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
|
|
@@ -64,31 +62,18 @@ def format_csv_numbers(text):
|
|
| 64 |
|
| 65 |
def format_csv_numbers_second(text):
|
| 66 |
return text.split()
|
| 67 |
-
|
| 68 |
-
|
| 69 |
def format_number(x):
|
| 70 |
return float(f"{x:.3}")
|
| 71 |
|
| 72 |
|
| 73 |
-
def get_dataset_new_csv(
|
| 74 |
-
model_size: List[str],
|
| 75 |
-
):
|
| 76 |
-
df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
|
| 77 |
-
df = df.drop(columns="Size")
|
| 78 |
-
|
| 79 |
-
leaderboard_table = gr.components.Dataframe(
|
| 80 |
-
value=df,
|
| 81 |
-
interactive=False,
|
| 82 |
-
visible=True,
|
| 83 |
-
)
|
| 84 |
-
return leaderboard_table
|
| 85 |
-
|
| 86 |
def get_dataset_csv(
|
| 87 |
model_size: List[str],
|
| 88 |
):
|
| 89 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
| 90 |
df = df.drop(columns="Size")
|
| 91 |
-
|
| 92 |
leaderboard_table = gr.components.Dataframe(
|
| 93 |
value=df,
|
| 94 |
interactive=False,
|
|
@@ -116,11 +101,11 @@ def get_dataset_csv_sub_gen(
|
|
| 116 |
):
|
| 117 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
| 118 |
df = df.drop(columns="Size")
|
| 119 |
-
|
| 120 |
# get subclass
|
| 121 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
| 122 |
df = df[subclass_choice_label]
|
| 123 |
-
|
| 124 |
leaderboard_table = gr.components.Dataframe(
|
| 125 |
value=df,
|
| 126 |
interactive=False,
|
|
@@ -135,11 +120,11 @@ def get_dataset_csv_sub_per(
|
|
| 135 |
):
|
| 136 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
| 137 |
df = df.drop(columns="Size")
|
| 138 |
-
|
| 139 |
# get subclass
|
| 140 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
| 141 |
df = df[subclass_choice_label]
|
| 142 |
-
|
| 143 |
leaderboard_table = gr.components.Dataframe(
|
| 144 |
value=df,
|
| 145 |
interactive=False,
|
|
@@ -158,15 +143,7 @@ def get_dataset_classfier_gen(
|
|
| 158 |
subclass_choice = main_choice
|
| 159 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 160 |
return leaderboard_table
|
| 161 |
-
|
| 162 |
-
def get_ChineseGuardBench(
|
| 163 |
-
model_size: List[str],
|
| 164 |
-
main_choice: List[str],
|
| 165 |
-
):
|
| 166 |
-
leaderboard_table = get_dataset_new_csv(model_size)
|
| 167 |
-
return leaderboard_table
|
| 168 |
-
|
| 169 |
-
|
| 170 |
def get_dataset_classfier_per(
|
| 171 |
model_size: List[str],
|
| 172 |
main_choice: List[str],
|
|
@@ -187,10 +164,10 @@ with gr.Blocks() as demo:
|
|
| 187 |
|
| 188 |
with gr.Row():
|
| 189 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
| 190 |
-
|
| 191 |
with gr.Row():
|
| 192 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
| 193 |
-
|
| 194 |
with gr.Row():
|
| 195 |
with gr.Column(scale=0.8):
|
| 196 |
main_choice = gr.Dropdown(
|
|
@@ -199,8 +176,8 @@ with gr.Blocks() as demo:
|
|
| 199 |
label="Type",
|
| 200 |
info="Please choose the type to display.",
|
| 201 |
)
|
| 202 |
-
|
| 203 |
-
with gr.Column(scale=10):
|
| 204 |
model_choice = gr.CheckboxGroup(
|
| 205 |
choices=CLASSIFICATION["model_size"],
|
| 206 |
value=CLASSIFICATION["model_size"], # all be choosed
|
|
@@ -211,29 +188,24 @@ with gr.Blocks() as demo:
|
|
| 211 |
#👉 this part is for csv table generatived
|
| 212 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 213 |
# ----------------- modify text -----------------
|
| 214 |
-
|
| 215 |
-
with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=
|
| 216 |
dataframe_all_gen = gr.components.Dataframe(
|
| 217 |
elem_id="leaderboard-table",
|
| 218 |
)
|
| 219 |
-
|
| 220 |
-
with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=6):
|
| 221 |
-
dataframe_all_per = gr.components.Dataframe(
|
| 222 |
-
elem_id="leaderboard-table",
|
| 223 |
-
)
|
| 224 |
|
| 225 |
-
with gr.TabItem("🏅
|
| 226 |
-
|
| 227 |
elem_id="leaderboard-table",
|
| 228 |
)
|
| 229 |
|
| 230 |
# ----------------- modify text -----------------
|
| 231 |
with gr.Row():
|
| 232 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
| 233 |
-
|
| 234 |
with gr.Row():
|
| 235 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
| 236 |
-
|
| 237 |
# 👉 this part is for citation
|
| 238 |
with gr.Row():
|
| 239 |
with gr.Accordion("📙 Citation", open=True):
|
|
@@ -244,18 +216,18 @@ with gr.Blocks() as demo:
|
|
| 244 |
elem_id="citation-button",
|
| 245 |
show_copy_button=True
|
| 246 |
)
|
| 247 |
-
|
| 248 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
| 249 |
-
|
| 250 |
# --------------------------- all --------------------------------
|
| 251 |
# this is all result Perplexity
|
| 252 |
-
|
| 253 |
main_choice.change(
|
| 254 |
get_dataset_classfier_per,
|
| 255 |
inputs=[model_choice, main_choice],
|
| 256 |
outputs=dataframe_all_per,
|
| 257 |
)
|
| 258 |
-
|
| 259 |
model_choice.change(
|
| 260 |
get_dataset_classfier_per,
|
| 261 |
inputs=[model_choice, main_choice],
|
|
@@ -267,45 +239,26 @@ with gr.Blocks() as demo:
|
|
| 267 |
inputs=[model_choice, main_choice],
|
| 268 |
outputs=dataframe_all_per,
|
| 269 |
)
|
| 270 |
-
|
| 271 |
# this is all result generatived
|
| 272 |
main_choice.change(
|
| 273 |
get_dataset_classfier_gen,
|
| 274 |
inputs=[model_choice, main_choice],
|
| 275 |
outputs=dataframe_all_gen,
|
| 276 |
)
|
| 277 |
-
|
| 278 |
model_choice.change(
|
| 279 |
get_dataset_classfier_gen,
|
| 280 |
inputs=[model_choice, main_choice],
|
| 281 |
outputs=dataframe_all_gen,
|
| 282 |
)
|
| 283 |
-
|
| 284 |
demo.load(
|
| 285 |
fn=get_dataset_classfier_gen,
|
| 286 |
inputs=[model_choice, main_choice],
|
| 287 |
outputs=dataframe_all_gen,
|
| 288 |
)
|
| 289 |
-
|
| 290 |
-
# this is new results for ChineseGuardBench
|
| 291 |
|
| 292 |
-
|
| 293 |
-
# get_ChineseGuardBench,
|
| 294 |
-
# inputs=[model_choice, main_choice],
|
| 295 |
-
# outputs=dataframe_all_guardbench,
|
| 296 |
-
# )
|
| 297 |
-
|
| 298 |
-
model_choice.change(
|
| 299 |
-
get_ChineseGuardBench,
|
| 300 |
-
inputs=[model_choice, main_choice],
|
| 301 |
-
outputs=dataframe_all_guardbench,
|
| 302 |
-
)
|
| 303 |
-
|
| 304 |
-
demo.load(
|
| 305 |
-
fn=get_ChineseGuardBench,
|
| 306 |
-
inputs=[model_choice, main_choice],
|
| 307 |
-
outputs=dataframe_all_guardbench,
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
demo.launch(share=True)
|
| 311 |
|
|
|
|
| 6 |
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
|
| 7 |
|
| 8 |
|
| 9 |
+
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
|
| 10 |
+
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
|
| 11 |
|
| 12 |
+
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
|
| 13 |
+
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
|
|
|
|
|
|
|
| 14 |
|
| 15 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 16 |
|
| 17 |
+
|
| 18 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 19 |
|
| 20 |
#SPLITS = ["Overall", "Subclass"]
|
|
|
|
| 26 |
"~30B",
|
| 27 |
"10B~20B",
|
| 28 |
"5B~10B",
|
|
|
|
| 29 |
"API",
|
| 30 |
]
|
| 31 |
+
|
| 32 |
}
|
| 33 |
|
| 34 |
|
|
|
|
| 36 |
|
| 37 |
_BIBTEX = """
|
| 38 |
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
|
| 39 |
+
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
|
| 40 |
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
|
| 41 |
year={2024},
|
| 42 |
eprint={2410.18491},
|
| 43 |
archivePrefix={arXiv},
|
| 44 |
primaryClass={cs.CL},
|
| 45 |
+
url={https://arxiv.org/abs/2410.18491},
|
| 46 |
}
|
| 47 |
"""
|
| 48 |
|
| 49 |
+
_LAST_UPDATED = "April 13, 2025"
|
| 50 |
|
| 51 |
banner_url = "./assets/logo.png"
|
| 52 |
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
|
|
|
|
| 62 |
|
| 63 |
def format_csv_numbers_second(text):
|
| 64 |
return text.split()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def format_number(x):
|
| 68 |
return float(f"{x:.3}")
|
| 69 |
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def get_dataset_csv(
|
| 72 |
model_size: List[str],
|
| 73 |
):
|
| 74 |
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
|
| 75 |
df = df.drop(columns="Size")
|
| 76 |
+
|
| 77 |
leaderboard_table = gr.components.Dataframe(
|
| 78 |
value=df,
|
| 79 |
interactive=False,
|
|
|
|
| 101 |
):
|
| 102 |
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
|
| 103 |
df = df.drop(columns="Size")
|
| 104 |
+
|
| 105 |
# get subclass
|
| 106 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
| 107 |
df = df[subclass_choice_label]
|
| 108 |
+
|
| 109 |
leaderboard_table = gr.components.Dataframe(
|
| 110 |
value=df,
|
| 111 |
interactive=False,
|
|
|
|
| 120 |
):
|
| 121 |
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
|
| 122 |
df = df.drop(columns="Size")
|
| 123 |
+
|
| 124 |
# get subclass
|
| 125 |
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
|
| 126 |
df = df[subclass_choice_label]
|
| 127 |
+
|
| 128 |
leaderboard_table = gr.components.Dataframe(
|
| 129 |
value=df,
|
| 130 |
interactive=False,
|
|
|
|
| 143 |
subclass_choice = main_choice
|
| 144 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 145 |
return leaderboard_table
|
| 146 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def get_dataset_classfier_per(
|
| 148 |
model_size: List[str],
|
| 149 |
main_choice: List[str],
|
|
|
|
| 164 |
|
| 165 |
with gr.Row():
|
| 166 |
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
|
| 167 |
+
|
| 168 |
with gr.Row():
|
| 169 |
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
|
| 170 |
+
|
| 171 |
with gr.Row():
|
| 172 |
with gr.Column(scale=0.8):
|
| 173 |
main_choice = gr.Dropdown(
|
|
|
|
| 176 |
label="Type",
|
| 177 |
info="Please choose the type to display.",
|
| 178 |
)
|
| 179 |
+
|
| 180 |
+
with gr.Column(scale=10):
|
| 181 |
model_choice = gr.CheckboxGroup(
|
| 182 |
choices=CLASSIFICATION["model_size"],
|
| 183 |
value=CLASSIFICATION["model_size"], # all be choosed
|
|
|
|
| 188 |
#👉 this part is for csv table generatived
|
| 189 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 190 |
# ----------------- modify text -----------------
|
| 191 |
+
|
| 192 |
+
with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
|
| 193 |
dataframe_all_gen = gr.components.Dataframe(
|
| 194 |
elem_id="leaderboard-table",
|
| 195 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
|
| 198 |
+
dataframe_all_per = gr.components.Dataframe(
|
| 199 |
elem_id="leaderboard-table",
|
| 200 |
)
|
| 201 |
|
| 202 |
# ----------------- modify text -----------------
|
| 203 |
with gr.Row():
|
| 204 |
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
|
| 205 |
+
|
| 206 |
with gr.Row():
|
| 207 |
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
|
| 208 |
+
|
| 209 |
# 👉 this part is for citation
|
| 210 |
with gr.Row():
|
| 211 |
with gr.Accordion("📙 Citation", open=True):
|
|
|
|
| 216 |
elem_id="citation-button",
|
| 217 |
show_copy_button=True
|
| 218 |
)
|
| 219 |
+
|
| 220 |
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
|
| 221 |
+
|
| 222 |
# --------------------------- all --------------------------------
|
| 223 |
# this is all result Perplexity
|
| 224 |
+
|
| 225 |
main_choice.change(
|
| 226 |
get_dataset_classfier_per,
|
| 227 |
inputs=[model_choice, main_choice],
|
| 228 |
outputs=dataframe_all_per,
|
| 229 |
)
|
| 230 |
+
|
| 231 |
model_choice.change(
|
| 232 |
get_dataset_classfier_per,
|
| 233 |
inputs=[model_choice, main_choice],
|
|
|
|
| 239 |
inputs=[model_choice, main_choice],
|
| 240 |
outputs=dataframe_all_per,
|
| 241 |
)
|
| 242 |
+
|
| 243 |
# this is all result generatived
|
| 244 |
main_choice.change(
|
| 245 |
get_dataset_classfier_gen,
|
| 246 |
inputs=[model_choice, main_choice],
|
| 247 |
outputs=dataframe_all_gen,
|
| 248 |
)
|
| 249 |
+
|
| 250 |
model_choice.change(
|
| 251 |
get_dataset_classfier_gen,
|
| 252 |
inputs=[model_choice, main_choice],
|
| 253 |
outputs=dataframe_all_gen,
|
| 254 |
)
|
| 255 |
+
|
| 256 |
demo.load(
|
| 257 |
fn=get_dataset_classfier_gen,
|
| 258 |
inputs=[model_choice, main_choice],
|
| 259 |
outputs=dataframe_all_gen,
|
| 260 |
)
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
demo.launch(share=True)
|
| 264 |
|
assets/text.py
CHANGED
|
@@ -34,13 +34,13 @@ EVALUTION_TEXT= """
|
|
| 34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
-
For generation, we use the content generated by the model to make prediction.
|
| 38 |
-
|
| 39 |
-
The following are the results of the evaluation.👇👇👇
|
| 40 |
</span> <br><br>
|
| 41 |
|
| 42 |
|
| 43 |
""" # noqa
|
|
|
|
| 44 |
REFERENCE_TEXT = """
|
| 45 |
# References
|
| 46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
|
|
|
| 34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
+
For generation, we use the content generated by the model to make prediction.
|
| 38 |
+
The following are the results of the evaluation. 👇👇👇
|
|
|
|
| 39 |
</span> <br><br>
|
| 40 |
|
| 41 |
|
| 42 |
""" # noqa
|
| 43 |
+
|
| 44 |
REFERENCE_TEXT = """
|
| 45 |
# References
|
| 46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
changelog.md
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
# CHANGELOG
|
| 2 |
|
| 3 |
-
|
| 4 |
### 2024-7-16
|
| 5 |
version: v1.0.0
|
| 6 |
|
|
@@ -67,14 +66,4 @@ version: v1.0.6
|
|
| 67 |
- Deepseek-chat-v3-0324
|
| 68 |
- Qwen3
|
| 69 |
- Gemma-3
|
| 70 |
-
- OpenThinker2
|
| 71 |
-
|
| 72 |
-
### 2025-7-29
|
| 73 |
-
version: v1.0.7
|
| 74 |
-
|
| 75 |
-
changed:
|
| 76 |
-
- [1]feat: Update the two models required by Deepexi.
|
| 77 |
-
- Deepexi-Guard-3B
|
| 78 |
-
- Qwen2.5-3B-Instruct
|
| 79 |
-
|
| 80 |
-
- [2]feat: Update a new table ChineseGuardBench required by Deepxi.
|
|
|
|
| 1 |
# CHANGELOG
|
| 2 |
|
|
|
|
| 3 |
### 2024-7-16
|
| 4 |
version: v1.0.0
|
| 5 |
|
|
|
|
| 66 |
- Deepseek-chat-v3-0324
|
| 67 |
- Qwen3
|
| 68 |
- Gemma-3
|
| 69 |
+
- OpenThinker2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/ChineseGuardBench.csv
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
|
| 2 |
-
Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
|
| 3 |
-
Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
|
| 4 |
-
Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
|
| 5 |
-
Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
|
| 6 |
-
GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
|
| 7 |
-
Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
|
| 8 |
-
QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
|
| 9 |
-
Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
|
| 10 |
-
Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
|
| 11 |
-
DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
|
| 12 |
-
Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
|
| 13 |
-
GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
|
| 14 |
-
MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
|
| 15 |
-
DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
|
| 16 |
-
Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
|
| 17 |
-
Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
|
| 18 |
-
GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
|
| 19 |
-
Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
|
| 20 |
-
Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
|
| 21 |
-
Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
|
| 22 |
-
shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
|
| 23 |
-
Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
|
| 24 |
-
SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
|
| 25 |
-
Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
|
| 26 |
-
SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
|
| 27 |
-
ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
|
| 28 |
-
Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
|
| 29 |
-
internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
|
| 30 |
-
Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
|
| 31 |
-
Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
|
| 32 |
-
DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
|
| 33 |
-
Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/chinese_benchmark_gen.csv
CHANGED
|
@@ -7,8 +7,6 @@ Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,7
|
|
| 7 |
Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
|
| 8 |
Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
|
| 9 |
Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
|
| 10 |
-
Deepexi-Guard-3B,1B~5B,78.26/0.0,89.35/0.0,64.16/0.0,72.04/0.0,92.35/0.0
|
| 11 |
-
Qwen2.5-3B-Instruct,1B~5B,71.81/0.0,70.36/0.0,75.36/0.0,73.47/0.0,68.25/0.0
|
| 12 |
Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
|
| 13 |
Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
|
| 14 |
DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
|
|
|
|
| 7 |
Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
|
| 8 |
Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
|
| 9 |
Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
|
|
|
|
|
|
|
| 10 |
Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
|
| 11 |
Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
|
| 12 |
DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
|
data/chinese_benchmark_per.csv
CHANGED
|
@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
|
|
| 43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
| 44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
| 45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
| 46 |
-
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
|
|
|
| 43 |
Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
|
| 44 |
Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
|
| 45 |
Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
|
| 46 |
+
Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
|
data/subclass_gen.csv
CHANGED
|
@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
|
|
| 48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
| 49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
| 50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
| 51 |
-
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
|
|
|
| 48 |
QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
|
| 49 |
Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
|
| 50 |
Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
|
| 51 |
+
OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516
|
data/subclass_per.csv
CHANGED
|
@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
|
|
| 41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
| 42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
| 43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
| 44 |
-
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|
|
|
|
| 41 |
QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
|
| 42 |
Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
|
| 43 |
OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
|
| 44 |
+
Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814
|