Spaces:

SUSTech
/

ChineseSafe-Benchmark

Running

App Files Files Community

hongfu_test_20250701

#18

by hxiang - opened Jul 1, 2025

base: refs/heads/main

←

from: refs/pr/18

Discussion Files changed

+42

-193

Files changed (9) hide show

.idea/workspace.xml +0 -58
app.py +35 -82
assets/text.py +3 -3
changelog.md +1 -12
data/ChineseGuardBench.csv +0 -33
data/chinese_benchmark_gen.csv +0 -2
data/chinese_benchmark_per.csv +1 -1
data/subclass_gen.csv +1 -1
data/subclass_per.csv +1 -1

.idea/workspace.xml DELETED Viewed

@@ -1,58 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProjectColorInfo">{
-  &quot;customColor&quot;: &quot;&quot;,
-  &quot;associatedIndex&quot;: 2
-}</component>
-  <component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true",
-    "git-widget-placeholder": "pr/18",
-    "last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
-    "nodejs_package_manager_path": "npm",
-    "vue.rearranger.settings.migration": "true"
-  }
-}]]></component>
-  <component name="SharedIndexes">
-    <attachedChunks>
-      <set>
-        <option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
-      </set>
-    </attachedChunks>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
-      <created>1751365967779</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1751365967779</updated>
-      <workItem from="1751365968934" duration="39000" />
-      <workItem from="1751366116696" duration="54000" />
-    </task>
-    <servers />
-  </component>
-  <component name="TypeScriptGeneratedFilesManager">
-    <option name="version" value="3" />
-  </component>
-</project>

app.py CHANGED Viewed

@@ -6,16 +6,15 @@ import pandas as pd
 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
-ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
-ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
-ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
-ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
-ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
 #SPLITS = ["Overall", "Subclass"]
@@ -27,10 +26,9 @@ CLASSIFICATION = {
         "~30B",
         "10B~20B",
         "5B~10B",
-        "1B~5B",
         "API",
     ]
 }
@@ -38,17 +36,17 @@ CLASSIFICATION = {
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
-      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2410.18491},
 }
 """
-_LAST_UPDATED = "July 28, 2025"
 banner_url = "./assets/logo.png"
 _BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'  # noqa
@@ -64,31 +62,18 @@ def format_csv_numbers(text):
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
-def get_dataset_new_csv(
-    model_size: List[str],
-):
-    df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
-    df = df.drop(columns="Size")
-    leaderboard_table = gr.components.Dataframe(
-        value=df,
-        interactive=False,
-        visible=True,
-    )
-    return leaderboard_table
 def get_dataset_csv(
     model_size: List[str],
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -116,11 +101,11 @@ def get_dataset_csv_sub_gen(
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -135,11 +120,11 @@ def get_dataset_csv_sub_per(
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -158,15 +143,7 @@ def get_dataset_classfier_gen(
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
-def get_ChineseGuardBench(
-    model_size: List[str],
-    main_choice: List[str],
-):
-    leaderboard_table = get_dataset_new_csv(model_size)
-    return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
@@ -187,10 +164,10 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
@@ -199,8 +176,8 @@ with gr.Blocks() as demo:
                 label="Type",
                 info="Please choose the type to display.",
             )
-        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
@@ -211,29 +188,24 @@ with gr.Blocks() as demo:
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
-        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
-        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=6):
-            dataframe_all_per = gr.components.Dataframe(
-                elem_id="leaderboard-table",
-            )
-        with gr.TabItem("🏅 NEW", elem_id="od-benchmark-tab-table", id=7):
-            dataframe_all_guardbench = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
@@ -244,18 +216,18 @@ with gr.Blocks() as demo:
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
@@ -267,45 +239,26 @@ with gr.Blocks() as demo:
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
-    # this is new results for ChineseGuardBench
-    # main_choice.change(
-    #     get_ChineseGuardBench,
-    #     inputs=[model_choice, main_choice],
-    #     outputs=dataframe_all_guardbench,
-    # )
-    model_choice.change(
-        get_ChineseGuardBench,
-        inputs=[model_choice, main_choice],
-        outputs=dataframe_all_guardbench,
-    )
-    demo.load(
-        fn=get_ChineseGuardBench,
-        inputs=[model_choice, main_choice],
-        outputs=dataframe_all_guardbench,
-    )
 demo.launch(share=True)

 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
+ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
+ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
+ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
+ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
 #SPLITS = ["Overall", "Subclass"]
         "~30B",
         "10B~20B",
         "5B~10B",
         "API",
     ]
 }
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
+      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2410.18491},
 }
 """
+_LAST_UPDATED = "April 13, 2025"
 banner_url = "./assets/logo.png"
 _BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'  # noqa
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
 def get_dataset_csv(
     model_size: List[str],
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
                 label="Type",
                 info="Please choose the type to display.",
             )
+        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
+        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
+        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
+            dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
 demo.launch(share=True)

assets/text.py CHANGED Viewed

@@ -34,13 +34,13 @@ EVALUTION_TEXT= """
 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
-For generation, we use the content generated by the model to make prediction.
-In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
-The following are the results of the evaluation.👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
+For generation, we use the content generated by the model to make prediction.
+The following are the results of the evaluation. 👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

changelog.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
@@ -67,14 +66,4 @@ version: v1.0.6
         - Deepseek-chat-v3-0324
         - Qwen3
         - Gemma-3
-        - OpenThinker2
-### 2025-7-29
-version: v1.0.7
-    changed:
-    - [1]feat: Update the two models required by Deepexi.
-        - Deepexi-Guard-3B
-        - Qwen2.5-3B-Instruct
-    - [2]feat: Update a new table ChineseGuardBench required by Deepxi.

 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
         - Deepseek-chat-v3-0324
         - Qwen3
         - Gemma-3
+        - OpenThinker2

data/ChineseGuardBench.csv DELETED Viewed

@@ -1,33 +0,0 @@
-Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
-Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
-Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
-Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
-Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
-GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
-Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
-QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
-Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
-Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
-DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
-Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
-GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
-MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
-DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
-Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
-Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
-GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
-Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
-Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
-Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
-shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
-Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
-SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
-Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
-SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
-ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
-Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
-internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
-Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
-Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
-DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
-Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71

data/chinese_benchmark_gen.csv CHANGED Viewed

@@ -7,8 +7,6 @@ Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,7
 Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
 Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
 Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
-Deepexi-Guard-3B,1B~5B,78.26/0.0,89.35/0.0,64.16/0.0,72.04/0.0,92.35/0.0
-Qwen2.5-3B-Instruct,1B~5B,71.81/0.0,70.36/0.0,75.36/0.0,73.47/0.0,68.25/0.0
 Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
 Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
 DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38

 Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
 Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
 Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
 Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
 Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
 DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38

data/chinese_benchmark_per.csv CHANGED Viewed

@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
 Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
 Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
 Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
-Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00

 Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
 Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
 Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
+Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00

data/subclass_gen.csv CHANGED Viewed

@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
 QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
 Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
 Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
-OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516

 QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
 Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
 Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
+OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516

data/subclass_per.csv CHANGED Viewed

@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
 QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
 Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
 OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
-Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814

 QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
 Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
 OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
+Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814