add-o1-mini-o3-mini
#6
by
						
jardinet-souffleton
	
							
						- opened
							
					
    	
        results/GenericAgent-GPT-o1-mini/workarena-l1.json
    CHANGED
    
    | @@ -12,5 +12,19 @@ | |
| 12 | 
             
                    "reproducible": "Yes",
         | 
| 13 | 
             
                    "comments": "NA",
         | 
| 14 | 
             
                    "original_or_reproduced": "Original"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 15 | 
             
                }
         | 
| 16 | 
             
            ]
         | 
|  | |
| 12 | 
             
                    "reproducible": "Yes",
         | 
| 13 | 
             
                    "comments": "NA",
         | 
| 14 | 
             
                    "original_or_reproduced": "Original"
         | 
| 15 | 
            +
                },
         | 
| 16 | 
            +
                {
         | 
| 17 | 
            +
                    "agent_name": "GenericAgent-GPT-o1-mini",
         | 
| 18 | 
            +
                    "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", 
         | 
| 19 | 
            +
                    "date_time": "2025-02-07 14:00:00",
         | 
| 20 | 
            +
                    "benchmark": "WorkArena-L1",
         | 
| 21 | 
            +
                    "score": 51.8,
         | 
| 22 | 
            +
                    "std_err": 2.80,
         | 
| 23 | 
            +
                    "benchmark_specific": "No",
         | 
| 24 | 
            +
                    "benchmark_tuned": "No",
         | 
| 25 | 
            +
                    "followed_evaluation_protocol": "Yes", 
         | 
| 26 | 
            +
                    "reproducible": "Yes",
         | 
| 27 | 
            +
                    "comments": "Additional details",
         | 
| 28 | 
            +
                    "original_or_reproduced": "Reproduced"
         | 
| 29 | 
             
                }
         | 
| 30 | 
             
            ]
         | 
    	
        results/GenericAgent-o3-mini/README.md
    ADDED
    
    | @@ -0,0 +1,46 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ### GenericAgent-o3-mini
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/agent_configs.py):
         | 
| 6 | 
            +
            ```python
         | 
| 7 | 
            +
            BASE_FLAGS = FLAGS_GPT_4o = GenericPromptFlags(
         | 
| 8 | 
            +
                obs=dp.ObsFlags(
         | 
| 9 | 
            +
                    use_html=False,
         | 
| 10 | 
            +
                    use_ax_tree=True,
         | 
| 11 | 
            +
                    use_focused_element=True,
         | 
| 12 | 
            +
                    use_error_logs=True,
         | 
| 13 | 
            +
                    use_history=True,
         | 
| 14 | 
            +
                    use_past_error_logs=False,
         | 
| 15 | 
            +
                    use_action_history=True,
         | 
| 16 | 
            +
                    use_think_history=False,
         | 
| 17 | 
            +
                    use_diff=False,
         | 
| 18 | 
            +
                    html_type="pruned_html",
         | 
| 19 | 
            +
                    use_screenshot=False,
         | 
| 20 | 
            +
                    use_som=False,
         | 
| 21 | 
            +
                    extract_visible_tag=True,
         | 
| 22 | 
            +
                    extract_clickable_tag=True,
         | 
| 23 | 
            +
                    extract_coords="False",
         | 
| 24 | 
            +
                    filter_visible_elements_only=False,
         | 
| 25 | 
            +
                ),
         | 
| 26 | 
            +
                action=dp.ActionFlags(
         | 
| 27 | 
            +
                    action_set=bgym.HighLevelActionSetArgs(
         | 
| 28 | 
            +
                        subsets=["bid"],
         | 
| 29 | 
            +
                        multiaction=False,
         | 
| 30 | 
            +
                    ),
         | 
| 31 | 
            +
                    long_description=False,
         | 
| 32 | 
            +
                    individual_examples=False,
         | 
| 33 | 
            +
                ),
         | 
| 34 | 
            +
                use_plan=False,
         | 
| 35 | 
            +
                use_criticise=False,
         | 
| 36 | 
            +
                use_thinking=True,
         | 
| 37 | 
            +
                use_memory=False,
         | 
| 38 | 
            +
                use_concrete_example=True,
         | 
| 39 | 
            +
                use_abstract_example=True,
         | 
| 40 | 
            +
                use_hints=True,
         | 
| 41 | 
            +
                enable_chat=False,
         | 
| 42 | 
            +
                max_prompt_tokens=40_000,
         | 
| 43 | 
            +
                be_cautious=True,
         | 
| 44 | 
            +
                extra_instructions=None,
         | 
| 45 | 
            +
            )
         | 
| 46 | 
            +
            ```
         | 
    	
        results/GenericAgent-o3-mini/workarena-l1.json
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            [
         | 
| 2 | 
            +
                {
         | 
| 3 | 
            +
                    "agent_name": "GenericAgent-o3-mini",
         | 
| 4 | 
            +
                    "study_id": "f3e1fcb8-5fc5-4115-9e00-27251508e2c7", 
         | 
| 5 | 
            +
                    "date_time": "2025-02-07 14:00:00",
         | 
| 6 | 
            +
                    "benchmark": "WorkArena-L1",
         | 
| 7 | 
            +
                    "score": 48.2,
         | 
| 8 | 
            +
                    "std_err": 2.80,
         | 
| 9 | 
            +
                    "benchmark_specific": "No",
         | 
| 10 | 
            +
                    "benchmark_tuned": "No",
         | 
| 11 | 
            +
                    "followed_evaluation_protocol": "Yes", 
         | 
| 12 | 
            +
                    "reproducible": "Yes",
         | 
| 13 | 
            +
                    "comments": "Additional details",
         | 
| 14 | 
            +
                    "original_or_reproduced": "Original"
         | 
| 15 | 
            +
                }
         | 
| 16 | 
            +
            ]
         | 
